Reputation: 15558
I am trying to crawl a paginated list of Catalog
which works fine.
But for each Catalog
there is a paginated list of DataSet
but only first page over there is appearing in the result. I am trying to get a result that looks like this, but all 24
nodes should be there corresponding to 24
DataSet's spanning over pages of 6 items each.
[{'data_sets_count': 24,
'description': 'The catalog contains data regarding various indicators of '
'HMIS like Health, Abortions, Immunisation, AEFI, Adolescent, '
'Bite, Sting, Disease, Diarrhoeal, Hypertension, HIV, AIDS, '
'Malaria, Neurological, Stroke, Fever, Respiratory, '
'Infection, suicide, Trauma, Accident, Burn, Tuberculosis, '
'VHND, ASHA, JSY, CHC, PHC, SDH, DH, Hospital.',
'last_updated': '11/08/17',
'ministry_department': 'Ministry of Health and Family Welfare, Department of '
'Health and Family Welfare',
'nodes': [{'node': '3183861',
'title': 'Item-wise report for North Goa of Goa upto '
'April-2014-15'},
{'node': '3183881',
'title': 'Item-wise report for North Goa of Goa upto May-2014-15'},
{'node': '3183981',
'title': 'Item-wise report for North Goa of Goa upto '
'October-2014-15'},
{'node': '3184021',
'title': 'Item-wise report for North Goa of Goa upto '
'December-2014-15'},
{'node': '3184061',
'title': 'Item-wise report for North Goa of Goa upto '
'February-2014-15'},
{'node': '3183961',
'title': 'Item-wise report for North Goa of Goa upto '
'September-2014-15'}],
'state_department': None,
'title': 'HMIS sub district level item-wise monthly report of Goa',
'url': '/catalog/hmis-sub-district-level-item-wise-monthly-report-goa'}]
import scrapy
class Category(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
ministry_department = scrapy.Field()
description = scrapy.Field()
state_department = scrapy.Field()
last_updated = scrapy.Field()
data_sets_count = scrapy.Field()
data_sets = scrapy.Field()
item = scrapy.Field()
nodes = scrapy.Field()
class CatalogSpider(scrapy.Spider):
name = 'catalogspider'
start_urls = ['https://data.gov.in/catalogs#sort_by=created&sort_order=DESC&items_per_page=9&page=1']
def parse(self, response):
for catalog in response.css('.view-catalogs > div > .views-row-6'):
category = Category()
category['title'] = catalog.css('.views-field-title .field-content a::text').extract_first()
category['url'] = catalog.css('.views-field-title .field-content a::attr(href)').extract_first()
category['ministry_department'] = catalog.css('.views-field-field-ministry-department .field-content ::text').extract_first()
category['description'] = catalog.css('.views-field-body .field-content ::text').extract_first()
category['state_department'] = catalog.css('.views-field-field-state-department .field-content ::text').extract_first()
category['last_updated'] = catalog.css('.views-field-changed .field-content ::text').extract_first()
category['data_sets_count'] = int(catalog.css('.views-field-resource-count-last .count-resource::text').re(r'\((.*?)\)')[0])
category['nodes'] = []
request = scrapy.Request(response.urljoin(category['url']), callback=self.parseDataSets)
request.meta['item'] = category
yield request
for next_page in response.css('li.pager-next > a'):
yield response.follow(next_page, self.parse)
def parseDataSets(self, response):
item = response.meta['item']
for dataset in response.css('.view-resource-detail-popup > div > .views-row'):
item['nodes'].append({
'node' : dataset.css('.data-extension.csv::attr(class)').extract_first().split()[0],
'title' : dataset.css('.views-field-title .field-content .title-content::text').extract_first()
})
for next_page in response.css('li.pager-next'):
print('here')
request = scrapy.Request(response.urljoin(next_page.css('a::attr(href)').extract_first()), callback=self.parseDataSets)
request.meta['item'] = item
yield item
Upvotes: 1
Views: 644
Reputation: 15558
I got it working by using below code, I'm not sure it this is the correct way to do. I'm adding DataSet
to a meta variable category
, and yields None
, at the end yields the meta variable category
when it is the last page. Sounds a bit hacky, but works now.
import scrapy
class Category(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
ministry_department = scrapy.Field()
description = scrapy.Field()
state_department = scrapy.Field()
last_updated = scrapy.Field()
data_sets_count = scrapy.Field()
data_sets_actual_count = scrapy.Field()
data_sets = scrapy.Field()
item = scrapy.Field()
nodes = scrapy.Field()
class CatalogSpider(scrapy.Spider):
name = 'catalogspider'
start_urls = ['https://data.gov.in/catalogs#sort_by=created&sort_order=DESC&items_per_page=9&page=1']
def parse(self, response):
for catalog in response.css('.view-catalogs > div > .views-row-6'):
category = Category()
category['title'] = catalog.css('.views-field-title .field-content a::text').extract_first()
category['url'] = catalog.css('.views-field-title .field-content a::attr(href)').extract_first()
category['ministry_department'] = catalog.css('.views-field-field-ministry-department .field-content ::text').extract_first()
category['description'] = catalog.css('.views-field-body .field-content ::text').extract_first()
category['state_department'] = catalog.css('.views-field-field-state-department .field-content ::text').extract_first()
category['last_updated'] = catalog.css('.views-field-changed .field-content ::text').extract_first()
category['data_sets_count'] = int(catalog.css('.views-field-resource-count-last .count-resource::text').re(r'\((.*?)\)')[0])
category['nodes'] = []
request = scrapy.Request(response.urljoin(category['url']), callback=self.parse_data_sets)
request.meta['category'] = category
yield request
#for next_page in response.css('li.pager-next > a'):
# yield response.follow(next_page, self.parse)
def parse_data_sets(self, response):
category = response.meta['category']
datasets = response.css('.view-resource-detail-popup > div > .views-row')
if datasets:
for dataset in datasets:
node = dataset.css('.data-extension.csv::attr(class)').extract_first().split()[0]
title = dataset.css('.views-field-title .field-content .title-content::text').extract_first()
url = 'https://data.gov.in/node/' + node + '/download'
category['nodes'].append({
'node' : node,
'title' : title,
'url' : url
})
yield None
else:
yield category
if len(response.css('li.pager-next').extract()) == 0:
category['data_sets_actual_count'] = len(category['nodes'])
yield category
#pagination
for next_page in response.css('li.pager-next'):
request = scrapy.Request(response.urljoin(next_page.css('a::attr(href)').extract_first()), callback=self.parse_data_sets)
request.meta['category'] = category
yield request
One of my problem was setting wrong depth in my command, which i changed to a bigger number later, random issues when in unknown domains:
scrapy parse --spider=catalogspider -d 60 'https://data.gov.in/catalogs#sort_by=created&sort_order=DESC&items_per_page=9&page=1'
Upvotes: 1