How to parse this info into individual items?

Question

I have scraped the following info from a web page using the following scrapy spider. How can I turn this info into individual items, that is one item should consist of name, size, link, extension, month, and year.

Here is the code of the spider:

import scrapy
from scrapy.crawler import CrawlerProcess


class MapSpider(scrapy.Spider):
    name = 'map'
    allowed_domains = ['map.gob.do']

    def start_requests(self):
        start_urls = [
            'https://map.gob.do/transparencia/recursos-humanos/nominas-de-empleados/']
        for url in start_urls:
            yield scrapy.Request(url=url, callback=self.parse,)

    def parse(self, response):
        panes = response.xpath('/html/body/div[8]/div/section/div/div/div[2]/div/div/div[3]/ul/li')
        tables = response.xpath('/html/body/div[8]/div/section/div/div/div[2]/div/div/div[3]/div/div')
        for pane in panes:
            Id = pane.css('::attr(href)').get(default='')
            Year = pane.css('::text').get(default='')
            yield{
                'year': Year,
                'id': Id
            }
        for d,table in enumerate(tables,1):
            yearId = table.css('.tab-pane ::attr(id)').get(default='')
            months = table.css('#'+ yearId + '.tab-pane .vr-tabs-nav-link ::text').getall()
            monthsIds = table.css('#'+ yearId + '.tab-pane .vr-tabs-nav-link ::attr(href)').getall()
            print(f'|||YEAR \' {d} \' INFO |||')
            yield{
                'yearId': yearId,
                'months': months,
                'monthsIds': monthsIds,
            }
            for c,monthId in enumerate(monthsIds,1):
                itemNames = table.css(monthId  + ' tr .wpfd_downloadlink ::attr(title)').getall()
                itemsLinks = table.css(monthId + ' tr.file .wpfd_downloadlink ::attr(href)').getall()
                itemsSizes = table.css(monthId + ' tr.file .file_size::text').getall()
                itemsExt = table.css(monthId + ' tr.file .wpfd_downloadlink > span > span ::attr(class)').getall()
                print(f'|||MONTH \' {c} \' INFO |||')
                yield {
                    'monthId': monthId,
                    'itemsNames': itemNames,
                    'itemsSizes': itemsSizes, 
                    'itemsLinks': itemsLinks,
                    'itemsExt': itemsExt
                }

process = CrawlerProcess()
process.crawl(MapSpider)
process.start()

Lukas Schmid · Accepted Answer

Currently your table.css(...).getall() returns multiple values, which you all pack into the yield. The advantage of yield as opposed to return is that you can also choose your chunksizes.

Replace the general yield with the more specific one you want. e.g.

for i in range(min(map(len, [itemNames, itemsLinks, itemsSizes, itemsExt]))):
    yield {
         'monthId': monthId,
         'itemsNames': itemNames[i],
         'itemsSizes': itemsSizes[i], 
         'itemsLinks': itemsLinks[i],
         'itemsExt': itemsExt[i]
          }

How to parse this info into individual items?

Answers (1)

Related Questions