Reputation: 46
I have a spider which exports data to different CSV files (per the names of the class definitions as defined in the spider class). However, I also wanted to keep the order of the fields in a specific order as they were being processed and exported into their different CSV files.
For example, this is my items.py:
import scrapy
class first_class_def_Item(scrapy.Item):
f1 = scrapy.Field() # f1 an arbitrary id used for both class definition items
f2 = scrapy.Field()
f3 = scrapy.Field()
class second_class_def_Item(scrapy.Item):
f1 = scrapy.Field()
f4 = scrapy.Field()
f5 = scrapy.Field()
f6 = scrapy.Field()
This is my pipelines.py:
from scrapy.exporters import CsvItemExporter
from scrapy import signals
from pydispatch import dispatcher
def item_type(item):
# The CSV file names are used (imported) from the scrapy spider.
# For this example, I just want to keep "first_class_def.csv" without,
# the "_item", as in "first_class_def_Item.csv" as defined in the main scrapy spider
return type(item).__name__.replace('_Item','')
class SomeSitePipeline(object):
# For simplicity, I'm using the same class def names as found in the,
# main scrapy spider and as defined in the items.py
SaveTypes = ['first_class_def','second_class_def']
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open("/somefolder/"+name+'.csv','wb')) for name in self.SaveTypes ])
self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.SaveTypes ])
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
typesItem = item_type(item)
if typesItem in set(self.SaveTypes):
self.exporters[typesItem].export_item(item)
return item
And this is my spider.py:
import os
import scrapy
from itertools import zip_longest
from somesite.items import first_class_def_Item, second_class_def_Item
from csv import DictReader
path = os.path.join(os.path.expanduser('~'), 'user', 'somefolder', 'IDs.csv')
class SomeSiteSpider(scrapy.Spider):
name = 'somesite'
allowed_domains = ['somesite.com']
start_urls = ['https://somesite.com/login.aspx']
def parse(self, response):
return scrapy.FormRequest.from_response(response,
formdata={'txtLogin$txtInput': 'User',
'txtPassword$txtInput': 'pass',
'btnLogin.x': '53',
'btnLogin.y': '33'},
callback=self.Tables)
def Tables(self, response):
with open(path) as rows:
for row in DictReader(rows):
id=row["id"]
yield scrapy.Request("https://somesite.com/page1.aspx",
meta={'mid': mid,
'form_control': some_form_control},
dont_filter = True,
callback=self.first_class_def)
yield scrapy.Request("https://somesite.com/page2.aspx",
meta={'mid': mid,
'form_control': some_form_control},
dont_filter = True,
callback=self.second_class_def)
def first_class_def(self, response):
return scrapy.FormRequest.from_response(response,
formdata={'id': response.meta['id'],
'form_control': response.meta['some_form_control'],
'SearchControl$btnCreateReport': 'Create Report'},
meta={'id': response.meta['id']},
callback=self.scrap_page_1)
def scrap_page_1(self, response):
items = first_class_def_Item()
field_1 = response.xpath('//*[@class="formatText"][1]/text()').extract()
field_2 = response.xpath('//*[@class="formatCurrency"][1]/text()').extract()
for a,b in zip(field_1,field_2):
items['f1'] = response.meta['id']
items['f2'] = a
items['f3'] = b
yield items
def second_class_def(self, response):
return scrapy.FormRequest.from_response(response,
formdata={'id': response.meta['id'],
'form_control': response.meta['some_form_control'],
'form_control_two': 'some_form_control_two',
'SearchControl$btnCreateReport': 'Create Report'},
meta={'id': response.meta['id']},
callback=self.scrap_page_2)
def scrap_page_2(self, response):
items = second_class_def_Item()
field_1 = response.xpath('//*[@class="formatText"][1]/text()').extract()
field_2 = response.xpath('//*[@class="formatCurrency"][1]/text()').extract()
field_3 = response.xpath('//*[@class="formatText"][3]/text()').extract()
for a,b,c in zip(field_1,field_2,field_3):
items['f1'] = response.meta['id']
items['f4'] = a
items['f5'] = b
items['f6'] = c
yield items
As the spider was processing and exporting data, I was looking for a way to keep the fields in the CSV generated files "first_class_def.csv" and "second_class_def.csv", exported in the same order as in the items.py:
f1,f2,f3
and
f1,f4,f5,f6
However, whenever I would crawl the spider, the fields within the CSV files were being exported in random order:
f2,f1,f3 and f5,f1,f4,f6
The solution is posted below!
Upvotes: 1
Views: 1212
Reputation: 46
This is the solution to my specific problem: export fields organized per the items class definition as defined in the items.py of a scrapy spider project.
So after tinkering with this problem and implementing @stranac's suggestion of getting rid of the list comprehension, I came up with the following solution, allowing to export all fields in order into their relative csv files:
from scrapy.exporters import CsvItemExporter
from scrapy import signals
from pydispatch import dispatcher
def item_type(item):
# just want "first_class_def.csv" not "first_class_def_Item.csv"
return type(item).__name__.replace('_Item','')
class SomeSitePipeline(object):
fileNamesCsv = ['first_class_def','second_class_def']
def __init__(self):
self.files = {}
self.exporters = {}
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open("/somefolder/"+name+'.csv','wb')) for name in self.fileNamesCsv ])
for name in self.fileNamesCsv:
self.exporters[name] = CsvItemExporter(self.files[name])
if name == 'first_class_def':
self.exporters[name].fields_to_export = ['f1','f2','f3']
self.exporters[name].start_exporting()
if name == 'second_class_def':
self.exporters[name].fields_to_export = ['f1','f4','f5','f6']
self.exporters[name].start_exporting()
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
typesItem = item_type(item)
if typesItem in set(self.fileNamesCsv):
self.exporters[typesItem].export_item(item)
return item
Now, everything works as I originally intended to.
Upvotes: 0
Reputation: 28256
Unfortunately, due to the way scrapy's Item
is implemented, the information about the order of field definitions is not preserved.
If the order matters, the best you can do is define the order you want as a separate class variable, and use that in your pipeline. Passing the fields_to_export
argument to CsvItemExporter
would probably be simplest.
Here's a basic idea you can play around with:
# items.py
class Item1(scrapy.Item):
fields_to_export = ['fi', 'f2']
f1 = scrapy.Field()
f2 = scrapy.Field()
# pipelines.py
from project.items import Item1
class SomeSitePipeline(object):
save_types = {'item1': Item1}
def spider_opened(self, spider):
# (...)
self.exporters = dict(
(name, CsvItemExporter(self.files[name], fields_to_export=item_type.fields_to_export))
for name, item_type in self.save_types.items()
)
# (...)
Also, I just noticed you're using list comprehensions for side-effects, which is a bad idea, you should just use a normal loop instead.
Upvotes: 1