Reputation:
I am writing a Scrapy crawler, and I want it to send the data to a database. But I can't make it work, maybe because the pipeline. Here is my spider:
from scrapy.contrib.spiders import CrawlSpider
from scrapy.selector import Selector
from scrapy.http import Request
class YourCrawler(CrawlSpider):
name = "bookstore"
start_urls = [
'https://example.com/materias/?novedades=LC&p',
]
allowed_domains = ["example.com"]
def parse(self, response):
# go to the urls in the list
s = Selector(response)
page_list_urls = s.xpath('///*[@id="results"]/ul/li/div[1]/h4/a[2]/@href').extract()
for url in page_list_urls:
yield Request(response.urljoin(url), callback=self.parse_following_urls, dont_filter=True)
# Return back and go to bext page in div#paginat ul li.next a::attr(href) and begin again
next_page = response.css('div#paginat ul li.next a::attr(href)').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(next_page, callback=self.parse)
#Dont know if this has to go here
if not s.select('//*[@id="logo"]/a/img'):
yield Request(url=response.url, dont_filter=True)
# For the urls in the list, go inside, and in div#main, take the div.ficha > div.caracteristicas > ul > li
def parse_following_urls(self, response):
#Parsing rules go here
for each_book in response.css('div#main'):
yield {
'book_isbn': each_book.css('div.ficha > div.caracteristicas > ul > li').extract(),
}
custom_settings = {
"DOWNLOAD_DELAY": 5,
"CONCURRENT_REQUESTS_PER_DOMAIN": 2
}
And I want it to send the data to a database, so in pypelines.py I have
import pymysql
from scrapy.exceptions import DropItem
from scrapy.http import Request
class to_mysql(object):
def __init__(self):
self.connection = pymysql.connect("***","***","***","***", charset="utf8", use_unicode=True)
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
self.cursor.execute("INSERT INTO _b (book_isbn) VALUES (%s)", (item['book_isbn'].encode('utf-8')))
self.connection.commit()
return item
def close_spider(self, spider):
self.cursor.close()
self.connection.close()
And in settings.py
ITEM_PIPELINES = {
'bookstore.pipelines.BookstorePipeline': 300,
'bookstore.pipelines.to_mysql': 300,
}
If I activate the pipeline «to_mysql» in settings.py it doesn't work, and returns this traceback:
Traceback (most recent call last):
File "/usr/local/lib/python2.7/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/***/scrapy/bookstore/bookstore/pipelines.py", line 27, in process_item
self.cursor.execute("INSERT INTO _b (book_isbn) VALUES (%s)", (item['book_isbn'].encode('utf-8')))
AttributeError: 'list' object has no attribute 'encode'
2017-07-09 16:19:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://example.com/book/?id=9788416495412> (referer: https://example.com/materias/?novedades=LC&p)
2017-07-09 16:19:48 [scrapy.core.scraper] ERROR: Error processing {'book_isbn': [u'<li>Editorial: <a href="/search/avanzada/?go=1&editorial=Galaxia%20Gutenberg">Galaxia Gutenberg</a></li>', u'<li>P\xe1ginas: 325</li>', u'<li>A\xf1o: 2017</li>', u'<li>Precio: 21.90 \u20ac</li>', u'<li>Traductor: Pablo Moreno</li>', u'<li>EAN: 9788416495412</li>']}
Any idea about why is this happening?
Upvotes: 0
Views: 146
Reputation: 18799
this is because you are returning a list value on the book_isbn
field, because .extract()
returns a list and a list can't be encoded into an sql query.
You'll have to make serialize that value, or maybe you didn't want a list, in that case use extract_first()
.
Upvotes: 1