user7499416
user7499416

Reputation:

Scrapy: connecto to MySQL

I am writing a Scrapy crawler, and I want it to send the data to a database. But I can't make it work, maybe because the pipeline. Here is my spider:

from scrapy.contrib.spiders import CrawlSpider
from scrapy.selector import Selector
from scrapy.http import Request

class YourCrawler(CrawlSpider):
    name = "bookstore"
    start_urls = [
    'https://example.com/materias/?novedades=LC&p',
    ]
    allowed_domains = ["example.com"]

    def parse(self, response):
        # go to the urls in the list
        s = Selector(response)
        page_list_urls = s.xpath('///*[@id="results"]/ul/li/div[1]/h4/a[2]/@href').extract()
        for url in page_list_urls:
            yield Request(response.urljoin(url), callback=self.parse_following_urls, dont_filter=True)

        # Return back and go to bext page in div#paginat ul li.next a::attr(href) and begin again
        next_page = response.css('div#paginat ul li.next a::attr(href)').extract_first()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield Request(next_page, callback=self.parse)

    #Dont know if this has to go here
    if not s.select('//*[@id="logo"]/a/img'):
        yield Request(url=response.url, dont_filter=True)

    # For the urls in the list, go inside, and in div#main, take the div.ficha > div.caracteristicas > ul > li
    def parse_following_urls(self, response):
        #Parsing rules go here
        for each_book in response.css('div#main'):
            yield {
                'book_isbn': each_book.css('div.ficha > div.caracteristicas > ul > li').extract(),
            }
    custom_settings = {
        "DOWNLOAD_DELAY": 5,
        "CONCURRENT_REQUESTS_PER_DOMAIN": 2
    }

And I want it to send the data to a database, so in pypelines.py I have

import pymysql
from scrapy.exceptions import DropItem
from scrapy.http import Request

class to_mysql(object):
    def __init__(self):
        self.connection = pymysql.connect("***","***","***","***", charset="utf8", use_unicode=True)
        self.cursor = self.connection.cursor()

    def process_item(self, item, spider):
        self.cursor.execute("INSERT INTO _b (book_isbn) VALUES (%s)", (item['book_isbn'].encode('utf-8')))
        self.connection.commit()
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.connection.close()

And in settings.py

ITEM_PIPELINES = {
   'bookstore.pipelines.BookstorePipeline': 300,
   'bookstore.pipelines.to_mysql': 300,
}

If I activate the pipeline «to_mysql» in settings.py it doesn't work, and returns this traceback:

Traceback (most recent call last):
  File "/usr/local/lib/python2.7/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File "/Users/***/scrapy/bookstore/bookstore/pipelines.py", line 27, in process_item
    self.cursor.execute("INSERT INTO _b (book_isbn) VALUES (%s)", (item['book_isbn'].encode('utf-8')))
AttributeError: 'list' object has no attribute 'encode'
2017-07-09 16:19:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://example.com/book/?id=9788416495412> (referer: https://example.com/materias/?novedades=LC&p)
2017-07-09 16:19:48 [scrapy.core.scraper] ERROR: Error processing {'book_isbn': [u'<li>Editorial: <a href="/search/avanzada/?go=1&amp;editorial=Galaxia%20Gutenberg">Galaxia Gutenberg</a></li>', u'<li>P\xe1ginas: 325</li>', u'<li>A\xf1o: 2017</li>', u'<li>Precio: 21.90 \u20ac</li>', u'<li>Traductor: Pablo Moreno</li>', u'<li>EAN: 9788416495412</li>']}

Any idea about why is this happening?

Upvotes: 0

Views: 146

Answers (1)

eLRuLL
eLRuLL

Reputation: 18799

this is because you are returning a list value on the book_isbn field, because .extract() returns a list and a list can't be encoded into an sql query.

You'll have to make serialize that value, or maybe you didn't want a list, in that case use extract_first().

Upvotes: 1

Related Questions