Reputation: 3062
I wrote a custom spider to recursively scape pages of a website and store the details of each crawl in my postgres database:
class MySpider(scrapy.Spider):
name = 'my_spider'
def __init__(self):
self.start_urls = ['http://www.example.com']
def parse(self, response):
yield scrapy.request(self.start_urls[0], callback=self.parse_page)
def parse_page(self, response):
with transaction.manager:
crawl = Crawl()
crawl.url = response.request.url
crawl.response_body = response.body
Session.add(crawl)
Session.flush()
if len(response.css('.pager-next')) == 1:
# build url for the next page to crawl
# ...
yield scrapy.Request(url=full_url, callback=self.parse_page)
The problem is that I want to get back a list of ids for the crawls that were added to the database, which another function can use.
def scrape_website():
process = CrawlerProcess()
process.crawl(MySpider)
process.start() # <-- how to return crawl ids?
parse_crawls(crawl_ids)
Any ideas?
Upvotes: 0
Views: 652
Reputation: 5181
You should use the Item Pipeline to store your data in Postgresql
Look at pipelines.py
example from this article
import psycopg2
from scrapy_example_com.items import *
class ScrapyExampleComPipeline(object):
def __init__(self):
self.connection = psycopg2.connect(host='localhost', database='scrapy_example_com', user='postgres')
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
# check item type to decide which table to insert
try:
if type(item) is CustomerItem:
self.cursor.execute("""INSERT INTO customers (id, firstname, lastname, phone, created_at, updated_at, state) VALUES(%s, %s, %s, %s, %s, %s, %s)""", (item.get('id'), item.get('firstname'), item.get('lastname'), item.get('phone'), item.get('created_at'), item.get('updated_at'), item.get('state'), ))
elif type(item) is CategoryItem:
self.cursor.execute("""INSERT INTO categories (id, name) VALUES(%s, %s)""", (item.get('id'), item.get('code'), ))
self.connection.commit()
self.cursor.fetchall()
except psycopg2.DatabaseError, e:
print "Error: %s" % e
return item
And don't forget to update your settings.py
ITEM_PIPELINES = {
'scrapy_taskeme.pipelines.ScrapyExampleComPipeline': 300
}
Upvotes: 1