Reputation: 41
My scrapy crawler collect data from ptt website, and input the crawling data into google spreadsheet by using gspread. my ptt spider parse latest 40 post on the ptt website everyday, and now i would like to drop duplicate data in this latest 40 post, for example, if the post_title or post_link is the same with yesterday, then don't need to parse this post into google spreadsheet.
i know i should use DropItem in scarpy, but literally i didn't know how to fix my code( i am a very new beginner in Python), and would like ask for help for this one, thanks.
This is my ppt spider code
# -*- coding: utf-8 -*-
import scrapy
# from scrapy.exceptions import CloseSpider
from myFirstScrapyProject.items import MyfirstscrapyprojectItem
class PttSpider(scrapy.Spider):
count_page = 1
name = 'ptt'
allowed_domains = ['www.ptt.cc/']
start_urls = ['https://www.ptt.cc/bbs/e-shopping/search?q=%E8%9D%A6%E7%9A%AE']+['https://www.ptt.cc/bbs/e-seller/search?q=%E8%9D%A6%E7%9A%AE']
# start_urls = ['https://www.ptt.cc/bbs/e-shopping/index.html']
def parse(self, response):
items = MyfirstscrapyprojectItem()
for q in response.css('div.r-ent'):
items['push']=q.css('div.nrec > span.h1::text').extract_first()
items['title']=q.css('div.title > a::text').extract_first()
items['href']=q.css('div.title> a::attr(href)').extract_first()
items['date']=q.css('div.meta > div.date ::text').extract_first()
items['author']=q.css('div.meta > div.author ::text').extract_first()
yield(items)
and this is my pipeline
from myFirstScrapyProject.exporters import GoogleSheetItemExporter
from scrapy.exceptions import DropItem
class MyfirstscrapyprojectPipeline(object):
def open_spider(self, spider):
self.exporter = GoogleSheetItemExporter()
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
thanks to sharmiko, i rewrite it, but it seems doesn't work, what should i fix?
from myFirstScrapyProject.exporters import GoogleSheetItemExporter
from scrapy.exceptions import DropItem
class MyfirstscrapyprojectPipeline(object):
def open_spider(self, spider):
self.exporter = GoogleSheetItemExporter()
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
# def process_item(self, item, spider):
# self.exporter.export_item(item)
# return item
#class DuplicatesTitlePipeline(object):
def __init__(self):
self.article = set()
def process_item(self, item, spider):
href = item['href']
if href in self.article:
raise DropItem('duplicates href found %s', item)
self.exporter.export_item(item)
return(item)
this is the code for export to google sheet
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from scrapy.exporters import BaseItemExporter
class GoogleSheetItemExporter(BaseItemExporter):
def __init__(self):
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name('pythonupload.json', scope)
gc = gspread.authorize(credentials)
self.spreadsheet = gc.open('Community')
self.worksheet = self.spreadsheet.get_worksheet(1)
def export_item(self, item):
self.worksheet.append_row([item['push'], item['title'],
item['href'],item['date'],item['author']])
Upvotes: 4
Views: 1200
Reputation: 623
You should modify your process_item
function to check for duplicate elements and if it is found, you can just drop it.
from scrapy.exceptions import DropItem
...
def process_item(self, item, spider):
if [ your duplicate check logic goes here]:
raise DropItem('Duplicate element found')
else:
self.exporter.export_item(item)
return item
Dropped items are no longer passed to other pipeline components. You can read more about pipelines here.
Upvotes: 1