Reputation: 328
I have a spider that starts with four different start_urls
, and then goes on crawling certain links inside. All of them have the same domain and structure, the only thing that changes is a query parameter between them. I use two rules: one for opening and parsing each link and one for following through with pagination.
My problem is: I don't want to crawl ALL links due to the extensive amount generated by pagination, so I need to check each link crawled for a condition (a publication year), and as soon as that year is different than the year I want, the spider should ignore the crawling of all remaining links belonging to that start_url, and then move on to the links generated by the second start_url
. How can I go about doing this? here is the code for my spider:
class articleSpider(CrawlSpider):
name = 'article'
allowed_domains = ['website.com']
start_urls = [
'https://www.website.com/search/?category=value1',
'https://www.website.com/search/?category=value2',
'https://www.website.com/search/?category=value3',
'https://www.website.com/search/?category=value4',
]
rules = (
Rule(
LinkExtractor(
restrict_xpaths="//div[@class='results-post']/article/a"
),
callback='parse_item',
follow=True,
),
Rule(
LinkExtractor(
restrict_xpaths="//section[@class='results-navi'][1]/div/div[@class='prevpageNav left']"
)
)
)
def parse_item(self, response):
name = response.url.strip('/').split('/')[-1]
date = response.xpath("//section/p/time/@datetime").get()[:4]
if date == '2020':
with open(f'./src/data/{name}.html', 'wb') as f:
f.write(response.text.encode('utf8'))
return
Thankss in advance for the help.
Upvotes: 1
Views: 835
Reputation: 2116
I don't know of an easy way to achieve this, but maybe the below (untested) code can help to get you started. The logic is as follows:
from scrapy import Spider, Request
class articleSpider(Spider):
name = 'article'
allowed_domains = ['website.com']
start_urls = [
'https://www.website.com/search/?category=value1',
'https://www.website.com/search/?category=value2',
'https://www.website.com/search/?category=value3',
'https://www.website.com/search/?category=value4',
]
def start_requests(self):
start_urls = self.start_urls
start_url = start_urls.pop()
meta = {'start_urls': start_urls}
yield Request(start_url, callback=self.parse, meta=meta)
def parse(self, response):
start_urls = response.meta['start_urls']
# get item-urls
item_urls = response.xpath(
'//div[@class="results-post"]/article/a'
).extract()
# get next page-url
next_page = response.xpath(
'//section[@class="results-navi"][1]/div/div[@class="prevpageNav left"]'
).extract_first()
# pass the item-urls and next page in the meta
item_url = item_urls.pop()
meta = {
'next_page': next_page,
'item_urls': item_urls,
'start_urls': start_urls
}
yield Request(item_url, self.parse_item, meta=meta)
def parse_item(self, response):
item_urls = response.meta['item_urls']
next_page = response.meta['next_page']
start_urls = response.meta['start_urls']
name = response.url.strip('/').split('/')[-1]
date = response.xpath("//section/p/time/@datetime").get()[:4]
if date == '2020':
with open(f'./src/data/{name}.html', 'wb') as f:
f.write(response.text.encode('utf8'))
try:
item_url = item_urls.pop()
except IndexError:
# all items are done - we go to next page
if next_page:
meta = {'start_urls': start_urls}
yield Request(next_page, self.parse, meta=meta)
else:
# no pages left, go to next start_url
try:
start_url = start_urls.pop()
except IndexError:
# nothing left to do
return
else:
meta = {'start_urls': start_urls}
yield Request(start_url, self.parse, meta=meta)
else:
# still items left to process
meta = {
'next_page': next_page,
'item_urls': item_urls
}
yield Request(item_url, self.parse_item, meta=meta)
else:
# go to next start_url
try:
start_url = start_urls.pop()
except IndexError:
# nothing left to do
return
else:
meta = {'start_urls': start_urls}
yield Request(start_url, self.parse, meta=meta)
Upvotes: 3