Reputation: 3
I want to know how do I stop it logging the same url more than once?
This is my code so far:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.item import Item, Field
class MyItem(Item):
url=Field()
class someSpider(CrawlSpider):
name = "My script"
domain=raw_input("Enter the domain:\n")
allowed_domains = [domain]
starting_url=raw_input("Enter the starting url with protocol:\n")
start_urls = [starting_url]
f=open("items.txt","w")
rules = (Rule(LxmlLinkExtractor(allow_domains=(domain)), callback='parse_obj', follow=True),)
def parse_obj(self,response):
for link in LxmlLinkExtractor(allow_domains=(self.domain)).extract_links(response):
item = MyItem()
item['url'] = link.url
self.f.write(item['url']+"\n")
Right now it will do thousands of duplicates for a single link, in for example, a vBulletin forum with around 250,000 posts.
Edit: Do note that the cralwer will get millions upon millions of links. Hence I would need the code to be really quick in checking.
Upvotes: 0
Views: 270
Reputation: 6331
Create a list of already visited urls and check it for every URL. So after parsing particular URL add it to the list. Before visiting a page on newly found URL check if this URL already in that list and either parse it and add or skip.
I.e.:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.item import Item, Field
class MyItem(Item):
url=Field()
class someSpider(CrawlSpider):
name = "My script"
domain=raw_input("Enter the domain:\n")
allowed_domains = [domain]
starting_url=raw_input("Enter the starting url with protocol:\n")
start_urls = [starting_url]
items=[] #list with your URLs
f=open("items.txt","w")
rules = (Rule(LxmlLinkExtractor(allow_domains=(domain)), callback='parse_obj', follow=True),)
def parse_obj(self,response):
for link in LxmlLinkExtractor(allow_domains=(self.domain)).extract_links(response):
if link not in self.items: #check if it's already parsed
self.items.append(link) #add to list if it's not parsed yet
#do your job on adding it to a file
item = MyItem()
item['url'] = link.url
self.f.write(item['url']+"\n")
Dictionary version:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.item import Item, Field
class MyItem(Item):
url=Field()
class someSpider(CrawlSpider):
name = "My script"
domain=raw_input("Enter the domain:\n")
allowed_domains = [domain]
starting_url=raw_input("Enter the starting url with protocol:\n")
start_urls = [starting_url]
items={} #dictionary with your URLs as keys
f=open("items.txt","w")
rules = (Rule(LxmlLinkExtractor(allow_domains=(domain)), callback='parse_obj', follow=True),)
def parse_obj(self,response):
for link in LxmlLinkExtractor(allow_domains=(self.domain)).extract_links(response):
if link not in self.items: #check if it's already parsed
self.items[link]=1 #add to dictionary as key if it's not parsed yet (stored value can be anything)
#do your job on adding it to a file
item = MyItem()
item['url'] = link.url
self.f.write(item['url']+"\n")
P.S. You can also collect items
first and then write it to a file.
There are many other improvements to be made to this code, but I leave that to you to study.
Upvotes: 2