Reputation: 1263
I have already done a scraper which can scrape data from a multiple pages. My question is I have a bunch of url's (say around 10 urls) which i need to pass each time.
Here is my code,
# -*- coding: utf-8 -*-
import scrapy
import csv
import re
import sys
import os
from scrapy.linkextractor import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider
from datablogger_scraper.items import DatabloggerScraperItem
class DatabloggerSpider(CrawlSpider):
# The name of the spider
name = "datablogger"
# The domains that are allowed (links to other domains are skipped)
allowed_domains = ["cityofalabaster.com"]
print type(allowed_domains)
# The URLs to start with
start_urls = ["http://www.cityofalabaster.com/"]
print type(start_urls)
# This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
rules = [
Rule(
LinkExtractor(
canonicalize=True,
unique=True
),
follow=True,
callback="parse_items"
)
]
# Method which starts the requests by visiting all URLs specified in start_urls
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse, dont_filter=True)
# Method for parsing items
def parse_items(self, response):
# The list of items that are found on the particular page
items = []
# Only extract canonicalized and unique links (with respect to the current page)
links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
# Now go through all the found links
for link in links:
# Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains
is_allowed = False
for allowed_domain in self.allowed_domains:
if allowed_domain in link.url:
is_allowed = True
# If it is allowed, create a new item and add it to the list of found items
if is_allowed:
item = DatabloggerScraperItem()
item['url_from'] = response.url
item['url_to'] = link.url
items.append(item)
# Return all the found items
return items
If u have a look at my code, You can see that allowed domains and start_urls "link" is passed manually. Instead i have csv which contains the urls to be passed.
Input:-
http://www.daphneal.com/
http://www.digitaldecatur.com/
http://www.demopolisal.com/
http://www.dothan.org/
http://www.cofairhope.com/
http://www.florenceal.org/
http://www.fortpayne.org/
http://www.cityofgadsden.com/
http://www.cityofgardendale.com/
http://cityofgeorgiana.com/Home/
http://www.goodwater.org/
http://www.guinal.org/
http://www.gulfshoresal.gov/
http://www.guntersvilleal.org/index.php
http://www.hartselle.org/
http://www.headlandalabama.org/
http://www.cityofheflin.org/
http://www.hooveral.org/
Here is the code to pass the urls and domain to the Start_urls and allowed_domains.
import csv
import re
import sys
import os
with open("urls.csv") as csvfile:
csvreader = csv.reader(csvfile, delimiter=",")
for line in csvreader:
start_urls = line[0]
start_urls1 = start_urls.split()
print start_urls1
print type(start_urls1)
if start_urls[7:10] == 'www':
p = re.compile(ur'(?<=http://www.).*(?=\/|.*)')
elif start_urls[7:10] != 'www' and start_urls[-1] == '/' :
p = re.compile(ur'(?<=http://).*(?=\/|\s)')
elif start_urls[7:10] != 'www' and start_urls[-1] != '/' :
p = re.compile(ur'(?<=http://).*(?=\/|.*)')
else:
p = re.compile(ur'(?<=https://).*(?=\/|.*)')
allowed_domains = re.search(p,start_urls).group()
allowed_domains1 = allowed_domains.split()
print allowed_domains1
print type(allowed_domains1)
The above code will read the each url , convert the each url into a list (format) and pass to the start_url and to get the domain by applying regex and pass it to the allowed_domain (format)
How should i integrate the above code to my main code to avoid manual passing of allowed_domains and start_urls ???
Thanks in Advance!!!!
Upvotes: 1
Views: 600
Reputation: 2624
You can run the spider from a python script, see more here:
if __name__ == '__main__':
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
# parse from csv file
allowed_domains = ...
start_urls = ...
DatabloggerSpider.allowed_domains = allowed_domains
DatabloggerSpider.start_urls = start_urls
process.crawl(DatabloggerSpider)
process.start()
Upvotes: 2