Reputation: 61
I am very new to Python and I am in the process of learning on how scrape web pages (1 day in). The task I want to achieve is to loop through a list of 2000 companies and extract revenue data and the number of employees. I started by using scrapy, and I have managed to get the workflow to work for one company (not elegant, but at least I am trying)- but I cannot figure out how I can load the list of companies and loop through to carry out multiple searches. I have a feeling this is a fairly simple procedure.
So, my main question is - where in the spider class should I define the query array of companies to loop through? I do not know the exact URLs since each company has a unique ID and belongs to specific market - so I can not input them as start_urls.
Is Scrapy the right tool or should I have used mechanize - for this type of task?
Here is my current code.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import FormRequest
from scrapy.http import Request
from tutorial.items import DmozItem
import json
class DmozSpider(BaseSpider):
name = "dmoz"
allowed_domains = ["proff.se"]
start_urls = ["http://www.proff.se"]
# Search on the website, currently I have just put in a static search term here, but I would like to loop over a list of companies.
def parse(self, response):
return FormRequest.from_response(response, formdata={'q': rebtel},callback=self.search_result)
# I fetch the url from the search result and convert it to correct Financial-url where the information is located.
def search_result(self,response):
sel = HtmlXPathSelector(response)
link = sel.xpath('//ul[@class="company-list two-columns"]/li/a/@href').extract()
finance_url=str(link[0]).replace("/foretag","http://www.proff.se/nyckeltal")
return Request(finance_url,callback=self.parse_finance)
# I Scraped the information of this particular company, this is hardcoded and will not
# work for other responses. I had some issues with the encoding characters
# initially since they were Swedish. I also tried to target the Json element direct by
# revenue = sel.xpath('#//*[@id="accountTable1"]/tbody/tr[3]/@data-chart').extract()
# but was not able to parse it (error - expected string or buffer - tried to convert it
# to a string by str() with no luck, something off with the formatting, which is messing the the data types.
def parse_finance(self, response):
sel = HtmlXPathSelector(response)
datachart = sel.xpath('//tr/@data-chart').extract()
employees=json.loads(datachart[36])
revenue = json.loads(datachart[0])
items = []
item = DmozItem()
item['company']=response.url.split("/")[-5]
item['market']=response.url.split("/")[-3]
item['employees']=employees
item['revenue']=revenue
items.append(item)
return item
Upvotes: 4
Views: 3677
Reputation: 753
The first thing I'd do is to create a list of companies and find a way to get the url of each one. After this crawling is easy. I have written a crawler to extract disease information from wikipedia from a list of diseases. See how it fits your use case.
import requests
from bs4 import BeautifulSoup
import sys
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
from subprocess import Popen, check_call
from multiprocessing import Pool
#nltk.download()
def crawlwiki(keywords):
print (keywords)
columns = ['Category', 'Text']
page=1
print ('Fetching for {}....'.format(keywords))
url = 'https://en.wikipedia.org/wiki/'
for i in range(len(keywords)):
url = url + keywords[i]
url = url + '%20'
url = url[0:(len(url)-3)]
output_obj = {}
#curr_page = url+str(page)
while True:
try:
page_source = requests.get(url)
except:
#What you should do if internet connection fails
break
plain_text = page_source.text
bs_obj = BeautifulSoup(plain_text, "lxml")
'''toc_links = bs_obj.findAll('div', {'class': 'toc-links'})
base_url = 'http://www.webmd.com'
for div in toc_links:
links = div.findAll('a')
for a in links:
output_obj[a.text] = base_url + a.get('href')
print (base_url + a.get('href'))
data = bs_obj.findAll('div', {'class':'search-text-container'})
for div in data:
links = div.findAll('a')
for a in links:
output_obj[a.text] = a.get('href')
print (a.get('href'))'''
"""
Mapping:
1 : Signs and symptoms
2 : Diagnosis
3 : Prognosis
4 : Treatment
"""
symptom_text = re.findall ( '<h2><span class="mw-headline" id="Signs_and_symptoms">Signs and symptoms</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(symptom_text)
symptoms_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
symptom_data = symptoms_object.findAll('p')
symptom_paragraphs = ""
for p in symptom_data:
symptom_paragraphs += p.text
symptom_paragraphs = re.sub(r"/?\[\d+]" , '', symptom_paragraphs, re.DOTALL)
df_1 = pd.DataFrame(data=[['1', symptom_paragraphs]], columns=columns)
diagnosis_text = re.findall ( '<h2><span class="mw-headline" id="Diagnosis">Diagnosis</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(diagnosis_text)
diagnosis_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
diagnosis_data = diagnosis_object.findAll('p')
diagnosis_paragraphs = ""
for p in diagnosis_data:
diagnosis_paragraphs += p.text
diagnosis_paragraphs = re.sub(r"/?\[\d+]" , '', diagnosis_paragraphs, re.DOTALL)
df_2 = pd.DataFrame(data=[['2', diagnosis_paragraphs]], columns=columns)
prognosis_text = re.findall ( '<h2><span class="mw-headline" id="Prognosis">Prognosis</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(prognosis_text)
prognosis_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
prognosis_data = prognosis_object.findAll('p')
prognosis_paragraphs = ""
for p in prognosis_data:
prognosis_paragraphs += p.text
prognosis_paragraphs = re.sub(r"/?\[\d+]" , '', prognosis_paragraphs, re.DOTALL)
df_3 = pd.DataFrame(data=[['3', prognosis_paragraphs]], columns=columns)
treatment_text = re.findall ( '<h2><span class="mw-headline" id="Treatment">Treatment</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(treatment_text)
treatment_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
treatment_data = treatment_object.findAll('p')
treatment_paragraphs = ""
for p in treatment_data:
treatment_paragraphs += p.text
treatment_paragraphs = re.sub(r"/?\[\d+]" , '', treatment_paragraphs, re.DOTALL)
df_4 = pd.DataFrame(data=[['4', treatment_paragraphs]], columns=columns)
df = pd.DataFrame(columns = columns)
df = df.append(df_1.append(df_2.append(df_3.append(df_4))))
return df
print('Fetch completed....')
def main():
disease_df = pd.read_csv("disease.txt", sep="\n", header=None)
columns = ['Category', 'Text']
df_data = pd.DataFrame(columns=columns)
size = disease_df.size
print("Initializing....")
p = Pool(5)
df_data = p.map(crawlwiki, disease_df.values.tolist())
"""for index, row in disease_df.iterrows():
print('Iteration {0} out of {1}.....'.format(index+1, size))
df = crawlwiki(row, columns)
df_data = df_data.append(df)"""
df_data.to_csv("TagDataset.csv", index=False)
if __name__ == '__main__':
main()
Upvotes: 0
Reputation: 298532
The common approach is to do this with a command-line argument. Give the spider's __init__
method an argument:
class ProffSpider(BaseSpider):
name = "proff"
...
def __init__(self, query):
self.query = query
def parse(self, response):
return FormRequest.from_response(response,
formdata={'q': self.query},
callback=self.search_result
)
...
And then start your spiders (maybe with Scrapyd):
$ scrapy crawl proff -a query="something"
$ scrapy crawl proff -a query="something else"
If you want to run a bunch of spiders at once by passing in the arguments from a file, you can create a new command to run multiple instances of a spider. This is just mixing the builtin crawl
command with the example code for running multiple spiders with a single crawler:
your_project/settings.py
COMMANDS_MODULE = 'your_project_module.commands'
your_project/commands/__init__.py
# empty file
your_project/commands/crawl_many.py
import os
import csv
from scrapy.commands import ScrapyCommand
from scrapy.utils.python import without_none_values
from scrapy.exceptions import UsageError
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Run many instances of a spider'
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option('-f', '--input-file', metavar='FILE', help='CSV file to load arguments from')
parser.add_option('-o', '--output', metavar='FILE', help='dump scraped items into FILE (use - for stdout)')
parser.add_option('-t', '--output-format', metavar='FORMAT', help='format to use for dumping items with -o')
def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
if not opts.output:
return
if opts.output == '-':
self.settings.set('FEED_URI', 'stdout:', priority='cmdline')
else:
self.settings.set('FEED_URI', opts.output, priority='cmdline')
feed_exporters = without_none_values(self.settings.getwithbase('FEED_EXPORTERS'))
valid_output_formats = feed_exporters.keys()
if not opts.output_format:
opts.output_format = os.path.splitext(opts.output)[1].replace('.', '')
if opts.output_format not in valid_output_formats:
raise UsageError('Unrecognized output format "%s". Valid formats are: %s' % (opts.output_format, tuple(valid_output_formats)))
self.settings.set('FEED_FORMAT', opts.output_format, priority='cmdline')
def run(self, args, opts):
if args:
raise UsageError()
with open(opts.input_file, 'rb') as handle:
for spider_options in csv.DictReader(handle):
spider = spider_options.pop('spider')
self.crawler_process.crawl(spider, **spider_options)
self.crawler_process.start()
You can run it like so:
$ scrapy crawl_many -f crawl_options.csv -o output_file.jsonl
The format of the crawl options CSV is simple:
spider,query,arg2,arg3
proff,query1,value2,value3
proff,query2,foo,bar
proff,query3,baz,asd
Upvotes: 2