Axel Ericsson
Axel Ericsson

Reputation: 61

Scrapy multiple search terms

I am very new to Python and I am in the process of learning on how scrape web pages (1 day in). The task I want to achieve is to loop through a list of 2000 companies and extract revenue data and the number of employees. I started by using scrapy, and I have managed to get the workflow to work for one company (not elegant, but at least I am trying)- but I cannot figure out how I can load the list of companies and loop through to carry out multiple searches. I have a feeling this is a fairly simple procedure.

So, my main question is - where in the spider class should I define the query array of companies to loop through? I do not know the exact URLs since each company has a unique ID and belongs to specific market - so I can not input them as start_urls.
Is Scrapy the right tool or should I have used mechanize - for this type of task?

Here is my current code.

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import FormRequest
from scrapy.http import Request
from tutorial.items import DmozItem
import json

class DmozSpider(BaseSpider):
    name = "dmoz"
    allowed_domains = ["proff.se"]
    start_urls = ["http://www.proff.se"]

# Search on the website, currently I have just put in a static search term here, but I would like to loop over a list of companies.

    def parse(self, response):
        return FormRequest.from_response(response, formdata={'q': rebtel},callback=self.search_result)

# I fetch the url from the search result and convert it to correct Financial-url where the information is located.

    def search_result(self,response):
        sel = HtmlXPathSelector(response)
        link = sel.xpath('//ul[@class="company-list two-columns"]/li/a/@href').extract()
        finance_url=str(link[0]).replace("/foretag","http://www.proff.se/nyckeltal")
        return Request(finance_url,callback=self.parse_finance)

# I Scraped the information of this particular company, this is hardcoded and will not 
# work for other responses. I had some issues with the encoding characters
# initially since they were Swedish. I also tried to target the Json element direct by
# revenue = sel.xpath('#//*[@id="accountTable1"]/tbody/tr[3]/@data-chart').extract()
# but was not able to parse it (error - expected string or buffer - tried to convert it
# to a string by str() with no luck, something off with the formatting, which is messing the the data types.    

    def parse_finance(self, response):
        sel = HtmlXPathSelector(response)
        datachart = sel.xpath('//tr/@data-chart').extract()
        employees=json.loads(datachart[36])
        revenue = json.loads(datachart[0])
        items = []
        item = DmozItem()
        item['company']=response.url.split("/")[-5]
        item['market']=response.url.split("/")[-3]
        item['employees']=employees
        item['revenue']=revenue
        items.append(item)
        return item

Upvotes: 4

Views: 3677

Answers (2)

Abhishek Vijayan
Abhishek Vijayan

Reputation: 753

The first thing I'd do is to create a list of companies and find a way to get the url of each one. After this crawling is easy. I have written a crawler to extract disease information from wikipedia from a list of diseases. See how it fits your use case.

import requests
from bs4 import BeautifulSoup
import sys
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
from subprocess import Popen, check_call
from multiprocessing import Pool
#nltk.download()

def crawlwiki(keywords):
    print (keywords)
    columns = ['Category', 'Text']
    page=1
    print ('Fetching for {}....'.format(keywords))
    url = 'https://en.wikipedia.org/wiki/'
    for i in range(len(keywords)):
        url = url + keywords[i]
        url = url + '%20'

    url = url[0:(len(url)-3)]   
    output_obj = {}
    #curr_page = url+str(page)
    while True:
        try:
            page_source = requests.get(url)
        except:

#What you should do if internet connection fails
        break

    plain_text = page_source.text
    bs_obj = BeautifulSoup(plain_text, "lxml")
    '''toc_links = bs_obj.findAll('div', {'class': 'toc-links'})
    base_url = 'http://www.webmd.com'
    for div in toc_links:
        links = div.findAll('a')
        for a in links:
            output_obj[a.text] = base_url + a.get('href')
            print (base_url + a.get('href'))
    data = bs_obj.findAll('div', {'class':'search-text-container'})
    for div in data:
        links = div.findAll('a')
        for a in links:
            output_obj[a.text] = a.get('href')
            print (a.get('href'))'''


    """
        Mapping:
        1 : Signs and symptoms
        2 : Diagnosis
        3 : Prognosis
        4 : Treatment

    """

    symptom_text = re.findall ( '<h2><span class="mw-headline" id="Signs_and_symptoms">Signs and symptoms</span>(.*?)<h2>', plain_text, re.DOTALL)
    str1 = ''.join(symptom_text)
    symptoms_object = BeautifulSoup(str1, "lxml")
    #paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
    symptom_data = symptoms_object.findAll('p')
    symptom_paragraphs = ""
    for p in symptom_data:
        symptom_paragraphs += p.text

    symptom_paragraphs = re.sub(r"/?\[\d+]" , '', symptom_paragraphs, re.DOTALL)
    df_1 = pd.DataFrame(data=[['1', symptom_paragraphs]], columns=columns)

    diagnosis_text = re.findall ( '<h2><span class="mw-headline" id="Diagnosis">Diagnosis</span>(.*?)<h2>', plain_text, re.DOTALL)
    str1 = ''.join(diagnosis_text)
    diagnosis_object = BeautifulSoup(str1, "lxml")
    #paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
    diagnosis_data = diagnosis_object.findAll('p')
    diagnosis_paragraphs = ""
    for p in diagnosis_data:
        diagnosis_paragraphs += p.text

    diagnosis_paragraphs = re.sub(r"/?\[\d+]"   , '', diagnosis_paragraphs, re.DOTALL)
    df_2 = pd.DataFrame(data=[['2', diagnosis_paragraphs]], columns=columns)

    prognosis_text = re.findall ( '<h2><span class="mw-headline" id="Prognosis">Prognosis</span>(.*?)<h2>', plain_text, re.DOTALL)
    str1 = ''.join(prognosis_text)
    prognosis_object = BeautifulSoup(str1, "lxml")
    #paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
    prognosis_data = prognosis_object.findAll('p')
    prognosis_paragraphs = ""
    for p in prognosis_data:
        prognosis_paragraphs += p.text

    prognosis_paragraphs = re.sub(r"/?\[\d+]"   , '', prognosis_paragraphs, re.DOTALL)
    df_3 = pd.DataFrame(data=[['3', prognosis_paragraphs]], columns=columns)

    treatment_text = re.findall ( '<h2><span class="mw-headline" id="Treatment">Treatment</span>(.*?)<h2>', plain_text, re.DOTALL)
    str1 = ''.join(treatment_text)
    treatment_object = BeautifulSoup(str1, "lxml")
    #paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
    treatment_data = treatment_object.findAll('p')
    treatment_paragraphs = ""
    for p in treatment_data:
        treatment_paragraphs += p.text

    treatment_paragraphs = re.sub(r"/?\[\d+]"   , '', treatment_paragraphs, re.DOTALL)
    df_4 = pd.DataFrame(data=[['4', treatment_paragraphs]], columns=columns)

    df = pd.DataFrame(columns = columns)

    df = df.append(df_1.append(df_2.append(df_3.append(df_4))))

    return df
    print('Fetch completed....')



def main():

    disease_df = pd.read_csv("disease.txt", sep="\n", header=None)

    columns = ['Category', 'Text']
    df_data = pd.DataFrame(columns=columns)
    size = disease_df.size
    print("Initializing....")
    p = Pool(5)
    df_data = p.map(crawlwiki, disease_df.values.tolist())
    """for index, row in disease_df.iterrows():
        print('Iteration {0} out of {1}.....'.format(index+1, size))
        df = crawlwiki(row, columns)
        df_data = df_data.append(df)"""

    df_data.to_csv("TagDataset.csv", index=False)




if __name__ == '__main__':
    main()

Upvotes: 0

Blender
Blender

Reputation: 298532

The common approach is to do this with a command-line argument. Give the spider's __init__ method an argument:

class ProffSpider(BaseSpider):
    name = "proff"
    ...

    def __init__(self, query):
        self.query = query

    def parse(self, response):
        return FormRequest.from_response(response,
            formdata={'q': self.query},
            callback=self.search_result
        )

    ...

And then start your spiders (maybe with Scrapyd):

$ scrapy crawl proff -a query="something"
$ scrapy crawl proff -a query="something else"

If you want to run a bunch of spiders at once by passing in the arguments from a file, you can create a new command to run multiple instances of a spider. This is just mixing the builtin crawl command with the example code for running multiple spiders with a single crawler:

your_project/settings.py

COMMANDS_MODULE = 'your_project_module.commands'

your_project/commands/__init__.py

# empty file

your_project/commands/crawl_many.py

import os
import csv

from scrapy.commands import ScrapyCommand
from scrapy.utils.python import without_none_values
from scrapy.exceptions import UsageError


class Command(ScrapyCommand):
    requires_project = True

    def syntax(self):
        return '[options]'

    def short_desc(self):
        return 'Run many instances of a spider'

    def add_options(self, parser):
        ScrapyCommand.add_options(self, parser)

        parser.add_option('-f', '--input-file', metavar='FILE', help='CSV file to load arguments from')
        parser.add_option('-o', '--output', metavar='FILE', help='dump scraped items into FILE (use - for stdout)')
        parser.add_option('-t', '--output-format', metavar='FORMAT', help='format to use for dumping items with -o')

    def process_options(self, args, opts):
        ScrapyCommand.process_options(self, args, opts)

        if not opts.output:
            return

        if opts.output == '-':
            self.settings.set('FEED_URI', 'stdout:', priority='cmdline')
        else:
            self.settings.set('FEED_URI', opts.output, priority='cmdline')

        feed_exporters = without_none_values(self.settings.getwithbase('FEED_EXPORTERS'))
        valid_output_formats = feed_exporters.keys()

        if not opts.output_format:
            opts.output_format = os.path.splitext(opts.output)[1].replace('.', '')

        if opts.output_format not in valid_output_formats:
            raise UsageError('Unrecognized output format "%s". Valid formats are: %s' % (opts.output_format, tuple(valid_output_formats)))

        self.settings.set('FEED_FORMAT', opts.output_format, priority='cmdline')

    def run(self, args, opts):
        if args:
            raise UsageError()

        with open(opts.input_file, 'rb') as handle:
            for spider_options in csv.DictReader(handle):
                spider = spider_options.pop('spider')
                self.crawler_process.crawl(spider, **spider_options)

        self.crawler_process.start()

You can run it like so:

$ scrapy crawl_many -f crawl_options.csv -o output_file.jsonl

The format of the crawl options CSV is simple:

spider,query,arg2,arg3
proff,query1,value2,value3
proff,query2,foo,bar
proff,query3,baz,asd

Upvotes: 2

Related Questions