Samsul Islam
Samsul Islam

Reputation: 2619

scrapy: "load more result" pages

I was trying to write the follwing scrapy script to scrape items from the follwing web site. I was able to scrape first page items but there are more about 2000 page that i want scrape all. There is a option "load more result" , I also try to scrape load more result's pages, but unable to do that. please help me.

from scrapy.shell import open_in_browser
import scrapy
from scrapy import Selector
import math
import json

class MyItems(scrapy.Item):
    date = scrapy.Field()
    title = scrapy.Field()
    link  = scrapy.Field()

class ProductSpider(scrapy.Spider):
    name= 'reuters'
    allowed_domains = ['reuters.com']
    start_urls = ['https://www.reuters.com/search/news?blob=National+Health+Investors%2c+Inc.']
    download_delay = 1.5
    def parse(self,response):
        for url in response.css('h3.search-result-title a ::attr(href)').extract():
        url=response.urljoin(url)
        yield scrapy.Request(url, callback=self.parse_article)

    #"load more result"

    job_count = 1970
    job_per_page = 10

    pages = math.ceil(job_count/job_per_page)
    
    for page in range(2,pages):

        headers = {
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'en-US,en;q=0.9,bn;q=0.8,af;q=0.7',
            'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
            'accept': '*/*',
            'referer': 'https://www.reuters.com/search/news?blob=National+Health+Investors%2c+Inc.',
            'authority': 'www.reuters.com',
            'cookie': '_ga=GA1.2.592162541.1518081459; _gid=GA1.2.1478931362.1518081459; ajs_user_id=null; ajs_group_id=null; ajs_anonymous_id=%22e58b8e9e-8674-49b4-aaff-0248b6976654%22; _cb_ls=1; OX_plg=pm; __gads=ID=3c74f81d13d6c1b4:T=1518081460:S=ALNI_MZsx67ijryijAj2JcD2YXXZw20zIA; _cb=sjG2aCNHffBaLnBl; AAMC_reuters_0=REGION%7C3; aam_uuid=06971314173867630360429126725673522696; _cb_svref=null; D_DUID=334503eb-dac8-49cd-babd-02081b0b6d24; D_TOKEN=1.0:a25bacf1dbb943e3ba1e93edb2093843:9841e8a348072081c4b770cfdd017d59831a31e6d41f368c89065cd08eec79bb34c9020669a0d8cbd7a670e4e11de2e762b5f67038115c02ba5fcbd9da8de4078116daf500471d1d6440734c181cb49859090467365cbf9d646c0d3fc7e7bb7e4e2643ea7a20bf00f9a695f9bf30b0df402746b31e429526a87ed7aa3c9da9bb:4b5290392fda7a6ff1f0f529cfad0d027a406ae35b6edb8e7cd3f6493ca8b99d; OX_sd=2; mnet_session_depth=2%7C1518104359854; _chartbeat2=.1518081466539.1518104385876.1.k_ivd8UuDjDegChcDsjhRBbcy9U',
        }

        data = {'blob':'National Health Investors, Inc.',
         'bigOrSmall':'big',
         'articleWithBlog':'true',
         'sortBy':"",
         'dateRange':"",
         'numResultsToShow':'10',
         'pn':str(page),
         'callback':'addMoreNewsResults'}

         
        url ='https://www.reuters.com/assets/searchArticleLoadMoreJson?blob=National+Health+Investors%2C+Inc.&bigOrSmall=big&articleWithBlog=true&sortBy=&dateRange=&numResultsToShow=10&pn={}&callback=addMoreNewsResults'.format(page)

        yield scrapy.FormRequest(url,
            headers=headers,callback=self.parse
            )

def parse_article(self, response):
    print('\n')
    print('***Heading:***',response.css('h1.ArticleHeader_headline_2zdFM ::text').extract_first())
    print('***Url-Link:***',response.url)
    print('***Date :***',response.css('div.ArticleHeader_date_V9eGk ::text').extract())
    print('\n')

Upvotes: 1

Views: 1105

Answers (1)

gangabass
gangabass

Reputation: 10666

Each click on "LOAD MORE RESULTS" returns Javascript response with JSON object inside:

if (typeof addMoreNewsResults == 'function') { 
addMoreNewsResults( {
    blob: 'National+Health+Investors%2C+Inc.',
    sortBy: 'relevance',
    dateRange: 'all',
    totalResultNumber: 1970,
    totalResultNumberStr: "1,970",
    news: [ 
        {
        id: "-pbm-push-idUSKBN1DG2CP",
        headline: "Diplomat Pharmacy plunges as <b>investors<\/b> fret over rapid PBM push",
        date: "November 16, 2017 11:22am EST",
        href: "/article/us-diplomat-stocks/diplomat-pharmacy-plunges-as-investors-fret-over-rapid-pbm-push-idUSKBN1DG2CP",
        blurb: "...(Reuters) - Shares of Diplomat Pharmacy <b>Inc<\/b> &lt;DPLO.N&gt; tumbled 20... <b>National<\/b> Pharmaceutical Services.\nSome analysts were not excited...",
        mainPicUrl: ""
        }, 
        {....

So you need to use different parsing mechanism to get information you want (import json, json.loads() etc)

There is much easy way. You can get everything in one request (just change numResultsToShow param to get everything): https://www.reuters.com/assets/searchArticleLoadMoreJson?blob=National+Health+Investors%2C+Inc.&bigOrSmall=big&articleWithBlog=true&sortBy=&dateRange=&numResultsToShow=2000&pn=1&callback=addMoreNewsResults

UPDATE

# -*- coding: utf-8 -*-

import scrapy
import re
import json

class ReutersSpider(scrapy.Spider):
    name = "reuters"
    start_urls = [
        'https://www.reuters.com/assets/searchArticleLoadMoreJson?blob=National+Health+Investors%2C+Inc.&bigOrSmall=big&articleWithBlog=true&sortBy=&dateRange=&numResultsToShow=2000&pn=1&callback=addMoreNewsResults',
    ]

    def parse(self, response):

        json_string = re.search( r'addMoreNewsResults\((.+?) \);', response.body, re.DOTALL ).group(1)

        #Below code is used to transform from Javascript-ish JSON-like structure to JSON
        json_string = re.sub( r'^\s*(\w+):', r'"\1":', json_string, flags=re.MULTILINE)
        json_string = re.sub( r'(\w+),\s*$', r'"\1",', json_string, flags=re.MULTILINE)
        json_string = re.sub( r':\s*\'(.+?)\',\s*$', r': "\1",', json_string, flags=re.MULTILINE)

        results = json.loads(json_string)

        for result in results["news"]:
            item = {}
            item["href"] = result["href"]
            item["date"] = result["date"]

            yield item

Upvotes: 1

Related Questions