scraping url and title from nested anchor tag

Question

This is my first scraper using scrapy.

I am trying to scrap video url, title from https://www.google.co.in/trends/hotvideos#hvsm=0 site.

import scrapy
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector

class CraigslistItem(Item):
    title = Field()
    link = Field()

class DmozSpider(scrapy.Spider):
    name = "google"
    allowed_domains = ["google.co.in"]
    start_urls = [
        "https://www.google.co.in/trends/hotvideos#hvsm=0"
    ]

    def parse(self, response):
        #for sel in response.xpath('//body/div'):
    hxs = HtmlXPathSelector(response)
    sites = hxs.xpath("//span[@class='single-video-image-container']")
    items = []
    for sel in response.xpath("//span[@class='single-video-image-container']"):
        item = CraigslistItem()
        item['title'] = sel.xpath('a/text()').extract()
        item['link'] = sel.xpath('a/@href').extract()   
        items.append(item)
        print items

General walk through of what I am doing wrong would be much appreciable.

Jithin · Accepted Answer

Use the help Scrapy FormRequest to get it done.

from scrapy.http import FormRequest
import json

class DmozSpider(scrapy.Spider):
    name = "google"
    allowed_domains = ["google.co.in"]
    start_urls = [
        "https://www.google.co.in/trends/hotvideos#hvsm=0"
    ]

    def parse(self, response):
        url = 'https://www.google.co.in/trends/hotvideos/hotItems'
        formdata = {'hvd':'','geo': 'IN','mob': '0','hvsm': '0'}
        yield FormRequest(url=url, formdata=formdata, callback=self.parse_data)

    def parse_data(self, response):
        json_response = json.loads(response.body)
        videos = json_response.get('videoList')
        for video in videos:
            item = CraigslistItem()
            item['title'] = video.get('title')
            item['link'] = video.get('url')
            yield item

scraping url and title from nested anchor tag

Answers (1)

Related Questions