Amen Aziz
Amen Aziz

Reputation: 779

Scrape information using scrapy

I am trying to scrape information as shown below but they will provide the wrong output. what mistakes am I doing? this is page link https://www.thegrommet.com/products/the-vintage-pearlmini-peas-in-the-pod-necklace

from scrapy import Spider
from scrapy.http import Request


class AuthorSpider(Spider):
    name = 'book'
    start_urls = ['https://www.thegrommet.com/gifts/by-type/personalized-gifts']
    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
    }



    def parse(self, response):
        books = response.xpath("//div[@class='flex-grow | p-t-s']//@href").extract()
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)

    def parse_book(self, response):
        title=response.xpath("//div[@class='f-heading-xl']//text()").get()
        title=title.strip()
        d3=response.xpath("//div[@class='accordion-section | p-t-s p-b-m']")
        for pro in d3:
            data=[tup for tup in pro.xpath('//div//text()')]
            try:
                trip=data[1].get()
            except:
                trip=''
            trip=trip.strip()
            try:
                tuck=data[2].get()
            except:
                tuck=''
            tuck=tuck.strip()
            try:
                tup=data[3].get()
            except:
                tup=''
            tup=tup.strip()
            
        yield{ 
            'title':title,
            'd1':trip,
            'd2':tuck,
            'd3':tup,
            
            
            }

enter image description here

PIC2:

enter image description here

Upvotes: 0

Views: 73

Answers (1)

Md. Fazlul Hoque
Md. Fazlul Hoque

Reputation: 16187

You can select the xpath expression for d1,d2,d3 the following way and no need to use try except because scrapy handles None value itself. You also can use scrapy built-in method which is normalize-space to remove leading and trailing Whitespace and Newlines.

Full working code:

from scrapy import Spider
from scrapy.http import Request

class AuthorSpider(Spider):
    name = 'book'
    start_urls = ['https://www.thegrommet.com/gifts/by-type/personalized-gifts']
    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
        'DOWNLOAD_DELAY': 1,
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.62 Safari/537.36'
    }

    def parse(self, response):
        books = response.xpath("//div[@class='flex-grow | p-t-s']//@href").extract()
        for book in books:
            url = response.urljoin(book)
            yield Request(url, callback=self.parse_book)

    def parse_book(self, response):
        title=response.xpath("//div[@class='f-heading-xl']//text()").get()
        title=title.strip()
        
        yield{ 
            'title':title,
            'd1':response.xpath('normalize-space((//*[@class="accordion-section | p-t-s p-b-m"]/div)[1]/text()[1])').get(),
            'd2':response.xpath('normalize-space((//*[@class="accordion-section | p-t-s p-b-m"]/div)[2]/text()[1])').get(),
            'd3':response.xpath('normalize-space((//*[@class="accordion-section | p-t-s p-b-m"]/div)[3]/text()[1])').get(),
            'url':response.url
            
            }

Upvotes: 1

Related Questions