Reputation: 779
I am trying to scrape information as shown below but they will provide the wrong output. what mistakes am I doing? this is page link https://www.thegrommet.com/products/the-vintage-pearlmini-peas-in-the-pod-necklace
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'book'
start_urls = ['https://www.thegrommet.com/gifts/by-type/personalized-gifts']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[@class='flex-grow | p-t-s']//@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
title=response.xpath("//div[@class='f-heading-xl']//text()").get()
title=title.strip()
d3=response.xpath("//div[@class='accordion-section | p-t-s p-b-m']")
for pro in d3:
data=[tup for tup in pro.xpath('//div//text()')]
try:
trip=data[1].get()
except:
trip=''
trip=trip.strip()
try:
tuck=data[2].get()
except:
tuck=''
tuck=tuck.strip()
try:
tup=data[3].get()
except:
tup=''
tup=tup.strip()
yield{
'title':title,
'd1':trip,
'd2':tuck,
'd3':tup,
}
PIC2:
Upvotes: 0
Views: 73
Reputation: 16187
You can select the xpath expression for d1,d2,d3
the following way and no need to use try except because scrapy handles None value itself. You also can use scrapy built-in method which is normalize-space
to remove leading and trailing Whitespace and Newlines.
Full working code:
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'book'
start_urls = ['https://www.thegrommet.com/gifts/by-type/personalized-gifts']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.62 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[@class='flex-grow | p-t-s']//@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
title=response.xpath("//div[@class='f-heading-xl']//text()").get()
title=title.strip()
yield{
'title':title,
'd1':response.xpath('normalize-space((//*[@class="accordion-section | p-t-s p-b-m"]/div)[1]/text()[1])').get(),
'd2':response.xpath('normalize-space((//*[@class="accordion-section | p-t-s p-b-m"]/div)[2]/text()[1])').get(),
'd3':response.xpath('normalize-space((//*[@class="accordion-section | p-t-s p-b-m"]/div)[3]/text()[1])').get(),
'url':response.url
}
Upvotes: 1