Reputation: 65
I'm training web scraping with python through the code below.
But one of the data has two xpath, I would like to know if there is a way to capture both using an "if" condition, but I don't know how to insert it into my code. Can anyone guide me?
For example, if one of the xpath is null, it will surely be the other. I don't know if it was explained well, but if I have a and b, if a is null then b.
'vlr_atual' can be respectively:
product.xpath(".//span[@id='priceblock_ourprice']/text()").get()
product.xpath(".//span[@id='priceblock_saleprice']/text()").get()
import scrapy
import datetime
class ProductsSpider(scrapy.Spider):
name = 'products'
allowed_domains = ['www.amazon.com.br']
start_urls = ['https://www.amazon.com.br/s?i=computers&bbn=16339926011&rh=n%3A16364756011&fs=true&qid=1615634908&ref=sr_pg_1']
def parse(self, response):
for produto in response.xpath("//div[@class='a-section a-spacing-medium']"):
selo = produto.xpath(".//span[@class='a-badge-text']/text()").get()
link = response.urljoin(produto.xpath(".//h2/a/@href").get())
yield response.follow(url=link, callback=self.parse_details, meta={'selo' : selo})
next_page = response.urljoin(response.xpath("//li[@class='a-last']/a/@href").get())
if next_page:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_details(self, response):
selo = response.request.meta['selo']
for produto in response.xpath("//div[@id='dp']"):
vlr_atual = produto.xpath(".//span[@id='priceblock_ourprice']/text()").get()
if vlr_atual is None:
vlr_atual = produto.xpath(".//span[@id='priceblock_saleprice']/text()").get()
yield{
'data' : datetime.datetime.now().strftime("%Y%m%d"),
'selo': selo,
'nome': produto.xpath("normalize-space(.//span[@id='productTitle']/text())").get(),
'vlr_atual': vlr_atual,
'estoque': produto.xpath("normalize-space(.//select[@name='quantity']/option[last()]/text())").get(),
'ean': produto.xpath("normalize-space(.//table[@id='productDetails_techSpec_section_1']//tr[last()]/td/text())").get(),
}
Upvotes: 0
Views: 103
Reputation: 10666
I highly recommend you to use Item Loaders. You'll be able to automatically update selected fields in a single place. Take a first non blank value, join several results etc.
First define your Product
in items.py with TakeFirst
processor:
class ProductItem(scrapy.Item):
data= scrapy.Field()
selo = scrapy.Field()
vlr_atual= scrapy.Field(output_processor=TakeFirst())
Next use it in you spider:
from scrapy.loader import ItemLoader
....
for produto in response.xpath("//div[@id='dp']"):
l = ItemLoader(item=ProductItem(), selector=produto)
l.add_value('data', datetime.datetime.now().strftime("%Y%m%d"))
l.add_xpath("vlr_atual", ".//span[@id='priceblock_ourprice']/text()")
l.add_xpath("vlr_atual", ".//span[@id='priceblock_saleprice']/text()")
...
l.load_item()
Upvotes: 1
Reputation: 2407
How about something really simple:
def parse_details(self, response):
selo = response.request.meta['selo']
for produto in response.xpath("//div[@id='dp']"):
# determine which field is vlr_atual
ourprice = produto.xpath(".//span[@id='priceblock_ourprice']/text()").get()
saleprice = produto.xpath(".//span[@id='priceblock_saleprice']/text()").get()
if ourprice is not None:
vlr_atual = ourprice
else:
vlr_atual = saleprice
yield {
'data': datetime.datetime.now().strftime("%Y%m%d"),
'selo': selo,
'nome': produto.xpath("normalize-space(.//span[@id='productTitle']/text())").get(),
'vlr_atual': vlr_atual,
'estoque': produto.xpath("normalize-space(.//select[@name='quantity']/option[last()]/text())").get(),
'ean': produto.xpath("normalize-space(.//table[@id='productDetails_techSpec_section_1']//tr[last()]/td/text())").get(),
}
Upvotes: 1
Reputation: 8510
you can use the or
operator to choose between two things while having a preference for the first one
>>> a="www.example.com"
>>> b="www.example2.com"
>>> a or b
'www.example.com'
>>> a=None
>>> a or b
'www.example2.com'
>>>
the way this work is that that if a
have a "truth" value of true then a or b
return a
otherwise it return b
so you can do
product.xpath (".//span[@id='priceblock_ourprice']/text()").get() or product.xpath (".//span[@id='priceblock_saleprice']/text()").get()
Edit
you can also encapsulate this into its own function, like so
def get_vlr_atual(product, default=None):
lst_xpaths = [".//span[@id='priceblock_ourprice']/text()",
".//span[@id='priceblock_saleprice']/text()"
]
for path in lst_paths:
result = product.xpath(path).get()
if result is not None:
return result
return default
this does basically the the same as before, but it can be easily be expanded to as many xpath as you like and if all of those fail just return some convenient default value
and simple use like
...
{...
'vlr_atual': get_vlr_atual(product),
...
}
...
Upvotes: 1