Reputation: 51
Unable to get proper xpath for a list of items Keep getting empty list Problem with selection of the whole list.
The link:
https://globaldrive.ru/moskva/motory/2х-тактный-лодочный-мотор-hangkai-m3.5-hp/
here is html code that I'm trying to parse
<div id="content_features" class="ty-wysiwyg-content content-features">
<div class="ty-product-feature">
<span class="ty-product-feature__label">Бренды:</span>
<div class="ty-product-feature__value">Hangkai</div>
</div>
<div class="ty-product-feature">
<span class="ty-product-feature__label">Вес:</span>
<div class="ty-product-feature__value">УТОЧНЯЙТЕ У МЕНЕДЖЕРА<span class="ty-product-feature__suffix">кг</span></div>
</div>
</div>
My code:
for prop in response.xpath('//div[@id="content_features"]'):
item['properties'].append(
{
'name': prop.xpath('normalize-space(./*[@class="ty-product-feature__label"])').extract_first(),
'value': prop.xpath('normalize-space(./*[@class="ty-product-feature__value"])').extract_first(),
}
)
yield item
Complete parser:
import scrapy
class GlobaldriveruSpider(scrapy.Spider):
name = 'globaldriveru'
allowed_domains = ['globaldrive.ru']
start_urls = ['https://globaldrive.ru/moskva/motory/?items_per_page=500']
def parse(self, response):
links = response.xpath('//div[@class="ty-grid-list__item-name"]/a/@href').extract()
for link in links:
yield scrapy.Request(response.urljoin(link), callback=self.parse_products, dont_filter=True)
#yield scrapy.Request(link, callback=self.parse_products, dont_filter=True)
def parse_products(self, response):
for parse_products in response.xpath('//div[contains(@class, "container-fluid products_block_page")]'):
item = dict()
item['title'] = response.xpath('//h1[@class="ty-product-block-title"]/text()').extract_first()
item['price'] = response.xpath('//meta[@itemprop="price"]/@content').get()
item['available'] = response.xpath('normalize-space(//span[@id="in_stock_info_5511"])').extract_first()
item['image'] = response.xpath('//meta[@property="og:image"]/@content').get()
item['brand'] = response.xpath('normalize-space(//div[contains(@class,"ty-features-list")])').get()
item['department'] = response.xpath('normalize-space(//a[@class="ty-breadcrumbs__a"][2]/text())').extract()
item['properties'] = list()
for prop in response.xpath('//div[@id="content_features"]'):
item['properties'].append(
{
'name': prop.xpath('normalize-space(./*[@class="ty-product-feature__label"])').extract_first(),
'value': prop.xpath('normalize-space(./*[@class="ty-product-feature__value"])').extract_first(),
}
)
yield item
Upvotes: 0
Views: 25
Reputation: 3717
Your code is almost right, just made some corrections to your properties xpath. Also seems that your main "product" loop is useless, so I removed it. Check this code:
def parse_products(self, response):
item = dict()
item['title'] = response.xpath('//h1[@class="ty-product-block-title"]/text()').get()
item['price'] = response.xpath('//meta[@itemprop="price"]/@content').get()
item['available'] = response.xpath('normalize-space(//span[@id="in_stock_info_5511"])').get()
item['image'] = response.xpath('//meta[@property="og:image"]/@content').get()
item['brand'] = response.xpath('normalize-space(//div[contains(@class,"ty-features-list")])').get()
item['department'] = response.xpath('normalize-space(//a[@class="ty-breadcrumbs__a"][2]/text())').extract()
item['properties'] = list()
for prop in response.xpath('//div[@id="content_features"]/div[@class="ty-product-feature"]'):
item['properties'].append(
{
'name': prop.xpath('normalize-space(./*[@class="ty-product-feature__label"])').get(),
'value': prop.xpath('normalize-space(./*[@class="ty-product-feature__value"])').get(),
}
)
yield item
Upvotes: 1