Reputation: 341
I am trying to scrape using Scrapy the following website (https://www.leadhome.co.za/search/property-for-sale/western-cape/4?sort=date'), and I see that the page gets crawled but none of the items are returned. Everything works withing Scrapy Shell.
Here is the code I have:
class LeadHomeSpider(scrapy.Spider):
name = "lead_home"
start_urls = [
'https://www.leadhome.co.za/search/property-for-sale/western-cape/4?sort=date',
]
# parse search page
def parse(self, response):
# follow property link
offering = 'buy' if 'sale' in response.css('h1::text').get() else 'rent'
for prop in response.css('div.search__PropertyCardWrapper-sc-1j5dndx-0.bsqBpI'):
link = 'https://www.leadhome.co.za' + prop.css('a::attr(href)').get()
a = prop.css('p.styles__Label-h53xsw-16.bcSkCI::text').getall()
#prop_type = attempt_get_property_type(a[0]) if len(a) != 0 else None
area = a[1] if len(a) > 1 else None
yield scrapy.Request(
link,
meta={'item': {
'agency': self.name,
'url': link,
'area': area,
'offering': offering,
#'property_type': prop_type,
}},
callback=self.parse_property,
)
# follow to next page
next_page_number = response.xpath(
'//a[contains(@class, "styles__PageNumber-zln67a-0 jRCKhp")]/following-sibling::a/text()').get()
if next_page_number is not None:
new_page_link = 'https://www.leadhome.co.za/search/property-for-sale/western-cape/4?sort=date&page=' + next_page_number
next_page = response.urljoin(new_page_link)
yield scrapy.Request(next_page, callback=self.parse)
# parse property
def parse_property(self, response):
item = response.meta.get('item')
item['parking'] = response.xpath('//p[contains(text(), "Uncovered Parking:")]/following-sibling::p/text()').get()
...
Any idea what might be wrong here? Any suggestions are welcome! Thank you in advance!
Upvotes: 0
Views: 55
Reputation: 10666
You're using random class values (1j5dndx-0.bsqBpI
etc) in your CSS expressions that's why your code don't work. Here is the same code but using XPath's contains
to match a part of a class:
def parse(self, response):
# follow property link
offering = 'buy' if 'sale' in response.css('h1::text').get() else 'rent'
# for prop in response.css('div.search__PropertyCardWrapper-sc-1j5dndx-0.bsqBpI'):
for prop in response.xpath('//div[contains(@class, "search__PropertyCardWrapper-sc-")]'):
link = prop.xpath('.//a/@href').get()
# a = prop.css('p.styles__Label-h53xsw-16.bcSkCI::text').getall()
prop_type = prop.xpath('(.//p[contains(@class, "styles__Label-")])[1]/text()').get()
# area = a[1] if len(a) > 1 else None
link = response.urljoin(link)
yield scrapy.Request(
url=link,
meta={'item': {
'agency': self.name,
'url': link,
# 'area': area,
'offering': offering,
'property_type': prop_type,
}},
callback=self.parse_property,
)
Upvotes: 1