Reputation: 11
I am facing a weird issue here, crawler running without any errors as well as without yielding any data.
Here is the starter code for one page:
# zillow scraper class
class ZillowScraper(scrapy.Spider):
# scraper/spider name
name = "zillow"
# custom_settings = {
# "FEED_FORMAT": "csv",
# "FEED_URI": "zillow_data.csv",
# }
# base URL
base_url = "https://www.zillow.com/homes/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-118.34704399108887%2C%22east%22%3A-118.24130058288574%2C%22south%22%3A34.05770827438846%2C%22north%22%3A34.12736593680466%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A13%7D"
# custom headers
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
}
# string query parameters
params = {
"searchQueryState": '{"pagination":{"currentPage":2},"usersSearchTerm":"Los Angeles, CA","mapBounds":{"west":-119.257679765625,"east":-117.565785234375,"south":33.46151132910718,"north":34.57696456062683},"mapZoom":9,"regionSelection":[{"regionId":12447,"regionType":6}],"isMapVisible":false,"filterState":{"ah":{"value":true},"sort":{"value":"globalrelevanceex"}},"isListVisible":true}',
}
def __init__(self):
self.zpid = []
def start_requests(self):
yield scrapy.Request(
url=self.base_url, headers=self.headers, callback=self.parse_links
)
Here is parsing links callback in which I am getting the data from json and getting the id's from json and appending it in the class variable list to use it to compare the id with listing id:
def parse_links(self, response):
results_selector = response.css(
'script[data-zrr-shared-data-key="mobileSearchPageStore"]'
).get()
clean_json = (
results_selector.replace(
'<script type="application/json" data-zrr-shared-data-key="mobileSearchPageStore"><!--',
"",
)
.replace("</script>", "")
.replace("-->", "")
)
parsed_data = json.loads(clean_json)
data = parsed_data["cat1"]["searchResults"]["listResults"]
for zid in data:
self.zpid.append(zid)
for listing in data:
yield scrapy.Request(
url=listing["detailUrl"],
headers=self.headers,
callback=self.parse_detail,
)
Here is the final callback parse details in this function again I am getting the data from the json. First I am doing some url parsing to get the id from the url to compare it with self.zpid list and then I am running the for loop over that self.zpid list and checking if listing_id(url id) is equal to the self.zpid list id's. Then generating keys dynamically with the help of the id to get the detailed data:
def parse_detail(self, response):
item = {}
listing_url = response.url.split("/")
parse_id = [u for u in listing_url if u]
listing_id = parse_id[4][:8]
for zid in self.zpid:
if zid == listing_id:
print(zid)
api_endpoint = response.css('script[id="hdpApolloPreloadedData"]').get()
clean_json = api_endpoint.replace(
'<script id="hdpApolloPreloadedData" type="application/json">', ""
).replace("</script>", "")
parsed_data = json.loads(clean_json)
sub_data = json.loads(parsed_data["apiCache"])
item["date"] = sub_data[
f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
]["property"]["datePostedString"]
item["home_status"] = sub_data[
f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
]["property"]["hdpTypeDimension"]
item["home_type"] = sub_data[
f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
]["property"]["homeType"]
item["sqft"] = sub_data[
f'ForSaleDoubleScrollFullRenderQuery{{"zpid": {zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
]["property"]["livingArea"]
item["street_address"] = sub_data[
f'VariantQuery{{"zpid":{zid},"altId":null}}'
]["property"]["streetAddress"]
item["city"] = sub_data[f'VariantQuery{{"zpid":{zid},"altId":null}}'][
"property"
]["city"]
item["state"] = sub_data[f'VariantQuery{{"zpid":{zid},"altId":null}}'][
"property"
]["state"]
item["zipcode"] = sub_data[
f'VariantQuery{{"zpid":{zid},"altId":null}}'
]["property"]["zipcode"]
item["price"] = sub_data[f'VariantQuery{{"zpid":{zid},"altId":null}}'][
"property"
]["price"]
item["zestimate"] = sub_data[
f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
]["property"]["zestimate"]
item["parcel_number"] = sub_data[
f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{zid},"contactFormRenderParameter":{{"zpid":{zid},"platform":"desktop","isDoubleScroll":true}}}}'
]["property"]["resoFacts"]["parcelNumber"]
yield item
# main driver
if __name__ == "__main__":
# run scraper
process = CrawlerProcess()
process.crawl(ZillowScraper)
process.start()
Right now crawler is running hitting the urls getting 200 response and everything but not yielding the data. What I am doing wrong here?
I tried running the crawler without comparing id's it outputs the keyerror which makes sense but apart from that crawler is just running and hittng the urls getting 200 response but empty dictionaries. I tried
response.follow
instead of initiating
scrapy.Request
but no output just {} empty dictionaries.
I am expecting:
{'date': 2022-03-11, 'home_status': 'For sale', 'home_type': 'Residential', 'sqft': '2,249', 'street_address': '659 Erskine Dr', 'city': 'Pacific Palisades', 'state': 'CA', 'zipcode': '90272', 'price': '$2,995,000', 'zestimate': '$3,356,900', 'parcel_number': 4413016022}
2022-03-24 01:04:17 [scrapy.core.engine] INFO: Closing spider (finished)
2022-03-24 01:04:17 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 54014,
'downloader/request_count': 41,
'downloader/request_method_count/GET': 41,
'downloader/response_bytes': 9157579,
'downloader/response_count': 41,
'downloader/response_status_count/200': 41,
'elapsed_time_seconds': 15.943654,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 3, 23, 20, 4, 17, 44889),
'httpcompression/response_bytes': 49582733,
'httpcompression/response_count': 41,
'item_scraped_count': 40,
'log_count/DEBUG': 90,
'log_count/INFO': 10,
'memusage/max': 54341632,
'memusage/startup': 54341632,
'request_depth_max': 1,
'response_received_count': 41,
'scheduler/dequeued': 41,
'scheduler/dequeued/memory': 41,
'scheduler/enqueued': 41,
'scheduler/enqueued/memory': 41,
'start_time': datetime.datetime(2022, 3, 23, 20, 4, 1, 101235)}
2022-03-24 01:04:17 [scrapy.core.engine] INFO: Spider closed (finished)
Upvotes: 0
Views: 219
Reputation: 143197
You have the same problem in many places.
First place
if zid == listing_id:
listing_id
is a number but zid
is a dictionary.
You have to use ["id"]
to get number from dictionary
if zid["id"] == listing_id:
And later the same problem in all keys with "zpid":{zid}
- you need "zpid":{zid["id"]}
In one key you have also extra space "zpid": {zid}
which you have to remove.
EDIT:
Another small problem - you yield item
outside if
but sometimes it can't find if zid["id"] == listing_id:
and it generate empty row in file. You should yield inside if
.
BTW:
Frankly, I don't like idea of list self.zpid
because it has to search on all values in list. And code may run on many workers and they may have separated self.zpid
and they may not find element on this list. Standard method is to send value to next function using
Request(... , meta={"data": zid})
and callback gets it as
zid = response.meta["data"]
But newest scrapy can send it as parameter for callback
Request(... , cb_kwars={"data": zid})
and callback gets it as argument in
def parse_detail(self, response, data):
Full working code with other changes
import scrapy
import json
class ZillowScraper(scrapy.Spider):
name = "zillow"
# custom_settings = {
# "FEED_FORMAT": "csv",
# "FEED_URI": "zillow_data.csv",
# }
# base URL
base_url = "https://www.zillow.com/homes/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-118.34704399108887%2C%22east%22%3A-118.24130058288574%2C%22south%22%3A34.05770827438846%2C%22north%22%3A34.12736593680466%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A13%7D"
# custom headers
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
}
# string query parameters
params = {
"searchQueryState": '{"pagination":{"currentPage":2},"usersSearchTerm":"Los Angeles, CA","mapBounds":{"west":-119.257679765625,"east":-117.565785234375,"south":33.46151132910718,"north":34.57696456062683},"mapZoom":9,"regionSelection":[{"regionId":12447,"regionType":6}],"isMapVisible":false,"filterState":{"ah":{"value":true},"sort":{"value":"globalrelevanceex"}},"isListVisible":true}',
}
def start_requests(self):
yield scrapy.Request(
url=self.base_url, headers=self.headers, callback=self.parse_links
)
def parse_links(self, response):
print('[parse_links] url:', response.url)
results_selector = response.css(
'script[data-zrr-shared-data-key="mobileSearchPageStore"]'
).get()
clean_json = (
results_selector.replace(
'<script type="application/json" data-zrr-shared-data-key="mobileSearchPageStore"><!--',
"",
)
.replace("</script>", "")
.replace("-->", "")
)
parsed_data = json.loads(clean_json)
data = parsed_data["cat1"]["searchResults"]["listResults"]
for listing in data:
yield scrapy.Request(
url=listing["detailUrl"],
headers=self.headers,
callback=self.parse_detail,
meta={'data': listing}
)
def parse_detail(self, response):
print('[parse_detail] url:', response.url)
listing_url = response.url.split("/")
parse_id = [u for u in listing_url if u]
listing_id = parse_id[4][:8]
zid = response.meta['data']
#print('listing_id:', listing_id)
#print("zid['id']:", zid['id'])
if zid['id'] == listing_id:
api_endpoint = response.css('script[id="hdpApolloPreloadedData"]').get()
clean_json = api_endpoint.replace(
'<script id="hdpApolloPreloadedData" type="application/json">', ""
).replace("</script>", "")
parsed_data = json.loads(clean_json)
sub_data = json.loads(parsed_data["apiCache"])
id_ = zid['id']
key_1 = f'ForSaleDoubleScrollFullRenderQuery{{"zpid":{id_},"contactFormRenderParameter":{{"zpid":{id_},"platform":"desktop","isDoubleScroll":true}}}}'
key_2 = f'VariantQuery{{"zpid":{id_},"altId":null}}'
properties_1 = sub_data[key_1]["property"]
properties_2 = sub_data[key_2]["property"]
item = {}
item["date"] = properties_1["datePostedString"]
item["home_status"] = properties_1["hdpTypeDimension"]
item["home_type"] = properties_1["homeType"]
item["sqft"] = properties_1["livingArea"]
item["street_address"] = properties_2["streetAddress"]
item["city"] = properties_2["city"]
item["state"] = properties_2["state"]
item["zipcode"] = properties_2["zipcode"]
item["price"] = properties_2["price"]
item["zestimate"] = properties_1["zestimate"]
item["parcel_number"] = properties_1["resoFacts"]["parcelNumber"]
yield item
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(ZillowScraper)
c.start()
Upvotes: 1