Reputation: 487
I have written the code below to scrape the cargurus website. A search shows 15 entries per page.
I want to iteratively move from page 1 to n and scrape each page. The code below is supposed to do that, however at the end of the script I have a dataframe df which duplicates the 1st page numPages times.
I thought the code wasn't giving the computer time to receive the request so I added a time.sleep(1) line but that doesn't seem to work.
What am I doing wrong?
# Import Modules
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import seaborn as sns
import time
#Utility Functions
def to_number(s):
#Convert to Number
numval = int(s.replace(',',''))
return numval
def get_location(s):
#Convert to City, State (SS), and zip (string)
s = s.replace(',','')
sList = s.split()
n = len(sList)-1
City = ''
for word in sList[0:n-1]:
City += word + ' '
City = City[:-1]
State = sList[n-1]
Zip = sList[n]
return City, State, Zip
def get_YearMakeModelTrim(s):
#Convert to Year, Make, Model, Trim
sList = s.split()
n = len(sList)-1
Year = sList[0]
Make = sList[1]
Model = sList[2]
if n == 3:
Trim = sList[3]
else:
Trim = "None"
return Year, Make, Model, Trim
numPages = 10
baseURL = 'https://www.cargurus.com/Cars/inventorylisting/viewDetailsFilterViewInventoryListing.action?sourceContext=forSaleTab_false_0&newSearchFromOverviewPage=true&inventorySearchWidgetType=AUTO&entitySelectingHelper.selectedEntity=c24578&entitySelectingHelper.selectedEntity2=c25202&zip=03062&distance=50000&searchChanged=true&modelChanged=false&filtersModified=true#resultsPage={}'
data = []
for ii in range(numPages):
URL = baseURL.format(ii+1)
print(URL)
r = requests.get(URL).text
time.sleep(1)
soup = bs(r,'html.parser')
stats = soup.find_all("div", attrs = {"class": "cg-dealFinder-result-stats"})
deals = soup.find_all("div", attrs = {"class": "cg-dealFinder-result-deal"})
titles = soup.find_all("h4", {"class":"cg-dealFinder-result-model"})
for title, deal, stat in zip(titles,deals,stats):
row = {}
row["Price"] = to_number(stat.find('span').get_text()[1:])
row["Mileage"] = to_number(stat.find_all("p")[1].text[9:])
row["City"], row["State"], row["Zip"] = get_location(stat.find_all("p")[2].text[10:])
row["natAvgPrice"] = to_number(deal.find('span', attrs = {'class': 'nationalAvg'}).get_text()[17:])
row["Year"], row["Make"], row["Model"], row["Trim"] = get_YearMakeModelTrim(title.find('span', attrs = {'itemprop': 'name'}).get_text())
row["NewUsed"] = title.find('span', attrs = {'class': 'invisibleLayer'}).get_text()[:-5]
data.append(row)
df = pd.DataFrame(data)
#df = df.drop_duplicates()
sns.pairplot(x_vars=["Mileage"], y_vars=["Price"], data=df, hue="Trim", size=5)
Upvotes: 0
Views: 878
Reputation: 142641
This page uses JavaScript/AJAX to read data from url
https://www.cargurus.com/Cars/inventorylisting/ajaxFetchSubsetInventoryListing.action?sourceContext=forSaleTab_false_0
It uses POST
request with parameters and there is parameter page
.
from bs4 import BeautifulSoup
import requests
params = {
'zip': '03062',
'address': 'Nashua,+NH',
'latitude': "42.73040008544922",
'longitude': '-71.49479675292969',
'distance': 50000,
'selectedEntity': 'c24578',
'entitySelectingHelper.selectedEntity2': 'c25202',
'minPrice': '',
'maxPrice': '',
'minMileage': '',
'maxMileage': '',
'transmission': 'ANY',
'bodyTypeGroup': '',
'serviceProvider': '',
'page': 1,
'filterBySourcesString': '',
'filterFeaturedBySourcesString': '',
'displayFeaturedListings': True,
'searchSeoPageType': '',
'inventorySearchWidgetType': 'AUTO',
'allYearsForTrimName': False,
'daysOnMarketMin': '',
'daysOnMarketMax': '',
'vehicleDamageCategoriesRaw': '',
'minCo2Emission': '',
'maxCo2Emission': '',
'vatOnly': False,
'minEngineDisplacement': '',
'maxEngineDisplacement': '',
'minMpg': '',
'maxMpg': '',
'startYear': 2015,
'endYear': 2016,
'isRecentSearchView': False,
}
url = 'https://www.cargurus.com/Cars/inventorylisting/ajaxFetchSubsetInventoryListing.action?sourceContext=forSaleTab_false_0'
display_keys = True
for x in range(1, 4):
params['page'] = x
response = requests.post(url, data=params)
data = response.json()
if display_keys:
display_keys = False
for key in data.keys():
print('key:', key)
for key in data['listings'][0].keys():
print("data['listings'] key:", key)
print('-----')
print('--- offers number:', len( data['listings']), '---')
for item in data['listings'][:10]:
print(item['id'], data['modelName'], item['modelName'], item['trimName'])
Result - keys
key: listings
key: modelName
key: styleSet
key: modelId
key: serviceProviders
key: page
key: sellers
key: remainingResults
data['listings'] key: bodyType
data['listings'] key: fleet
data['listings'] key: serviceProviderId
data['listings'] key: saved
data['listings'] key: highwayFuelEconomy
data['listings'] key: modelId
data['listings'] key: nonwholesaleSellerId
data['listings'] key: isFranchiseDealer
data['listings'] key: regressionPrice
data['listings'] key: rating
data['listings'] key: listedDate
data['listings'] key: dealerRatingPriceAdjustment
data['listings'] key: isOEMCPO
data['listings'] key: sellerId
data['listings'] key: transmission
data['listings'] key: mainPictureUrl
data['listings'] key: monthlyPayment
data['listings'] key: price
data['listings'] key: exteriorColorName
data['listings'] key: id
data['listings'] key: isFeatured
data['listings'] key: mileage
data['listings'] key: makeId
data['listings'] key: zip
data['listings'] key: noPhotos
data['listings'] key: isCertified
data['listings'] key: msrpString
data['listings'] key: engineCylinders
data['listings'] key: expectedPriceString
data['listings'] key: trimName
data['listings'] key: daysOnMarket
data['listings'] key: scaleMainPictureOnLoad
data['listings'] key: vehicleDamageCategory
data['listings'] key: monthlyPaymentString
data['listings'] key: isOutlier
data['listings'] key: cityFuelEconomy
data['listings'] key: savingsAmount
data['listings'] key: ownerCount
data['listings'] key: absoluteRating
data['listings'] key: salvage
data['listings'] key: contacted
data['listings'] key: priceString
data['listings'] key: distance
data['listings'] key: originalPrice
data['listings'] key: sellerRating
data['listings'] key: mileageString
data['listings'] key: engineType
data['listings'] key: wheelSystemDisplay
data['listings'] key: isDisplayConquestSection
data['listings'] key: serviceProviderName
data['listings'] key: carYear
data['listings'] key: savingsRecommendation
data['listings'] key: specificOptionIds
data['listings'] key: lemon
data['listings'] key: vehicleIdentifier
data['listings'] key: bodyTypeGroupId
data['listings'] key: useAnonymousContactEmail
data['listings'] key: msrp
data['listings'] key: sellerCity
data['listings'] key: bodyTypeGroupName
data['listings'] key: savingsArrowImage
data['listings'] key: dealScore
data['listings'] key: frameDamaged
data['listings'] key: hasAccidents
data['listings'] key: isCPO
data['listings'] key: expectedPrice
data['listings'] key: engineDisplacement
data['listings'] key: priceDifferentialString
data['listings'] key: trimLevelName
data['listings'] key: isNew
data['listings'] key: modelName
data['listings'] key: bodyTypeId
data['listings'] key: theftTitle
data['listings'] key: fuelType
data['listings'] key: maxSeating
data['listings'] key: wheelSystem
data['listings'] key: isConquestEnabled
data['listings'] key: autoEntityId
data['listings'] key: franchiseMake
data['listings'] key: optionIds
data['listings'] key: makeName
-----
Result - I display only first 10 items for every request (with different page
)
--- offers number: 2000 ---
190057566 Honda Odyssey Odyssey Touring Elite
194518873 Honda Odyssey Odyssey
184211547 Honda Odyssey Odyssey Touring Elite
185999601 Honda Odyssey Odyssey EX-L
191225205 Honda Odyssey Odyssey EX-L
192457272 Honda Odyssey Odyssey EX-L
190727203 Honda Odyssey Odyssey EX-L
189805101 Honda Odyssey Odyssey EX-L
190017310 Honda Odyssey Odyssey EX-L
185841600 Honda Odyssey Odyssey SE
--- offers number: 1985 ---
189574780 Honda Odyssey Odyssey EX-L
185923444 Honda Odyssey Odyssey EX-L
193088921 Honda Odyssey Odyssey Touring Elite
191861106 Honda Odyssey Odyssey EX-L
188361750 Honda Odyssey Odyssey Touring
185077447 Honda Odyssey Odyssey EX-L
182773821 Honda Odyssey Odyssey SE
189573553 Honda Odyssey Odyssey EX
191224649 Honda Odyssey Odyssey EX-L
179786502 Honda Odyssey Odyssey EX
--- offers number: 1970 ---
192649298 Honda Odyssey Odyssey Touring Elite
188612484 Honda Odyssey Odyssey EX-L
182338399 Honda Odyssey Odyssey EX
193159667 Honda Odyssey Odyssey EX-L
188979870 Honda Odyssey Odyssey EX-L
194311827 Honda Odyssey Odyssey EX
181047736 Honda Odyssey Odyssey EX-L
189115988 Honda Odyssey Odyssey EX-L
183408178 Honda Odyssey Odyssey EX-L
188950701 Honda Odyssey Odyssey EX-L
Upvotes: 2