Reputation: 117
This is my first time with Python and web scraping. Have been looking around and still unable to get what I need to do.
Below are print screen of the elements that I've used via Chrome.
What I am trying to do is that, I am trying to get the apartment names and the address from the selected city name.
import requests
from bs4 import BeautifulSoup
#url = ''
rootURL = ''
response = requests.get(rootURL)
html = response.content
soup = BeautifulSoup(html,'lxml')
dropdown_list =".primary .child-pages a")
#city_names=[dropdown_list_value.text for dropdown_list_value in dropdown_list]
#print (city_names)
cityLinks=[rootURL + dropdown_list_value['href'] for dropdown_list_value in dropdown_list]
for cityLinks_select in dropdown_list: #Looping each city from the Apartment drop down list
print ('Selecting city:',cityLinks_select.text)
cityResponse = requests.get(cityLinks)
cityHtml = cityResponse.content
citySoup = BeautifulSoup(cityHtml,'lxml')
community_list =".extended-search .property-container a[h2 h3]")
get and print the apartment link
get and print the apartment name
get and print the address of the apartment
Upvotes: 3
Views: 4037
Reputation: 180441
As I commented, some of the data is dynamically created, if we look at the source itself we see:
<div class="content">
<div class="title-container">
<h2 class="building-name"><%= building.get('name') %></h2>
<h3 class="address"><%= building.get('address').address %></h3>
<div class="rent">
<h4 class="sub-title">Rent from</h4>
<% if (building.get('statistics').suites.rates.min !== 'undefined') { %>
<% $min_rate = commaSeparateNumber(parseInt(building.get('statistics').suites.rates.min)); %>
<span class="rent-value">$<%= $min_rate %></span>
<% } %>
All we can get from the source is the building name, the address and the ph number:
cityLinks = [rootURL + dropdown_list_value['href'] for dropdown_list_value in dropdown_list]
# you need to iterate over the joined urls
for city in cityLinks: # Looping each city from the Apartment drop down list
cityResponse = requests.get(city)
cityHtml = cityResponse.content
citySoup = BeautifulSoup(cityHtml, 'lxml')
# all the info we can parse is inside the div class="building-info"
for div in"div.building-info"):
We can get all the data in json format if we mimic an ajax request:
import requests
from bs4 import BeautifulSoup
from pprint import pprint as pp
rootURL = ''
response = requests.get(rootURL)
html = response.content
soup = BeautifulSoup(html, 'lxml')
dropdown_list =".primary .child-pages a")
cityLinks = (rootURL + dropdown_list_value['href'] for dropdown_list_value in dropdown_list)
# params for our request
params = {"show_promotions": "true",
"show_custom_fields": "true",
"client_id": "6",
"auth_token": "sswpREkUtyeYjeoahA2i",
"min_bed": "-1",
"max_bed": "100",
"min_bath": "0",
"max_bath": "10",
"min_rate": "0",
"max_rate": "4000",
"keyword": "false",
"property_types": "low-rise-apartment,mid-rise-apartment,high-rise-apartment,luxury-apartment,townhouse,house,multi-unit-house,single-family-home,duplex,tripex,semi",
"order": "max_rate ASC, min_rate ASC, min_bed ASC, max_bath ASC",
"limit": "50",
"offset": "0",
"count": "false"}
for city in cityLinks: # Looping each city from the Apartment drop down list
with requests.Session() as s:
r= s.get(city)
# we need to parse the city_id for out next request to work
soup = BeautifulSoup(r.content)
city_id = soup.select_one("")["data-city-id"]
# update params with the city id
params["city_id"] = city_id
js = s.get("", params=params).json()
Now we get data like:
[{u'address': {u'address': u'325 North Park Street',
u'city': u'Brantford',
u'city_id': 332,
u'country': u'Canada',
u'country_code': u'CAN',
u'intersection': u'',
u'neighbourhood': u'',
u'postal_code': u'N3R 2X4',
u'province': u'Ontario',
u'province_code': u'ON'},
u'availability_count': 6,
u'availability_status': 1,
u'availability_status_label': u'Available Now',
u'building_header': u'',
u'client': {u'email': u'[email protected]',
u'id': 6,
u'name': u'Homestead Land Holdings',
u'phone': u'613-546-3146',
u'website': u''},
u'contact': {u'alt_extension': u'',
u'alt_phone': u'',
u'email': u'[email protected]',
u'extension': u'',
u'fax': u'(519) 752-6855',
u'name': u'',
u'phone': u'519-752-3596'},
u'details': {u'features': u'',
u'location': u'',
u'overview': u"Located on North Park Street and Memorial Avenue,this quiet building is within walking distance of the following: - Zehrs Plaza, North Park Plaza, Shoppers Drug Mart, Zehrs Grocery Store, Zellers, Pet Store, Party Supply Store, furniture store, variety store, Black's Photography, paint shop and veterinary clinic\xa0 - Restaurants and coffee shops\xa0 - Wayne Gretzky Recreational Arena\xa0 - Medical Clinic,Shoppers Home Health Care Clinic and Pharmacy\xa0 - Catholic Elementary School\xa0 - On bus route ",
u'suite': u''},
u'geocode': {u'distance': None,
u'latitude': u'43.1703624',
u'longitude': u'-80.2605725'},
u'id': 309,
u'matched_beds': [u'0', u'1', u'2'],
u'matched_suite_names': [u'Bachelor', u'One Bedroom', u'Two Bedroom'],
u'min_availability_date': u'',
u'name': u'North Park Tower',
u'office_hours': u'',
u'parking': {u'additional': u'', u'indoor': u'', u'outdoor': u''},
u'permalink': u'',
u'pet_friendly': True,
u'photo': u'1443018148_2.jpg',
u'photo_path': u'',
u'promotion': {u'featured': 0},
u'property_type': u'High-rise-apartment',
u'statistics': {u'suites': {u'bathrooms': {u'average': 1.0,
u'max': 1.0,
u'min': 1.0},
u'bedrooms': {u'average': u'1.0',
u'max': 2,
u'min': 0},
u'rates': {u'average': 950.0,
u'max': 1275.0,
u'min': 625.0},
u'square_feet': {u'average': 0.0,
u'max': u'0.0',
u'min': u'0.0'}}},
u'thumbnail_path': u'',
u'website': {u'description': u'', u'title': u'', u'url': u''}},
{u'address': {u'address': u'661 West Street',
u'city': u'Brantford',
u'city_id': 332,
u'country': u'Canada',
u'country_code': u'CAN',
u'intersection': u'',
u'neighbourhood': u'',
u'postal_code': u'N3R 6W9',
u'province': u'Ontario',
u'province_code': u'ON'},
u'availability_count': 6,
u'availability_status': 1,
u'availability_status_label': u'Available Now',
u'building_header': u'',
u'client': {u'email': u'[email protected]',
u'id': 6,
u'name': u'Homestead Land Holdings',
u'phone': u'613-546-3146',
u'website': u''},
u'contact': {u'alt_extension': u'',
u'alt_phone': u'',
u'email': u'[email protected]',
u'extension': u'',
u'fax': u'(519) 751-0379',
u'name': u'',
u'phone': u'519-751-3867'},
u'details': {u'features': u'',
u'location': u'',
u'overview': u'Located in the North end of Brantford, Westgate Tower is in an area that resembles a city within a city. There are a variety of banks, grocery stores, drug stores, malls, a wide selection of fast food, fine dining restaurants and an after hours medical centre, within waking distance.',
u'suite': u''},
u'geocode': {u'distance': None,
u'latitude': u'43.1733242',
u'longitude': u'-80.2482991'},
u'id': 310,
u'matched_beds': [u'0', u'1', u'2'],
u'matched_suite_names': [u'Bachelor', u'One Bedroom', u'Two Bedroom'],
u'min_availability_date': u'',
u'name': u'Westgate Apartments',
u'office_hours': u'',
u'parking': {u'additional': u'', u'indoor': u'', u'outdoor': u''},
u'permalink': u'',
u'pet_friendly': True,
u'photo': u'1443017488_1.jpg',
u'photo_path': u'',
u'promotion': {u'featured': 0},
u'property_type': u'High-rise-apartment',
u'statistics': {u'suites': {u'bathrooms': {u'average': 1.0,
u'max': 1.0,
u'min': 1.0},
u'bedrooms': {u'average': u'1.0',
u'max': 2,
u'min': 0},
u'rates': {u'average': 975.0,
u'max': 1300.0,
u'min': 650.0},
u'square_feet': {u'average': 0.0,
u'max': u'0.0',
u'min': u'0.0'}}},
u'thumbnail_path': u'',
u'website': {u'description': u'', u'title': u'', u'url': u''}},
{u'address': {u'address': u'321 Fairview Drive',
u'city': u'Brantford',
u'city_id': 332,
u'country': u'Canada',
u'country_code': u'CAN',
u'intersection': u'',
u'neighbourhood': u'',
u'postal_code': u'N3R 2X6',
u'province': u'Ontario',
u'province_code': u'ON'},
u'availability_count': 8,
u'availability_status': 1,
u'availability_status_label': u'Available Now',
u'building_header': u'',
u'client': {u'email': u'[email protected]',
u'id': 6,
u'name': u'Homestead Land Holdings',
u'phone': u'613-546-3146',
u'website': u''},
u'contact': {u'alt_extension': u'',
u'alt_phone': u'',
u'email': u'[email protected]',
u'extension': u'',
u'fax': u'(519) 752-6855',
u'name': u'',
u'phone': u'519-752-3596'},
u'details': {u'features': u'',
u'location': u'',
u'overview': u'Dornia Manor is a quiet, ninety-two unit apartment building located in the North end of Brantford. We offer one, two and three bedroom units and one penthouse suite. The building is located in close proximity to many major services such as banking, shopping, health services, recreational facilities, beauty shops, dry cleaners, schools and churches. There is a bus stop at the front door and highway 403 is within minutes.',
u'suite': u''},
u'geocode': {u'distance': None,
u'latitude': u'43.1706331',
u'longitude': u'-80.2584034'},
u'id': 308,
u'matched_beds': [u'1', u'2', u'3'],
u'matched_suite_names': [u'One Bedroom', u'Two Bedroom', u'Three Bedroom'],
u'min_availability_date': u'',
u'name': u'Dornia Manor',
u'office_hours': u'',
u'parking': {u'additional': u'', u'indoor': u'', u'outdoor': u''},
u'permalink': u'',
u'pet_friendly': True,
u'photo': u'1443017947_1.jpg',
u'photo_path': u'',
u'promotion': {u'featured': 0},
u'property_type': u'High-rise-apartment',
u'statistics': {u'suites': {u'bathrooms': {u'average': 1.375,
u'max': 2.0,
u'min': 1.0},
u'bedrooms': {u'average': u'2.25',
u'max': 3,
u'min': 1},
u'rates': {u'average': 1124.5,
u'max': 1350.0,
u'min': 899.0},
u'square_feet': {u'average': 0.0,
u'max': u'0.0',
u'min': u'0.0'}}},
u'thumbnail_path': u'',
u'website': {u'description': u'', u'title': u'', u'url': u''}}]
That gives you the url, bedrooms and pretty much everything you could want. Each dict in the list is one listing, you just need to access using the keys to pull the data you want, for example:
for dct in js:
add = dct["address"]
Would give you:
N3R 2X4
N3R 6W9
N3R 2X6
The contact info is under dct["contact"]
and the stats are under = dct["statistics"]
for dct in js:
contact = dct["contact"]
stats = dct["statistics"]
Which would give you:
{u'alt_phone': u'', u'fax': u'(519) 752-6855', u'name': u'', u'alt_extension': u'', u'phone': u'519-752-3596', u'extension': u'', u'email': u'[email protected]'}
{u'rates': {u'max': 1275.0, u'average': 950.0, u'min': 625.0}, u'bedrooms': {u'max': 2, u'average': u'1.0', u'min': 0}, u'bathrooms': {u'max': 1.0, u'average': 1.0, u'min': 1.0}, u'square_feet': {u'max': u'0.0', u'average': 0.0, u'min': u'0.0'}}
{u'alt_phone': u'', u'fax': u'(519) 751-0379', u'name': u'', u'alt_extension': u'', u'phone': u'519-751-3867', u'extension': u'', u'email': u'[email protected]'}
{u'rates': {u'max': 1300.0, u'average': 975.0, u'min': 650.0}, u'bedrooms': {u'max': 2, u'average': u'1.0', u'min': 0}, u'bathrooms': {u'max': 1.0, u'average': 1.0, u'min': 1.0}, u'square_feet': {u'max': u'0.0', u'average': 0.0, u'min': u'0.0'}}
{u'alt_phone': u'', u'fax': u'(519) 752-6855', u'name': u'', u'alt_extension': u'', u'phone': u'519-752-3596', u'extension': u'', u'email': u'[email protected]'}
{u'rates': {u'max': 1350.0, u'average': 1124.5, u'min': 899.0}, u'bedrooms': {u'max': 3, u'average': u'2.25', u'min': 1}, u'bathrooms': {u'max': 2.0, u'average': 1.375, u'min': 1.0}, u'square_feet': {u'max': u'0.0', u'average': 0.0, u'min': u'0.0'}}
You can put all that together to get whatever you need. Yo can tweak the params and there are actually more if you check out the request in chrome tools or firebug.
Upvotes: 3