Reputation: 4807
I am working with the following code:
import requests, pandas as pd
from bs4 import BeautifulSoup
if __name__ == '__main__':
url = r'https://www.har.com/search/dosearch?map_tools_nwlat=30.285067156744077&map_tools_nwlng=-95.67872179656118&map_tools_selat=30.106228394674915&map_tools_selng=-95.37032501917425&for_sale=1&property_status=A&listing_price_min=450000&listing_price_max=1000000&bedroom_min=4&lotsize_min=8500&garage_num=3&private_pool=1'
soup = BeautifulSoup(requests.get(url).text, "lxml").find_all("div", class_="mpi_info")
I am trying to get all the urls like "/homedetail/30729-mcguinness-dr-spring-tx-77386/5204857"
in a dataframe but not sure how to go about this.
Upvotes: 1
Views: 55
Reputation: 906
import requests, pandas as pd
from bs4 import BeautifulSoup
def scraper():
lst = []
url = r'https://www.har.com/search/dosearch?map_tools_nwlat=30.285067156744077&map_tools_nwlng=-95.67872179656118&map_tools_selat=30.106228394674915&map_tools_selng=-95.37032501917425&for_sale=1&property_status=A&listing_price_min=450000&listing_price_max=1000000&bedroom_min=4&lotsize_min=8500&garage_num=3&private_pool=1'
soup = BeautifulSoup(requests.get(url).text)
for i in soup.find_all('a'):
if i.get('href') and i.get('href').startswith('/homedetail/'):
lst.append(i['href'])
return lst
if __name__ == '__main__':
urls = scraper()
df = pd.DataFrame(urls)
print(df)
Output:
0 /homedetail/30729-mcguinness-dr-spring-tx-7738...
1 /homedetail/30729-mcguinness-dr-spring-tx-7738...
2 /homedetail/11-dovecote-spring-tx-77382/5323232
3 /homedetail/11-dovecote-spring-tx-77382/5323232
4 /homedetail/9934-crestwater-cir-magnolia-tx-77...
5 /homedetail/9934-crestwater-cir-magnolia-tx-77...
6 /homedetail/3-shanewood-ct-spring-tx-77382/532...
7 /homedetail/3-shanewood-ct-spring-tx-77382/532...
8 /homedetail/22-solebrook-path-tomball-tx-77375...
9 /homedetail/22-solebrook-path-tomball-tx-77375...
10 /homedetail/24-snowdrop-lily-dr-tomball-tx-773...
11 /homedetail/24-snowdrop-lily-dr-tomball-tx-773...
12 /homedetail/26-freestone-pl-spring-tx-77382/97...
13 /homedetail/26-freestone-pl-spring-tx-77382/97...
14 /homedetail/8557-alford-point-dr-magnolia-tx-7...
15 /homedetail/8557-alford-point-dr-magnolia-tx-7...
16 /homedetail/210-spyglass-park-loop-montgomery-...
17 /homedetail/210-spyglass-park-loop-montgomery-...
18 /homedetail/6-rosedown-pl-spring-tx-77382/5329545
19 /homedetail/6-rosedown-pl-spring-tx-77382/5329545
20 /homedetail/51-lenox-hill-dr-spring-tx-77382/5...
21 /homedetail/51-lenox-hill-dr-spring-tx-77382/5...
22 /homedetail/19-s-garnet-bnd-spring-tx-77382/91...
23 /homedetail/19-s-garnet-bnd-spring-tx-77382/91...
Upvotes: 1
Reputation: 20118
The addresses are under the class "address". Create a list containing all the href
's and pass it to a DataFrame
import requests, pandas as pd
from bs4 import BeautifulSoup
url = "https://www.har.com/search/dosearch?map_tools_nwlat=30.285067156744077&map_tools_nwlng=-95.67872179656118&map_tools_selat=30.106228394674915&map_tools_selng=-95.37032501917425&for_sale=1&property_status=A&listing_price_min=450000&listing_price_max=1000000&bedroom_min=4&lotsize_min=8500&garage_num=3&private_pool=1"
soup = BeautifulSoup(requests.get(url).text, "lxml")
address_links = [tag["href"] for tag in soup.find_all("a", class_="address")]
df = pd.DataFrame(address_links)
print(df.to_string())
Output:
0
0 /homedetail/30729-mcguinness-dr-spring-tx-77386/5204857
1 /homedetail/11-dovecote-spring-tx-77382/5323232
2 /homedetail/9934-crestwater-cir-magnolia-tx-77354/11567525
3 /homedetail/3-shanewood-ct-spring-tx-77382/5325643
4 /homedetail/22-solebrook-path-tomball-tx-77375/12190176
5 /homedetail/24-snowdrop-lily-dr-tomball-tx-77375/14652805
6 /homedetail/26-freestone-pl-spring-tx-77382/9791228
7 /homedetail/8557-alford-point-dr-magnolia-tx-77354/13580284?lid=6218369
8 /homedetail/210-spyglass-park-loop-montgomery-tx-77316/12783261
9 /homedetail/6-rosedown-pl-spring-tx-77382/5329545
10 /homedetail/51-lenox-hill-dr-spring-tx-77382/5331072
11 /homedetail/19-s-garnet-bnd-spring-tx-77382/9164284
Upvotes: 1