Reputation: 87
I am learning python & trying to scrape a website, having 10 listing of properties on each page. I want to extract information from each listing on each page. My code for first 5 pages is as follows :-
import requests
from bs4 import BeautifulSoup
urls = []
for i in range(1,5):
pages = "http://www.realcommercial.com.au/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-{0}?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true".format(i)
urls.append(pages)
for info in urls:
page = requests.get(info)
soup = BeautifulSoup(page.content, 'html.parser')
links = soup.find_all('a', attrs ={'class' :'details-panel'})
hrefs = [link['href'] for link in links]
Data = []
for urls in hrefs:
pages = requests.get(urls)
soup_2 =BeautifulSoup(pages.content, 'html.parser')
Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
Address = [Address.text.strip() for Address in Address_1]
Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
Area = [Area.text.strip() for Area in Area_1]
Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
Data.append(Sold_Date+Address+Area+Agency_Name+Agent_Name)
The above code is not working for me. Please let me know the correct coding to achieve the purpose.
Upvotes: 0
Views: 160
Reputation: 46779
You can tell BeautifulSoup to only give you links containing a href
to make your code safer. Also, rather than modifying your URL to include a page number, you could extract the next >
link at the bottom. This would also then automatically stop when the final page has been returned:
import requests
from bs4 import BeautifulSoup
base_url = r"http://www.realcommercial.com.au"
url = base_url + "/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-1?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true"
data = []
for _ in range(10):
print(url)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
hrefs = [link['href'] for link in soup.find_all('a', attrs={'class' : 'details-panel'}, href=True)]
for href in hrefs:
pages = requests.get(href)
soup_2 = BeautifulSoup(pages.content, 'html.parser')
Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
Address = [Address.text.strip() for Address in Address_1]
Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
Area_1 = soup_2.find_all('ul', attrs={'class' :'summaryList'})
Area = [Area.text.strip() for Area in Area_1]
Agency_1 = soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
Agency_Name = [Agency_Name.text.strip() for Agency_Name in Agency_1]
Agent_1 = soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
Agent_Name = [Agent_Name.text.strip() for Agent_Name in Agent_1]
data.append(Sold_Date+Address+Area+Agency_Name+Agent_Name)
# Find next page (if any)
next_button = soup.find('li', class_='rui-pagination-next')
if next_button:
url = base_url + next_button.parent['href']
else:
break
for entry in data:
print(entry)
print("---------")
Upvotes: 1
Reputation: 100
Use headers in the code and use string concatenation instead of .format(i)
The code looks like this
import requests
from bs4 import BeautifulSoup
urls = []
for i in range(1,6):
pages = 'http://www.realcommercial.com.au/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-'i+'?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true'
urls.append(pages)
Data = []
for info in urls:
headers = {'User-agent':'Mozilla/5.0'}
page = requests.get(info,headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
links = soup.find_all('a', attrs ={'class' :'details-panel'})
hrefs = [link['href'] for link in links]
for href in hrefs:
pages = requests.get(href)
soup_2 =BeautifulSoup(pages.content, 'html.parser')
Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
Address = [Address.text.strip() for Address in Address_1]
Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
Area = [Area.text.strip() for Area in Area_1]
Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
Data.append(Sold_Date+Address+Area+Agency_Name+Agent_Name)
print Data
Upvotes: 1
Reputation: 301
There is one problem in your code is that you declared the variable "urls" twice. You need to update the code like below:
import requests
from bs4 import BeautifulSoup
urls = []
for i in range(1,6):
pages = "http://www.realcommercial.com.au/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-{0}?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true".format(i)
urls.append(pages)
Data = []
for info in urls:
page = requests.get(info)
soup = BeautifulSoup(page.content, 'html.parser')
links = soup.find_all('a', attrs ={'class' :'details-panel'})
hrefs = [link['href'] for link in links]
for href in hrefs:
pages = requests.get(href)
soup_2 =BeautifulSoup(pages.content, 'html.parser')
Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
Address = [Address.text.strip() for Address in Address_1]
Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
Area = [Area.text.strip() for Area in Area_1]
Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
Data.append(Sold_Date+Address+Area+Agency_Name+Agent_Name)
print Data
Upvotes: 2