Renu sharma
Renu sharma

Reputation: 87

Web scraping to get data from website

I am learning python & trying to scrape a website, having 10 listing of properties on each page. I want to extract information from each listing on each page. My code for first 5 pages is as follows :-

import requests 
from bs4 import BeautifulSoup

urls = []
for i in range(1,5):
    pages = "http://www.realcommercial.com.au/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-{0}?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true".format(i)
    urls.append(pages)
    for info in urls:
         page = requests.get(info)
         soup = BeautifulSoup(page.content, 'html.parser')
         links = soup.find_all('a', attrs ={'class' :'details-panel'})
         hrefs = [link['href'] for link in links]
         Data = []
         for urls in hrefs:
             pages = requests.get(urls)
             soup_2 =BeautifulSoup(pages.content, 'html.parser')
             Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
             Address = [Address.text.strip() for Address in Address_1]
             Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
             Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
             Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
             Area = [Area.text.strip() for Area in Area_1]
             Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
             Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
             Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
             Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
             Data.append(Sold_Date+Address+Area+Agency_Name+Agent_Name)

The above code is not working for me. Please let me know the correct coding to achieve the purpose.

Upvotes: 0

Views: 160

Answers (3)

Martin Evans
Martin Evans

Reputation: 46779

You can tell BeautifulSoup to only give you links containing a href to make your code safer. Also, rather than modifying your URL to include a page number, you could extract the next > link at the bottom. This would also then automatically stop when the final page has been returned:

import requests 
from bs4 import BeautifulSoup

base_url = r"http://www.realcommercial.com.au"
url = base_url + "/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-1?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true"
data = []

for _ in range(10):
    print(url)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    hrefs = [link['href'] for link in soup.find_all('a', attrs={'class' : 'details-panel'}, href=True)]

    for href in hrefs:
         pages = requests.get(href)
         soup_2 = BeautifulSoup(pages.content, 'html.parser')
         Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
         Address = [Address.text.strip() for Address in Address_1]
         Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
         Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
         Area_1 = soup_2.find_all('ul', attrs={'class' :'summaryList'})
         Area = [Area.text.strip() for Area in Area_1]
         Agency_1 = soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
         Agency_Name = [Agency_Name.text.strip() for Agency_Name in Agency_1]
         Agent_1 = soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
         Agent_Name = [Agent_Name.text.strip() for Agent_Name in Agent_1]

         data.append(Sold_Date+Address+Area+Agency_Name+Agent_Name)

    # Find next page (if any)
    next_button = soup.find('li', class_='rui-pagination-next')

    if next_button:
        url = base_url + next_button.parent['href']
    else:
        break


for entry in data:
    print(entry)
    print("---------")

Upvotes: 1

Lakshmana Deepesh
Lakshmana Deepesh

Reputation: 100

Use headers in the code and use string concatenation instead of .format(i)

The code looks like this

import requests 
from bs4 import BeautifulSoup

urls = []
for i in range(1,6):
    pages = 'http://www.realcommercial.com.au/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-'i+'?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true'
    urls.append(pages)

Data = []
for info in urls:
    headers = {'User-agent':'Mozilla/5.0'}
    page = requests.get(info,headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    links = soup.find_all('a', attrs ={'class' :'details-panel'})
    hrefs = [link['href'] for link in links]

for href in hrefs:
    pages = requests.get(href)
    soup_2 =BeautifulSoup(pages.content, 'html.parser')
    Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
    Address = [Address.text.strip() for Address in Address_1]
    Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
    Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
    Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
    Area = [Area.text.strip() for Area in Area_1]
    Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
    Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
    Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
    Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
    Data.append(Sold_Date+Address+Area+Agency_Name+Agent_Name)

print Data

Upvotes: 1

htn
htn

Reputation: 301

There is one problem in your code is that you declared the variable "urls" twice. You need to update the code like below:

import requests 
from bs4 import BeautifulSoup

urls = []
for i in range(1,6):
    pages = "http://www.realcommercial.com.au/sold/property-offices-retail-showrooms+bulky+goods-land+development-hotel+leisure-medical+consulting-other-in-vic/list-{0}?includePropertiesWithin=includesurrounding&activeSort=list-date&autoSuggest=true".format(i)
    urls.append(pages)

Data = []
for info in urls:
    page = requests.get(info)
    soup = BeautifulSoup(page.content, 'html.parser')
    links = soup.find_all('a', attrs ={'class' :'details-panel'})
    hrefs = [link['href'] for link in links]

    for href in hrefs:
        pages = requests.get(href)
        soup_2 =BeautifulSoup(pages.content, 'html.parser')
        Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
        Address = [Address.text.strip() for Address in Address_1]
        Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
        Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
        Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
        Area = [Area.text.strip() for Area in Area_1]
        Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
        Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
        Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
        Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
        Data.append(Sold_Date+Address+Area+Agency_Name+Agent_Name)

print Data

Upvotes: 2

Related Questions