Renu sharma
Renu sharma

Reputation: 87

webscraping through beautifulsoup python

I am trying to scrape data through a website having many pages with each page having 10 listings & on each listing webpage, there is a hyperlink showing contact number - but only a few initial numbers. The entire number gets visible once i click on that hyperlink. I am unable to figure the way out for including these numbers in my data. Below is my code :

soup_2 =BeautifulSoup(pages.content, 'html.parser')
con_lin=soup_2.find_all('a', attrs ={'href' :'#'})
Contact_Number =[]
for number in con_lin:
    Cont = number.text
Contact_Number.append(Cont)

P.S : I am using Python3

Any help/input would be highly appreciated

Thanks

Thanks for the reply, My entire code is :

import requests 
from bs4 import BeautifulSoup

urls = []
for i in range(1,5):
    pages = "https://www.realcommercial.com.au/sold/in-luddenham%2c+nsw+2745%3bbadgerys+creek%2c+nsw+2555%3bkemps+creek%2c+nsw+2178%3bmount+vernon%2c+nsw+2178%3bcecil+park%2c+nsw+2178%3bgreendale%2c+nsw+2550%3baustral%2c+nsw+2179%3bwallacia%2c+nsw+2745%3berskine+park%2c+nsw+2759%3bbringelly%2c+nsw+2556%3brossmore%2c+nsw+2557/list-{0}?activeSort=date-desc&autoSuggest=true&includePropertiesWithin=includesurrounding&minFloorArea=10000".format(i)
    urls.append(pages)

Data = []
for info in urls:
    page = requests.get(info)
    soup = BeautifulSoup(page.content, 'html.parser')
    links = soup.find_all('a', attrs ={'class' :'details-panel'})
    hrefs = [link['href'] for link in links]

    for href in hrefs:
        entry=[]
        pages = requests.get(href)
        soup_2 =BeautifulSoup(pages.content, 'html.parser')
        Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
        Address = [Address.text.strip() for Address in Address_1]
        Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
        Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
        Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
        Area_2 = [Area.text.strip() for Area in Area_1]
        Land_Area = [x for x in Area_2 if x.startswith('Land Area')]
        Floor_Area = [y for y in Area_2 if y.startswith('Floor Area')]
        Prop_Type = soup_2.find('div', attrs={'class' :'propTypes ellipsis'}).findChildren()
        Property_Type=[]
        for span in Prop_Type:
            Property_Type+=span
        Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
        Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
        Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
        Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
        con_lin=soup_2.find_all('a', attrs ={'href' :'#'})
        Contact_Number =[]
        for number in con_lin:
            Cont = number.text
        Contact_Number.append(Cont)
        entry.append(Address)
        entry.append(Sold_Date)
        entry.append(Area)
        entry.append(Property_Type)
        entry.append(Agency_Name)
        entry.append(Agent_Name)
        entry.append(Contact_Number)
        Data.append(entry)

@Andersson : the edit you suggested didn't work. I am getting output as below

[[['Kemps Creek, address available on request'],
  ['Thu 01-Sep-16'],
  ['Land Area 10.00ha (24.71 acres) (approx)', 'Floor Area 10,000 m²'],
  ['Land/Development', 'Commercial Farming'],
  ['CBRE - Western Sydney'],
  ['Jason Edge'],
  ['MyCommercial',
   'Previous',
   'Next',
   'Map',
   '0410 6...',
   ' Save Property',
   'Get Email Alerts',
   'Real Estate Directory']],
 [['320 - 340 Badgerys Creek Road, Badgerys Creek, NSW 2555'],
  ['Mon 22-Apr-13'],
  ['Land Area 10.00ha (24.71 acres) (approx)', 'Floor Area 10,000 m²'],
  ['Land/Development', 'Industrial/Warehouse', 'Retail'],
  ['CBRE - Western Sydney'],
  ['Frank Oliveri'],
  ['MyCommercial',
   'Previous',
   'Next',
   'Map',
   '+61 41...',
   'Street View',
   ' Save Property',
   'Get Email Alerts',
   'Real Estate Directory']],

Upvotes: 0

Views: 125

Answers (2)

SIM
SIM

Reputation: 22440

@Shahin Thank you so much for the help. Please help me with the use of .agentCont in the code & any specific reason for having space between agentCon and agentPhone. ?

I have implemented your suggestion as below & it is working perfectly fine:

import requests 
from bs4 import BeautifulSoup

urls = []
for i in range(1,5):
    pages = "https://www.realcommercial.com.au/sold/in-luddenham%2c+nsw+2745%3bbadgerys+creek%2c+nsw+2555%3bkemps+creek%2c+nsw+2178%3bmount+vernon%2c+nsw+2178%3bcecil+park%2c+nsw+2178%3bgreendale%2c+nsw+2550%3baustral%2c+nsw+2179%3bwallacia%2c+nsw+2745%3berskine+park%2c+nsw+2759%3bbringelly%2c+nsw+2556%3brossmore%2c+nsw+2557/list-{0}?activeSort=date-desc&autoSuggest=true&includePropertiesWithin=includesurrounding&minFloorArea=10000".format(i)
    urls.append(pages)

Data = []
for info in urls:
    page = requests.get(info)
    soup = BeautifulSoup(page.content, 'html.parser')
    links = soup.find_all('a', attrs ={'class' :'details-panel'})
    hrefs = [link['href'] for link in links]

    for href in hrefs:
        entry=[]
        pages = requests.get(href)
        soup_2 =BeautifulSoup(pages.content, 'html.parser')
        Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
        Address = [Address.text.strip() for Address in Address_1]
        Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
        Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
        Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
        Area_2 = [Area.text.strip() for Area in Area_1]
        Land_Area = [x for x in Area_2 if x.startswith('Land Area')]
        Floor_Area = [y for y in Area_2 if y.startswith('Floor Area')]
        Prop_Type = soup_2.find('div', attrs={'class' :'propTypes ellipsis'}).findChildren()
        Property_Type=[]
        for span in Prop_Type:
            Property_Type+=span
        Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
        Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
        Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
        Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
        phone = [broth['data-value'] for broth in soup_2.select(".agentCont .agentPhone [rel='showContactNumber']")]
        Contacts =[]
        Contacts.append(phone)
        entry.append(Address)
        entry.append(Sold_Date)
        entry.append(Land_Area)
        entry.append(Floor_Area)
        entry.append(Property_Type)
        entry.append(Agency_Name)
        entry.append(Agent_Name)
        entry.append(Contacts)
        Data.append(entry)

Thanks a lot!!

I have taken three different links. First one contains the number of one agent, second one contains two agents, and the last one obviously contains three agent number. To deal with them all at once, see the below script:

import requests
from bs4 import BeautifulSoup

main_links = (
    "https://www.realcommercial.com.au/property-land+development-nsw-badgerys+creek-500502195",
    "https://www.realcommercial.com.au/property-land+development-nsw-austral-500468083",
    "https://www.realcommercial.com.au/property-industrial+warehouse-nsw-minchinbury-502343342"
    )

def phone_parser(link):
    soup = BeautifulSoup(requests.get(link).text,"lxml")
    phone = [broth['data-value'] for broth in soup.select(".agentCont .agentPhone [rel='showContactNumber']")]
    print(' '.join(phone))

if __name__ == '__main__':
    for url in main_links:
        phone_parser(url)

Result:

+61 419 018 356
0412 549 766 0407 506 010
+61 414 836 817 +61 401 146 051 +61 412 992 830

Upvotes: 0

SIM
SIM

Reputation: 22440

Try the below code. The link ending with # is just a misleading stuff which doesn't reveal the number if you try to make another requests with that. The phone numbers are basically stored in data-value attribute and to get that, you can go like this:

import requests
from bs4 import BeautifulSoup

main_link = "https://www.realcommercial.com.au/sold/in-luddenham%2c+nsw+2745%3bbadgerys+creek%2c+nsw+2555%3bkemps+creek%2c+nsw+2178%3bmount+vernon%2c+nsw+2178%3bcecil+park%2c+nsw+2178%3bgreendale%2c+nsw+2550%3baustral%2c+nsw+2179%3bwallacia%2c+nsw+2745%3berskine+park%2c+nsw+2759%3bbringelly%2c+nsw+2556%3brossmore%2c+nsw+2557/list-1?activeSort=date-desc&autoSuggest=true&includePropertiesWithin=includesurrounding&minFloorArea=10000"

def phone_parser(main_link):
    soup = BeautifulSoup(requests.get(main_link).text,"lxml")
    for titles in soup.select(".listing-card .details-panel"):
        target_page(titles['href'])

def target_page(link):
    broth = BeautifulSoup(requests.get(link).text,"lxml")
    phone = broth.select(".agentPhone [rel='showContactNumber']")[0]['data-value']
    # phone = broth.select(".agentPhone [rel='showContactNumber']")[0].get('data-value')     #To make it more readable if you like.
    print(phone)

phone_parser(main_link)

Partial results:

0410 687 866
+61 419 018 356
0407 506 010

Thank you so much Andersson, I have implemented your suggestion as below:

import requests 
from bs4 import BeautifulSoup

urls = []
for i in range(1,5):
    pages = "https://www.realcommercial.com.au/sold/in-luddenham%2c+nsw+2745%3bbadgerys+creek%2c+nsw+2555%3bkemps+creek%2c+nsw+2178%3bmount+vernon%2c+nsw+2178%3bcecil+park%2c+nsw+2178%3bgreendale%2c+nsw+2550%3baustral%2c+nsw+2179%3bwallacia%2c+nsw+2745%3berskine+park%2c+nsw+2759%3bbringelly%2c+nsw+2556%3brossmore%2c+nsw+2557/list-{0}?activeSort=date-desc&autoSuggest=true&includePropertiesWithin=includesurrounding&minFloorArea=10000".format(i)
    urls.append(pages)

Data = []
for info in urls:
    page = requests.get(info)
    soup = BeautifulSoup(page.content, 'html.parser')
    links = soup.find_all('a', attrs ={'class' :'details-panel'})
    hrefs = [link['href'] for link in links]

    for href in hrefs:
        entry=[]
        pages = requests.get(href)
        soup_2 =BeautifulSoup(pages.content, 'html.parser')
        Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
        Address = [Address.text.strip() for Address in Address_1]
        Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
        Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
        Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
        Area_2 = [Area.text.strip() for Area in Area_1]
        Land_Area = [x for x in Area_2 if x.startswith('Land Area')]
        Floor_Area = [y for y in Area_2 if y.startswith('Floor Area')]
        Prop_Type = soup_2.find('div', attrs={'class' :'propTypes ellipsis'}).findChildren()
        Property_Type=[]
        for span in Prop_Type:
            Property_Type+=span
        Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
        Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
        Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
        Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
        Contact = soup_2.select(".agentPhone [rel='showContactNumber']")[0]['data-value']
        Contacts =[]
        Contacts.append(Contact)
        entry.append(Address)
        entry.append(Sold_Date)
        entry.append(Land_Area)
        entry.append(Floor_Area)
        entry.append(Property_Type)
        entry.append(Agency_Name)
        entry.append(Agent_Name)
        entry.append(Contacts)
        Data.append(entry)

Thanks a lot for your help !!!

Upvotes: 1

Related Questions