Reputation: 83
I've seen several solutions to scrape multiple pages from a website, but couldn't make it work on my code.
At the moment, I have this code, that is working to scrape the first page. And I would like to create a loop to scrape all the page of the website (from page 1 to 5)
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
options = Options()
options.add_argument("window-size=1400,600")
from fake_useragent import UserAgent
ua = UserAgent()
a = ua.random
user_agent = ua.random
print(user_agent)
options.add_argument(f'user-agent={user_agent}')
driver = webdriver.Chrome('/Users/raduulea/Documents/chromedriver', options=options)
driver.get('https://www.immoweb.be/fr/recherche/immeuble-de-rapport/a-vendre/liege/4000?page=1')
import time
time.sleep(10)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
results = soup.find_all("div", {"class":"result-xl"})
title=[]
address=[]
price=[]
surface=[]
desc=[]
for result in results:
title.append(result.find("div", {"class":"title-bar-left"}).get_text().strip())
address.append(result.find("span", {"result-adress"}).get_text().strip())
price.append(result.find("div", {"class":"xl-price rangePrice"}).get_text().strip())
surface.append(result.find("div", {"class":"xl-surface-ch"}).get_text().strip())
desc.append(result.find("div", {"class":"xl-desc"}).get_text().strip())
df = pd.DataFrame({"Title":title,"Address":address,"Price:":price,"Surface" : surface,"Description":desc})
df.to_csv("output.csv")
Upvotes: 4
Views: 15589
Reputation: 31
I haven't checked your code but according to your question I hope this is useful for readers
you can use driver.back();
urls = ['https://example1.com', 'https://example2.com']
driver = webdriver.Chrome("chromedriver")
driver.get(url[0])
html_example0 = driver.page_source
driver.back()
driver.get(url[1])
html_example1 = driver.page_source
In upper code when I dont use driver.back()
; I get same response for driver.page_source while the websites is not the same.
with using the back() method of driver; I get The desired result.
In fact, It seems that this method press the back button to return to the previous page in the browser.
Upvotes: 0
Reputation: 4315
Using pandas
library to save data into csv file
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time
def main():
options = Options()
options.add_argument("window-size=1400,600")
ua = UserAgent()
a = ua.random
user_agent = ua.random
options.add_argument(f'user-agent={user_agent}')
driver = webdriver.Chrome("/Users/raduulea/Documents/chromedriver", options=options)
title=[]
address=[]
price=[]
surface=[]
desc=[]
for i in range(1,6):
url = 'https://www.immoweb.be/fr/recherche/immeuble-de-rapport/a-vendre/liege/4000?page='+str(i)
driver.get(url)
scrap_data(driver,title,address,price,surface,desc)
df = pd.DataFrame({"Title":title,"Address":address,"Price:":price,"Surface" : surface,"Description":desc})
df.to_csv("output.csv")
def scrap_data(driver,title,address,price,surface,desc):
time.sleep(10)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
results = soup.find_all("div", {"class":"result-xl"})
for result in results:
title.append(result.find("div", {"class":"title-bar-left"}).get_text().strip())
address.append(result.find("span", {"result-adress"}).get_text().strip())
price.append(result.find("div", {"class":"xl-price rangePrice"}).get_text().strip())
surface.append(result.find("div", {"class":"xl-surface-ch"}).get_text().strip())
desc.append(result.find("div", {"class":"xl-desc"}).get_text().strip())
if __name__ == '__main__':
main()
csv file o/p:
,Title,Address,Price:,Surface,Description
0,Appartement,4000 Liège,279.000 €,180 m² 3 ch.,Appartement au 8ème étage avec vue sur Meuse
1,Immeuble à appartements,4000 Liège (Hognoul),645.000 €,345 m² 6 ch.,Hognoul - immeuble à appartements !
2,Immeuble à appartements,4000 Liège,195.000 €,100 m² 2 ch.,Immeuble à appartement
3,Immeuble à appartements,4000 Liège,320.000 €,229 m² 4 ch.,"immeuble de rapport , 1 commerce et 3 logements"
4,Immeuble mixte,4000 Liège,670.000 €,324 m² 3 ch.,"Immeuble atypique : triplex, centre de bien-être, ch. hôtes"
5,Immeuble à appartements,4000 Liege,635.000 €,360 m² 9 ch.,LIEGE - Immeuble à appartements remis à neuf !
6,Maison,4000 Liège,245.000 €,225 m² 5 ch.,Opportunité - rendement de 6% NET
7,Immeuble mixte,4000 Liège,339.000 €,348 m² 2 ch.,Ensemble de 2 immeubles mixtes
8,Immeuble à appartements,4000 Liege,1.250.000 €,1000 m² 14 ch.,LIEGE - Ensemble immobilier de rapport !
9,Immeuble à appartements,4000 Liège,310.000 €,3 ch.,Maison de commerce et logements Hyper centre
10,Immeuble à appartements,4000 Liège,660.000 €,285 m² 5 ch.,Hôtel de Maître avec terrasse à prox. rue de la Casquette
11,Immeuble à appartements,4000 Liège,1.100.000 €,722 m²,!!!!! OPTION !!!! Immeuble de rapport hyper centre
12,Immeuble mixte,4000 Liège,925.000 €,390 m² 6 ch.,Ensemble immobilier dans le Parc de Cointe
13,Immeuble mixte,4000 Liège,299.000 €,550 m²,batiment commercial de 550m2 environ . dispose d'une vitrine a ...
14,Immeuble mixte,4000 Liège,211.500 €,220 m²,"AGENCE S'ABSTENIR
Lot de 3 bureaux (dont deux loués pour ..."
15,Immeuble mixte,4000 LIEGE,86.000 €,1 ch.,CENTRE-VILLE: MAISON DE COM.+HABITATION
16,Immeuble à appartements,4000 Liège,70.000 €,3 ch.,"Maison de rapport sise à Liège rue Chéri, 49"
17,Immeuble à appartements,4000 LIEGE,399.000 €,6 ch.,"IMM. DE RAPPORT: 2 REZ COM., 4 STUDIOS+1 APPART."
Upvotes: 0
Reputation: 33384
Try the below code.It will loop through all pages not only 5 pages.Check the next button if available click on it else break the wile loop.
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
options = Options()
options.add_argument("window-size=1400,600")
from fake_useragent import UserAgent
ua = UserAgent()
a = ua.random
user_agent = ua.random
print(user_agent)
options.add_argument(f'user-agent={user_agent}')
driver = webdriver.Chrome('/Users/raduulea/Documents/chromedriver', options=options)
driver.get('https://www.immoweb.be/fr/recherche/immeuble-de-rapport/a-vendre')
import time
time.sleep(10)
Title = []
address = []
price = []
surface = []
desc = []
page=2
while True:
time.sleep(10)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
results = soup.find_all("div", {"class": "result-xl"})
for result in results:
Title.append(result.find("div", {"class": "title-bar-left"}).get_text().strip())
address.append(result.find("span", {"result-adress"}).get_text().strip())
price.append(result.find("div", {"class": "xl-price rangePrice"}).get_text().strip())
surface.append(result.find("div", {"class": "xl-surface-ch"}).get_text().strip())
desc.append(result.find("div", {"class": "xl-desc"}).get_text().strip())
if len(driver.find_elements_by_css_selector("a.next")) > 0:
url = "https://www.immoweb.be/fr/recherche/immeuble-de-rapport/a-vendre/?page={}".format(page)
driver.get(url)
page += 1
#It will traverse for only 5 pages as you are after if want more page just comment the below if block
if int(page)>5:
break
else:
break
df = pd.DataFrame({"Title": Title, "Address": address, "Price:": price, "Surface": surface, "Description": desc})
df.to_csv("output.csv")
Upvotes: 3