Reputation: 43
I'm trying to scrape this page
https://www.vivareal.com.br/venda/pernambuco/recife/#onde=BR-Pernambuco-NULL-Recife
I scraped the first page this website and click with selenium to next page, but I only can get the first page content, when I scrape the second, it came the same content from first page. I dunno how to fix this or if the webpage has some protection to scraping.
Could someone help me?
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from time import time
from random import randint
from IPython.core.display import clear_output
from warnings import warn
from selenium import webdriver
def scrape():
cont = [True,True,True,True,False]
for times in cont:
if times != True:
driver = webdriver.Firefox(executable_path = 'geckodriver')
page = driver.get('https://www.vivareal.com.br/venda/pernambuco/recife/?#onde=BR-Pernambuco-NULL-Recife')
sleep(15)
titles = []
addresses = []
areas = []
rooms = []
bathes = []
values = []
start_time = time()
request = 0
soup = BeautifulSoup(page,'html.parser')
imov = soup.find_all('div', class_='property-card__main-content')
sleep(randint(8,15))
# Monitor
request += 1
elapsed_time = time() - start_time
print('Request: {}; Frequency: {} requests/s'.format(request, request/elapsed_time))
clear_output(wait = True)
# Throw a warning for non-200 status codes
if page.status_code != 200:
warn('Request: {}; Status code: {}'.format(requests, page.status_code))
# Break the loop if the number of requests is greater than expected
if request > 72:
warn('Number of requests was greater than expected.')
break
for container in imov:
# Título
title = container.h2.a.get_text()
t2 = title.strip()
titles.append(t2)
# Título
# Endereço
address = container.h2.span.get_text()
a2 = address.strip()
addresses.append(a2)
# Endereço
# Área
area = container.li.span.get_text()
ar2 = area.strip()
areas.append(ar2)
# Área
# Quartos
room = container.find(class_= "property-card__detail-item property-card__detail-room js-property-detail-rooms")
room2 = room.find('span', class_="property-card__detail-value js-property-card-value").get_text()
r2 = room2.strip()
rooms.append(r2)
# Quartos
# Banheiros
bath = container.find(class_= "property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom")
bath2 = bath.find('span', class_="property-card__detail-value js-property-card-value").get_text()
b2 = bath2.strip()
bathes.append(b2)
# Banheiros
# Valor
value = container.section.div.get_text()
v2 = value.strip()
values.append(v2)
# Valor
# Dataframe e salvar
vivareal = pd.DataFrame({
"title": titles,
"address": addresses,
"area": areas,
"rooms":rooms,
"baths":bathes,
"value":values
})
vivareal.to_csv(r'output.csv')
prox = driver.find_element_by_xpath('//*[@title="Próxima página"]')
prox.click()
else:
print('Done!')
scrape()```
Upvotes: 0
Views: 160
Reputation: 738
Your code is not working as espected, even with the fixes provided by @MarceloBaliu. Here is my code that (finally!) worked for me. I'm sharing because it can help someone, like I was helped by this website.
from selenium import webdriver
from selenium.common.exceptions import WebDriverException, ElementClickInterceptedException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
class ScraperVivaReal:
wait_time = 5
def __init__(self, url):
# Initializing the webdriver
options = webdriver.FirefoxOptions()
options.add_argument('-headless')
self.driver = webdriver.Firefox(options=options)
self.driver.maximize_window()
self.driver.get(url)
time.sleep(self.wait_time)
# Handling cookies acception
WebDriverWait(self.driver, self.wait_time).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="cookie-notifier-cta"]'))).click()
time.sleep(self.wait_time/2)
def __scrape_page__(self):
result = []
# Extracting data from the page
try:
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
except WebDriverException:
print('Webdriver was manually quit by the user!') # I configure this exception before adding the option -headless to webdriver
return result
# Finding property cards containing search results
div_list = soup.find_all('div', {'class':'property-card__content'})
# Iterating each card
for d in div_list:
# Extracting info from card
title = d.find('span', {'class': 'property-card__title js-cardLink js-card-title'}).get_text().strip()
complete_address = d.find('span', {'class': 'property-card__address'}).get_text().strip()
area = d.find('span', {'class': 'property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area'}).get_text().strip()
rooms = d.find('li', {'class': 'property-card__detail-item property-card__detail-room js-property-detail-rooms'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
baths = d.find('li', {'class': 'property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
garage = d.find('li', {'class': 'property-card__detail-item property-card__detail-garage js-property-detail-garages'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
# Extracting the price
try:
price = d.find('div', {'class':'property-card__price js-property-card-prices js-property-card__price-small'}).find('p').get_text().strip()
except AttributeError:
price = "N/I"
# Splitting the address
add_list = re.split(',|-', complete_address)
add_list = [ item.strip() for item in add_list ]
if len(add_list) == 2:
city, st = add_list
neibhood = 'N/I'
address = 'N/I'
number = 'N/I'
if len(add_list) == 3:
neibhood, city, st = add_list
address = 'N/I'
number = 'N/I'
if len(add_list) == 4:
address, neibhood, city, st = add_list
number = 'N/I'
elif len(add_list) == 5:
address, number, neibhood, city, st = add_list
# Adding the result into a dicionary and appending the dict to a result list
row = { 'Título': title, 'Endereço': address, 'Número': number, 'Bairro': neibhood, 'Cidade': city, 'Estado': st, 'Área': area, 'Quartos': rooms, 'Banheiros': baths, 'Vagas': garage, 'Preço': price }
result.append(row)
return result
def __next_page__(self):
# Finding the "Next Page" button element
next_element = self.driver.find_element_by_xpath('//*[@title="Próxima página"]')
try:
# Trying to click it
next_element.click()
time.sleep(self.wait_time)
return True
# Treating some exceptions (element not found and element not clickable)
except ElementClickInterceptedException:
print('"Próxima Página" element is not clickable!')
except NoSuchElementException:
print('"Próxima Página" element not found!')
return False
def run(self, output):
has_next = True
final_result = []
# Getting the information!
while has_next:
results = self.__scrape_page__()
final_result.extend(results)
print('Got {} results! Total Found: {}'.format(len(results), len(final_result)))
if len(results) == 0:
break
has_next = self.__next_page__()
# Quitting Firefox
self.driver.quit()
# Exporting results to CSV
df = pd.DataFrame(final_result)
df.to_csv(output, sep=',')
S = ScraperVivaReal('https://www.vivareal.com.br/venda/sp/paulinia/')
S.run('output.csv')
Upvotes: 1
Reputation: 230
Although you put the click command at the end, when it goes to the next loop, the first command is to create a new driver and then is called the command to get the main page of Viva Real to Pernambuco. This is unwanted. Instead of this you could do:
def scrape():
cont = [True,True,True,True,False]
# You create the driver and access the main page only once
driver = webdriver.Firefox(executable_path = 'geckodriver')
page = driver.get('https://www.vivareal.com.br/venda/pernambuco/recife/?#onde=BR-Pernambuco-NULL-Recife')
for times in cont:
if times != True:
# Wait to load every page
sleep(15)
Upvotes: 1