Reputation: 43

The page doesn't scraping

I'm trying to scrape this page

https://www.vivareal.com.br/venda/pernambuco/recife/#onde=BR-Pernambuco-NULL-Recife

I scraped the first page this website and click with selenium to next page, but I only can get the first page content, when I scrape the second, it came the same content from first page. I dunno how to fix this or if the webpage has some protection to scraping.

Could someone help me?

from bs4 import BeautifulSoup
import pandas as pd 
from time import sleep
from time import time
from random import randint
from IPython.core.display import clear_output
from warnings import warn
from selenium import webdriver

def scrape():
    cont = [True,True,True,True,False]

    for times in cont:

        if times != True:
            driver = webdriver.Firefox(executable_path = 'geckodriver')
            
            page = driver.get('https://www.vivareal.com.br/venda/pernambuco/recife/?#onde=BR-Pernambuco-NULL-Recife')
            
            sleep(15)

            titles = []
            addresses = []
            areas = []
            rooms = []
            bathes = []
            values = []

            start_time = time()
            request = 0
            
            soup = BeautifulSoup(page,'html.parser')
            imov = soup.find_all('div', class_='property-card__main-content')

            sleep(randint(8,15))

            # Monitor
            request += 1
            elapsed_time = time() - start_time
            print('Request: {}; Frequency: {} requests/s'.format(request, request/elapsed_time))
            clear_output(wait = True)

            # Throw a warning for non-200 status codes
            if page.status_code != 200:
                warn('Request: {}; Status code: {}'.format(requests, page.status_code))

            # Break the loop if the number of requests is greater than expected
            if request > 72:
                warn('Number of requests was greater than expected.')
                break
                
            for container in imov:
                # Título
                title = container.h2.a.get_text()
                t2 = title.strip()
                titles.append(t2)
                # Título

                # Endereço
                address = container.h2.span.get_text()
                a2 = address.strip()
                addresses.append(a2)
                # Endereço

                # Área
                area = container.li.span.get_text()
                ar2 = area.strip()
                areas.append(ar2)
                # Área

                # Quartos
                room = container.find(class_= "property-card__detail-item property-card__detail-room js-property-detail-rooms")
                room2 = room.find('span', class_="property-card__detail-value js-property-card-value").get_text()
                r2 = room2.strip()
                rooms.append(r2)
                # Quartos

                # Banheiros
                bath = container.find(class_= "property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom")
                bath2 = bath.find('span', class_="property-card__detail-value js-property-card-value").get_text()
                b2 = bath2.strip()
                bathes.append(b2)
                # Banheiros

                # Valor
                value = container.section.div.get_text()
                v2 = value.strip()
                values.append(v2)
                # Valor

                # Dataframe e salvar
                vivareal = pd.DataFrame({
                    "title": titles, 
                    "address": addresses, 
                    "area": areas, 
                    "rooms":rooms,
                    "baths":bathes,
                    "value":values
                    })
                                
                vivareal.to_csv(r'output.csv')

            prox = driver.find_element_by_xpath('//*[@title="Próxima página"]')
            prox.click()
        else:
            print('Done!')
            
scrape()```

Upvotes: 0

Answers (2)

Vinícius A. Jorge

Reputation: 738

Your code is not working as espected, even with the fixes provided by @MarceloBaliu. Here is my code that (finally!) worked for me. I'm sharing because it can help someone, like I was helped by this website.

from selenium import webdriver
from selenium.common.exceptions import WebDriverException, ElementClickInterceptedException, NoSuchElementException
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
import time
import pandas as pd

class ScraperVivaReal:
    wait_time = 5
    
    def __init__(self, url):
        # Initializing the webdriver
        options = webdriver.FirefoxOptions()
        options.add_argument('-headless')
        self.driver = webdriver.Firefox(options=options)
        self.driver.maximize_window()
        self.driver.get(url)
        time.sleep(self.wait_time)
        # Handling cookies acception
        WebDriverWait(self.driver, self.wait_time).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="cookie-notifier-cta"]'))).click()
        time.sleep(self.wait_time/2)
        
    def __scrape_page__(self):
        result = []
        
        # Extracting data from the page
        try:
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
        except WebDriverException:
            print('Webdriver was manually quit by the user!') # I configure this exception before adding the option -headless to webdriver
            return result
        
        # Finding property cards containing search results
        div_list = soup.find_all('div', {'class':'property-card__content'})
        
        # Iterating each card
        for d in div_list:

            # Extracting info from card
            title = d.find('span', {'class': 'property-card__title js-cardLink js-card-title'}).get_text().strip()
            complete_address = d.find('span', {'class': 'property-card__address'}).get_text().strip()
            area = d.find('span', {'class': 'property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area'}).get_text().strip()
            rooms = d.find('li', {'class': 'property-card__detail-item property-card__detail-room js-property-detail-rooms'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
            baths = d.find('li', {'class': 'property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
            garage = d.find('li', {'class': 'property-card__detail-item property-card__detail-garage js-property-detail-garages'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
            
            # Extracting the price
            try:
                price = d.find('div', {'class':'property-card__price js-property-card-prices js-property-card__price-small'}).find('p').get_text().strip()
            except AttributeError:
                price = "N/I"
            
            # Splitting the address
            add_list = re.split(',|-', complete_address)
            add_list = [ item.strip() for item in add_list ]
            if len(add_list) == 2:
                city, st = add_list
                neibhood = 'N/I'
                address = 'N/I'
                number = 'N/I'
            if len(add_list) == 3:
                neibhood, city, st = add_list
                address = 'N/I'
                number = 'N/I'
            if len(add_list) == 4:
                address, neibhood, city, st = add_list
                number = 'N/I'
            elif len(add_list) == 5:
                address, number, neibhood, city, st = add_list
                
            # Adding the result into a dicionary and appending the dict to a result list
            row = { 'Título': title, 'Endereço': address, 'Número': number, 'Bairro': neibhood, 'Cidade': city, 'Estado': st, 'Área': area, 'Quartos': rooms, 'Banheiros': baths, 'Vagas': garage, 'Preço': price }
            result.append(row)
        return result
        
    def __next_page__(self):
        # Finding the "Next Page" button element
        next_element = self.driver.find_element_by_xpath('//*[@title="Próxima página"]')
        try:
            # Trying to click it
            next_element.click()
            time.sleep(self.wait_time)
            return True
        # Treating some exceptions (element not found and element not clickable)
        except ElementClickInterceptedException:
            print('"Próxima Página" element is not clickable!')
        except NoSuchElementException:
            print('"Próxima Página" element not found!')
        return False
        
    def run(self, output):
        has_next = True
        final_result = []
        # Getting the information!
        while has_next:
            results = self.__scrape_page__()
            final_result.extend(results)
            print('Got {} results! Total Found: {}'.format(len(results), len(final_result)))
            if len(results) == 0:
                break
            has_next = self.__next_page__()
        # Quitting Firefox
        self.driver.quit()
        # Exporting results to CSV
        df = pd.DataFrame(final_result)
        df.to_csv(output, sep=',')

S = ScraperVivaReal('https://www.vivareal.com.br/venda/sp/paulinia/')
S.run('output.csv')

Upvotes: 1

MarceloBaliu

Reputation: 230

Although you put the click command at the end, when it goes to the next loop, the first command is to create a new driver and then is called the command to get the main page of Viva Real to Pernambuco. This is unwanted. Instead of this you could do:

def scrape():
    cont = [True,True,True,True,False]

    # You create the driver and access the main page only once
    driver = webdriver.Firefox(executable_path = 'geckodriver')
    page = driver.get('https://www.vivareal.com.br/venda/pernambuco/recife/?#onde=BR-Pernambuco-NULL-Recife')

    for times in cont:

        if times != True:
            # Wait to load every page
            sleep(15)

Upvotes: 1

The page doesn&#39;t scraping

Answers (2)

Related Questions

The page doesn't scraping