How can I make the phantomJS webdriver to wait until a specific HTML element being loaded and then return the page.source?

Question

I have developed the code below for a web crawling object.

It takes two dates as inputs.Then creates a list of dates between these two dates and attach each one to a webpage url which contains weather information of a location. Then it converts HTML tables of data into Dataframe and after that stores data as csv file in storage (the base link is: https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/2019-1-3 and as you can see in this example the date is 2019-1-3):

from datetime import timedelta, date
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
from furl import furl
import os
import time

class WebCrawler():
    def __init__(self, st_date, end_date):
        if not os.path.exists('Data'):
            os.makedirs('Data')
        self.path = os.path.join(os.getcwd(), 'Data')
        self.driver = webdriver.PhantomJS()
        self.base_url = 'https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/'
        self.st_date = st_date
        self.end_date = end_date

    def date_list(self):
        # Create list of dates between two dates given as inputs.
        dates = []
        total_days = int((self.end_date - self.st_date).days + 1)

        for i in range(total_days):
            date = self.st_date + timedelta(days=i)
            dates.append(date.strftime('%Y-%m-%d'))

        return dates

    def create_link(self, attachment):
        # Attach dates to base link
        f = furl(self.base_url)
        f.path /= attachment
        f.path.normalize()

        return f.url

    def open_link(self, link):
        # Opens link and visits page and returns html source code of page
        self.driver.get(link)
        html = self.driver.page_source

        return html

    def table_to_df(self, html):
        # Finds table of weather data and converts it into pandas dataframe and returns it
        soup = BeautifulSoup(html, 'lxml')
        table = soup.find("table",{"class":"tablesaw-sortable"})

        dfs = pd.read_html(str(table))
        df = dfs[0]

        return df

    def to_csv(self, name, df):
        # Save the dataframe as csv file in the defined path
        filename = name + '.csv'
        df.to_csv(os.path.join(self.path,filename), index=False)

This is the way I want to use the WebCrawler object:

date1 = date(2018, 12, 29)
date2 = date(2019, 1, 1)

# Initialize WebCrawler object
crawler = WebCrawler(st_date=date1, end_date=date2)
dates = crawler.date_list()

for day in dates:
    print('**************************')
    print('PROCESSING : ', day)
    link = crawler.create_link(day)
    print('WAITING... ')
    time.sleep(3)
    print('VISIT WEBPAGE ... ')
    html = crawler.open_link(link)
    print('DATA RETRIEVED ... ')
    df = crawler.table_to_df(html)
    print(df.head(3))
    crawler.to_csv(day, df)
    print('DATA SAVED ...')

The problem which occurs is that the first iteration of loop runs perfect but the second one stops with an error which says No tables where found (occurs in table = soup.find("table",{"class":"tablesaw-sortable"}) line) and that's because page source is returned by WebCrawler.open_link before the webpage fully load the contents of webpage including the table (containing weather information). there is also a probability that website rejects the request because it's making the servers too busy.

Is there anyway that we could build a loop that keep trying to open the link until when it could find the table, or at least wait until table is loaded and then return the table?

masouduut94 · Accepted Answer

I rewrote the code using the https://stackoverflow.com/a/26567563/4159473 solution which was suggested by @mildmelon and I also used some delays between each time sending request to server and asking for the page source:

from datetime import timedelta, date
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import pandas as pd
from furl import furl
import os
import time
class WebCrawler():
    def __init__(self, st_date, end_date):
        if not os.path.exists('Data'):
            os.makedirs('Data')
        self.path = os.path.join(os.getcwd(), 'Data')
        self.driver = webdriver.PhantomJS()
        self.delay_for_page = 7
        self.base_url = 'https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/'
        self.st_date = st_date
        self.end_date = end_date

    def date_list(self):
        # Create list of dates between two dates given as inputs.
        dates = []
        total_days = int((self.end_date - self.st_date).days + 1)

        for i in range(total_days):
            date = self.st_date + timedelta(days=i)
            dates.append(date.strftime('%Y-%m-%d'))

        return dates

    def create_link(self, attachment):
        # Attach dates to base link
        f = furl(self.base_url)
        f.path /= attachment
        f.path.normalize()

        return f.url

    def open_link(self, link):
        # Opens link and visits page and returns html source code of page
        self.driver.get(link)
        myElem = WebDriverWait(self.driver, self.delay_for_page)\
        .until(EC.presence_of_element_located((By.CLASS_NAME, 'tablesaw-sortable')))


    def table_to_df(self, html):
        # Finds table of weather data and converts it into pandas dataframe and returns it
        soup = BeautifulSoup(html, 'lxml')
        table = soup.find("table",{"class":"tablesaw-sortable"})

        dfs = pd.read_html(str(table))
        df = dfs[0]

        return df

    def to_csv(self, name, df):
        # Save the dataframe as csv file in the defined path
        filename = name + '.csv'
        df.to_csv(os.path.join(self.path,filename), index=False)

date1 = date(2019, 2, 1)
date2 = date(2019, 3, 5)


# Initialize WebCrawler object
crawler = WebCrawler(st_date=date1, end_date=date2)
dates = crawler.date_list()
for day in few_dates:
    print('**************************')
    print('DATE : ', day)
    link = crawler.create_link(day)
    print('WAITING ....')
    print('')
    time.sleep(12)
    print('OPENING LINK ... ')

    try:
        crawler.open_link(link)
        html = crawler.driver.page_source
        print( "DATA IS FETCHED")
        df = crawler.table_to_df(html)
        print(df.head(3))
        crawler.to_csv(day, df)
        print('DATA SAVED ...')
    except TimeoutException:
        print( "NOT FETCHED ...!!!")

The weather information is fetched without problem. I guess delays between each request resulted in better performance. The line myElem = WebDriverWait(self.driver, self.delay_for_page)\.until(EC.presence_of_element_located((By.CLASS_NAME, 'tablesaw-sortable'))) has also improved speed.

How can I make the phantomJS webdriver to wait until a specific HTML element being loaded and then return the page.source?

Answers (2)

Related Questions