Thai Nguyen
Thai Nguyen

Reputation: 1

Scraping data from multiple tooltips using Python and Selenium

I am trying to scrape the Hydrogen Sulfide data from this website using Python and Selenium. What I have been struggling so far is I do not know how to get the data for each tooltip(site id, site name, date, value, unit, etc.). As you can see, we have seven monitoring points ranging from A to G and each point corresponds to its own data. I have done lots of research and still got stuck. I have compiled the following codes to scrape the data on a specific date but encountered errors. Please see my codes below.

from selenium import webdriver
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Edge(EdgeChromiumDriverManager(log_level=20).install())
driver.maximize_window()
driver.get("https://marathonlosangelesrefineryfencelinemonitoring.com/index.html")

# Navigate to monitors
button = driver.find_element_by_xpath("//div[@class='nav-link-text']")   
button.click()

# Navigate to dropdown button
dropdown = driver.find_element_by_xpath("//i[@class='arrow-down parameter-arrow']") 
dropdown.click()

# Select Hydrogen Sulfide and click
h2s = driver.find_element_by_xpath("//ul[@class='dropdown-menu' and @role='menu' and @aria-labelledby='ParameterDropdown']//li[12]")
h2s.click()

res = []
test = driver.find_elements_by_xpath("//div[@class='leaflet-pane leaflet-marker-pane']//div[contains(@class, 'leaflet-marker-icon')]")
for ele in test:
    hover = ActionChains(driver).move_to_element(ele)
    hover.perform()
    try:
        site_id = driver.find_element_by_css_selector(".LAR-tooltip-site-id > p")
        site_name = driver.find_element_by_css_selector(".LAR-tooltip-site-name")
        date = driver.find_element_by_css_selector(".LAR-tooltip-localtime")
        value = driver.find_element_by_css_selector(".LAR-tooltip-data-value")
        unit = driver.find_element_by_css_selector(".LAR-tooltip-data-unit")
        para_mdl = driver.find_element_by_css_selector(".tooltip-parameter-mdl")
        res.append((site_id.text, site_name.text, date.text, value.text, unit.text, para_mdl.text))
    except:
        pass

I really appreciate if anyone can help me resolve this issue. Also, I want to scrape data on a time window (let's say from Aug 1st, 2021 to Jan 1st, 2022) by leveraging the codes above, so any feedbacks are greatly appreciated.

Upvotes: 0

Views: 812

Answers (3)

Thai Nguyen
Thai Nguyen

Reputation: 1

@AnandGautam, I realized that whenever I wanted to scrape the data for a whole month (let's say Sep 2021) everything went fine until I reached to the 29th where we had Aug 29th and Sep 29th on the same calendar. So to make the Xpath unique, I modified it a bit by adding @data-title=. But I encountered some errors. I tried to verify the Xpath and found it was valid so I still have no idea why the errors occurred. Please see the codes below.

from selenium import webdriver
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

driver = webdriver.Edge(EdgeChromiumDriverManager(log_level=20).install())
driver.maximize_window()
def h2s_selection():
    driver.get("https://marathonlosangelesrefineryfencelinemonitoring.com/index.html")
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='nav-link-text']"))).click()
    # Navigate to monitors
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//i[@class='arrow-down parameter-arrow']"))).click()
    # Navigate to dropdown button
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//ul[@class='dropdown-menu' and @role='menu' and @aria-labelledby='ParameterDropdown']//li[12]"))).click()
    # Select Hydrogen Sulfide and click
    WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[@class='leaflet-pane leaflet-marker-pane']//div[contains(@class, 'leaflet-marker-icon')]")))

def month_data(req_month, req_year, data):
    driver.find_element_by_css_selector(".arrow-down.date-arrow").click()
    req_timeline = req_month + " " + req_year
    print(f"Timeline Selected is: {req_timeline}")
    for i in range(11):
        month = driver.find_element(By.XPATH, "//th[@class='month']").text
        if month == req_timeline:
            break
        else:
            driver.find_element(By.XPATH, "//th[@class='prev available']").click()
    
    for k, v in data.items():
        time.sleep(5)
        each_date = driver.find_element(By.XPATH, f"//*[@class='table-condensed']//td[text()={k} and @data-title={v}]")
        #print(f"Date is {each_date.text}")
        each_date.click()
        driver.find_element(By.XPATH, "//*[text()='Apply']").click()
        time.sleep(10)
        tooltips()
        time.sleep(5)
        driver.find_element_by_css_selector(".arrow-down.date-arrow").click()

def tooltips():
    # time.sleep(8)
    res = []
    test = driver.find_elements_by_xpath("//div[@class='leaflet-pane leaflet-marker-pane']//div[contains(@class, 'leaflet-marker-icon')]")
    for ele in test:
        hover = ActionChains(driver).move_to_element(ele)
        hover.perform()
        time.sleep(1)
        try:
            site_id = driver.find_element_by_css_selector(".LAR-tooltip-site-id > p")
            site_name = driver.find_element_by_css_selector(".LAR-tooltip-site-name")
            date = driver.find_element_by_css_selector(".LAR-tooltip-localtime")
            value = driver.find_element_by_css_selector(".LAR-tooltip-data-value")
            unit = driver.find_element_by_css_selector(".LAR-tooltip-data-unit")
            para_mdl = driver.find_element_by_css_selector(".tooltip-parameter-mdl")
            res.append((site_id.text, site_name.text, date.text, value.text, unit.text, para_mdl.text))
        except:
            pass
    print(res)


if __name__ == "__main__":
    h2s_selection()
    data_dict = {'29': 'r4c3', '30': 'r4c4'}
    month_data(req_month='Sep', req_year='2021', data=data_dict)

I really appreciate if you can give me some pointers/feedbacks on how to resolve that. Thank you!

Upvotes: 0

Anand Gautam
Anand Gautam

Reputation: 2101

@ThaiNguyen, adding another answer to preserve the earlier one. I tried some crude ways to get the work done, and I succeeded after a lot of attempts, but I would say take it with a pinch of salt, as I iterated only for 3 dates in Aug. The refactored code is pasted below, but before you could see the code, let me explain you what I faced, which you could flag to take care. I had to add a lot of sleeps in order for the DOM to settle down for each action (and as you know, time.sleep is pretty unreliable when it comes to async), but I think even after waits I see that the code fails to stale elements, and adding time helped me take care of them (temporarily). Another thing - which, per me, is a big concern: Even though this code succeeded in fetching the results, I cannot assure you that it would do so for all the dates in Aug (let alone for all the required months), for the code behaves pretty flaky with the rendered DOM, and I don't want to blame the code at this point in time (with the limited knowledge that I have on selenium), but the DOM has heavy async if I am not wrong. So, I would want to say that with this code, you cannot hope to get everything at one shot; rather, you may have to invest your time in either refactoring the code and improving it or fetching data in chunks by running multiple times for a few dates at a time for each month, which is very frustrating, given the flakiness it owes to.

from selenium import webdriver
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

driver = webdriver.Edge(EdgeChromiumDriverManager(log_level=20).install())
driver.maximize_window()
def h2s_selection():
    driver.get("https://marathonlosangelesrefineryfencelinemonitoring.com/index.html")
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='nav-link-text']"))).click()
    # Navigate to monitors
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//i[@class='arrow-down parameter-arrow']"))).click()
    # Navigate to dropdown button
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//ul[@class='dropdown-menu' and @role='menu' and @aria-labelledby='ParameterDropdown']//li[12]"))).click()
    # Select Hydrogen Sulfide and click
    WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[@class='leaflet-pane leaflet-marker-pane']//div[contains(@class, 'leaflet-marker-icon')]")))

def aug_date():
    driver.find_element_by_css_selector(".arrow-down.date-arrow").click()
    req_month = 'Aug'
    req_year = '2021'
    req_timeline = req_month + " " + req_year
    print(f"Timeline Selected is: {req_timeline}")
    for i in range(11):
        month = driver.find_element(By.XPATH, "//th[@class='month']").text
        if month == req_timeline:
            break
        else:
            driver.find_element(By.XPATH, "//th[@class='prev available']").click()
    dt = ['1', '2', '3']
    for i in dt:
        time.sleep(5)
        each_date = driver.find_element(By.XPATH, "//*[@class='table-condensed']//td[text()=" + i + ']')
        print(f"Date is {each_date.text}")
        each_date.click()
        driver.find_element(By.XPATH, "//*[text()='Apply']").click()
        time.sleep(10)
        tooltips()
        time.sleep(5)
        driver.find_element_by_css_selector(".arrow-down.date-arrow").click()

def tooltips():
    # time.sleep(8)
    res = []
    test = driver.find_elements_by_xpath("//div[@class='leaflet-pane leaflet-marker-pane']//div[contains(@class, 'leaflet-marker-icon')]")
    for ele in test:
        hover = ActionChains(driver).move_to_element(ele)
        hover.perform()
        time.sleep(1)
        try:
            site_id = driver.find_element_by_css_selector(".LAR-tooltip-site-id > p")
            site_name = driver.find_element_by_css_selector(".LAR-tooltip-site-name")
            date = driver.find_element_by_css_selector(".LAR-tooltip-localtime")
            value = driver.find_element_by_css_selector(".LAR-tooltip-data-value")
            unit = driver.find_element_by_css_selector(".LAR-tooltip-data-unit")
            para_mdl = driver.find_element_by_css_selector(".tooltip-parameter-mdl")
            res.append((site_id.text, site_name.text, date.text, value.text, unit.text, para_mdl.text))
        except:
            pass
    print(res)


if __name__ == "__main__":
    h2s_selection()
    aug_date()

Output:

Timeline Selected is: Aug 2021
Date is 1
[('F', 'Point Monitor', '10:55 AM', '0.90', 'ppb', 'MDL: 0.40 ppb'), ('B', 'Point Monitor', '10:55 AM', '1.20', 'ppb', 'MDL: 0.40 ppb'), ('E', 'Point Monitor', '10:55 AM', '1.30', 'ppb', 'MDL: 0.40 ppb'), ('A', 'Point Monitor', '10:55 AM', '0.60', 'ppb', 'MDL: 0.40 ppb')]
Date is 2
[('B', 'Point Monitor', '10:25 PM', '1.70', 'ppb', 'MDL: 0.40 ppb'), ('E', 'Point Monitor', '10:25 PM', '1.90', 'ppb', 'MDL: 0.40 ppb')]
Date is 3
[('F', 'Point Monitor', '9:55 AM', '1.20', 'ppb', 'MDL: 0.40 ppb'), ('B', 'Point Monitor', '9:55 AM', '1.20', 'ppb', 'MDL: 0.40 ppb'), ('E', 'Point Monitor', '9:55 AM', '1.90', 'ppb', 'MDL: 0.40 ppb'), ('A', 'Point Monitor', '9:55 AM', '0.50', 'ppb', 'MDL: 0.40 ppb')]

Process finished with exit code 0

Upvotes: 1

Anand Gautam
Anand Gautam

Reputation: 2101

Looks like all your code needed were some WebdriverWaits. React-based websites are a bit difficult on automating due to a lot of aysncs if I am not wrong and due to the virtual DOM. I have refactored your code with WebdriverWaits as required (and also eliminated multiple lines, although you may retain them if you want a better readability). Here is the code:

from selenium import webdriver
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

driver = webdriver.Edge(EdgeChromiumDriverManager(log_level=20).install())
driver.maximize_window()
driver.get("https://marathonlosangelesrefineryfencelinemonitoring.com/index.html")
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='nav-link-text']"))).click()
# Navigate to monitors
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//i[@class='arrow-down parameter-arrow']"))).click()
# Navigate to dropdown button
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//ul[@class='dropdown-menu' and @role='menu' and @aria-labelledby='ParameterDropdown']//li[12]"))).click()
# Select Hydrogen Sulfide and click
WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[@class='leaflet-pane leaflet-marker-pane']//div[contains(@class, 'leaflet-marker-icon')]")))
driver.find_element_by_css_selector(".arrow-down.date-arrow").click()
req_month = 'Aug'
req_year = '2021'
req_timeline = req_month + " " + req_year
print(f"Timeline Selected is: {req_timeline}")
for i in range(11):
    month = driver.find_element(By.XPATH, "//th[@class='month']").text
    if month == req_timeline:
        break
    else:
        driver.find_element(By.XPATH, "//th[@class='prev available']").click()
driver.find_element(By.XPATH, "//*[@class='table-condensed']//td[text()='1']").click()
driver.find_element(By.XPATH, "//*[text()='Apply']").click()
time.sleep(8)
res = []
test = driver.find_elements_by_xpath("//div[@class='leaflet-pane leaflet-marker-pane']//div[contains(@class, 'leaflet-marker-icon')]")
for ele in test:
    hover = ActionChains(driver).move_to_element(ele)
    hover.perform()
    time.sleep(1)
    try:
        site_id = driver.find_element_by_css_selector(".LAR-tooltip-site-id > p")
        site_name = driver.find_element_by_css_selector(".LAR-tooltip-site-name")
        date = driver.find_element_by_css_selector(".LAR-tooltip-localtime")
        value = driver.find_element_by_css_selector(".LAR-tooltip-data-value")
        unit = driver.find_element_by_css_selector(".LAR-tooltip-data-unit")
        para_mdl = driver.find_element_by_css_selector(".tooltip-parameter-mdl")
        res.append((site_id.text, site_name.text, date.text, value.text, unit.text, para_mdl.text))
    except:
        pass
print(res)

Here is the result:

Timeline Selected is: Aug 2021
[('F', 'Point Monitor', '7:55 AM', '1.80', 'ppb', 'MDL: 0.40 ppb'), ('B', 'Point Monitor', '7:55 AM', '1.20', 'ppb', 'MDL: 0.40 ppb'), ('E', 'Point Monitor', '7:55 AM', '1.10', 'ppb', 'MDL: 0.40 ppb'), ('A', 'Point Monitor', '7:55 AM', '0.40', 'ppb', 'MDL: 0.40 ppb')]

Process finished with exit code 0

You see even there are WebdriverWaits introduced, some places needed that hard stop on time.sleep, otherwise the tests are getting flaky.

Upvotes: 1

Related Questions