Reputation: 23
I am trying to scrape a table from web using selenium in python. But the website is very slow and there are many network issues most of the time. So I would like the code to keep trying even if the website takes time to load. I have to scrape 941 entries to scrape. I tried this module named retry I found online, but it seems not to be work. Giving a sample of the code below. Is there any other way to make the code keep retrying until the website loads?
import pandas as pd
import io
import time
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
from selenium.webdriver.support.ui import Select
from retry import retry
# Web page url
driver.get("http://mnregaweb4.nic.in/netnrega/dynamic_work_details.aspx?page=S&lflag=eng&state_name=KERALA&state_code=16&fin_year=2020-2021&source=national&Digest=s5wXOIOkT98cNVkcwF6NQA")
@retry()
def make_trouble():
'''Retry until succeed'''
driver.implicitly_wait(5)
# Find District of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_dist')
drop = Select(x)
# Select by value
drop.select_by_value("1613")
@retry()
def make_trouble():
'''Retry until succeed'''
time.sleep(6)
# Find Block of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_blk')
drop = Select(x)
# Select by value
drop.select_by_value("1613001")
@retry()
def make_trouble():
'''Retry until succeed'''
time.sleep(4)
# Find GP of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_pan')
drop = Select(x)
# Select by value
drop.select_by_value("1613001001")
@retry()
def make_trouble():
'''Retry until succeed'''
time.sleep(4)
search_button = driver.find_element_by_id("ContentPlaceHolder1_Button1")
search_button.click()
time.sleep(8)
soup = BeautifulSoup(driver.page_source, 'lxml')
tables = soup.find_all('table')
dfs = pd.read_html(str(tables))
print(dfs[4])
df1 = pd.read_csv(io.StringIO(dfs[4].to_csv(index=False)), skiprows=1, header=[0,1])
df1.to_csv("test with pandas V3.csv", index=False)
driver.close()```
Upvotes: 0
Views: 2757
Reputation: 888
This is not my code but as requested by ABC Admin this modification of the code by Sangun Devkota.
This way it prints an error every 5 loops.
x = 0
while True:
try:
driver.implicitly_wait(5)
# Find District of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_dist')
drop = Select(x)
# Select by value
drop.select_by_value("1613")
time.sleep(6)
# Find Block of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_blk')
drop = Select(x)
# Select by value
drop.select_by_value("1613001")
time.sleep(4)
# Find GP of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_pan')
drop = Select(x)
# Select by value
drop.select_by_value("1613001001")
time.sleep(4)
search_button = driver.find_element_by_id("ContentPlaceHolder1_Button1")
search_button.click()
time.sleep(8)
soup = BeautifulSoup(driver.page_source, 'lxml')
tables = soup.find_all('table')
dfs = pd.read_html(str(tables))
print(dfs[4])
df1 = pd.read_csv(io.StringIO(dfs[4].to_csv(index=False)), skiprows=1, header=[0,1])
df1.to_csv("test with pandas V3.csv", index=False)
driver.close()
except:
if x%5 == 0:
print("Error")
x += 1
If you want it to only print once you can change it to this:
x = True
... Other code ...
except:
if x:
print('Error')
x = False
Upvotes: 1
Reputation: 505
while True:
try:
driver.implicitly_wait(5)
# Find District of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_dist')
drop = Select(x)
# Select by value
drop.select_by_value("1613")
time.sleep(6)
# Find Block of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_blk')
drop = Select(x)
# Select by value
drop.select_by_value("1613001")
time.sleep(4)
# Find GP of option
x = driver.find_element_by_id('ContentPlaceHolder1_ddl_pan')
drop = Select(x)
# Select by value
drop.select_by_value("1613001001")
time.sleep(4)
search_button = driver.find_element_by_id("ContentPlaceHolder1_Button1")
search_button.click()
time.sleep(8)
soup = BeautifulSoup(driver.page_source, 'lxml')
tables = soup.find_all('table')
dfs = pd.read_html(str(tables))
print(dfs[4])
df1 = pd.read_csv(io.StringIO(dfs[4].to_csv(index=False)), skiprows=1, header=[0,1])
df1.to_csv("test with pandas V3.csv", index=False)
driver.close()
except:
print("Error")
Upvotes: 1