Reputation: 761
At this time, my script will check multiple url if some 5 different type of keywords are present in the webpage. Depending of which keyword is found or not, it will output "ok" or "no".
I use set_page_load_timeout(30)
to avoid infinite load of a url.
Problem : some webpages doesn't load fully before timeout (even if it's a "very" long timeout). But I can see visually (no headless) that the page is loaded. At least it could check the keywords in the webpage but it doesn't and after timeout, it display "fail" and the scrape saying "no" doesn't show to the final output.
So I don't want to put an except after 30 seconds but I want to stop loading the page after 30 seconds and takes what it can be taken.
My code :
# coding=utf-8
import re
sites=[]
keywords_1=[]
keywords_2=[]
keywords_3=[]
keywords_4=[]
keywords_5=[]
import sys
from selenium import webdriver
import csv
import urllib.parse
from datetime import datetime
from datetime import date
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
def reader3(filename):
with open(filename, 'r') as csvfile:
# creating a csv reader object
csvreader = csv.reader(csvfile)
# extracting field names through first row
# extracting each data row one by one
for row in csvreader:
sites.append(str(row[0]).lower())
try:
reader3("data/script/filter_domain_OUTPUT.csv")
except Exception as e:
print(e)
sys.exit()
exc=[]
def reader3(filename):
with open(filename, 'r') as csvfile:
# creating a csv reader object
csvreader = csv.reader(csvfile)
# extracting field names through first row
# extracting each data row one by one
for row in csvreader:
exc.append(str(row[0]).lower())
try:
reader3("data/script/checking_EXCLUDE.csv")
except Exception as e:
print(e)
sys.exit()
def reader2(filename):
with open(filename, 'r') as csvfile:
# creating a csv reader object
csvreader = csv.reader(csvfile)
# extracting field names through first row
# extracting each data row one by one
for row in csvreader:
keywords_1.append(str(row[0]).lower())
keywords_2.append(str(row[1]).lower())
keywords_3.append(str(row[2]).lower())
keywords_4.append(str(row[3]).lower())
keywords_5.append(str(row[4]).lower())
try:
reader2("data/script/checking_KEYWORD.csv")
except Exception as e:
print(e)
sys.exit()
chrome_options = Options()
chrome_options.page_load_strategy = 'none'
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--lang=en')
chrome_options.add_argument('--disable-notifications')
#chrome_options.headless = True
chrome_options.add_argument('start-maximized')
chrome_options.add_argument('enable-automation')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-browser-side-navigation')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=chrome_options)
for site in sites:
try:
status_1 = "no"
status_2 = "no"
status_3 = "no"
status_4 = "no"
status_5 = "no"
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
today = date.today()
print("[" + current_time + "] " + str(site))
if 'http' in site:
driver.get(site)
else:
driver.get("http://" + site)
r=str(driver.page_source).lower()
driver.set_page_load_timeout(30)
for keyword_1 in keywords_1:
if keyword_1 in r:
status_1="ok"
print("home -> " +str(keyword_1))
break
for keyword_2 in keywords_2:
if keyword_2 in r:
status_2="ok"
print("home -> " +str(keyword_2))
break
for keyword_3 in keywords_3:
if keyword_3 in r:
status_3="ok"
print("home -> " +str(keyword_3))
break
for keyword_4 in keywords_4:
if keyword_4 in r:
status_4="ok"
print("home -> " +str(keyword_4))
break
for keyword_5 in keywords_5:
if keyword_5 in r:
status_5="ok"
print("Home ->" +str(keyword_5))
break
with open('data/script/checking_OUTPUT.csv', mode='a') as employee_file:
employee_writer = csv.writer(employee_file, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL,lineterminator='\n')
write=[site,status_1,status_2,status_3,status_4,status_5]
employee_writer.writerow(write)
except Exception as e:
#driver.delete_all_cookies()
print("Fail")
driver.quit()
Upvotes: 0
Views: 1194
Reputation: 193308
To start with, ideally set_page_load_timeout()
and page_load_strategy = 'none'
shouldn't be clubbed up together.
set_page_load_timeout() set the amount of time to wait for a page load to complete before throwing an error.
You can find a detailed discussion in How to set the timeout of 'driver.get' for python selenium 3.8.0?
page_load_strategy = 'none'
causes Selenium to return immediately after the initial page content is fully received (html content downloaded).
You can find a detailed discussion in How to set the timeout of 'driver.get' for python selenium 3.8.0?
Upvotes: 0
Reputation: 19989
chromeOptions.setPageLoadStrategy(PageLoadStrategy.EAGER);
WebDriver driver = new ChromeDriver(chromeOptions);
use page loading strategy Eager to wait only till initial html is loaded , you can also use none , but make sure you have explicit/implicit waits for elements if timing issue comes up
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
caps = DesiredCapabilities().CHROME
# caps["pageLoadStrategy"] = "normal" # Waits for full page load
caps["pageLoadStrategy"] = "none"
options = Options()
driver = webdriver.Chrome(desired_capabilities=caps, options=options)
url = 'https://www.gm-trucks.com/'
driver.get(url)
print(driver.title)
print("hi")
input()
Or :
options = Options()
options.set_capability("pageLoadStrategy", "none")
driver = webdriver.Chrome(options=options)
The documentation is updated as per selenium 4.0.0-alpha-7
so use above solution or update to selenium v4 for future protection
pip install selenium==4.0.0.a7
Bug
https://github.com/SeleniumHQ/seleniumhq.github.io/issues/627
Upvotes: 2