Ankit Srivastava
Ankit Srivastava

Reputation: 21

Multiprocessing selenium in Python 3 using Chrome issues

I'm trying to scrape data from a website, and trying to run multiple chrome browsers to simultaneously download these files to speed up the process. If I use a single window, this script runs fine. However, there are two issues I'm running into -

a) Many of the browser windows do not close. b) While the program does run and downloads files for a while, it stops after some time. Error message - 'ERROR:shader_disk_cache.cc(238)] Failed to create shader cache entry -2'

My chromedriver is in 'D:\401\401k'

Script -

'''Downloading 5500 forms from ERISA'''

#Import Library
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time, os, timeit, shutil
import pandas as pd
from multiprocessing import Pool

#Clean up download folder
if os.path.exists('D:/Form5500_Downloads'):    
    shutil.rmtree('D:/Form5500_Downloads')
    os.makedirs('D:/Form5500_Downloads')
else:
    os.makedirs('D:/Form5500_Downloads')

'''Function to download a single form using ACK ID'''
def download_form(ackid):
    #Setting Chrome preferences
    chromeOptions = webdriver.ChromeOptions()
    prefs = {"download.default_directory" : "D:/Form5500_Downloads"} #Download folder
    chromeOptions.add_experimental_option("prefs",prefs)
    path_to_chromedriver = 'D:/401/401k/chromedriver_2.35'
    browser = webdriver.Chrome(executable_path = r'D:\401\401k\chromedriver_2.35.exe', chrome_options=chromeOptions)
    browser.implicitly_wait(3) #Implicit wait untile the element appears

    # Open ERISA website
    url = 'https://www.efast.dol.gov/portal/app/disseminate?execution=e1s4#'
    browser.get(url)
    # Search for a form using ACK ID
    browser.find_element_by_css_selector('#ackId').send_keys(ackid)
    browser.find_element_by_css_selector('.ui-icon-search').click()

    #Check if the form exists - if NOT, exit the function
    try:
        browser.find_element_by_css_selector('#form\:filingTreeTable\:0\:einLnk').click() 
    except:
        browser.find_element_by_css_selector('#ackId').clear()# delete the ackid value
        browser.quit()
    #Wait until downloaded and rename using ackid
    print(ackid)
    while not os.path.exists("D:/Form5500_Downloads/filing.pdf"):
        time.sleep(5)
    os.rename("D:/Form5500_Downloads/filing.pdf","D:/Form5500_Downloads/"+ackid+".pdf")  
    browser.quit()

def main():

    '''Download'''
    # Get list of ackids from csv file
    df = pd.read_csv('D:/401/401k/F_SCH_H_2015_latest.csv',usecols=[0], nrows=10000) 
    ackid_list = df['ACK_ID'].tolist()

    if __name__ ==  '__main__':
        with Pool(10) as p:
            records = p.map(download_form, ackid_list)

main()

Upvotes: 0

Views: 853

Answers (1)

Ankit Srivastava
Ankit Srivastava

Reputation: 21

This issue was resolved using the following -

    browser.stop_client()
    browser.close()

instead of

    browser.quit()

I still do not understand why this hack works though.

Upvotes: 1

Related Questions