Reputation: 1
I'm trying to scrape links from the website https://www.usyouthsoccer.org/clubs/club-directory/. Initially, the code broke at the 30th link, so I tried to handle the exception error with urllib HTTPError. Now, the script just stops running at the 30th link. I checked that specific url and it is a bad link. I just want to move past it in the loop, but I'm having trouble with the work around. Any suggestions would be greatly appreciated...
import requests
from bs4 import BeautifulSoup as bs
from splinter import Browser
import pandas as pd
from urllib.request import Request, urlopen
from urllib.error import HTTPError
executable_path = {"executable_path": "chromedriver"}
browser = Browser("chrome", **executable_path, headless=True)
url = 'https://www.usyouthsoccer.org/clubs/club-directory/'
zipcode_input = 'CT_Main_0$txtLocation'
search_button = '//*[@id="CT_Main_0_btnSearch"]'
dropdown = '//*[@id="CT_Main_0_drpMiles"]/option[5]'
zip_codes = [64015]
team_df = pd.DataFrame()
for x in zip_codes:
try:
print(f'\n{x}\n')
url = 'https://www.usyouthsoccer.org/clubs/club-directory/'
browser.visit(url)
browser.fill(zipcode_input, x)
browser.find_by_xpath(dropdown).click()
browser.find_by_xpath(search_button).click()
html = browser.html
soup = bs(html, 'html.parser')
dallas_urls = soup.find_all(class_="more")
counter = 1
for url in dallas_urls:
print(f'Link {counter} of {len((dallas_urls))}')
counter += 1
back_url = url['href']
front_url = 'https://www.usyouthsoccer.org'
total_url = front_url + back_url
browser.visit(total_url)
my_html = pd.read_html(total_url)
details_pd = pd.DataFrame(my_html[0])
details_pd.columns = ['Cols', 'Vals']
df = details_pd.T
df.columns = df.iloc[0]
df.drop('Cols', inplace = True)
contacts_pd = pd.DataFrame(my_html[1])
if len(contacts_pd.index) == 1:
df['Contact_Title'] = contacts_pd.iloc[0,0]
df['Contact_Name'] = contacts_pd.iloc[0, 1]
df['Contact_Email'] = contacts_pd.iloc[0, 2]
elif len(contacts_pd.index) == 2:
df['Contact_Title'] = contacts_pd.iloc[0,0]
df['Contact_Name'] = contacts_pd.iloc[0, 1]
df['Contact_Email'] = contacts_pd.iloc[0, 2]
df['Contact_Title2'] = contacts_pd.iloc[1,0]
df['Contact_Name2'] = contacts_pd.iloc[1, 1]
df['Contact_Email2'] = contacts_pd.iloc[1, 2]
elif len(contacts_pd.index) == 3:
df['Contact_Title'] = contacts_pd.iloc[0,0]
df['Contact_Name'] = contacts_pd.iloc[0, 1]
df['Contact_Email'] = contacts_pd.iloc[0, 2]
df['Contact_Title2'] = contacts_pd.iloc[1,0]
df['Contact_Name2'] = contacts_pd.iloc[1, 1]
df['Contact_Email2'] = contacts_pd.iloc[1, 2]
df['Contact_Title3'] = contacts_pd.iloc[2,0]
df['Contact_Name3'] = contacts_pd.iloc[2, 1]
df['Contact_Email3'] = contacts_pd.iloc[2, 2]
team_df = pd.concat([team_df, df])
except HTTPError as err:
continue
Upvotes: 0
Views: 99
Reputation: 909
Put your try statement inside of the nested for loop. Right now it looks like if you have a HTTP Error it is stopping the entire for loop - instead of continuing through the for loop.
for url in dallas_urls:
try:
print(f'Link {counter} of {len((dallas_urls))}')
counter += 1
back_url = url['href']
front_url = 'https://www.usyouthsoccer.org'
total_url = front_url + back_url
urllib.request.urlretrieve(total_url)
except urllib.error.HTTPError:
print ('Error')
continue
Upvotes: 2