William Bernard
William Bernard

Reputation: 357

Error when scraping in Python, need to bypass

import requests
from bs4 import BeautifulSoup
import csv
from urlparse import urljoin
import urllib2

outfile = open("./battingall.csv", "wb")
writer = csv.writer(outfile)
base_url = 'http://www.baseball-reference.com'
player_url = 'http://www.baseball-reference.com/players/'
alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
players = 'shtml'
gamel = '&t=b&year='
game_logs = 'http://www.baseball-reference.com/players/gl.cgi?id='
years = ['2015','2014','2013','2012','2011','2010','2009','2008']

drounders = []
for dround in alphabet:
    drounders.append(player_url + dround)

urlz = []
for ab in drounders:
    data = requests.get(ab)
    soup = BeautifulSoup(data.content)
    for link in soup.find_all('a'):
        if link.has_attr('href'):
            urlz.append(base_url + link['href'])

yent = []
for ant in urlz:
    for d in drounders:
        for y in years:
            if players in ant:
                if len(ant) < 60:
                    if d in ant:
                        yent.append(game_logs + ant[44:-6] + gamel + y)

for j in yent:
    try:
        data = requests.get(j)
        soup = BeautifulSoup(data.content)
        table = soup.find('table', attrs={'id': 'batting_gamelogs'})
        tablea = j[52:59]
        tableb= soup.find("b", text='Throws:').next_sibling.strip()
        tablec= soup.find("b", text='Height:').next_sibling.strip()
        tabled= soup.find("b", text='Weight:').next_sibling.strip()
        list_of_rows = []
        for row in table.findAll('tr'):
            list_of_cells = []
            list_of_cells.append(tablea)
            list_of_cells.append(j[len(j)-4:])
            list_of_cells.append(tableb)
            list_of_cells.append(tablec)
            list_of_cells.append(tabled)
            for cell in row.findAll('td'):
                text = cell.text.replace('&nbsp;', '').encode("utf-8")
                list_of_cells.append(text)
            list_of_rows.append(list_of_cells)
        print list_of_rows
        writer.writerows(list_of_rows)
    except (AttributeError,NameError):
        pass

When I run this code to get gamelog batting data I keep getting an error:

Traceback (most recent call last):
  File "battinggamelogs.py", line 44, in <module>
    data = requests.get(j)
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-      packages/requests/api.py", line 65, in get
    return request('get', url, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-    packages/requests/api.py", line 49, in request
    response = session.request(method=method, url=url, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/sessions.py", line 461, in request
    resp = self.send(prep, **send_kwargs)
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/sessions.py", line 573, in send
    r = adapter.send(request, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/adapters.py", line 415, in send
    raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.',     BadStatusLine("''",))

I need a way to bypass this error to keep going. I think the reason the error comes up because there is no table to get data from.

Upvotes: 0

Views: 486

Answers (1)

Andy
Andy

Reputation: 50550

You can wrap your requests.get() block in a try/except. You need to catch the requests.exceptions.ConnectionError that is being generated.

for ab in drounders:
    try:
        data = requests.get(ab)
        soup = BeautifulSoup(data.content)
        for link in soup.find_all('a'):
            if link.has_attr('href'):
                urlz.append(base_url + link['href'])
    except requests.exceptions.ConnectionError:
        pass

This is occurring because the connection, itself, has a problem, not because there is no data in the table. You aren't even getting that far.

Note: This is completely eating the exception by simply using pass (as you are also doing later in the code block). It may be better to do something like this:

except requests.exceptions.ConnectionError:
    print("Failed to open {}".format(ab))

This will provide you with a message on the console of what URL is failing.

Upvotes: 2

Related Questions