Reputation: 15
I am a begginer in python and programming in general.
I have a code in python that I use to scrape data from a specific website into a csv data. It works fine for me in general. I usually let it running through the night, when the website response is faster and more stable.
The problem is: sometimes my own connection fails or some instabillity happens in the website and the script returns an error, making me loose a lot ot time.
I want to improve the code with some error handling method so it can keep checking if the insternet connection is working and move to the next link when it works instead of crashing. Does any of you know how to implement this?
This is my python code:
#!-*- coding: utf8 -*-
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from bs4 import BeautifulSoup
from tqdm import tqdm
import datetime
import requests
import pandas
import os
class SigefRequests:
"""Class responsible for accessing, extracting and parsing sigef
information into a csv file.
The output file will be at ./data/outputs
"""
def __init__(self, path):
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
self.url_list = self.reading_url_file(path)
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) "
"Gecko/20100101 Firefox/54.0",
"Connection": "close",
"Accept-Language": "en-US,en;q=0.5",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/"
";q=0.8",
"Upgrade-Insecure-Requests": "1"
}
self.session = requests.session()
self.data = {
'código': [],
'denominação': [],
'área': [],
'data de entrada': [],
'situação': [],
'responsável técnico': [],
'ART': [],
'envio': [],
'requerimento': [],
'status': [],
'data': [],
'nome': [],
'cpf/cnpj': [],
'situação - georreferência': [],
'natureza': [],
'número de parcelas': [],
'municípios': [],
'código do imóvel': [],
'shp - polígono': [],
'shp - vértices': [],
'shp - limites': [],
'kml - polígono': [],
'kml - vértices': [],
'kml - limites': [],
'csv - polígono': [],
'csv - vértices': [],
'csv - limites': [],
}
self.export_list = [
"https://sigef.incra.gov.br/geo/exportar/parcela/shp/{}",
"https://sigef.incra.gov.br/geo/exportar/vertice/shp/{}",
"https://sigef.incra.gov.br/geo/exportar/limite/shp/{}",
"https://sigef.incra.gov.br/geo/exportar/parcela/kml/{}",
"https://sigef.incra.gov.br/geo/exportar/vertice/kml/{}",
"https://sigef.incra.gov.br/geo/exportar/limite/kml/{}",
"https://sigef.incra.gov.br/geo/exportar/parcela/csv/{}",
"https://sigef.incra.gov.br/geo/exportar/vertice/csv/{}",
"https://sigef.incra.gov.br/geo/exportar/limite/csv/{}"
]
# Used in __init__
@staticmethod
def reading_url_file(path):
"""This function reads the links.txt file and return a links list.
Parameters
----------
path : str
The path to links.txt file.
(By default this file is in data folder).
Returns
-------
url_list : iterator
The links list.
"""
return open(
os.path.abspath('../' + path)
).readlines()
# Used in __call__
def requesting(self, url):
"""This function makes a GET requisition into the given sigef url.
Parameters
----------
url : str
Sigef's URL.
Returns
-------
response : requests.models.Response
The GET Requisition response.
"""
return self.session.get(url, verify=False, headers=self.headers)
# Used in __call__
@staticmethod
def soup(html):
"""This function parses the html.
Parameters
----------
html : requests.models.Response
Unparsed html.
Returns
-------
parsed_html : bs4.BeautifulSoup
Parsed html.
"""
return BeautifulSoup(html.content, 'html5lib')
# Used in __call__
def filtering_content(self, html):
"""This function filters the page content and looks for the relevant
data.
Parameters
----------
html : bs4.BeautifulSoup
Parsed html.
Returns
-------
"""
tables = html.find_all('table', {
'class': 'table table-hover tabela-atributos'
})
tables_ = [tables[0], tables[1], tables[2], tables[-1]]
content_list = []
for table in tables_:
for row in table.find_all('td'):
content_list.append((row.text.strip()))
content_list.pop(content_list.index('Envio'))
if 'Nenhum requerimento' in content_list:
content_list.insert(9, '-')
content_list.insert(9, '-')
names = []
for row in tables[3].find_all('th'):
names.append(row.text)
table_3_content = []
for row in tables[3].find_all('td'):
table_3_content.append(row.text.strip())
content_list.append(table_3_content[1])
content_list.append(table_3_content[2])
content_list.append(table_3_content[names.index('Número parcelas')])
content_list.append(table_3_content[-1])
try:
content_list.append(table_3_content[names.index(
'Código do Imóvel (SNCR/INCRA)')])
except ValueError:
content_list.append('-')
for elem in self.export_list:
content_list.append(elem.format(content_list[0]))
for elem in content_list:
if u'\u2013' in elem:
content_list[content_list.index(elem)] = \
elem.replace(u'\u2013', '-')
for key, value in zip(self.data.keys(), content_list):
self.data.get(key).append(value)
self.parsing_to_csv()
# Used in filtering_content
def parsing_to_csv(self):
"""This function parses the acquired data into a csv file.
Returns
-------
"""
pandas.DataFrame(self.data).set_index('código').to_csv(os.path.abspath(
'../data/outputs/sigef-{}.csv'.format(datetime.date.today())),
encoding='latin-1', sep=';'
)
def __call__(self, *args, **kwargs):
for url in tqdm(self.url_list):
self.filtering_content(self.soup(self.requesting(url)))
if __name__ == '__main__':
SigefRequests(r'data\links.txt').__call__()
Here is an example of the error I get when it stops working:
(env) D:\Documentos\LAGESA\Programas\Scraper\up3\sigef-crawler\src>python crawler.py
12%|█████████▎ | 543/4493 [1:59:07<14:26:33, 13.16s/it]
Traceback (most recent call last):
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
conn = connection.create_connection(
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\util\connection.py", line 61, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "C:\Users\joaop\AppData\Local\Programs\Python\Python38\lib\socket.py", line 918, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
httplib_response = self._make_request(
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
self._validate_conn(conn)
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 976, in _validate_conn
conn.connect()
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connection.py", line 308, in connect
conn = self._new_conn()
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connection.py", line 171, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x00000295683D0520>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\adapters.py", line 439, in send
resp = conn.urlopen(
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 724, in urlopen
retries = retries.increment(
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\util\retry.py", line 439, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='sigef.incra.gov.br', port=443): Max retries exceeded with url: /geo/parcela/detalhe/a7144e88-f458-4c25-b275-64b24284fac0/%0A (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000295683D0520>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "crawler.py", line 212, in <module>
SigefRequests(r'data\links.txt').__call__()
File "crawler.py", line 208, in __call__
self.filtering_content(self.soup(self.requesting(url)))
File "crawler.py", line 110, in requesting
return self.session.get(url, verify=False, headers=self.headers)
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\sessions.py", line 543, in get
return self.request('GET', url, **kwargs)
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\adapters.py", line 516, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='sigef.incra.gov.br', port=443): Max retries exceeded with url: /geo/parcela/detalhe/a7144e88-f458-4c25-b275-64b24284fac0/%0A (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000295683D0520>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Thanks for your help in advance!
Upvotes: 1
Views: 268
Reputation: 26
Hey joao in python you use the try try statement to continue running if the program gets a specific error.
heres and example.
string = "string"
try:
print(int(string))
except ValueError:
print("it didn't work")
without try and except you get
Traceback (most recent call last):
File "C:\Users\jojop\OneDrive\Desktop\python.py", line 4, in <module>
print(int(string))
ValueError: invalid literal for int() with base 10: 'string'
The message will give you the error that you can use in this case "ValueError"
Upvotes: 1