Joao Penido
Joao Penido

Reputation: 15

Creating exceptions for errors in a python webscraping script

I am a begginer in python and programming in general.

I have a code in python that I use to scrape data from a specific website into a csv data. It works fine for me in general. I usually let it running through the night, when the website response is faster and more stable.

The problem is: sometimes my own connection fails or some instabillity happens in the website and the script returns an error, making me loose a lot ot time.

I want to improve the code with some error handling method so it can keep checking if the insternet connection is working and move to the next link when it works instead of crashing. Does any of you know how to implement this?

This is my python code:

    #!-*- coding: utf8 -*-
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from bs4 import BeautifulSoup
from tqdm import tqdm

import datetime
import requests
import pandas
import os


class SigefRequests:
    """Class responsible for accessing, extracting and parsing sigef
    information into a csv file.

    The output file will be at ./data/outputs

    """
    def __init__(self, path):
        requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
        self.url_list = self.reading_url_file(path)
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) "
                          "Gecko/20100101 Firefox/54.0",
            "Connection": "close",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/"
                      ";q=0.8",
            "Upgrade-Insecure-Requests": "1"
        }
        self.session = requests.session()
        self.data = {
            'código': [],
            'denominação': [],
            'área': [],
            'data de entrada': [],
            'situação': [],
            'responsável técnico': [],
            'ART': [],
            'envio': [],
            'requerimento': [],
            'status': [],
            'data': [],
            'nome': [],
            'cpf/cnpj': [],
            'situação - georreferência': [],
            'natureza': [],
            'número de parcelas': [],
            'municípios': [],
            'código do imóvel': [],
            'shp - polígono': [],
            'shp - vértices': [],
            'shp - limites': [],
            'kml - polígono': [],
            'kml - vértices': [],
            'kml - limites': [],
            'csv - polígono': [],
            'csv - vértices': [],
            'csv - limites': [],
        }

        self.export_list = [
            "https://sigef.incra.gov.br/geo/exportar/parcela/shp/{}",
            "https://sigef.incra.gov.br/geo/exportar/vertice/shp/{}",
            "https://sigef.incra.gov.br/geo/exportar/limite/shp/{}",
            "https://sigef.incra.gov.br/geo/exportar/parcela/kml/{}",
            "https://sigef.incra.gov.br/geo/exportar/vertice/kml/{}",
            "https://sigef.incra.gov.br/geo/exportar/limite/kml/{}",
            "https://sigef.incra.gov.br/geo/exportar/parcela/csv/{}",
            "https://sigef.incra.gov.br/geo/exportar/vertice/csv/{}",
            "https://sigef.incra.gov.br/geo/exportar/limite/csv/{}"
        ]

    # Used in __init__
    @staticmethod
    def reading_url_file(path):
        """This function reads the links.txt file and return a links list.

        Parameters
        ----------
        path : str
            The path to links.txt file.
            (By default this file is in data folder).

        Returns
        -------
        url_list : iterator
            The links list.

        """
        return open(
            os.path.abspath('../' + path)
        ).readlines()

    # Used in __call__
    def requesting(self, url):
        """This function makes a GET requisition into the given sigef url.

        Parameters
        ----------
        url : str
            Sigef's URL.

        Returns
        -------
        response : requests.models.Response
            The GET Requisition response.

        """
        return self.session.get(url, verify=False, headers=self.headers)

    # Used in __call__
    @staticmethod
    def soup(html):
        """This function parses the html.

        Parameters
        ----------
        html : requests.models.Response
            Unparsed html.

        Returns
        -------
        parsed_html : bs4.BeautifulSoup
            Parsed html.

        """
        return BeautifulSoup(html.content, 'html5lib')

    # Used in __call__
    def filtering_content(self, html):
        """This function filters the page content and looks for the relevant
        data.

        Parameters
        ----------
        html : bs4.BeautifulSoup
            Parsed html.

        Returns
        -------

        """
        tables = html.find_all('table', {
            'class': 'table table-hover tabela-atributos'
        })

        tables_ = [tables[0], tables[1], tables[2], tables[-1]]

        content_list = []
        for table in tables_:
            for row in table.find_all('td'):
                content_list.append((row.text.strip()))

        content_list.pop(content_list.index('Envio'))

        if 'Nenhum requerimento' in content_list:
            content_list.insert(9, '-')
            content_list.insert(9, '-')

        names = []
        for row in tables[3].find_all('th'):
            names.append(row.text)

        table_3_content = []
        for row in tables[3].find_all('td'):
            table_3_content.append(row.text.strip())

        content_list.append(table_3_content[1])
        content_list.append(table_3_content[2])
        content_list.append(table_3_content[names.index('Número parcelas')])
        content_list.append(table_3_content[-1])

        try:
            content_list.append(table_3_content[names.index(
                'Código do Imóvel (SNCR/INCRA)')])
        except ValueError:
            content_list.append('-')

        for elem in self.export_list:
            content_list.append(elem.format(content_list[0]))

        for elem in content_list:
            if u'\u2013' in elem:
                content_list[content_list.index(elem)] = \
                    elem.replace(u'\u2013', '-')

        for key, value in zip(self.data.keys(), content_list):
            self.data.get(key).append(value)

        self.parsing_to_csv()

    # Used in filtering_content
    def parsing_to_csv(self):
        """This function parses the acquired data into a csv file.

        Returns
        -------

        """
        pandas.DataFrame(self.data).set_index('código').to_csv(os.path.abspath(
            '../data/outputs/sigef-{}.csv'.format(datetime.date.today())),
            encoding='latin-1', sep=';'
        )

    def __call__(self, *args, **kwargs):
        for url in tqdm(self.url_list):
            self.filtering_content(self.soup(self.requesting(url)))


if __name__ == '__main__':
    SigefRequests(r'data\links.txt').__call__()

Here is an example of the error I get when it stops working:

(env) D:\Documentos\LAGESA\Programas\Scraper\up3\sigef-crawler\src>python crawler.py
 12%|█████████▎                                                                   | 543/4493 [1:59:07<14:26:33, 13.16s/it]
Traceback (most recent call last):
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    conn = connection.create_connection(
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\util\connection.py", line 61, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "C:\Users\joaop\AppData\Local\Programs\Python\Python38\lib\socket.py", line 918, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11001] getaddrinfo failed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
    httplib_response = self._make_request(
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 976, in _validate_conn
    conn.connect()
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connection.py", line 308, in connect
    conn = self._new_conn()
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connection.py", line 171, in _new_conn
    raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x00000295683D0520>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\adapters.py", line 439, in send
    resp = conn.urlopen(
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 724, in urlopen
    retries = retries.increment(
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\util\retry.py", line 439, in increment
    raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='sigef.incra.gov.br', port=443): Max retries exceeded with url: /geo/parcela/detalhe/a7144e88-f458-4c25-b275-64b24284fac0/%0A (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000295683D0520>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "crawler.py", line 212, in <module>
    SigefRequests(r'data\links.txt').__call__()
  File "crawler.py", line 208, in __call__
    self.filtering_content(self.soup(self.requesting(url)))
  File "crawler.py", line 110, in requesting
    return self.session.get(url, verify=False, headers=self.headers)
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\sessions.py", line 543, in get
    return self.request('GET', url, **kwargs)
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\sessions.py", line 530, in request
    resp = self.send(prep, **send_kwargs)
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\sessions.py", line 643, in send
    r = adapter.send(request, **kwargs)
  File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\adapters.py", line 516, in send
    raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='sigef.incra.gov.br', port=443): Max retries exceeded with url: /geo/parcela/detalhe/a7144e88-f458-4c25-b275-64b24284fac0/%0A (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000295683D0520>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

Thanks for your help in advance!

Upvotes: 1

Views: 268

Answers (1)

60robo
60robo

Reputation: 26

Hey joao in python you use the try try statement to continue running if the program gets a specific error.

heres and example.

string = "string"

try:
    print(int(string))
except ValueError:
    print("it didn't work")

without try and except you get

Traceback (most recent call last):
File "C:\Users\jojop\OneDrive\Desktop\python.py", line 4, in <module>
print(int(string))
ValueError: invalid literal for int() with base 10: 'string'

The message will give you the error that you can use in this case "ValueError"

Upvotes: 1

Related Questions