dev
dev

Reputation: 1179

Python/Twisted - How to print more detailed error message

I run this code:

import argparse

from tqdm import tqdm
from sys import argv
from pprint import pformat

from twisted.internet.task import react
from twisted.web.client import Agent, readBody
from twisted.web.http_headers import Headers
from twisted.internet.task import cooperate
from twisted.internet.defer import gatherResults

import sys
from twisted.python import log

log.startLogging(sys.stdout)

import lxml.html

from geoip import geolite2
import pycountry

from tld import get_tld
import json
import socket

poweredby = ""
server = ""
ip = ""

f = open("errors.txt", "w")


def error(response, url):
    f.write("Error: "+url+"\n") 


def cbRequest(response, url):
    global poweredby, server, ip
    # print 'Response version:', response.version
    # print 'Response code:', response.code
    # print 'Response phrase:', response.phrase
    # print 'Response headers:'
    # print pformat(list(response.headers.getAllRawHeaders()))
    poweredby = response.headers.getRawHeaders("X-Powered-By")[0]
    server = response.headers.getRawHeaders("Server")[0]

    #print poweredby
    #print server

    d = readBody(response)
    d.addCallback(cbBody, url)
    return d


def cbBody(body, ourl):
    global poweredby, server,ip

    #print body
    html_element = lxml.html.fromstring(body)
    generator = html_element.xpath("//meta[@name='generator']/@content")

    ip = socket.gethostbyname(ourl)

    try:
        match = geolite2.lookup(ip)
        if match is not None:
            country = match.country
            try:

                c = pycountry.countries.lookup(country)
                country = c.name
            except:
                country = ""

    except:
        country = ""
    try:
        res = get_tld("http://www" + ourl, as_object=True)
        tld = res.suffix
    except:
        tld = ""

    try:
        match = re.search(r'[\w\.-]+@[\w\.-]+', body)
        email = match.group(0)
    except:
        email = ""

    permalink=ourl.rstrip().replace(".","-")

    try:
        item = generator[0]
        val = "{ \"Domain\":" + json.dumps(
            "http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\",\"Server\":" + json.dumps(
            str(server)) + ",\"PoweredBy\":" + json.dumps(
                str(poweredby)) + ",\"MetaGenerator\":" + json.dumps(item) + ",\"Email\":" + json.dumps(
                    email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"
    except:
        val = "{ \"Domain\":" + json.dumps(
            "http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\"," + "\"Server\":" + json.dumps(
            str(server)) + ",\"PoweredBy\":" + json.dumps(
                str(poweredby)) + ",\"MetaGenerator\":\"\",\"Email\":" + json.dumps(
                    email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"


    print val


def main(reactor, url_path):
    urls = open(url_path)
    return mainjob(reactor, (url.strip() for url in urls))

def mainjob(reactor, urls=argv[2:]):
    #for url in urls:
    #  print url
    agent = Agent(reactor)
    work = (process(agent, url) for url in tqdm(urls))
    tasks = list(cooperate(work) for i in range(100))
    return gatherResults(list(task.whenDone() for task in tasks))



def process(agent, url):
    d = agent.request(
        'GET', "http://" + url,
        Headers({'User-Agent': ['crawler']}),
        None)
    d.debug=1
    d.addCallback(cbRequest, url)
    d.addErrback(error, url)  
    return d

react(main, ["./test.txt"])

f.close()

I enabled the debug and somehow for those 2 entries addErrback is always (4 out of 4 attempts) called, hence Error entries:

 user@laptop:~/crawler$ python scanner.py 
2017-02-25 20:35:36+0100 [-] Log opened.
0it [00:00, ?it/s]2017-02-25 20:35:36+0100 [-] Starting factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac1248>
2017-02-25 20:35:36+0100 [-] Starting factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac16c8>
2it [00:00, 660.31it/s]
2017-02-25 20:35:37+0100 [-] Stopping factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac1248>
2017-02-25 20:35:37+0100 [-] Stopping factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac16c8>
2017-02-25 20:35:37+0100 [-] Main loop terminated.
user@laptop:~/crawler$ cat errors.txt 
Error: google.al
Error: fau.edu.al

I am confused, since while capturing, I see those requests got sent out and response was received.

enter image description here

How to print the exact error cause in addErrback() to understand the behavior more?

This is related to this question:

Twisted/Python - processing a large file line by line

I appreciate any help. New to Python and Twisted.

Thanks,

Update 1:

I modified the error function to this:

def error(failure, url):
    f.write("Error: "+url+"\n") 
    print type(failure.value), failure # catch error here
    print failure.value.reasons[0].printTraceback()

and here is the output, seem to be the same on each run:

user@laptop:~/crawler$ python scanner.py 
2017-02-25 21:24:48+0100 [-] Log opened.
0it [00:00, ?it/s]2017-02-25 21:24:48+0100 [-] Starting factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac1200>
2017-02-25 21:24:48+0100 [-] Starting factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac1680>
2it [00:00, 788.33it/s]
2017-02-25 21:24:48+0100 [-] <type 'exceptions.TypeError'> [Failure instance: Traceback: <type 'exceptions.TypeError'>: 'NoneType' object has no attribute '__getitem__'
2017-02-25 21:24:48+0100 [-] /usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:565:_startRunCallbacks
2017-02-25 21:24:48+0100 [-] /usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:651:_runCallbacks
2017-02-25 21:24:48+0100 [-] /usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:457:callback
2017-02-25 21:24:48+0100 [-] /usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:565:_startRunCallbacks
2017-02-25 21:24:48+0100 [-] --- <exception caught here> ---
2017-02-25 21:24:48+0100 [-] /usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:651:_runCallbacks
2017-02-25 21:24:48+0100 [-] scanner.py:47:cbRequest
2017-02-25 21:24:48+0100 [-] ]
2017-02-25 21:24:48+0100 [HTTP11ClientProtocol,client] main function encountered error
    Traceback (most recent call last):
    Failure: twisted.internet.defer.FirstError: FirstError[#0, [Failure instance: Traceback: <type 'exceptions.AttributeError'>: 'exceptions.TypeError' object has no attribute 'reasons'
    /usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:565:_startRunCallbacks
    /usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:651:_runCallbacks
    /usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:457:callback
    /usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:565:_startRunCallbacks
    --- <exception caught here> ---
    /usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:651:_runCallbacks
    scanner.py:37:error
    ]]

2017-02-25 21:24:48+0100 [-] <class 'twisted.internet.error.ConnectError'> [Failure instance: Traceback (failure with no frames): <class 'twisted.internet.error.ConnectError'>: An error occurred while connecting: [Failure instance: Traceback (failure with no frames): <class 'twisted.internet.error.ConnectionLost'>: Connection to the other side was lost in a non-clean fashion: Connection lost.
2017-02-25 21:24:48+0100 [-] ].
2017-02-25 21:24:48+0100 [-] ]
2017-02-25 21:24:48+0100 [-] Stopping factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac1680>
2017-02-25 21:24:48+0100 [-] Stopping factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac1200>
2017-02-25 21:24:48+0100 [-] Main loop terminated.
2017-02-25 21:24:48+0100 [-] Unhandled error in Deferred:
2017-02-25 21:24:48+0100 [-] Unhandled Error
    Traceback (most recent call last):
      File "/usr/local/lib/python2.7/dist-packages/twisted/internet/base.py", line 1084, in connectionFailed
        self.factory.clientConnectionFailed(self, reason)
      File "/usr/local/lib/python2.7/dist-packages/twisted/internet/endpoints.py", line 246, in clientConnectionFailed
        self._onConnection.errback(reason)
      File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 498, in errback
        self._startRunCallbacks(fail)
      File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 565, in _startRunCallbacks
        self._runCallbacks()
    --- <exception caught here> ---
      File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 651, in _runCallbacks
        current.result = callback(current.result, *args, **kw)
      File "scanner.py", line 37, in error
        print failure.value.reasons[0].printTraceback()
    exceptions.AttributeError: 'ConnectError' object has no attribute 'reasons'

2017-02-25 21:24:48+0100 [-] Unhandled error in Deferred:
2017-02-25 21:24:48+0100 [-] Unhandled Error
    Traceback (most recent call last):
      File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 565, in _startRunCallbacks
        self._runCallbacks()
      File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 651, in _runCallbacks
        current.result = callback(current.result, *args, **kw)
      File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 457, in callback
        self._startRunCallbacks(result)
      File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 565, in _startRunCallbacks
        self._runCallbacks()
    --- <exception caught here> ---
      File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 651, in _runCallbacks
        current.result = callback(current.result, *args, **kw)
      File "scanner.py", line 37, in error
        print failure.value.reasons[0].printTraceback()
    exceptions.AttributeError: 'exceptions.TypeError' object has no attribute 'reasons'

Update 2:

After modifying the error function to example given by @Jean-Paul Calderone

I get this output:

2017-02-27 17:46:57+0100 [-] Log opened.
0it [00:00, ?it/s]2017-02-27 17:46:57+0100 [-] Starting factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac0320>
2017-02-27 17:46:57+0100 [-] Starting factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac07a0>
2it [00:00, 763.36it/s]
2017-02-27 17:46:57+0100 [-] Traceback (most recent call last):
2017-02-27 17:46:57+0100 [-]   File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 565, in _startRunCallbacks
2017-02-27 17:46:57+0100 [-]     self._runCallbacks()
2017-02-27 17:46:57+0100 [-]   File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 651, in _runCallbacks
2017-02-27 17:46:57+0100 [-]     current.result = callback(current.result, *args, **kw)
2017-02-27 17:46:57+0100 [-]   File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 457, in callback
2017-02-27 17:46:57+0100 [-]     self._startRunCallbacks(result)
2017-02-27 17:46:57+0100 [-]   File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 565, in _startRunCallbacks
2017-02-27 17:46:57+0100 [-]     self._runCallbacks()
2017-02-27 17:46:57+0100 [-] --- <exception caught here> ---
2017-02-27 17:46:57+0100 [-]   File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 651, in _runCallbacks
2017-02-27 17:46:57+0100 [-]     current.result = callback(current.result, *args, **kw)
2017-02-27 17:46:57+0100 [-]   File "scan.py", line 59, in cbRequest
2017-02-27 17:46:57+0100 [-]     poweredby = response.headers.getRawHeaders("X-Powered-By")[0]
2017-02-27 17:46:57+0100 [-] exceptions.TypeError: 'NoneType' object has no attribute '__getitem__'
2017-02-27 17:46:57+0100 [-] Stopping factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac0320>
2017-02-27 17:46:59+0100 [-] Traceback (most recent call last):
2017-02-27 17:46:59+0100 [-]   File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 565, in _startRunCallbacks
2017-02-27 17:46:59+0100 [-]     self._runCallbacks()
2017-02-27 17:46:59+0100 [-]   File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 651, in _runCallbacks
2017-02-27 17:46:59+0100 [-]     current.result = callback(current.result, *args, **kw)
2017-02-27 17:46:59+0100 [-]   File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 457, in callback
2017-02-27 17:46:59+0100 [-]     self._startRunCallbacks(result)
2017-02-27 17:46:59+0100 [-]   File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 565, in _startRunCallbacks
2017-02-27 17:46:59+0100 [-]     self._runCallbacks()
2017-02-27 17:46:59+0100 [-] --- <exception caught here> ---
2017-02-27 17:46:59+0100 [-]   File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 651, in _runCallbacks
2017-02-27 17:46:59+0100 [-]     current.result = callback(current.result, *args, **kw)
2017-02-27 17:46:59+0100 [-]   File "scan.py", line 59, in cbRequest
2017-02-27 17:46:59+0100 [-]     poweredby = response.headers.getRawHeaders("X-Powered-By")[0]
2017-02-27 17:46:59+0100 [-] exceptions.TypeError: 'NoneType' object has no attribute '__getitem__'
2017-02-27 17:46:59+0100 [-] Stopping factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac07a0>
2017-02-27 17:46:59+0100 [-] Main loop terminated.

As you can see, it complaints about response being NoneType object, so no real cause is mentioned there.

However, when I run the old version of error function, it prints what you can see in Update 1:

<class 'twisted.internet.error.ConnectError'> [Failure instance: Traceback (failure with no frames): <class 'twisted.internet.error.ConnectError'>: An error occurred while connecting: [Failure instance: Traceback (failure with no frames): <class 'twisted.internet.error.ConnectionLost'>: Connection to the other side was lost in a non-clean fashion: Connection lost.

Why it does not print it in the first (answer suggested by @Jean-Paul Calderone) version? Any why it get this error in general?

I verified it works when I connect to those domains, also works via threaded version of the crawler (also in Python), as you can see it is also on the wire (Wireshark screenshot). It just seems the Twisted does not see it (sees it as a lost connection?)

Upvotes: 1

Views: 1649

Answers (1)

Jean-Paul Calderone
Jean-Paul Calderone

Reputation: 48325

There's a bug in your error function. The traceback is telling you this:

  File "scanner.py", line 37, in error
    print failure.value.reasons[0].printTraceback()
exceptions.AttributeError: 'exceptions.TypeError' object has no attribute 'reasons'

scanner.py, line 37, in a function named error, the line print failure.value.reasons[0].printTraceback() provokes an AttributeError because a TypeError instance has no reasons attribute.

I think the reasons attribute you're looking belongs to RequestGenerationFailed, RequestTransmissionFailed, or ResponseFailed.

So, define error like this, instead:

from twisted.web._newclient import (
    RequestGenerationFailed, 
    RequestTransmissionFailed,
    ResponseFailed,
)

def error(failure, url):
    f.write("Error: "+url+"\n") 
    if failure.check(
        RequestGenerationFailed,
        RequestTransmissionFailed,
        ResponseFailed,
    ):
        failure.value.reasons[0].printTraceback()
    else:
        failure.printTraceback()

And if this proves useful, file a ticket against Twisted to make those exception types public (since there's no guarantee the _newclient import will continue to work in the future).

Upvotes: 2

Related Questions