Reputation: 1179
I run this code:
import argparse
from tqdm import tqdm
from sys import argv
from pprint import pformat
from twisted.internet.task import react
from twisted.web.client import Agent, readBody
from twisted.web.http_headers import Headers
from twisted.internet.task import cooperate
from twisted.internet.defer import gatherResults
import sys
from twisted.python import log
log.startLogging(sys.stdout)
import lxml.html
from geoip import geolite2
import pycountry
from tld import get_tld
import json
import socket
poweredby = ""
server = ""
ip = ""
f = open("errors.txt", "w")
def error(response, url):
f.write("Error: "+url+"\n")
def cbRequest(response, url):
global poweredby, server, ip
# print 'Response version:', response.version
# print 'Response code:', response.code
# print 'Response phrase:', response.phrase
# print 'Response headers:'
# print pformat(list(response.headers.getAllRawHeaders()))
poweredby = response.headers.getRawHeaders("X-Powered-By")[0]
server = response.headers.getRawHeaders("Server")[0]
#print poweredby
#print server
d = readBody(response)
d.addCallback(cbBody, url)
return d
def cbBody(body, ourl):
global poweredby, server,ip
#print body
html_element = lxml.html.fromstring(body)
generator = html_element.xpath("//meta[@name='generator']/@content")
ip = socket.gethostbyname(ourl)
try:
match = geolite2.lookup(ip)
if match is not None:
country = match.country
try:
c = pycountry.countries.lookup(country)
country = c.name
except:
country = ""
except:
country = ""
try:
res = get_tld("http://www" + ourl, as_object=True)
tld = res.suffix
except:
tld = ""
try:
match = re.search(r'[\w\.-]+@[\w\.-]+', body)
email = match.group(0)
except:
email = ""
permalink=ourl.rstrip().replace(".","-")
try:
item = generator[0]
val = "{ \"Domain\":" + json.dumps(
"http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\",\"Server\":" + json.dumps(
str(server)) + ",\"PoweredBy\":" + json.dumps(
str(poweredby)) + ",\"MetaGenerator\":" + json.dumps(item) + ",\"Email\":" + json.dumps(
email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"
except:
val = "{ \"Domain\":" + json.dumps(
"http://" + ourl.rstrip()) + ",\"IP\":\"" + ip + "\"," + "\"Server\":" + json.dumps(
str(server)) + ",\"PoweredBy\":" + json.dumps(
str(poweredby)) + ",\"MetaGenerator\":\"\",\"Email\":" + json.dumps(
email) + ",\"Suffix\":\"" + tld + "\",\"CountryHosted\":\"" + country+"\",\"permalink\":\""+permalink+"\" }"
print val
def main(reactor, url_path):
urls = open(url_path)
return mainjob(reactor, (url.strip() for url in urls))
def mainjob(reactor, urls=argv[2:]):
#for url in urls:
# print url
agent = Agent(reactor)
work = (process(agent, url) for url in tqdm(urls))
tasks = list(cooperate(work) for i in range(100))
return gatherResults(list(task.whenDone() for task in tasks))
def process(agent, url):
d = agent.request(
'GET', "http://" + url,
Headers({'User-Agent': ['crawler']}),
None)
d.debug=1
d.addCallback(cbRequest, url)
d.addErrback(error, url)
return d
react(main, ["./test.txt"])
f.close()
I enabled the debug and somehow for those 2 entries addErrback is always (4 out of 4 attempts) called, hence Error entries:
user@laptop:~/crawler$ python scanner.py
2017-02-25 20:35:36+0100 [-] Log opened.
0it [00:00, ?it/s]2017-02-25 20:35:36+0100 [-] Starting factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac1248>
2017-02-25 20:35:36+0100 [-] Starting factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac16c8>
2it [00:00, 660.31it/s]
2017-02-25 20:35:37+0100 [-] Stopping factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac1248>
2017-02-25 20:35:37+0100 [-] Stopping factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac16c8>
2017-02-25 20:35:37+0100 [-] Main loop terminated.
user@laptop:~/crawler$ cat errors.txt
Error: google.al
Error: fau.edu.al
I am confused, since while capturing, I see those requests got sent out and response was received.
How to print the exact error cause in addErrback() to understand the behavior more?
This is related to this question:
Twisted/Python - processing a large file line by line
I appreciate any help. New to Python and Twisted.
Thanks,
Update 1:
I modified the error function to this:
def error(failure, url):
f.write("Error: "+url+"\n")
print type(failure.value), failure # catch error here
print failure.value.reasons[0].printTraceback()
and here is the output, seem to be the same on each run:
user@laptop:~/crawler$ python scanner.py
2017-02-25 21:24:48+0100 [-] Log opened.
0it [00:00, ?it/s]2017-02-25 21:24:48+0100 [-] Starting factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac1200>
2017-02-25 21:24:48+0100 [-] Starting factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac1680>
2it [00:00, 788.33it/s]
2017-02-25 21:24:48+0100 [-] <type 'exceptions.TypeError'> [Failure instance: Traceback: <type 'exceptions.TypeError'>: 'NoneType' object has no attribute '__getitem__'
2017-02-25 21:24:48+0100 [-] /usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:565:_startRunCallbacks
2017-02-25 21:24:48+0100 [-] /usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:651:_runCallbacks
2017-02-25 21:24:48+0100 [-] /usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:457:callback
2017-02-25 21:24:48+0100 [-] /usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:565:_startRunCallbacks
2017-02-25 21:24:48+0100 [-] --- <exception caught here> ---
2017-02-25 21:24:48+0100 [-] /usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:651:_runCallbacks
2017-02-25 21:24:48+0100 [-] scanner.py:47:cbRequest
2017-02-25 21:24:48+0100 [-] ]
2017-02-25 21:24:48+0100 [HTTP11ClientProtocol,client] main function encountered error
Traceback (most recent call last):
Failure: twisted.internet.defer.FirstError: FirstError[#0, [Failure instance: Traceback: <type 'exceptions.AttributeError'>: 'exceptions.TypeError' object has no attribute 'reasons'
/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:565:_startRunCallbacks
/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:651:_runCallbacks
/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:457:callback
/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:565:_startRunCallbacks
--- <exception caught here> ---
/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py:651:_runCallbacks
scanner.py:37:error
]]
2017-02-25 21:24:48+0100 [-] <class 'twisted.internet.error.ConnectError'> [Failure instance: Traceback (failure with no frames): <class 'twisted.internet.error.ConnectError'>: An error occurred while connecting: [Failure instance: Traceback (failure with no frames): <class 'twisted.internet.error.ConnectionLost'>: Connection to the other side was lost in a non-clean fashion: Connection lost.
2017-02-25 21:24:48+0100 [-] ].
2017-02-25 21:24:48+0100 [-] ]
2017-02-25 21:24:48+0100 [-] Stopping factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac1680>
2017-02-25 21:24:48+0100 [-] Stopping factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac1200>
2017-02-25 21:24:48+0100 [-] Main loop terminated.
2017-02-25 21:24:48+0100 [-] Unhandled error in Deferred:
2017-02-25 21:24:48+0100 [-] Unhandled Error
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/base.py", line 1084, in connectionFailed
self.factory.clientConnectionFailed(self, reason)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/endpoints.py", line 246, in clientConnectionFailed
self._onConnection.errback(reason)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 498, in errback
self._startRunCallbacks(fail)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 565, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 651, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "scanner.py", line 37, in error
print failure.value.reasons[0].printTraceback()
exceptions.AttributeError: 'ConnectError' object has no attribute 'reasons'
2017-02-25 21:24:48+0100 [-] Unhandled error in Deferred:
2017-02-25 21:24:48+0100 [-] Unhandled Error
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 565, in _startRunCallbacks
self._runCallbacks()
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 651, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 457, in callback
self._startRunCallbacks(result)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 565, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 651, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "scanner.py", line 37, in error
print failure.value.reasons[0].printTraceback()
exceptions.AttributeError: 'exceptions.TypeError' object has no attribute 'reasons'
Update 2:
After modifying the error function to example given by @Jean-Paul Calderone
I get this output:
2017-02-27 17:46:57+0100 [-] Log opened.
0it [00:00, ?it/s]2017-02-27 17:46:57+0100 [-] Starting factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac0320>
2017-02-27 17:46:57+0100 [-] Starting factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac07a0>
2it [00:00, 763.36it/s]
2017-02-27 17:46:57+0100 [-] Traceback (most recent call last):
2017-02-27 17:46:57+0100 [-] File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 565, in _startRunCallbacks
2017-02-27 17:46:57+0100 [-] self._runCallbacks()
2017-02-27 17:46:57+0100 [-] File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 651, in _runCallbacks
2017-02-27 17:46:57+0100 [-] current.result = callback(current.result, *args, **kw)
2017-02-27 17:46:57+0100 [-] File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 457, in callback
2017-02-27 17:46:57+0100 [-] self._startRunCallbacks(result)
2017-02-27 17:46:57+0100 [-] File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 565, in _startRunCallbacks
2017-02-27 17:46:57+0100 [-] self._runCallbacks()
2017-02-27 17:46:57+0100 [-] --- <exception caught here> ---
2017-02-27 17:46:57+0100 [-] File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 651, in _runCallbacks
2017-02-27 17:46:57+0100 [-] current.result = callback(current.result, *args, **kw)
2017-02-27 17:46:57+0100 [-] File "scan.py", line 59, in cbRequest
2017-02-27 17:46:57+0100 [-] poweredby = response.headers.getRawHeaders("X-Powered-By")[0]
2017-02-27 17:46:57+0100 [-] exceptions.TypeError: 'NoneType' object has no attribute '__getitem__'
2017-02-27 17:46:57+0100 [-] Stopping factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac0320>
2017-02-27 17:46:59+0100 [-] Traceback (most recent call last):
2017-02-27 17:46:59+0100 [-] File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 565, in _startRunCallbacks
2017-02-27 17:46:59+0100 [-] self._runCallbacks()
2017-02-27 17:46:59+0100 [-] File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 651, in _runCallbacks
2017-02-27 17:46:59+0100 [-] current.result = callback(current.result, *args, **kw)
2017-02-27 17:46:59+0100 [-] File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 457, in callback
2017-02-27 17:46:59+0100 [-] self._startRunCallbacks(result)
2017-02-27 17:46:59+0100 [-] File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 565, in _startRunCallbacks
2017-02-27 17:46:59+0100 [-] self._runCallbacks()
2017-02-27 17:46:59+0100 [-] --- <exception caught here> ---
2017-02-27 17:46:59+0100 [-] File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 651, in _runCallbacks
2017-02-27 17:46:59+0100 [-] current.result = callback(current.result, *args, **kw)
2017-02-27 17:46:59+0100 [-] File "scan.py", line 59, in cbRequest
2017-02-27 17:46:59+0100 [-] poweredby = response.headers.getRawHeaders("X-Powered-By")[0]
2017-02-27 17:46:59+0100 [-] exceptions.TypeError: 'NoneType' object has no attribute '__getitem__'
2017-02-27 17:46:59+0100 [-] Stopping factory <twisted.web.client._HTTP11ClientFactory instance at 0x7fffefac07a0>
2017-02-27 17:46:59+0100 [-] Main loop terminated.
As you can see, it complaints about response being NoneType object, so no real cause is mentioned there.
However, when I run the old version of error function, it prints what you can see in Update 1:
<class 'twisted.internet.error.ConnectError'> [Failure instance: Traceback (failure with no frames): <class 'twisted.internet.error.ConnectError'>: An error occurred while connecting: [Failure instance: Traceback (failure with no frames): <class 'twisted.internet.error.ConnectionLost'>: Connection to the other side was lost in a non-clean fashion: Connection lost.
Why it does not print it in the first (answer suggested by @Jean-Paul Calderone) version? Any why it get this error in general?
I verified it works when I connect to those domains, also works via threaded version of the crawler (also in Python), as you can see it is also on the wire (Wireshark screenshot). It just seems the Twisted does not see it (sees it as a lost connection?)
Upvotes: 1
Views: 1649
Reputation: 48325
There's a bug in your error
function. The traceback is telling you this:
File "scanner.py", line 37, in error
print failure.value.reasons[0].printTraceback()
exceptions.AttributeError: 'exceptions.TypeError' object has no attribute 'reasons'
scanner.py, line 37, in a function named error
, the line print failure.value.reasons[0].printTraceback()
provokes an AttributeError
because a TypeError
instance has no reasons
attribute.
I think the reasons
attribute you're looking belongs to RequestGenerationFailed
, RequestTransmissionFailed
, or ResponseFailed
.
So, define error
like this, instead:
from twisted.web._newclient import (
RequestGenerationFailed,
RequestTransmissionFailed,
ResponseFailed,
)
def error(failure, url):
f.write("Error: "+url+"\n")
if failure.check(
RequestGenerationFailed,
RequestTransmissionFailed,
ResponseFailed,
):
failure.value.reasons[0].printTraceback()
else:
failure.printTraceback()
And if this proves useful, file a ticket against Twisted to make those exception types public (since there's no guarantee the _newclient
import will continue to work in the future).
Upvotes: 2