Reputation: 21
I'm trying to check 22,800+ urls from a 2012 database to find out which ones are still valid. I'm using urllib in Python 3.8 in PyCharm. It makes it through the first 47 urls which are in a text file that I read in. Then it crashes when the host can't be found.
Here's the error output:
Traceback (most recent call last): File "C:\Users\rmcape\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 1350, in do_open h.request(req.get_method(), req.selector, req.data, headers, File "C:\Users\rmcape\AppData\Local\Programs\Python\Python38-32\lib\http\client.py", line 1255, in request self._send_request(method, url, body, headers, encode_chunked) File "C:\Users\rmcape\AppData\Local\Programs\Python\Python38-32\lib\http\client.py", line 1301, in _send_request self.endheaders(body, encode_chunked=encode_chunked) File "C:\Users\rmcape\AppData\Local\Programs\Python\Python38-32\lib\http\client.py", line 1250, in endheaders self._send_output(message_body, encode_chunked=encode_chunked) File "C:\Users\rmcape\AppData\Local\Programs\Python\Python38-32\lib\http\client.py", line 1010, in _send_output self.send(msg) File "C:\Users\rmcape\AppData\Local\Programs\Python\Python38-32\lib\http\client.py", line 950, in send self.connect() File "C:\Users\rmcape\AppData\Local\Programs\Python\Python38-32\lib\http\client.py", line 921, in connect self.sock = self._create_connection( File "C:\Users\rmcape\AppData\Local\Programs\Python\Python38-32\lib\socket.py", line 787, in create_connection for res in getaddrinfo(host, port, 0, SOCK_STREAM): File "C:\Users\rmcape\AppData\Local\Programs\Python\Python38-32\lib\socket.py", line 918, in getaddrinfo for res in _socket.getaddrinfo(host, port, family, type, proto, flags): socket.gaierror: [Errno 11002] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/rmcape/PycharmProjects/first/venv/validateURLs.py", line 19, in resp=urllib.request.urlopen(req) File "C:\Users\rmcape\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 222, in urlopen return opener.open(url, data, timeout) File "C:\Users\rmcape\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 525, in open response = self._open(req, data) File "C:\Users\rmcape\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 542, in _open result = self._call_chain(self.handle_open, protocol, protocol + File "C:\Users\rmcape\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 502, in _call_chain result = func(*args) File "C:\Users\rmcape\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 1379, in http_open return self.do_open(http.client.HTTPConnection, req) File "C:\Users\rmcape\AppData\Local\Programs\Python\Python38-32\lib\urllib\request.py", line 1353, in do_open raise URLError(err) urllib.error.URLError: <urlopen error [Errno 11002] getaddrinfo failed>
How can I detect the DNS lookup failure and recover from it and continue on to the next URL in the file? Is there some other library that I should be using? I've googled about everything I can think of. Thanks for any help.
Here's the code:
#!/bin/python
#
#validateURLs.py
import urllib
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
import responses
import socket
f = open("updatedURLs.txt", "r")
site=f.readline()
siteCount=1
errorCount=0
while site:
site=site.strip()
req = urllib.request.Request(site)
try:
resp=urllib.request.urlopen(req)
respo=str(resp.getcode())
result = "("+str(siteCount)+") "+respo+" ==> "+site
print(result)
#print(siteCount, site, resp.getcode())
except urllib.error.HTTPError as e:
errorCount=errorCount+1
result="("+str(siteCount)+") "+str(e.code)+" ==> "+site
print(result)
print("errorCount = "+str(errorCount))
site=f.readline()
siteCount=siteCount+1
print(errorCount)
print("Done")
Upvotes: 1
Views: 1383
Reputation: 483
Will this work for you?:
#!/bin/python
#
#validateURLs.py
import urllib
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
import responses
import socket
f = open("updatedURLs.txt", "r")
site=f.readline()
siteCount=1
errorCount=0
while site:
site=site.strip()
req = urllib.request.Request(site)
try:
resp=urllib.request.urlopen(req)
respo=str(resp.getcode())
result = "("+str(siteCount)+") "+respo+" ==> "+site
print(result)
#print(siteCount, site, resp.getcode())
except Exception as e:
errorCount=errorCount+1
result="("+str(siteCount)+") "+str(e)+" ==> "+site
print(result)
print("errorCount = "+str(errorCount))
else:
site=f.readline()
siteCount=siteCount+1
print(errorCount)
print("Done")
Upvotes: 0