Reputation: 3
I am trying to check the response of the URL same as the domain record from the WHOIS database.
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
The code:
def abnormal_url(url):
response = requests.get(url,verify=False)
domainname = urlparse(url).netloc
domain = whois.whois(domainname)
try:
if response.text == domain:
return 0 # legitimate
else:
return 1 # phishing
except:
return 1 # phishing
Append to dataframe:
df['abnormal url'] = df['url'].apply(lambda i: abnormal_url(i))
Error found:
gaierror Traceback (most recent call last)
File D:\anaconda3\lib\site-packages\urllib3\connection.py:174, in HTTPConnection._new_conn(self)
173 try:
--> 174 conn = connection.create_connection(
175 (self._dns_host, self.port), self.timeout, **extra_kw
176 )
178 except SocketTimeout:
File D:\anaconda3\lib\site-packages\urllib3\util\connection.py:72, in create_connection(address, timeout, source_address, socket_options)
68 return six.raise_from(
69 LocationParseError(u"'%s', label empty or too long" % host), None
70 )
---> 72 for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
73 af, socktype, proto, canonname, sa = res
File D:\anaconda3\lib\socket.py:954, in getaddrinfo(host, port, family, type, proto, flags)
953 addrlist = []
--> 954 for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
955 af, socktype, proto, canonname, sa = res
gaierror: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
NewConnectionError Traceback (most recent call last)
File D:\anaconda3\lib\site-packages\urllib3\connectionpool.py:703, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
702 # Make the request on the httplib connection object.
--> 703 httplib_response = self._make_request(
704 conn,
705 method,
706 url,
707 timeout=timeout_obj,
708 body=body,
709 headers=headers,
710 chunked=chunked,
711 )
713 # If we're going to release the connection in ``finally:``, then
714 # the response doesn't need to know about the connection. Otherwise
715 # it will also try to release it and we'll have a double-release
716 # mess.
File D:\anaconda3\lib\site-packages\urllib3\connectionpool.py:386, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
385 try:
--> 386 self._validate_conn(conn)
387 except (SocketTimeout, BaseSSLError) as e:
388 # Py2 raises this as a BaseSSLError, Py3 raises it as socket timeout.
File D:\anaconda3\lib\site-packages\urllib3\connectionpool.py:1040, in HTTPSConnectionPool._validate_conn(self, conn)
1039 if not getattr(conn, "sock", None): # AppEngine might not have `.sock`
-> 1040 conn.connect()
1042 if not conn.is_verified:
File D:\anaconda3\lib\site-packages\urllib3\connection.py:358, in HTTPSConnection.connect(self)
356 def connect(self):
357 # Add certificate verification
--> 358 self.sock = conn = self._new_conn()
359 hostname = self.host
File D:\anaconda3\lib\site-packages\urllib3\connection.py:186, in HTTPConnection._new_conn(self)
185 except SocketError as e:
--> 186 raise NewConnectionError(
187 self, "Failed to establish a new connection: %s" % e
188 )
190 return conn
NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x0000023F7EA21520>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
MaxRetryError Traceback (most recent call last)
File D:\anaconda3\lib\site-packages\requests\adapters.py:440, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
439 if not chunked:
--> 440 resp = conn.urlopen(
441 method=request.method,
442 url=url,
443 body=request.body,
444 headers=request.headers,
445 redirect=False,
446 assert_same_host=False,
447 preload_content=False,
448 decode_content=False,
449 retries=self.max_retries,
450 timeout=timeout
451 )
453 # Send the request.
454 else:
File D:\anaconda3\lib\site-packages\urllib3\connectionpool.py:785, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
783 e = ProtocolError("Connection aborted.", e)
--> 785 retries = retries.increment(
786 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
787 )
788 retries.sleep()
File D:\anaconda3\lib\site-packages\urllib3\util\retry.py:592, in Retry.increment(self, method, url, response, error, _pool, _stacktrace)
591 if new_retry.is_exhausted():
--> 592 raise MaxRetryError(_pool, url, error or ResponseError(cause))
594 log.debug("Incremented Retry for (url='%s'): %r", url, new_retry)
MaxRetryError: HTTPSConnectionPool(host='www.list.tmall.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000023F7EA21520>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
Input In [16], in <cell line: 1>()
----> 1 df['abnormal url'] = df['url'].apply(lambda i: abnormal_url(i))
File D:\anaconda3\lib\site-packages\pandas\core\series.py:4433, in Series.apply(self, func, convert_dtype, args, **kwargs)
4323 def apply(
4324 self,
4325 func: AggFuncType,
(...)
4328 **kwargs,
4329 ) -> DataFrame | Series:
4330 """
4331 Invoke function on values of Series.
4332
(...)
4431 dtype: float64
4432 """
-> 4433 return SeriesApply(self, func, convert_dtype, args, kwargs).apply()
File D:\anaconda3\lib\site-packages\pandas\core\apply.py:1082, in SeriesApply.apply(self)
1078 if isinstance(self.f, str):
1079 # if we are a string, try to dispatch
1080 return self.apply_str()
-> 1082 return self.apply_standard()
File D:\anaconda3\lib\site-packages\pandas\core\apply.py:1137, in SeriesApply.apply_standard(self)
1131 values = obj.astype(object)._values
1132 # error: Argument 2 to "map_infer" has incompatible type
1133 # "Union[Callable[..., Any], str, List[Union[Callable[..., Any], str]],
1134 # Dict[Hashable, Union[Union[Callable[..., Any], str],
1135 # List[Union[Callable[..., Any], str]]]]]"; expected
1136 # "Callable[[Any], Any]"
-> 1137 mapped = lib.map_infer(
1138 values,
1139 f, # type: ignore[arg-type]
1140 convert=self.convert_dtype,
1141 )
1143 if len(mapped) and isinstance(mapped[0], ABCSeries):
1144 # GH#43986 Need to do list(mapped) in order to get treated as nested
1145 # See also GH#25959 regarding EA support
1146 return obj._constructor_expanddim(list(mapped), index=obj.index)
File D:\anaconda3\lib\site-packages\pandas\_libs\lib.pyx:2870, in pandas._libs.lib.map_infer()
Input In [16], in <lambda>(i)
----> 1 df['abnormal url'] = df['url'].apply(lambda i: abnormal_url(i))
Input In [15], in abnormal_url(url)
1 def abnormal_url(url):
----> 2 response = requests.get(url,verify=False)
3 domainname = urlparse(url).netloc
4 domain = whois.whois(domainname)
File D:\anaconda3\lib\site-packages\requests\api.py:75, in get(url, params, **kwargs)
64 def get(url, params=None, **kwargs):
65 r"""Sends a GET request.
66
67 :param url: URL for the new :class:`Request` object.
(...)
72 :rtype: requests.Response
73 """
---> 75 return request('get', url, params=params, **kwargs)
File D:\anaconda3\lib\site-packages\requests\api.py:61, in request(method, url, **kwargs)
57 # By using the 'with' statement we are sure the session is closed, thus we
58 # avoid leaving sockets open which can trigger a ResourceWarning in some
59 # cases, and look like a memory leak in others.
60 with sessions.Session() as session:
---> 61 return session.request(method=method, url=url, **kwargs)
File D:\anaconda3\lib\site-packages\requests\sessions.py:529, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
524 send_kwargs = {
525 'timeout': timeout,
526 'allow_redirects': allow_redirects,
527 }
528 send_kwargs.update(settings)
--> 529 resp = self.send(prep, **send_kwargs)
531 return resp
File D:\anaconda3\lib\site-packages\requests\sessions.py:645, in Session.send(self, request, **kwargs)
642 start = preferred_clock()
644 # Send the request
--> 645 r = adapter.send(request, **kwargs)
647 # Total elapsed time of the request (approximately)
648 elapsed = preferred_clock() - start
File D:\anaconda3\lib\site-packages\requests\adapters.py:519, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
515 if isinstance(e.reason, _SSLError):
516 # This branch is for urllib3 v1.22 and later.
517 raise SSLError(e, request=request)
--> 519 raise ConnectionError(e, request=request)
521 except ClosedPoolError as e:
522 raise ConnectionError(e, request=request)
ConnectionError: HTTPSConnectionPool(host='www.list.tmall.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000023F7EA21520>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Upvotes: 0
Views: 451
Reputation: 97
The connection could not be established because the site can't be reached.
Just execute the get request inside your try/except and it will work.
def abnormal_url(url):
domainname = urlparse(url).netloc
domain = whois.whois(domainname)
try:
response = requests.get(url,verify=False)
if response.text == domain:
return 0 # legitimate
else:
return 1 # phishing
except:
return 1 # phishing
Upvotes: 1