Reputation: 9569
I am playing with Python and trying to parse internet page in order to automate my football watching process on Amazon Fire Tv.
I produced follow code to read HTML pages by URL:
from httplib import BadStatusLine
import urllib2
import logging
htmlWorker = html_worker.HtmlWorkerLiveFootball()
htmlWorker.get_list_of_matches(htmlWorker.URL)
class HtmlWorkerLiveFootball:
URL = 'http://livefootball.ws/'
def get_list_of_matches(self, url):
opener = urllib2.OpenerDirector()
for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler, HTTPMethodFallback, HEADRedirectHandler,
urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
opener.add_handler(handler())
opener.addheaders = [('User-agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36'
)]
urllib2.install_opener(opener)
try:
logging.warning("request = %s" % opener.addheaders)
page = urllib2.urlopen(url)
logging.warning("result = %s" % page.read())
except urllib2.HTTPError, error:
logging.error("error code = %d" % error.code)
except BadStatusLine:
logging.error("could not fetch %s" % url)
class HeadRequest(urllib2.Request):
def get_method(self):
return "HEAD"
class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
def redirect_request(self, req, fp, code, msg, headers, newurl):
if code in (301, 302, 303, 307):
logging.warning("redirect_request = %d" % code)
newurl = newurl.replace(' ', '%20')
logging.warning("new url = %s" % newurl)
logging.warning("headers = %s" % headers)
newheaders = dict((k, v) for k, v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
logging.debug("newheaders = %s" % newheaders)
request = HeadRequest(newurl, headers=newheaders, origin_req_host=req.get_origin_req_host(),
unverifiable=True)
request.add_header('User-agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36')
request.add_header('Cookie', headers.dict['set-cookie'])
request.add_header('Host', "livefootball.ws")
request.add_header('Accept-Encoding', "gzip,deflate,sdch")
request.add_header('Accept', "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
request.add_header('Cache-Control', "max-age=0")
request.add_header('Accept-Language', "en-US,en;q=0.8,ru;q=0.6")
logging.warning("request = %s" % request.headers)
return request
else:
raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
class HTTPMethodFallback(urllib2.BaseHandler):
def http_error_405(self, req, fp, code, msg, headers):
logging.warning("http_error_405. Headers = %s" % headers)
fp.read()
fp.close()
newheaders = dict((k, v) for k, v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
return self.parent.open(urllib2.Request(req.get_full_url(),
headers=newheaders,
origin_req_host=req.get_origin_req_host(),
unverifiable=True))
It works for major amount of site all over internet, but unfortunately looks like site that I need trying to avoid DDOS attack with some unfamiliar for me mechanism (redirect + some stuff with cookies). I trying to emulate browser behavior, but have empty string at the end.
Here log that I have after executing this code:
WARNING:root:request = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36')]
WARNING:root:redirect_request = 307
WARNING:root:new url = http://livefootball.ws/?dos=1
WARNING:root:headers = Server: nginx
Date: Sun, 15 Jun 2014 14:11:03 GMT
Content-Type: text/html
Content-Length: 180
Connection: close
Set-Cookie: antid=6abeccafd9ac44951b4acc7f642649b7; path=/
Location: http://livefootball.ws/?dos=1
WARNING:root:request = {'Accept-language': 'en-US,en;q=0.8,ru;q=0.6', 'Accept-encoding': 'gzip,deflate,sdch', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36', 'Host': 'livefootball.ws', 'Cookie': 'antid=6abeccafd9ac44951b4acc7f642649b7; path=/', 'Cache-control': 'max-age=0'}
WARNING:root:result =
How to read this page with python? Thanks.
Upvotes: 1
Views: 210
Reputation: 9937
If you want to read HTML pages by URL, you can use requests library instead urllib2
. It is so easy to use:
import requests
session = requests.Session()
index_url = 'http://livefootball.ws/'
index_request = session.get(index_url)
#change encoding of the response
index_request.encoding = 'CP1251'
#print page content
print index_request.text
Upvotes: 1