Reputation: 163
The usage of the code is to get the HTTP titles of the URLLIST provided.
Am getting the following error in some hosts, also after it finishes, i have to force close the program to exit. Please help me solve these issues. I am getting the error below
httperror_seek_wrapper: HTTP Error 404: /
#!/usr/bin/python
import os
import urllib
import workerpool
from BeautifulSoup import BeautifulSoup
from mechanize import Browser
import sys
def titleprint(url):
br = Browser()
br.set_handle_robots(False)
res = br.open(url, None, 2.5)
data = res.get_data()
soup = BeautifulSoup(data)
title = soup.find('title')
if soup.title != None:
print url, title.renderContents(), '\n'
# Initialize a pool, 5 threads in this case
pool = workerpool.WorkerPool(size=5)
# The ``Title Print`` method will be called with a line from the second
# parameter for each job.
pool.map(titleprint, open("urls.txt").readlines())
# Send shutdown jobs to all threads, and wait until all the jobs have been completed
pool.shutdown()
pool.wait()
Upvotes: 1
Views: 79
Reputation: 163
Found the issue, i was using urllib not urllib2, anyways the correct code is below, but i couldn't fix the SIGINT (CTRL+C) break. :(
import os
import urllib2
import socket
import workerpool
from BeautifulSoup import BeautifulSoup
from mechanize import Browser
import signal
import time
import sys
def titleprint(url):
try:
br = Browser()
br.set_handle_robots(False)
res = br.open(url, None, 2.5)
data = res.get_data()
soup = BeautifulSoup(data)
title = soup.find('title')
if soup.title != None:
print url, title.renderContents(), '\n'
else:
print "No Title Found"
except urllib2.URLError, e:
print url,"Oops, timed out?", '\n'
except socket.error,e:
print url,"Oops, timed out?", '\n'
except socket.timeout:
print url,"Oops, timed out?", '\n'
def signal_handler(signal, frame):
print('You pressed Ctrl+C!')
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
pool = workerpool.WorkerPool(size=20)
pool.map(titleprint, open("urls.txt").readlines())
pool.shutdown()
pool.wait()
print 'Processing of list completed, Cheers!!'
sys.exit(1)
print('Stop the script using Ctrl+C')
signal.pause()
Upvotes: 1