iqzer0
iqzer0

Reputation: 163

Python getting weird HTTP errors

The usage of the code is to get the HTTP titles of the URLLIST provided.

Am getting the following error in some hosts, also after it finishes, i have to force close the program to exit. Please help me solve these issues. I am getting the error below

httperror_seek_wrapper: HTTP Error 404: /

#!/usr/bin/python

import os
import urllib
import workerpool
from BeautifulSoup import BeautifulSoup
from mechanize import Browser
import sys

def titleprint(url):
    br = Browser()
    br.set_handle_robots(False)
    res = br.open(url, None, 2.5)
    data = res.get_data()
    soup = BeautifulSoup(data)
    title = soup.find('title')
    if soup.title != None:
        print url, title.renderContents(), '\n'
# Initialize a pool, 5 threads in this case
pool = workerpool.WorkerPool(size=5)

# The ``Title Print`` method will be called with a line from the second
# parameter for each job.
pool.map(titleprint, open("urls.txt").readlines())

# Send shutdown jobs to all threads, and wait until all the jobs have been completed
pool.shutdown()
    pool.wait()

Upvotes: 1

Views: 79

Answers (1)

iqzer0
iqzer0

Reputation: 163

Found the issue, i was using urllib not urllib2, anyways the correct code is below, but i couldn't fix the SIGINT (CTRL+C) break. :(

import os
import urllib2
import socket
import workerpool
from BeautifulSoup import BeautifulSoup
from mechanize import Browser
import signal
import time
import sys

def titleprint(url):
    try:
        br = Browser()
        br.set_handle_robots(False)
        res = br.open(url, None, 2.5)
        data = res.get_data()
        soup = BeautifulSoup(data)
        title = soup.find('title')
        if soup.title != None:
                print url, title.renderContents(), '\n'
        else:
                print "No Title Found"
    except urllib2.URLError, e:
        print url,"Oops, timed out?", '\n'
    except socket.error,e:
        print url,"Oops, timed out?", '\n'
    except socket.timeout:
        print url,"Oops, timed out?", '\n'


def signal_handler(signal, frame):
        print('You pressed Ctrl+C!')
        sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
pool = workerpool.WorkerPool(size=20)
pool.map(titleprint, open("urls.txt").readlines())
pool.shutdown()
pool.wait()
print 'Processing of list completed, Cheers!!'
sys.exit(1)
print('Stop the script using Ctrl+C')
signal.pause()

Upvotes: 1

Related Questions