snehoozle
snehoozle

Reputation: 273

File Storage Problem with Python Web Crawler

I am screen scraping data using a web crawler and storing the results - (tweets from a twitter page) as separate html files for each user I'm crawling. I intend to later parse the html files and store the data into a database for analysis. However, I am having a bizarre problem.

When I run the following program - a small snippet from the overall crawler - I am able to get a separate html file for each follower:

import re
import urllib2
import twitter

start_follower = "NYTimesKrugman"
depth = 3

searched = set()

api = twitter.Api()

def crawl(follower, in_depth):
    if in_depth > 0:
        searched.add(follower)
        directory = "C:\\Python28\\Followertest1\\" + follower + ".html"
        output = open(directory, 'a')
        output.write(follower)
        output.write('\n\n')
        users = api.GetFriends(follower)
        names = set([str(u.screen_name) for u in users])
        names -= searched
        for name in list(names)[0:5]:
            crawl(name, in_depth-1) 

crawl(start_follower, depth)

for x in searched:
    print x
print "Program is completed."

However, when I run the full crawler, I do not get a separate file for each follower:

import twitter
import urllib
from BeautifulSoup import BeautifulSoup
import re
import time

start_follower = "NYTimeskrugman" 
depth = 2
searched = set()

api = twitter.Api()


def add_to_U(user):
    U.append(user)

def site(follower): #creates a twitter site url in string format based on the follower username
    followersite = "http://mobile.twitter.com/" + follower
    return followersite

def getPage(follower): #obtains access to a webapge
    url = site(follower)
    response = urllib.urlopen(url)
    return response

def getSoup(response): #creates the parsing module
    html = response.read()
    soup = BeautifulSoup(html)
    return soup

def gettweets(soup, output):
    tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
    for tag in tags: 
        a = tag.renderContents()
        b = str (a)
        output.write(b)
        output.write('\n\n')

def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter 
    links = soup.findAll('a', {'href': True}, {id: 'more_link'})
    for link in links:
        b = link.renderContents()
        test_b = str(b)
        if test_b.find('more') != -1:
            return True
    return False

def getnewlink(soup): #to get the link to go to the next page of tweets on twitter 
    links = soup.findAll('a', {'href': True}, {id : 'more_link'})
    for link in links:
        b = link.renderContents()
        if str(b) == 'more':
            c = link['href']
            d = 'http://mobile.twitter.com' +c
            return d

def crawl(follower, in_depth): #main method of sorts
    if in_depth > 0:
        searched.add(follower)
        directory = "C:\\Python28\\Followertest2\\" + follower + ".html"
        output = open(directory, 'a')
        output.write(follower)
        output.write('\n\n')
        a = getPage(follower)
        soup = getSoup(a)
        gettweets(soup, output)
        tweets = are_more_tweets(soup)
        while(tweets): 
            b = getnewlink(soup)
            red = urllib.urlopen(b)
            html = red.read()
            soup = BeautifulSoup(html)
            gettweets(soup, output)
            tweets = are_more_tweets(soup)
        users = api.GetFriends(follower)
        names = set([str(u.screen_name) for u in users])
        names -= searched
        for name in list(names)[0:5]:
            print name
            crawl(name, in_depth - 1)

crawl(start_follower, depth)
print("Program done. Look at output file.")

More specifically, I seem to get a separate html file for about the first five followers and then no new files appear to be created. Any help would be appreciated!

Upvotes: 2

Views: 712

Answers (1)

eric
eric

Reputation: 516

The depth value is different between the snippet and the full code (you're only going to get one level of recursion in the full code). Also, you only grab the first five names from the followers list: for name in list(names)[0:5]: So you get six people total: the starting follower and their first five friends.

Upvotes: 1

Related Questions