Reputation: 273
Below is a small snippet of code I have for my twitter crawler mechanism:
from BeautifulSoup import BeautifulSoup
import re
import urllib2
url = 'http://mobile.twitter.com/NYTimesKrugman'
def gettweets(soup):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
print tag.renderContents()
print ('\n\n')
def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter
links = soup.findAll('a', {'href': True}, {id: 'more_link'})
for link in links:
b = link.renderContents()
test_b = str(b)
if test_b.find('more'):
return True
else:
return False
def getnewlink(soup): #to get the link to go to the next page of tweets on twitter
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def checkforstamp(soup): # the parser scans a webpage to check if any of the tweets are older than 3 months
times = soup.findAll('a', {'href': True}, {'class': 'status_link'})
for time in times:
stamp = time.renderContents()
test_stamp = str(stamp)
if test_stamp == '3 months ago':
print test_stamp
return True
else:
return False
response = urllib2.urlopen(url)
html = response.read()
soup = BeautifulSoup(html)
gettweets(soup)
stamp = checkforstamp(soup)
tweets = are_more_tweets(soup)
print 'stamp' + str(stamp)
print 'tweets' +str (tweets)
while (stamp is False) and (tweets is True):
b = getnewlink(soup)
print b
red = urllib2.urlopen(b)
html = red.read()
soup = BeautifulSoup(html)
gettweets(soup)
stamp = checkforstamp(soup)
tweets = are_more_tweets(soup)
print 'done'
The problem is, after my twitter crawler hits about 3 months of tweets, I would like it to stop going to the next page of a user. However, it does not appear to be doing that. It seems to continually going searching for the next page of tweets. I believe this is due to the fact that checkstamp keeps evaluating to False. Does anyone have any suggestions as to how I can modify the code so that the crawler keeps looking for the next page of tweets as long as there are more tweets (verified by are_more_tweets mechanism) and it hasn't hit 3 months of tweets yet??? Thanks!
EDIT - Please see below:
from BeautifulSoup import BeautifulSoup
import re
import urllib
url = 'http://mobile.twitter.com/cleversallie'
output = open(r'C:\Python28\testrecursion.txt', 'a')
def gettweets(soup):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
a = tag.renderContents()
b = str (a)
print(b)
print('\n\n')
def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter
links = soup.findAll('a', {'href': True}, {id: 'more_link'})
for link in links:
b = link.renderContents()
test_b = str(b)
if test_b.find('more'):
return True
else:
return False
def getnewlink(soup): #to get the link to go to the next page of tweets on twitter
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def checkforstamp(soup): # the parser scans a webpage to check if any of the tweets are older than 3 months
times = soup.findAll('a', {'href': True}, {'class': 'status_link'})
for time in times:
stamp = time.renderContents()
test_stamp = str(stamp)
if not (test_stamp[0]) in '0123456789':
continue
if test_stamp == '3 months ago':
print test_stamp
return True
else:
return False
response = urllib.urlopen(url)
html = response.read()
soup = BeautifulSoup(html)
gettweets(soup)
stamp = checkforstamp(soup)
tweets = are_more_tweets(soup)
while (not stamp) and (tweets):
b = getnewlink(soup)
print b
red = urllib.urlopen(b)
html = red.read()
soup = BeautifulSoup(html)
gettweets(soup)
stamp = checkforstamp(soup)
tweets = are_more_tweets(soup)
print 'done'
Upvotes: 0
Views: 901
Reputation: 176730
Your soup.findall()
is picking up an image tag in a link that matches your pattern (has an href
attribute and class
status-link
).
Instead of always return
ing on the very first link, try:
for time in times:
stamp = time.renderContents()
test_stamp = str(stamp)
print test_stamp
if not test_stamp[0] in '0123456789':
continue
if test_stamp == '3 months ago':
return True
else:
return False
Which will skip the link if it doesn't start with a number, so you might actually get to the right link. Keep that print
statement in there so you can see if you hit some other kind of link that starts with a number that you also need to filter out.
Edit: What you were doing was always returning on the very first item in times
. I changed it so it ignored any links that didn't start with a number.
However, this would cause it to return None
if it didn't find any links with a number. This would work fine, except you changed while not stamp and tweets
to while stamp is False and tweets is True
. Change it back to while not stamp and tweets
and it will correctly treat None
and False
as the same, and it should work.
Upvotes: 1