Reputation: 4419
I am calling a web crawling function from a handler in GAE and it retrieves a few images and then displays them. It works just fine on the first call but then the next time it displays all the same images and the crawler starts up from where the last one left off. I think it is a problem with my global variables not being reset correctly.
Everytime I redeploy the app it does it correctly the first time but then the problem begins.
Here is my code please let me know if you need me to clarify it but I think it should make sense.
Here is the scraper function
visited_pages = []
visit_queue = deque([])
collected_pages = []
collected_pics = []
count = 0
pic_count = 0
def scrape_pages(url, root_url, keywords=[], recurse=True):
#variables
max_count = 16
pic_num = 100
global count
global pic_count
global collected_pics
global collected_pages
print 'the keywords and url are'
print keywords
print url
#this is all of the links that have been scraped
the_links = []
soup = soupify_url(url)
#only add new pages onto the queue if the recursion argument is true
if recurse:
#find all the links on the page
try:
for tag in soup.findAll('a'):
the_links.append(tag.get('href'))
except AttributeError:
return
try:
external_links, internal_links, root_links, primary_links = categorize_links(the_links, url, root_url)
except TypeError:
return
#change it so this depends on the input
links_to_visit = external_links + internal_links + root_links
#build the queue
for link in links_to_visit:
if link not in visited_pages and link not in visit_queue:
visit_queue.append(link)
visited_pages.append(url)
count = count + 1
# print 'number of pages visited'
# print count
#add pages to collected_pages depending on the criteria given if any keywords are given
if keywords:
page_to_add = find_pages(url, soup, keywords)
# print 'page to add'
# print page_to_add
if page_to_add and page_to_add not in collected_pages:
collected_pages.append(page_to_add)
pics_to_add = add_pics(url, soup)
# print 'pics to add'
# print pics_to_add
if pics_to_add:
collected_pics.extend(pics_to_add)
#here is where the actual recursion happens by finishing the queue
while visit_queue:
if count >= max_count:
return
if pic_count > pic_num:
return
link = visit_queue.popleft()
# print link
scrape_pages(link, root_url, keywords)
# print '***done***'
###done with the recursive scraping function here
#here I just get a list of links from Bing, add them to the queue and go through them then reset all the global variables
def scrape_bing_src(keywords):
visit_queue, the_url = scrape_bing.get_links(keywords, a_list = False)
scrape_pages(visit_queue.popleft(), the_url, keywords, recurse=True)
global collected_pics
global pic_count
global count
global visited_pages
global visit_queue
pic_count = 0
count = 0
visited_pages = []
visit_queue = deque([])
pics_to_return = collected_pics
collected_pics = []
return pics_to_return
Here is the handler that calls the scraper function
#this just simply displays the images
class Try(BlogHandler):
def get(self, keyword):
keyword = str(keyword)
keyword_list = keyword.split()
img_list = scraper.scrape_bing_src(keyword_list)
for img in img_list:
self.response.write("""<br><img src='""" + img + """'>""")
self.response.write('we are done here')
Upvotes: 0
Views: 547
Reputation: 1538
Your code isn't run inside only one "server" and one instance, you probably already noticed instances tab in admin console. So there is chance that even between calls you will be switched to different server, or process will be "restarted" (more you can read here). During warmup process your application reads from disk into memory and then starts to handle requests. So every time you getting new precached python instance with its own globals variable values.
In your case it is better to use memcache.
Upvotes: 1