Reputation: 31
Here's the Problem:
I'm writing a python program that's purpose is to continuously collect news from RSS feeds. I want the program to collect the data for 1 week. The problem is that the program never makes it to the end of the week. Sometimes it freezes after running for several days, sometimes several hours and even just a few minutes. It always freezes, no errors. When I say freezing I mean the interpreter seems to still be running, in that I can't give it any additional commands. How can I solve this problem?
I'll post the code below. Thanks guys!!
from goose import Goose
from requests import get
import urllib2
import feedparser
from urllib2 import urlopen
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import datetime as dt
import time
import os
Symbols=['AAPL','T','BA','XOM','GOOG','JPM','PG','WMT']
url='http://finance.yahoo.com/rss/headline?s='
for t in xrange(7):
AAPL=pd.DataFrame()
AAPL['Published']=""
AAPL['Title']=""
AAPL['link']=""
AAPL['ID']=""
AAPL['News']=""
T=pd.DataFrame()
T['Published']=""
T['Title']=""
T['link']=""
T['ID']=""
T['News']=""
BA=pd.DataFrame()
BA['Published']=""
BA['Title']=""
BA['link']=""
BA['ID']=""
BA['News']=""
XOM=pd.DataFrame()
XOM['Published']=""
XOM['Title']=""
XOM['link']=""
XOM['ID']=""
XOM['News']=""
GOOG=pd.DataFrame()
GOOG['Published']=""
GOOG['Title']=""
GOOG['link']=""
GOOG['ID']=""
GOOG['News']=""
JPM=pd.DataFrame()
JPM['Published']=""
JPM['Title']=""
JPM['link']=""
JPM['ID']=""
JPM['News']=""
PG=pd.DataFrame()
PG['Published']=""
PG['Title']=""
PG['link']=""
PG['ID']=""
PG['News']=""
WMT=pd.DataFrame()
WMT['Published']=""
WMT['Title']=""
WMT['link']=""
WMT['ID']=""
WMT['News']=""
DaysIDsAAPL=[]
DaysIDsT=[]
DaysIDsBA=[]
DaysIDsXOM=[]
DaysIDsGOOG=[]
DaysIDsJPM=[]
DaysIDsPG=[]
DaysIDsWMT=[]
count=0
AAPLCount=0
TCount=0
BACount=0
XOMCount=0
GOOGCount=0
JPMCount=0
PGCount=0
WMTCount=0
date=dt.date.today()
newpathAAPL = r'D:\News Data\AAPL\\'+str(t)
newpathT = r'D:\News Data\T\\'+str(t)
newpathBA = r'D:\News Data\BA\\'+str(t)
newpathXOM = r'D:\News Data\XOM\\'+str(t)
newpathGOOG = r'D:\News Data\GOOG\\'+str(t)
newpathJPM = r'D:\News Data\JPM\\'+str(t)
newpathPG = r'D:\News Data\PG\\'+str(t)
newpathWMT = r'D:\News Data\WMT\\'+str(t)
os.makedirs(newpathAAPL)
os.makedirs(newpathT)
os.makedirs(newpathBA)
os.makedirs(newpathXOM)
os.makedirs(newpathGOOG)
os.makedirs(newpathJPM)
os.makedirs(newpathPG)
os.makedirs(newpathWMT)
while dt.date.today()==date:
print "Loop"
try:
#AAPL inner most loop
d1=feedparser.parse(url+Symbols[0])
for x in xrange(len(d1['entries'])):
if int(d1.entries[x]['id'][14:]) not in DaysIDsAAPL:
DaysIDsAAPL.append(int(d1.entries[x]['id'][14:]))
y = len(AAPL.index.tolist())
m=re.search(r'\*(.*)',d1.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
AAPL.loc[y,'Title'] =d1.entries[x]['title'].encode('utf8')
AAPL.loc[y,'link'] =m.encode('utf8')
AAPL.loc[y,'Published'] =d1.entries[x]['published'].encode('utf8')
AAPL.loc[y,'ID'] =int(d1.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathAAPL+r"\\"+str(AAPLCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
AAPL.loc[y,'News'] = AAPLCount
AAPLCount+=1
AAPL=AAPL.fillna("")
AAPL.to_csv(newpathAAPL+r'\Key.csv')
except:
print m
print "AAPL"
else:
Text_file = open(newpathAAPL+r"\\"+str(AAPLCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
AAPL.loc[y,'News'] =AAPLCount
AAPLCount+=1
AAPL=AAPL.fillna("")
AAPL.to_csv(newpathAAPL+r'\Key.csv')
print "AAPL"
#T inner most loop
d2=feedparser.parse(url+Symbols[1])
for x in xrange(len(d2['entries'])):
if int(d2.entries[x]['id'][14:]) not in DaysIDsT:
DaysIDsT.append(int(d2.entries[x]['id'][14:]))
y = len(T.index.tolist())
m=re.search(r'\*(.*)',d2.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
T.loc[y,'Title'] =d2.entries[x]['title'].encode('utf8')
T.loc[y,'link'] =m.encode('utf8')
T.loc[y,'Published'] =d2.entries[x]['published'].encode('utf8')
T.loc[y,'ID'] =int(d2.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathT+r"\\"+str(TCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
T.loc[y,'News'] = TCount
TCount+=1
T=T.fillna("")
T.to_csv(newpathT+r'\Key.csv')
except:
print m
print "T"
else:
Text_file = open(newpathT+r"\\"+str(TCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
T.loc[y,'News'] =TCount
TCount+=1
T=T.fillna("")
T.to_csv(newpathT+r'\Key.csv')
print "T"
#BA inner most loop
d3=feedparser.parse(url+Symbols[2])
for x in xrange(len(d3['entries'])):
if int(d3.entries[x]['id'][14:]) not in DaysIDsBA:
DaysIDsBA.append(int(d3.entries[x]['id'][14:]))
y = len(BA.index.tolist())
m=re.search(r'\*(.*)',d3.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
BA.loc[y,'Title'] =d3.entries[x]['title'].encode('utf8')
BA.loc[y,'link'] =m.encode('utf8')
BA.loc[y,'Published'] =d3.entries[x]['published'].encode('utf8')
BA.loc[y,'ID'] =int(d3.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathBA+r"\\"+str(BACount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
BA.loc[y,'News'] = BACount
BACount+=1
BA=BA.fillna("")
BA.to_csv(newpathBA+r'\Key.csv')
except:
print m
print "BA"
else:
Text_file = open(newpathBA+r"\\"+str(BACount)+".txt", "w")
Text_file.write(text)
Text_file.close()
BA.loc[y,'News'] =BACount
BACount+=1
BA=BA.fillna("")
BA.to_csv(newpathBA+r'\Key.csv')
print "BA"
#XOM inner most loop
d4=feedparser.parse(url+Symbols[3])
for x in xrange(len(d4['entries'])):
if int(d4.entries[x]['id'][14:]) not in DaysIDsXOM:
DaysIDsXOM.append(int(d4.entries[x]['id'][14:]))
y = len(XOM.index.tolist())
m=re.search(r'\*(.*)',d4.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
XOM.loc[y,'Title'] =d4.entries[x]['title'].encode('utf8')
XOM.loc[y,'link'] =m.encode('utf8')
XOM.loc[y,'Published'] =d4.entries[x]['published'].encode('utf8')
XOM.loc[y,'ID'] =int(d4.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathXOM+r"\\"+str(XOMCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
XOM.loc[y,'News'] = XOMCount
XOMCount+=1
XOM=XOM.fillna("")
XOM.to_csv(newpathXOM+r'\Key.csv')
except:
print m
print "XOM"
else:
Text_file = open(newpathXOM+r"\\"+str(XOMCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
XOM.loc[y,'News'] =XOMCount
XOMCount+=1
XOM=XOM.fillna("")
XOM.to_csv(newpathXOM+r'\Key.csv')
#GOOG inner most loop
d5=feedparser.parse(url+Symbols[4])
for x in xrange(len(d5['entries'])):
if int(d5.entries[x]['id'][14:]) not in DaysIDsGOOG:
DaysIDsGOOG.append(int(d5.entries[x]['id'][14:]))
y = len(GOOG.index.tolist())
m=re.search(r'\*(.*)',d5.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
GOOG.loc[y,'Title'] =d5.entries[x]['title'].encode('utf8')
GOOG.loc[y,'link'] =m.encode('utf8')
GOOG.loc[y,'Published'] =d5.entries[x]['published'].encode('utf8')
GOOG.loc[y,'ID'] =int(d5.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathGOOG+r"\\"+str(GOOGCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
GOOG.loc[y,'News'] = GOOGCount
GOOGCount+=1
GOOG=GOOG.fillna("")
GOOG.to_csv(newpathGOOG+r'\Key.csv')
except:
print m
print "GOOG"
else:
Text_file = open(newpathGOOG+r"\\"+str(GOOGCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
GOOG.loc[y,'News'] =GOOGCount
GOOGCount+=1
GOOG=GOOG.fillna("")
GOOG.to_csv(newpathGOOG+r'\Key.csv')
print "GOOG"
#JPM inner most loop
d6=feedparser.parse(url+Symbols[5])
for x in xrange(len(d6['entries'])):
if int(d6.entries[x]['id'][14:]) not in DaysIDsJPM:
DaysIDsJPM.append(int(d6.entries[x]['id'][14:]))
y = len(JPM.index.tolist())
m=re.search(r'\*(.*)',d6.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
JPM.loc[y,'Title'] =d6.entries[x]['title'].encode('utf8')
JPM.loc[y,'link'] =m.encode('utf8')
JPM.loc[y,'Published'] =d6.entries[x]['published'].encode('utf8')
JPM.loc[y,'ID'] =int(d6.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathJPM+r"\\"+str(JPMCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
JPM.loc[y,'News'] = JPMCount
JPMCount+=1
JPM=JPM.fillna("")
JPM.to_csv(newpathJPM+r'\Key.csv')
except:
print m
print "JPM"
else:
Text_file = open(newpathJPM+r"\\"+str(JPMCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
JPM.loc[y,'News'] =JPMCount
JPMCount+=1
JPM=JPM.fillna("")
JPM.to_csv(newpathJPM+r'\Key.csv')
print "JPM"
#PG inner most loop
d7=feedparser.parse(url+Symbols[6])
for x in xrange(len(d7['entries'])):
if int(d7.entries[x]['id'][14:]) not in DaysIDsPG:
DaysIDsPG.append(int(d7.entries[x]['id'][14:]))
y = len(PG.index.tolist())
m=re.search(r'\*(.*)',d7.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
PG.loc[y,'Title'] =d7.entries[x]['title'].encode('utf8')
PG.loc[y,'link'] =m.encode('utf8')
PG.loc[y,'Published'] =d7.entries[x]['published'].encode('utf8')
PG.loc[y,'ID'] =int(d7.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == "":
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathPG+r"\\"+str(PGCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
PG.loc[y,'News'] = PGCount
PGCount+=1
PG=PG.fillna("")
PG.to_csv(newpathPG+r'\Key.csv')
except:
print m
print "PG"
else:
Text_file = open(newpathPG+r"\\"+str(PGCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
PG.loc[y,'News'] =PGCount
PGCount+=1
PG=PG.fillna("")
PG.to_csv(newpathPG+r'\Key.csv')
print "PG"
#WMT inner most loop
d8=feedparser.parse(url+Symbols[7])
for x in xrange(len(d8['entries'])):
if int(d8.entries[x]['id'][14:]) not in DaysIDsWMT:
DaysIDsWMT.append(int(d8.entries[x]['id'][14:]))
y = len(WMT.index.tolist())
m=re.search(r'\*(.*)',d8.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
WMT.loc[y,'Title'] =d8.entries[x]['title'].encode('utf8')
WMT.loc[y,'link'] =m.encode('utf8')
WMT.loc[y,'Published'] =d8.entries[x]['published'].encode('utf8')
WMT.loc[y,'ID'] =int(d8.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == "":
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathWMT+r"\\"+str(WMTCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
WMT.loc[y,'News'] = WMTCount
WMTCount+=1
WMT=WMT.fillna("")
WMT.to_csv(newpathWMT+r'\Key.csv')
except:
print m
print "WMT"
else:
Text_file = open(newpathWMT+r"\\"+str(WMTCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
WMT.loc[y,'News'] =WMTCount
WMTCount+=1
WMT=WMT.fillna("")
WMT.to_csv(newpathWMT+r'\Key.csv')
print "WMT"
count+=1
print count
time.sleep(1)
except:
print "Error"
AAPL=AAPL.fillna("")
AAPL.to_csv(newpathAAPL+r'\Key.csv')
T=T.fillna("")
T.to_csv(newpathT+r'\Key.csv')
BA=BA.fillna("")
BA.to_csv(newpathBA+r'\Key.csv')
XOM=XOM.fillna("")
XOM.to_csv(newpathXOM+r'\Key.csv')
GOOG=GOOG.fillna("")
GOOG.to_csv(newpathGOOG+r'\Key.csv')
JPM=JPM.fillna("")
JPM.to_csv(newpathJPM+r'\Key.csv')
PG=PG.fillna("")
PG.to_csv(newpathPG+r'\Key.csv')
WMT=WMT.fillna("")
WMT.to_csv(newpathWMT+r'\Key.csv')
Upvotes: 2
Views: 1356
Reputation: 9422
in the program is consumed too much RAM when it collects a too large of feeds or if there are other active processes on your system (this is why the time to freeze differs), see Why does a simple python script crash my system
The process in that your program runs stores the arrays and variables for the calculations in process memory which is ram
you can fix this by forcing the program to use hard disk memory.
For workarounds (shelve
, periodically saving the collected feeds to a textfile (moves information from ram to rom and frees ram), ...) see the following links
memory usage, how to free memory
Python large variable RAM usage
I need to free up RAM by storing a Python dictionary on the hard drive, not in RAM. Is it possible?
Upvotes: 1