Reputation: 1
I am fairly new to programming, and this is my first project after reading various guides. I am trying to scrape data from the Yahoo Finance Key Statistics page and Financial Statements (ie. http://finance.yahoo.com/q/ks?s=GOOG+Key+Statistics). The links to the financials is at the bottom of the key statistics page. The code for the key statistics function seems to work.
But for the statement function, the entry variable used in pattern3 does not obtain negative values. The problem is especially apparent for the cash flow statement. For negative values, entry should look like
entry = '<td align="right">(.+?)</td>'
Am I approaching this correctly? Is there a simple way obtain all the values of the financial statements and put them into a list?
My Code in Python 2.7:
import urllib
import re
keystat = '<td class="yfnc_tabledata1">(.+?)</td>'
date = '<th scope="col" style="border-top:2px solid #000;text-align:right; font- weight:bold">(.+?)</th>' #obtain the date; only works for income statement
total = '<strong>(.+?) </strong>' #obtain data for any totals from statements
entry = '<td align="right">(.+?) </td>' #obtain data for any entries on statements that are not totals
def keystatfunc(symbol):
url = 'http://finance.yahoo.com/q/ks?s=' + symbol + '+Key+Statistics'
htmlfile = urllib.urlopen(url)
htmltext = htmlfile.read()
regex = '<span id="yfs_j10_' + symbol + '">(.+?)</span>'
pattern = re.compile(regex)
pattern2 = re.compile(keystat)
marketcap = re.findall(pattern, htmltext)
keystats = re.findall(pattern2, htmltext)
return (marketcap + keystats[1:31]) #creates a list with all the data on key statistics page)
def statement(symbol, period, statementtype): #period: "quarter" or "annually"; statementtype: is, bs, or cf (income statement, balance sheet, cash flow statement)
if period == "quarterly" and statementtype == "bs":
url = 'http://finance.yahoo.com/q/bs?s=' + symbol
elif period == "annual" and statementtype == "bs":
url = 'http://finance.yahoo.com/q/bs?s=' + symbol + '&annual'
elif period == "quarterly" and statementtype == "is":
url = 'http://finance.yahoo.com/q/is?s=' + symbol + '&annual'
elif period == "annual" and statementtype == "is":
url = 'http://finance.yahoo.com/q/is?s=' + symbol + '&annual'
elif period == "quarterly" and statementtype == "cf":
url = 'http://finance.yahoo.com/q/cf?s=' + symbol + '&annual'
elif period == "annual" and statementtype == "cf":
url = 'http://finance.yahoo.com/q/cf?s=' + symbol + '&annual'
htmlfile = urllib.urlopen(url)
htmltext = htmlfile.read()
pattern = re.compile(date)
pattern2 = re.compile(total)
pattern3 = re.compile(entry)
dates = re.findall(pattern, htmltext)
totals = re.findall(pattern2, htmltext)
entries = re.findall(pattern3, htmltext)
return (dates + totals + entries)
print keystatfunc("goog")
print statement("goog", "annual", "cf")
Upvotes: 0
Views: 3657
Reputation: 814
I don't believe the method you are using to extract the info is the most reliable way but I changed your code a little bit to capture the info you need. I updated the regular expression to check for parenthesis and added a section at the end to replace
import urllib
import re
keystat = '<td class="yfnc_tabledata1">(.+?)</td>'
date = '<th scope="col" style="border-top:2px solid #000;text-align:right; font- weight:bold">(.+?)</th>' #obtain the date; only works for income statement
total = '<strong>(.+?) </strong>' #obtain data for any totals from statements
entry = '<td align="right">(\(?.+?\)?)</td>' #obtain data for any entries on statements that are not totals
def keystatfunc(symbol):
url = 'http://finance.yahoo.com/q/ks?s=' + symbol + '+Key+Statistics'
htmlfile = urllib.urlopen(url)
htmltext = htmlfile.read()
regex = '<span id="yfs_j10_' + symbol + '">(.+?)</span>'
pattern = re.compile(regex)
pattern2 = re.compile(keystat)
marketcap = re.findall(pattern, htmltext)
keystats = re.findall(pattern2, htmltext)
return (marketcap + keystats[1:31]) #creates a list with all the data on key statistics page)
def statement(symbol, period, statementtype): #period: "quarter" or "annually"; statementtype: is, bs, or cf (income statement, balance sheet, cash flow statement)
if period == "quarterly" and statementtype == "bs":
url = 'http://finance.yahoo.com/q/bs?s=' + symbol
elif period == "annual" and statementtype == "bs":
url = 'http://finance.yahoo.com/q/bs?s=' + symbol + '&annual'
elif period == "quarterly" and statementtype == "is":
url = 'http://finance.yahoo.com/q/is?s=' + symbol + '&annual'
elif period == "annual" and statementtype == "is":
url = 'http://finance.yahoo.com/q/is?s=' + symbol + '&annual'
elif period == "quarterly" and statementtype == "cf":
url = 'http://finance.yahoo.com/q/cf?s=' + symbol + '&annual'
elif period == "annual" and statementtype == "cf":
url = 'http://finance.yahoo.com/q/cf?s=' + symbol + '&annual'
htmlfile = urllib.urlopen(url)
htmltext = htmlfile.read()
pattern = re.compile(date)
pattern2 = re.compile(total)
pattern3 = re.compile(entry)
dates = re.findall(pattern, htmltext)
totals = re.findall(pattern2, htmltext)
entries = re.findall(pattern3, htmltext)
entriesFixed = []
for e in entries:
entriesFixed.append(e.replace(' ',''))
return (dates + totals + entriesFixed)
print keystatfunc("goog")
Upvotes: 2