Reputation: 37
I'm trying to change the scraped results in a column called "Outstanding". Currently, the numbers being scraped are coming out like 297.5M and I want them to be 297,500,000. I'm not sure quite how to do it but I know that if you put e5 instead of M, it would come out as 297500000. I tried this below but no luck. Any ideas how to get what I'm looking for. Thanks
data.replace({
'Outstanding': {
'M': 'e5'
}
})
import pandas as pd
import requests
import bs4
import time
import random
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
def get_screener(version):
url = 'https://finviz.com/screener.ashx?v={version}&r={page}&f=all&c=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70&f=ind_stocksonly&o=-marketcap'
page = 1
screen = requests.get(url.format(version=version, page=page), headers=headers)
soup = bs4.BeautifulSoup(screen.text, features='lxml')
pages = int(soup.find_all('a', {'class': 'screener-pages'})[-1].text)
data = []
for page in range(1, 20 * pages, 20):
print(version, page)
screen = requests.get(url.format(version=version, page=page), headers=headers).text
tables = pd.read_html(screen)
tables = tables[-2]
tables.columns = tables.iloc[0]
tables = tables[1:]
data.append(tables)
time.sleep(random.random())
return pd.concat(data).reset_index(drop=True).rename_axis(columns=None)
df = get_screener('152')
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter("pandas_column_formats.xlsx", engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet1', index=False)
# Get the xlsxwriter workbook and worksheet objects.
workbook = writer.book
worksheet = writer.sheets['Sheet1']
header_format = workbook.add_format()
header_format.set_font_name('Calibri')
header_format.set_font_color('green')
header_format.set_font_size(8)
header_format.set_italic()
header_format.set_underline()
# Write the column headers with the defined format.
for col_num, value in enumerate(df.columns.values):
worksheet.write(0, col_num, value, header_format)
# Add some cell formats.
format1 = workbook.add_format({'num_format': '#,##0.00'})
format2 = workbook.add_format({'num_format': '0%'})
format3 = workbook.add_format({'bold': True, 'font_color': 'red'})
data.replace({
'Outstanding': {
'M': 'e5'
}
})
# Note: It isn't possible to format any cells that already have a format such
# as the index or headers or any cells that contain dates or datetimes.
# Set the column width and format.
worksheet.set_column('AA:AA', 18, format1)
# Set the format but not the column width.
worksheet.set_column('B:C', None, format3)
# Close the Pandas Excel writer and output the Excel file.
writer.save()
**Updated Code**
import pandas as pd
import requests
import bs4
import time
import random
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
def get_screener(version):
url = 'https://finviz.com/screener.ashx?v=131&f=sh_outstanding_o1'
page = 1
screen = requests.get(url.format(version=version, page=page), headers=headers)
soup = bs4.BeautifulSoup(screen.text, features='lxml')
pages = int(soup.find_all('a', {'class': 'screener-pages'})[-1].text)
data = []
for page in range(1, 20 * pages, 20):
print(version, page)
screen = requests.get(url.format(version=version, page=page), headers=headers).text
tables = pd.read_html(screen)
tables = tables[-2]
tables.columns = tables.iloc[0]
tables = tables[1:]
data.append(tables)
time.sleep(random.random())
return pd.concat(data).reset_index(drop=True).rename_axis(columns=None)
df = get_screener('131')
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter("pandas_column_formats.xlsx", engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet1', index=False)
# Get the xlsxwriter workbook and worksheet objects.
workbook = writer.book
worksheet = writer.sheets['Sheet1']
header_format = workbook.add_format()
header_format.set_font_name('Calibri')
header_format.set_font_color('green')
header_format.set_font_size(8)
header_format.set_italic()
header_format.set_underline()
# Write the column headers with the defined format.
for col_num, value in enumerate(df.columns.values):
worksheet.write(0, col_num, value, header_format)
# Add some cell formats.
format1 = workbook.add_format({'num_format': '#,##0.00'})
format2 = workbook.add_format({'num_format': '0%'})
format3 = workbook.add_format({'bold': True, 'font_color': 'red'})
get_screener('131').Outstanding = get_screener('131').Outstanding.apply(lambda x: f"{(float(x.strip('MB'))*1000000):,.2f}")
# Note: It isn't possible to format any cells that already have a format such
# as the index or headers or any cells that contain dates or datetimes.
# Set the column width and format.
worksheet.set_column('AA:AA', 18, format1)
# Set the format but not the column width.
worksheet.set_column('B:C', None, format3)
# Close the Pandas Excel writer and output the Excel file.
writer.save()
No. Ticker Market Cap Outstanding ... Avg Volume Price Change Volume
0 1 A 48.27B 304.70M ... 1.53M 160.88 0.14% 756185
1 2 AA 11.07B 187.00M ... 8.65M 59.21 -0.70% 4378041
2 3 AAC 1.22B 125.00M ... 271.23K 9.74 0.00% 60427
3 4 AACG 32.13M 31.19M ... 45.85K 1.04 -1.89% 116352
4 5 AADI 499.64M 9.51M ... 181.05K 24.87 4.36% 66884
... .. ... ... ... ... ... ... ... ...
6035 16 AATC 33.53M 5.34M ... 19.10K 6.28 -2.40% 12875
6036 17 AAU 41.57M 137.22M ... 663.16K 0.30 3.26% 665062
6037 18 AAWW 2.69B 29.02M ... 522.66K 93.25 -0.54% 364687
6038 19 AB 4.72B 99.41M ... 275.59K 49.01 0.51% 176291
6039 20 ABB 76.14B 2.00B ... 1.43M 38.14 -1.32% 1367408
Edit
import pandas as pd
import requests
import bs4
import time
import random
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
def testDf(version):
url = 'https://finviz.com/screener.ashx?v={version}&r={page}&f=sh_outstanding_o1000&c=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70&f=ind_stocksonly&o=-marketcap'
page = 1
screen = requests.get(url.format(version=version, page=page), headers=headers)
soup = bs4.BeautifulSoup(screen.text, features='lxml')
pages = int(soup.find_all('a', {'class': 'screener-pages'})[-1].text)
data = []
for page in range(1, 20 * pages, 20):
print(version, page)
screen = requests.get(url.format(version=version, page=page), headers=headers).text
tables = pd.read_html(screen)
tables = tables[-2]
tables.columns = tables.iloc[0]
tables = tables[1:]
data.append(tables)
time.sleep(random.random())
return pd.concat(data).reset_index(drop=True).rename_axis(columns=None)
df = testDf('152')
def converter(x):
if 'M' in x:
return f"{(float(x.strip('M'))*1000000):,.2f}"
elif 'B' in x:
return f"{(float(x.strip('B'))*1000000000):,.2f}"
# Print first 20 rows of head to see current values
print(testDf('152').Outstanding.head(20))
# Call the apply method
testDf('152').Outstanding = testDf('152').Outstanding.apply(converter)
# Print first 20 rows of head to see modified values
print(testDf('152').Outstanding.head(20))
Upvotes: 0
Views: 1501
Reputation: 169
Example dataframe:
data = pd.DataFrame([['297.5M']], columns=['Outstanding'])
>>> data
Convert:
data.Outstanding = data.Outstanding.apply(lambda x: float(x.strip('M'))*1000000)
>>> data
*** Edit ***
Further to the comment below, if you want commas:
data.Outstanding = data.Outstanding.apply(lambda x: f"{(float(x.strip('M'))*1000000):,.2f}")
>>> data
However, your value will now be a string.
*** Edit ***
*** Edit ***
If you also want to remove possible B's (as in 260B), use this code:
data.Outstanding = data.Outstanding.apply(lambda x: f"{(float(x.strip('MB'))*1000000):,.2f}
***** Edit *****
# Create the conversion function
def converter(x):
if 'M' in x:
return f"{(float(x.strip('M'))*1000000):,.2f}"
elif 'B' in x:
return f"{(float(x.strip('B'))*1000000000):,.2f}"
# Print first 20 rows of head to see current values
print(testDf.Outstanding.head(20))
# Call the apply method
testDf.Outstanding = testDf.Outstanding.apply(converter)
# Print first 20 rows of head to see modified values
print(testDf.Outstanding.head(20))
***** EDIT *****
Replace the portion of your code that begins with the first statement in the code below:
# Copy from here on below
df = testDf('152')
def converter(x):
if 'M' in x:
return f"{(float(x.strip('M'))*1000000):,.2f}"
elif 'B' in x:
return f"{(float(x.strip('B'))*1000000000):,.2f}"
# Print first 20 rows of the Outstanding column of
# the dataframe to see current values
print(df.Outstanding.head(20))
# Call the apply method on the Outstanding column
df.Outstanding = df.Outstanding.apply(converter)
# Print first 20 rows of the Outstanding column of
# the dataframe to see values after being converted
print(df.Outstanding.head(20))
# The newly converted dataframe is in the df variable
print(df)
Little plus: there are four columns in your dataframe that are formatted with "999M|B". If you wanted to change all of them to the long format, replace df.Outstanding = df.Outstanding.apply(converter)
with:
# Conveert all M/B columns in one go:
listToConvert = ['Outstanding', 'Market Cap', 'Float', 'Avg Volume']
for column in listToConvert:
df[column] = df[column].apply(converter)
print(df)
Upvotes: 2