vinny russo
vinny russo

Reputation: 37

Convert Million to integer in Pandas

I'm trying to change the scraped results in a column called "Outstanding". Currently, the numbers being scraped are coming out like 297.5M and I want them to be 297,500,000. I'm not sure quite how to do it but I know that if you put e5 instead of M, it would come out as 297500000. I tried this below but no luck. Any ideas how to get what I'm looking for. Thanks

 data.replace({
            'Outstanding': {
                'M': 'e5'
            }
        })

    import pandas as pd
    import requests
    import bs4
    import time
    import random
    
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    
    def get_screener(version):
        url = 'https://finviz.com/screener.ashx?v={version}&r={page}&f=all&c=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70&f=ind_stocksonly&o=-marketcap'
        
        page = 1
       
        screen = requests.get(url.format(version=version, page=page), headers=headers)
        soup = bs4.BeautifulSoup(screen.text, features='lxml')
        pages = int(soup.find_all('a', {'class': 'screener-pages'})[-1].text)
          
        data = []
        for page in range(1, 20 * pages, 20):
            print(version, page)
            screen = requests.get(url.format(version=version, page=page), headers=headers).text
            tables = pd.read_html(screen)
            tables = tables[-2]
            tables.columns = tables.iloc[0]
            tables = tables[1:]
            data.append(tables)
            time.sleep(random.random())
        return pd.concat(data).reset_index(drop=True).rename_axis(columns=None)
           
    df = get_screener('152')
    
    # Create a Pandas Excel writer using XlsxWriter as the engine.
    writer = pd.ExcelWriter("pandas_column_formats.xlsx", engine='xlsxwriter')
    
    
    # Convert the dataframe to an XlsxWriter Excel object.
    df.to_excel(writer, sheet_name='Sheet1', index=False)
    
    # Get the xlsxwriter workbook and worksheet objects.
    workbook  = writer.book
    worksheet = writer.sheets['Sheet1']
    
    header_format = workbook.add_format()
    header_format.set_font_name('Calibri')
    header_format.set_font_color('green')
    header_format.set_font_size(8)
    header_format.set_italic()
    header_format.set_underline()
    
    # Write the column headers with the defined format.
    for col_num, value in enumerate(df.columns.values):
        worksheet.write(0, col_num, value, header_format)
    
    # Add some cell formats.
    format1 = workbook.add_format({'num_format': '#,##0.00'})
    format2 = workbook.add_format({'num_format': '0%'})
    format3 = workbook.add_format({'bold': True, 'font_color': 'red'})
    
    data.replace({
        'Outstanding': {
            'M': 'e5'
        }
    })
    
    # Note: It isn't possible to format any cells that already have a format such
    # as the index or headers or any cells that contain dates or datetimes.
    
    # Set the column width and format.
    worksheet.set_column('AA:AA', 18, format1)
    
    # Set the format but not the column width.
    worksheet.set_column('B:C', None, format3)    
    
    # Close the Pandas Excel writer and output the Excel file.
    writer.save()

**Updated Code**
import pandas as pd
import requests
import bs4
import time
import random

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

def get_screener(version):
    url = 'https://finviz.com/screener.ashx?v=131&f=sh_outstanding_o1'
    
    page = 1
   
    screen = requests.get(url.format(version=version, page=page), headers=headers)
    soup = bs4.BeautifulSoup(screen.text, features='lxml')
    pages = int(soup.find_all('a', {'class': 'screener-pages'})[-1].text)
      
    data = []
    for page in range(1, 20 * pages, 20):
        print(version, page)
        screen = requests.get(url.format(version=version, page=page), headers=headers).text
        tables = pd.read_html(screen)
        tables = tables[-2]
        tables.columns = tables.iloc[0]
        tables = tables[1:]
        data.append(tables)
        time.sleep(random.random())
    return pd.concat(data).reset_index(drop=True).rename_axis(columns=None)
       
df = get_screener('131')

# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter("pandas_column_formats.xlsx", engine='xlsxwriter')


# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet1', index=False)

# Get the xlsxwriter workbook and worksheet objects.
workbook  = writer.book
worksheet = writer.sheets['Sheet1']

header_format = workbook.add_format()
header_format.set_font_name('Calibri')
header_format.set_font_color('green')
header_format.set_font_size(8)
header_format.set_italic()
header_format.set_underline()

# Write the column headers with the defined format.
for col_num, value in enumerate(df.columns.values):
    worksheet.write(0, col_num, value, header_format)

# Add some cell formats.
format1 = workbook.add_format({'num_format': '#,##0.00'})
format2 = workbook.add_format({'num_format': '0%'})
format3 = workbook.add_format({'bold': True, 'font_color': 'red'})

get_screener('131').Outstanding = get_screener('131').Outstanding.apply(lambda x: f"{(float(x.strip('MB'))*1000000):,.2f}")

# Note: It isn't possible to format any cells that already have a format such
# as the index or headers or any cells that contain dates or datetimes.

# Set the column width and format.
worksheet.set_column('AA:AA', 18, format1)

# Set the format but not the column width.
worksheet.set_column('B:C', None, format3)

# Close the Pandas Excel writer and output the Excel file.
writer.save()



    No. Ticker Market Cap Outstanding  ... Avg Volume   Price  Change   Volume
0      1      A     48.27B     304.70M  ...      1.53M  160.88   0.14%   756185
1      2     AA     11.07B     187.00M  ...      8.65M   59.21  -0.70%  4378041
2      3    AAC      1.22B     125.00M  ...    271.23K    9.74   0.00%    60427
3      4   AACG     32.13M      31.19M  ...     45.85K    1.04  -1.89%   116352
4      5   AADI    499.64M       9.51M  ...    181.05K   24.87   4.36%    66884
...   ..    ...        ...         ...  ...        ...     ...     ...      ...
6035  16   AATC     33.53M       5.34M  ...     19.10K    6.28  -2.40%    12875
6036  17    AAU     41.57M     137.22M  ...    663.16K    0.30   3.26%   665062
6037  18   AAWW      2.69B      29.02M  ...    522.66K   93.25  -0.54%   364687
6038  19     AB      4.72B      99.41M  ...    275.59K   49.01   0.51%   176291
6039  20    ABB     76.14B       2.00B  ...      1.43M   38.14  -1.32%  1367408

Edit

import pandas as pd
import requests
import bs4
import time
import random

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

def testDf(version):
    url = 'https://finviz.com/screener.ashx?v={version}&r={page}&f=sh_outstanding_o1000&c=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70&f=ind_stocksonly&o=-marketcap'

    page = 1

    screen = requests.get(url.format(version=version, page=page), headers=headers)
    soup = bs4.BeautifulSoup(screen.text, features='lxml')
    pages = int(soup.find_all('a', {'class': 'screener-pages'})[-1].text)
    
    data = []
    for page in range(1, 20 * pages, 20):
        print(version, page)
        screen = requests.get(url.format(version=version, page=page), headers=headers).text
        tables = pd.read_html(screen)
        tables = tables[-2]
        tables.columns = tables.iloc[0]
        tables = tables[1:]
        data.append(tables)
        time.sleep(random.random())
    return pd.concat(data).reset_index(drop=True).rename_axis(columns=None)

df = testDf('152')
def converter(x):
    if 'M' in x:
        return f"{(float(x.strip('M'))*1000000):,.2f}"
    elif 'B' in x:
        return f"{(float(x.strip('B'))*1000000000):,.2f}"

# Print first 20 rows of head to see current values
print(testDf('152').Outstanding.head(20))
# Call the apply method
testDf('152').Outstanding = testDf('152').Outstanding.apply(converter)

# Print first 20 rows of head to see modified values
print(testDf('152').Outstanding.head(20))

Upvotes: 0

Views: 1501

Answers (1)

Joey Miths
Joey Miths

Reputation: 169

Example dataframe:

data = pd.DataFrame([['297.5M']], columns=['Outstanding'])
>>> data

enter image description here

Convert:

data.Outstanding = data.Outstanding.apply(lambda x: float(x.strip('M'))*1000000)
>>> data

enter image description here

*** Edit ***

Further to the comment below, if you want commas:

data.Outstanding = data.Outstanding.apply(lambda x: f"{(float(x.strip('M'))*1000000):,.2f}")
>>> data

enter image description here

However, your value will now be a string.

*** Edit ***

enter image description here

*** Edit ***

If you also want to remove possible B's (as in 260B), use this code:

data.Outstanding = data.Outstanding.apply(lambda x: f"{(float(x.strip('MB'))*1000000):,.2f}

***** Edit *****

# Create the conversion function
def converter(x):
    if 'M' in x:
        return f"{(float(x.strip('M'))*1000000):,.2f}"
    elif 'B' in x:
        return f"{(float(x.strip('B'))*1000000000):,.2f}"

# Print first 20 rows of head to see current values
print(testDf.Outstanding.head(20))

enter image description here

# Call the apply method
testDf.Outstanding = testDf.Outstanding.apply(converter)

# Print first 20 rows of head to see modified values
print(testDf.Outstanding.head(20))

enter image description here

***** EDIT *****

Replace the portion of your code that begins with the first statement in the code below:

# Copy from here on below

df = testDf('152')

def converter(x):
    if 'M' in x:
        return f"{(float(x.strip('M'))*1000000):,.2f}"
    elif 'B' in x:
        return f"{(float(x.strip('B'))*1000000000):,.2f}"

# Print first 20 rows of the Outstanding column of 
# the dataframe to see current values
print(df.Outstanding.head(20))

# Call the apply method on the Outstanding column
df.Outstanding = df.Outstanding.apply(converter)

# Print first 20 rows of the Outstanding column of 
# the dataframe to see values after being converted
print(df.Outstanding.head(20))

# The newly converted dataframe is in the df variable
print(df)

Little plus: there are four columns in your dataframe that are formatted with "999M|B". If you wanted to change all of them to the long format, replace df.Outstanding = df.Outstanding.apply(converter) with:

# Conveert all M/B columns in one go:
listToConvert = ['Outstanding', 'Market Cap', 'Float', 'Avg Volume']
for column in listToConvert:
    df[column] = df[column].apply(converter)

print(df)

enter image description here

Upvotes: 2

Related Questions