Problem parsing list of companies with BeautifulSoup

Question

I could parse the list of S&P500 companies with following codes:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import xlwings as xw

def get_sp500_info():
    resp = requests.get("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")
    soup = BeautifulSoup(resp.text, 'lxml')
    stocks_info = []
    tickers = []
    securities = []
    gics_industries = []
    gics_sub_industries = []
    table = soup.find('table', {'class': 'wikitable sortable'})
    
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text
        security = row.findAll('td')[1].text
        gics_industry = row.findAll('td')[3].text
        gics_sub_industry = row.findAll('td')[4].text
    
        tickers.append(ticker.lower().replace(r"
", " "))
        securities.append(security)
        gics_industries.append(gics_industry.lower())
        gics_sub_industries.append(gics_sub_industry.lower())
    
    stocks_info.append(tickers)
    stocks_info.append(securities)
    stocks_info.append(gics_industries)
    stocks_info.append(gics_sub_industries)
    
    stocks_info_df = pd.DataFrame(stocks_info).T
    stocks_info_df.columns=['tickers','security','gics_industry','gics_sub_industry']
    stocks_info_df['seclabels'] = 'SP500'
    return stocks_info_df

def open_in_excel(dataframe):
    xw.view(dataframe)

if __name__ == "__main__":
    open_in_excel(get_sp500_info())

Now I would like to parse list of Russel3000 companies with basically the same codes as above. And it doesn't work.

import requests
from bs4 import BeautifulSoup
import pandas as pd
import xlwings as xw

def get_russel3000_info():
    resp = requests.get("https://www.ishares.com/us/products/239714/ishares-russell-3000-etf#holdings")
    soup = BeautifulSoup(resp.text, "lxml")
    stocks_info = []
    tickers = []
    securities = []
    gics_industries = []
    
    table = soup.find('table', {'class': 'display product-table border-row dataTable no-footer'})

    for row in table.findAll('tr')[1:]:           #Line A
        ticker = row.findAll('td')[0].text
        security = row.findAll('td')[1].text
        gics_industry = row.findAll('td')[2].text

        tickers.append(ticker.lower().replace(r"
", " "))
        securities.append(security)
        gics_industries.append(gics_industry.lower())
        
    stocks_info.append(tickers)
    stocks_info.append(securities)
    stocks_info.append(gics_industries)
    
    stocks_info_df = pd.DataFrame(stocks_info).T
    stocks_info_df.columns=['tickers','security','gics_industry']
    stocks_info_df['seclabels'] = 'Russel3000'
    return stocks_info_df

def open_in_excel(dataframe):
    xw.view(dataframe)

if __name__ == "__main__":
    open_in_excel(get_russel3000_info())

I can't understand why it works with S&P500 but not with Russel3000. At "Line A" I would get the following error:

Exception has occurred: AttributeError
'NoneType' object has no attribute 'findAll'

It shouldn't return "None". I am thankful for any pointer:-)

RJ Adriaansen · Accepted Answer

You can load tables directly into pandas:

df = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")

You can access the tables on the page with df[0], df[1], etc. In the case of ishares.com the particular table doesn't load because it is loaded locally through javascript. One solution is to use Selenium to do the job:

from selenium import webdriver
import pandas as pd
import time

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

url="https://www.ishares.com/us/products/239714/ishares-russell-3000-etf#holdings"
wd = webdriver.Chrome('chromedriver',options=options)
wd.get(url)
time.sleep(5) # sleep for a few seconds to allow loading the data
df = pd.read_html(wd.page_source)

df[7] is the table you're looking for:

	Ticker	Name	Sector	Asset Class	Market Value	Weight (%)	Notional Value	Shares	CUSIP	ISIN	SEDOL	Accrual Date
0	AAPL	APPLE INC	Information Technology	Equity	$560,367,328.56	5.16	5.60367e+08	4.38506e+06	037833100	US0378331005	2046251	-
1	MSFT	MICROSOFT CORP	Information Technology	Equity	$482,112,717.24	4.44	4.82113e+08	2.03475e+06	594918104	US5949181045	2588173	-
2	AMZN	AMAZON COM INC	Consumer Discretionary	Equity	$362,479,373.96	3.34	3.62479e+08	115214	023135106	US0231351067	2000019	-
3	FB	FACEBOOK CLASS A INC	Communication	Equity	$172,844,238.24	1.59	1.72844e+08	652464	30303M102	US30303M1027	B7TL820	-
4	GOOGL	ALPHABET INC CLASS A	Communication	Equity	$168,815,957.22	1.55	1.68816e+08	81567	02079K305	US02079K3059	BYVY8G0	-

A better solution is to load the json file directly. As you can see when you inspect the website in Firefox or Chrome, the table data is loaded from this json url: https://www.ishares.com/us/products/239714/ishares-russell-3000-etf/1467271812596.ajax?tab=all&fileType=json. Loading this into pandas has the advantage of having the full 2866 entries in your dataframe in one go. We cannot load it directly into pandas because the file contains a UTF-8 BOM header, but this will work:

import requests
import json
import pandas as pd

url = "https://www.ishares.com/us/products/239714/ishares-russell-3000-etf/1467271812596.ajax?tab=all&fileType=json"
r = requests.get(url)
json = json.loads(r.content.decode('utf-8-sig'))
df = pd.DataFrame(json['aaData'])

Output:

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17
0	AAPL	APPLE INC	Information Technology	Equity	{'display': '$560,367,328.56', 'raw': 560367328.56}	{'display': '5.16', 'raw': 5.15741}	{'display': '560,367,328.56', 'raw': 560367328.56}	{'display': '4,385,064.00', 'raw': 4385064}	037833100	US0378331005	2046251	{'display': '127.79', 'raw': 127.79}	United States	NASDAQ	USD	1	USD	-
1	MSFT	MICROSOFT CORP	Information Technology	Equity	{'display': '$482,112,717.24', 'raw': 482112717.24}	{'display': '4.44', 'raw': 4.43718}	{'display': '482,112,717.24', 'raw': 482112717.24}	{'display': '2,034,746.00', 'raw': 2034746}	594918104	US5949181045	2588173	{'display': '236.94', 'raw': 236.94}	United States	NASDAQ	USD	1	USD	-
2	AMZN	AMAZON COM INC	Consumer Discretionary	Equity	{'display': '$362,479,373.96', 'raw': 362479373.96}	{'display': '3.34', 'raw': 3.33612}	{'display': '362,479,373.96', 'raw': 362479373.96}	{'display': '115,214.00', 'raw': 115214}	023135106	US0231351067	2000019	{'display': '3,146.14', 'raw': 3146.14}	United States	NASDAQ	USD	1	USD	-
3	FB	FACEBOOK CLASS A INC	Communication	Equity	{'display': '$172,844,238.24', 'raw': 172844238.24}	{'display': '1.59', 'raw': 1.59079}	{'display': '172,844,238.24', 'raw': 172844238.24}	{'display': '652,464.00', 'raw': 652464}	30303M102	US30303M1027	B7TL820	{'display': '264.91', 'raw': 264.91}	United States	NASDAQ	USD	1	USD	-
4	GOOGL	ALPHABET INC CLASS A	Communication	Equity	{'display': '$168,815,957.22', 'raw': 168815957.22}	{'display': '1.55', 'raw': 1.55372}	{'display': '168,815,957.22', 'raw': 168815957.22}	{'display': '81,567.00', 'raw': 81567}	02079K305	US02079K3059	BYVY8G0	{'display': '2,069.66', 'raw': 2069.66}	United States	NASDAQ	USD	1	USD	-

Problem parsing list of companies with BeautifulSoup

Answers (1)

Related Questions