Reputation: 67
First, to be clear: My desired goal is to scrape data from ~100 URLS monthly using the code below. I need data from each URL to be exported to the same XLSX file but in different sheets with a predetermined name. Example from code below: Workbook name = "data.xlsx", and sheet name = "FEUR". ALSO: All of the links have the exact same layout and XPATHs. Works perfectly to just insert a new link.
The only solution I have found to be working so far is copy-pasting the code from the ####### and down, where I change the URL in driver.get() and the sheet_name="XX" in df.to_excel().
Instead, I am looking for a more efficient code to add links and make the code less heavy. Is this possible to do using Selenium?
See the code below:
from bs4 import BeautifulSoup
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd
from openpyxl import load_workbook
opts = Options()
opts.add_argument(" --headless")
chrome_driver = os.getcwd() +"/chromedriver"
driver = webdriver.Chrome(options=opts, executable_path=chrome_driver)
driver.implicitly_wait(10)
############
#FEUR
driver.get("https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F00000ZG2F&tab=3")
driver.switch_to.frame(1)
driver.find_element_by_xpath("//button[contains(@class,'show-table')]//span").click()
table = driver.find_elements_by_xpath("//div[contains(@class,'sal-mip-factor-profile__value-table')]/table//tr/th")
header = []
for tab in table:
header.append(tab.text)
#print(header)
tablebody = driver.find_elements_by_xpath("//div[contains(@class,'sal-mip-factor-profile__value-table')]/table//tbody/tr")
data = []
data.append(header)
for tab in tablebody:
row = []
content = tab.find_elements_by_tag_name("td")
for con in content:
row.append(con.text)
data.append(row)
df = pd.DataFrame(data)
path= r'/Users/karlemilthulstrup/Downloads/data.xlsx'
book = load_workbook(path)
writer = pd.ExcelWriter(path, engine='openpyxl')
writer.book = book
df.to_excel(writer, sheet_name="FEUR")
writer.save()
writer.close()
Upvotes: 0
Views: 97
Reputation: 28630
As long as you have the ID (which if you have the links, you can extract from), then feed those into the api. You may need to tweek a bit to fit your needs, but see how this goes.
import requests
import re
import pandas as pd
auth = 'https://www.morningstar.dk/Common/funds/snapshot/PortfolioSAL.aspx'
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('C:/Users/karlemilthulstrup/Downloads/data.xlsx', engine='openpyxl')
ids = ['F00000ZG2F', 'F0000025UI', 'F00000Z1MD', 'F00000Z4AE','F00000ZG2F']
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Mobile Safari/537.36'}
payload = {
'languageId': 'da-DK',
'locale': 'da-DK',
'clientId': 'MDC_intl',
'benchmarkId': 'category',
'component': 'sal-components-mip-factor-profile',
'version': '3.40.1'}
for api_id in ids:
payload = {
'Site': 'dk',
'FC': '%s' %api_id,
'IT': 'FO',
'LANG': 'da-DK',}
response = requests.get(auth, params=payload)
search = re.search('(tokenMaaS:[\w\s]*\")(.*)(\")', response.text, re.IGNORECASE)
bearer = 'Bearer ' + search.group(2)
headers.update({'Authorization': bearer})
url = 'https://www.us-api.morningstar.com/sal/sal-service/fund/factorProfile/%s/data' %api_id
jsonData = requests.get(url, headers=headers, params=payload).json()
rows = []
for k, v in jsonData['factors'].items():
row = {}
row['factor'] = k
historicRange = v.pop('historicRange')
row.update(v)
for each in historicRange:
row.update(each)
rows.append(row.copy())
df = pd.DataFrame(rows)
sheetName = jsonData['id']
df.to_excel(writer, sheet_name=sheetName, index=False)
print('Finished: %s' %sheetName)
writer.save()
writer.close()
Upvotes: 2