Reputation: 20302
I made two attempts to get my code to navigate to a web page, import data from a table into a data frame, then move to the next page and do the same thing again. Below is some sample code that I tested. Now I am stuck; not sure how to proceed.
# first attempt
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from time import sleep
lst = []
url = "https://www.nasdaq.com/market-activity/stocks/screener"
for numb in (1, 10):
url = "https://www.nasdaq.com/market-activity/stocks/screener"
r = requests.get(url)
html = r.text
soup = BeautifulSoup(html, "html.parser")
table = soup.find_all('table')
df = pd.DataFrame(table)
lst.append(df)
def get_cpf():
driver = webdriver.Chrome("C:/Utility/chromedriver.exe")
driver.get(url)
driver.find_element_by_class('pagination__page" data-page="'' + numb + ''').click()
sleep(10)
text=driver.find_element_by_id('texto_cpf').text
print(text)
get_cpf()
get_cpf.click
### second attempt
#import BeautifulSoup
from bs4 import BeautifulSoup
import pandas as pd
import requests
from selenium import webdriver
from time import sleep
lst = []
for numb in (1, 10):
r=requests.get('https://www.nasdaq.com/market-activity/stocks/screener')
data = r.text
soup = BeautifulSoup(data, "html.parser")
table = soup.find( "table", {"class":"nasdaq-screener__table"} )
for row in table.findAll("tr"):
for cell in row("td"):
data = cell.get_text().strip()
df = pd.DataFrame(data)
lst.append(df)
def get_cpf():
driver = webdriver.Chrome("C:/Utility/chromedriver.exe")
driver.get(url)
driver.find_element_by_class('pagination__page" data-page="'' + numb + ''').click()
sleep(10)
text=driver.find_element_by_id('texto_cpf').text
print(text)
get_cpf()
get_cpf.click
### third attempt
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
import time
import requests
import pandas as pd
lst = []
url="https://www.nasdaq.com/market-activity/stocks/screener"
driver = webdriver.Chrome("C:/Utility/chromedriver.exe")
wait = WebDriverWait(driver, 10)
driver.get(url)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#_evh-ric-c"))).click()
for pages in range(1,9):
try:
print(pages)
r = requests.get(url)
html = r.text
soup = BeautifulSoup(html, "html.parser")
table = soup.find_all('table')
df = pd.DataFrame(table)
lst.append(df)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"button.pagination__next"))).click()
time.sleep(1)
except:
break
Here is a screen shot of the HTML behind the table that I am trying to scrape.
So, on the first page, I want to scrape everything from:
AAPL Apple Inc. Common Stock $127.79 6.53 5.385% 2,215,538,678,600
To:
ASML ASML Holding N.V. New York Registry Shares $583.55 16.46 2.903% 243,056,764,541
Then, move to page 2, do the same, move to page 3, do the same, etc., etc., etc. I'm not sure if this is doable using only BeautifulSoup. Or maybe I need Selenium, for the button click event. I'm open to doing what is easiest here. Thanks!
Upvotes: 2
Views: 134
Reputation: 9969
Not going to deal with the API since Nuran just going to go with what the user asked
Here's an example of going through the first 10 pages. First we remove the notification. Then wait for the next button to be interactable and click it.
wait = WebDriverWait(driver, 10)
driver.get("https://www.nasdaq.com/market-activity/stocks/screener")
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#_evh-ric-c"))).click()
#Currently you start on the 1st page and say we want to click 9 times for the 10th page
for pages in range(1,10):
try:
print(pages)
#Get your data from this page
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"button.pagination__next"))).click()
#This is just here to slow everything so it may be removeable.
time.sleep(5)
except:
break
Import
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
You can do something like this
html=driver.page_source
soup=BeautifulSoup(html,'html.parser')
div=soup.select_one("table.nasdaq-screener__table")
table=pd.read_html(str(div))
print(table[0])
Upvotes: 0
Reputation: 11515
Attention Please, you don't need to use selenium
for such task as it's going to slow down your process at all.
In real world scenarios, we only use selenium
to bypass browser detection, Then we pass cookies to whatever HTTP module for continue the operation.
Regarding your task, I noticed there's an API
which actually feed the HTML
source.
Here's a quick call for it.
import pandas as pd
import requests
def main(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0"
}
params = {
'tableonly': 'true',
'limit': 1000
}
r = requests.get(
'https://api.nasdaq.com/api/screener/stocks', params=params, headers=headers)
goal = pd.DataFrame(r.json()['data']['table']['rows'])
print(goal)
goal.to_csv('data.csv', index=False)
if __name__ == "__main__":
main('https://api.nasdaq.com/api/screener/stocks')
Note that each page contain 25 ticker. Within my code, I've fetched
1000/ 25
= 40 Pages.
You don't need to loop over the pages
here. as you can just interact with increasing the limit!
But in case if you would like to use for
loop, so you have to loop over the following
And keep up the offset.
https://api.nasdaq.com/api/screener/stocks?tableonly=true&limit=25&offset=0
Upvotes: 2