Reputation: 1190
i would like to scrape the whole table in the middle of this site: https://www.brilliantearth.com/lab-diamonds-search/
i tried it with the following code - but with that i only get the first 200 rows of the table:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import os, sys
import xlwings as xw
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from fake_useragent import UserAgent
if __name__ == '__main__':
WAIT = 3
ua = UserAgent()
userAgent = ua.random
options = Options()
# options.add_argument('--headless')
options.add_experimental_option ('excludeSwitches', ['enable-logging'])
options.add_argument("start-maximized")
options.add_argument('window-size=1920x1080')
options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')
options.add_argument(f'user-agent={userAgent}')
srv=Service(ChromeDriverManager().install())
driver = webdriver.Chrome (service=srv, options=options)
waitWebDriver = WebDriverWait (driver, 10)
lElems = []
link = f"https://www.brilliantearth.com/lab-diamonds-search/"
# driver.minimize_window() # optional
driver.get (link)
time.sleep(WAIT)
driver.find_element(By.XPATH,"(//button[@title='Accept All'])[1]").click()
time.sleep(WAIT)
soup = BeautifulSoup (driver.page_source, 'html.parser')
time.sleep(WAIT)
tmpSearch = soup.find("div", {"id": "diamond_search_wrapper"})
tmpDIVs = tmpSearch.select("div.inner.item")
for idx,elem in enumerate(tmpDIVs):
tmpTD = elem.find_all("td")
row = []
for e2 in tmpTD:
row.append(e2.text)
print(idx, row)
I would like to scroll down with selenium to the very bottom of this table. But when i make a scrolldown only the overall page scroll downs and not the table inside.
How can i scroll down in the table to the bottom? (and can then probably scrape all elements from the the table)
Upvotes: 0
Views: 203
Reputation: 1411
It would be quicker and easier to scrape their backend network calls, to explore them in your browser open Developer Tools - Network - fetch/XHR and refresh the page or scroll down the data you want and you can see the network calls happening. I've recreated them below and dump the data into csv:
import requests
import pandas as pd
headers = {
'accept':'application/json, text/javascript, */*; q=0.01',
'accept-encoding':'gzip, deflate, br',
'referer':'https://www.brilliantearth.com/lab-diamonds-search/',
'sec-fetch-site':'same-origin',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'x-requested-with':'XMLHttpRequest'
}
final = []
for page in range(1,10):
print(f'Scraping page {page}')
new_url = f'https://www.brilliantearth.com/lab-diamonds/list/?page={page}&shapes=Round&cuts=Fair%2CGood%2CVery%20Good%2CIdeal%2CSuper%20Ideal&colors=J%2CI%2CH%2CG%2CF%2CE%2CD&clarities=SI2%2CSI1%2CVS2%2CVS1%2CVVS2%2CVVS1%2CIF%2CFL&polishes=Good%2CVery%20Good%2CExcellent&symmetries=Good%2CVery%20Good%2CExcellent&fluorescences=Very%20Strong%2CStrong%2CMedium%2CFaint%2CNone&min_carat=0.30&max_carat=8.18&min_table=45.00&max_table=82.50&min_depth=5.00&max_depth=85.80&min_price=350&max_price=128290&stock_number=&row=0&requestedDataSize=200&order_by=price&order_method=asc¤cy=%24&has_v360_video=&dedicated=&min_ratio=1.00&max_ratio=2.75&exclude_quick_ship_suppliers=&MIN_PRICE=350&MAX_PRICE=128290&MIN_CARAT=0.3&MAX_CARAT=8.18&MIN_TABLE=45&MAX_TABLE=82.5&MIN_DEPTH=5&MAX_DEPTH=85.8'
resp = requests.get(new_url,headers=headers).json()
for diamond in resp['diamonds']:
diamond.pop('v360_src',None) #remove long video and images links to clean up csv
diamond.pop('images',None)
final.append(diamond)
df = pd.DataFrame(final)
df.to_csv('diamonds.csv',encoding='utf-8',index=False)
print('Saved to diamonds.csv')
Upvotes: 1
Reputation: 33361
You can scroll the inner table with this code:
rows = driver.find_elements_by_css_selector("#diamond_search_wrapper div.inner.item")
for row in rows :
driver.execute_script("arguments[0].scrollIntoView();", row )
#scrape the data etc..
Upvotes: 0