Reputation: 25
Am trying to scrape market Data from the dex tool using selenium and Django and so far am not able to scrape all the market data. u will notice dex tool market data are in lazy loading meaning new data is loaded once u scroll down or when you press next page, since all the data cannot be displayed in one web page, the data is divided up to web page 35 . at the moment am only able to scrape 1st page that appears on the screen, the code below doesn't scrape all data. how can I improve the code to scrape all 35web pages data values
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
def getData(url):
driver = webdriver.Chrome(
executable_path='C:/Users/denni/OneDrive/Desktop/DextoolScrapper/app/chromedriver.exe'
)
driver.get('https://www.dextools.io/app/uniswap/pair-explorer/0xa29fe6ef9592b5d408cca961d0fb9b1faf497d6d')
# get table
tableElement = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, 'ngx-datatable'))
)
# scroll into table view
driver.execute_script("arguments[0].scrollIntoView();", tableElement)
# scrolling through the table body to the bottom
tableBodyelement = tableElement.find_element_by_tag_name('datatable-body-cell')
driver.execute_script("arguments[0].scrollTo(0, arguments[0].scrollHeight)", tableBodyelement)
rowWrapper = tableElement.find_elements_by_tag_name('datatable-row-wrapper')
for row in rowWrapper:
cells = row.find_elements_by_tag_name('datatable-body-cell')
date = cells[0].text
type = cells[1].text
price_usd = cells[2].text
price_eth = cells[3].text
amount_cuminu = cells[4].text
total_eth = cells[5].text
maker = cells[6].find_element_by_tag_name('a').get_attribute('href')
print(date, type, price_usd, price_eth, amount_cuminu, total_eth, maker)
print('----')
this is the result of the above code 1st page scraped data
Upvotes: 0
Views: 1102
Reputation: 142859
Simply put code in while True
loop and click next
at the end of this loop. You can use try/except
to catch error when there is no more next
pages to exit loop.
Eventually it may need sleep()
after click()
so JavaScript will have time to replace values in already existing table ngx-datatable
.
EDIT: now code use pandas.DataFrame
to save all in excel file.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
def getData(url):
driver = webdriver.Chrome(
executable_path='C:/Users/denni/OneDrive/Desktop/DextoolScrapper/app/chromedriver.exe'
)
#driver = webdriver.Chrome()
#driver = webdriver.Firefox()
driver.get('https://www.dextools.io/app/uniswap/pair-explorer/0xa29fe6ef9592b5d408cca961d0fb9b1faf497d6d')
page = 0
all_results = [] # list for all rows
while True:
page += 1
print('--- page:', page, '---')
# get table
tableElement = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, 'ngx-datatable'))
)
# scroll into table view
driver.execute_script("arguments[0].scrollIntoView();", tableElement)
# scrolling through the table body to the bottom
tableBodyelement = tableElement.find_element_by_tag_name('datatable-body-cell')
driver.execute_script("arguments[0].scrollTo(0, arguments[0].scrollHeight)", tableBodyelement)
rowWrapper = tableElement.find_elements_by_tag_name('datatable-row-wrapper')
for row in rowWrapper:
cells = row.find_elements_by_tag_name('datatable-body-cell')
date = cells[0].text
type = cells[1].text
price_usd = cells[2].text
price_eth = cells[3].text
amount_cuminu = cells[4].text
total_eth = cells[5].text
maker = cells[6].find_element_by_tag_name('a').get_attribute('href')
print(date, type, price_usd, price_eth, amount_cuminu, total_eth, maker)
print('----')
# add row to list
all_results.append( [date, type, price_usd, price_eth, amount_cuminu, total_eth, maker] )
try:
next_page = driver.find_element_by_xpath('//a[@aria-label="go to next page"]')
next_page.click()
time.sleep(0.5)
except Exception as ex:
print("last page???")
break
# after loop convert to DataFrame and write it to excel
import pandas as pd
df = pd.DataFrame(all_results, columns=['date', 'type', 'price_usd', 'price_eth', 'amount_cuminu', 'total_eth', 'maker'])
df.to_excel('results.xlsx')
# ---
getData(None)
Upvotes: 2