DENNIS
DENNIS

Reputation: 25

Scraping Market Data

Am trying to scrape market Data from the dex tool using selenium and Django and so far am not able to scrape all the market data. u will notice dex tool market data are in lazy loading meaning new data is loaded once u scroll down or when you press next page, since all the data cannot be displayed in one web page, the data is divided up to web page 35 . at the moment am only able to scrape 1st page that appears on the screen, the code below doesn't scrape all data. how can I improve the code to scrape all 35web pages data values

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time

def getData(url):
    driver = webdriver.Chrome(
        executable_path='C:/Users/denni/OneDrive/Desktop/DextoolScrapper/app/chromedriver.exe'
        )
    driver.get('https://www.dextools.io/app/uniswap/pair-explorer/0xa29fe6ef9592b5d408cca961d0fb9b1faf497d6d')

    # get table
    tableElement = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.TAG_NAME, 'ngx-datatable'))
    )
    # scroll into table view
    driver.execute_script("arguments[0].scrollIntoView();", tableElement)

    # scrolling through the table body to the bottom
    tableBodyelement = tableElement.find_element_by_tag_name('datatable-body-cell')
    driver.execute_script("arguments[0].scrollTo(0, arguments[0].scrollHeight)", tableBodyelement)

    rowWrapper = tableElement.find_elements_by_tag_name('datatable-row-wrapper')

    for row in rowWrapper:
        cells = row.find_elements_by_tag_name('datatable-body-cell')
        date = cells[0].text
        type = cells[1].text
        price_usd = cells[2].text
        price_eth = cells[3].text
        amount_cuminu = cells[4].text
        total_eth = cells[5].text
        maker = cells[6].find_element_by_tag_name('a').get_attribute('href')
        print(date, type, price_usd, price_eth, amount_cuminu, total_eth, maker)
        print('----')

this is the result of the above code 1st page scraped data

Upvotes: 0

Views: 1102

Answers (1)

furas
furas

Reputation: 142859

Simply put code in while True loop and click next at the end of this loop. You can use try/except to catch error when there is no more next pages to exit loop.

Eventually it may need sleep() after click() so JavaScript will have time to replace values in already existing table ngx-datatable.

EDIT: now code use pandas.DataFrame to save all in excel file.

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time

def getData(url):
    
    driver = webdriver.Chrome(
        executable_path='C:/Users/denni/OneDrive/Desktop/DextoolScrapper/app/chromedriver.exe'
        )
    
    #driver = webdriver.Chrome()
    #driver = webdriver.Firefox()
    
    driver.get('https://www.dextools.io/app/uniswap/pair-explorer/0xa29fe6ef9592b5d408cca961d0fb9b1faf497d6d')

    page = 0

    all_results = []  # list for all rows
    
    while True:

        page += 1
        print('--- page:', page, '---')
        
        # get table
        tableElement = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, 'ngx-datatable'))
        )
        # scroll into table view
        driver.execute_script("arguments[0].scrollIntoView();", tableElement)
    
        # scrolling through the table body to the bottom
        tableBodyelement = tableElement.find_element_by_tag_name('datatable-body-cell')
        driver.execute_script("arguments[0].scrollTo(0, arguments[0].scrollHeight)", tableBodyelement)
    
        rowWrapper = tableElement.find_elements_by_tag_name('datatable-row-wrapper')
    
        for row in rowWrapper:
            cells = row.find_elements_by_tag_name('datatable-body-cell')
            date = cells[0].text
            type = cells[1].text
            price_usd = cells[2].text
            price_eth = cells[3].text
            amount_cuminu = cells[4].text
            total_eth = cells[5].text
            maker = cells[6].find_element_by_tag_name('a').get_attribute('href')
            print(date, type, price_usd, price_eth, amount_cuminu, total_eth, maker)
            print('----')
            
            # add row to list
            all_results.append( [date, type, price_usd, price_eth, amount_cuminu, total_eth, maker] )
                                 
    
        try:
            next_page = driver.find_element_by_xpath('//a[@aria-label="go to next page"]')
            next_page.click()
            time.sleep(0.5)
        except Exception as ex:
            print("last page???")
            break
        
    # after loop convert to DataFrame and write it to excel
    
    import pandas as pd
    
    df = pd.DataFrame(all_results, columns=['date', 'type', 'price_usd', 'price_eth', 'amount_cuminu', 'total_eth', 'maker'])
    df.to_excel('results.xlsx')
    
# ---
 
getData(None)

Upvotes: 2

Related Questions