isiaatz
isiaatz

Reputation: 1145

Puppeteer doesn't extract all elements

I'm writing a script in NodeJS to extract the top traded cryptocurrencies in the last 24 hours. I want to extract the columns for Name, ticker and 24h percentage inside an array like this:

[{ name: 'Bitcoin', ticker: 'BTC', percentage: '20.62%' },
{ name: 'Ethereum', ticker: 'ETH', percentage: '10.19%' },
...
]

My script looks like this, but once you execute it, it will skip some rows. Does anybody know why is it skipping some rows randomly? Is there a better way to do this?

let cryptoData = []

const browser = await puppeteer.launch({ args: ['--no-sandbox'], headless: true })
const page = await browser.newPage()

await page.setViewport({ width: 1536, height: 850 })

await page.goto('https://coinmarketcap.com/', { waitUntil: 'networkidle2' })

// Wait for tickers table to fully load
  await page.waitForSelector('tr:nth-child(1) > td > .cmc-link > .sc-16r8icm-0 > .sc-16r8icm-0 > .sc-1eb5slv-0')

// Sort the list 24h descending
  await page.waitForSelector('.stickyTop:nth-child(5) > div > .sc-9dqrx-0 > .sc-9dqrx-1 > .sc-1eb5slv-0')
  await page.click('.stickyTop:nth-child(5) > div > .sc-9dqrx-0 > .sc-9dqrx-1 > .sc-1eb5slv-0')

// Wait for tickers table to fully load
  await page.waitForSelector('tr:nth-child(1) > td > .cmc-link > .sc-16r8icm-0 > .sc-16r8icm-0 > .sc-1eb5slv-0')


let data = await page.evaluate(() => {
  let tempData = []

  for (let index = 1; index <= 100; index++) {
    let name = document.querySelector(`tr:nth-child(${index}) > td > .cmc-link > .sc-16r8icm-0 > .sc-16r8icm-0 > .sc-1eb5slv-0`)
    let ticker = document.querySelector(`tr:nth-child(${index}) > td > .cmc-link > .sc-16r8icm-0 > .sc-16r8icm-0 > .sc-1teo54s-2 > .sc-1eb5slv-0`)
    let percentage = document.querySelector(`.cmc-table > tbody > tr:nth-child(${index}) > td > .iqsl6q-0`)

    if (name && ticker && percentage) {
      name = name.innerText

      tempData.push({
          id: index,
          name,
          ticker,
          percentage,
        })
    }
  }

  return tempData
})

console.log(data)

await browser.close()

Upvotes: 2

Views: 820

Answers (1)

isiaatz
isiaatz

Reputation: 1145

The issue was that Puppeteer wasn't loading the whole page and needed to scroll to the bottom of the page to load the lazy-loading data.

For that I've used this reply from another user:

https://stackoverflow.com/a/53527984

const puppeteer = require('puppeteer');

(async () => {
    const browser = await puppeteer.launch({
        headless: false
    });
    const page = await browser.newPage();
    await page.goto('https://www.yoursite.com');
    await page.setViewport({
        width: 1200,
        height: 800
    });

    await autoScroll(page);

    await page.screenshot({
        path: 'yoursite.png',
        fullPage: true
    });

    await browser.close();
})();

async function autoScroll(page){
    await page.evaluate(async () => {
        await new Promise((resolve, reject) => {
            var totalHeight = 0;
            var distance = 100;
            var timer = setInterval(() => {
                var scrollHeight = document.body.scrollHeight;
                window.scrollBy(0, distance);
                totalHeight += distance;

                if(totalHeight >= scrollHeight){
                    clearInterval(timer);
                    resolve();
                }
            }, 100);
        });
    });
}

Upvotes: 3

Related Questions