MK12
MK12

Reputation: 491

HTML not get in node js puppeteer

Error

Cannnot read property 'querySelectorAll' of null

I am scraping this site, when I write the below lines in console it gives me the HTML. But when I scrape the HTML from puppeteer it gives me error

document.querySelectorAll('#stroke-play-container > .stroke-play-leaderboard > .the-leaderboard.with-rolex > table.leaderboard.leaderboard-table.large')[0].nextSibling;

Code

'use strict';

 const puppeteer = require('puppeteer');
 function run() {
 return new Promise(async (resolve, reject) => {
    try {


        const browser = await puppeteer.launch({
        headless : false
        });

        const page = await browser.newPage();

        await page.goto("https://www.pgatour.com/leaderboard.html");

        await page.evaluate(`window.scrollTo(0, document.body.scrollHeight)`);
        await page.waitFor(5000);
    
        let urls = await page.evaluateHandle(() => {
            let results = [];
            var parser = new DOMParser();
            
            var node = document.querySelectorAll('#stroke-play-container > .stroke-play-leaderboard > .the-leaderboard.with-rolex > table.leaderboard.leaderboard-table.large')[0].nextSibling;
           
            if(node){

            var $ = parser.parseFromString(node, 'text/html');
            
          
            return {
                name: $.querySelectorAll('table > tbody:nth-child(1) > tr > td.player-name > div > div.player-name-col').innerText
            };
            }
            else{
                return 'error';
            }

        })
        browser.close();
        return resolve(urls);
    } catch (e) {
        return reject(e);
    }
})
}
 run().then(console.log).catch(console.error);

Upvotes: 2

Views: 1482

Answers (2)

pguardiario
pguardiario

Reputation: 54984

Try it like this:

let names = await page.evaluate(() => {
  let css = '.the-leaderboard.with-rolex > table.leaderboard.leaderboard-table.large + div div.player-name-col'
  let divs = [...document.querySelectorAll(css)]
  return divs.map(div => div.innerText)
})

I'm not sure what you were trying to accomplish with DOMParser, you shouldn't ever need to use that.

Upvotes: 1

Giovanni Rago
Giovanni Rago

Reputation: 71

EDIT: as pointed out in the comments, please be mindful of the Terms of Service of pgatours.com, which do not allow for scraping, crawling etc. The below solution is only intended to illustrate how to solve the generic technical point behind your question.

I think this might be due to the default viewport size Puppeteer is using. The website is hiding the content you are looking for on smaller resolutions, hence the problem.

What made this work for me was specifying the viewport size explicitly, like so:

page.setViewport({ width: 1200, height: 1000 })

So your code would become:

'use strict';

 const puppeteer = require('puppeteer');
 function run() {
 return new Promise(async (resolve, reject) => {
    try {


        const browser = await puppeteer.launch({
        headless : false
        });

        const page = await browser.newPage();
        page.setViewport({ width: 1200, height: 1000 })


        await page.goto("https://www.pgatour.com/leaderboard.html");

        await page.evaluate(`window.scrollTo(0, document.body.scrollHeight)`);
        await page.waitFor(5000);
    
        let urls = await page.evaluateHandle(() => {
            let results = [];
            var parser = new DOMParser();
            
            var node = document.querySelectorAll('#stroke-play-container > .stroke-play-leaderboard > .the-leaderboard.with-rolex > table.leaderboard.leaderboard-table.large')[0].nextSibling;
           
            if(node){

            var $ = parser.parseFromString(node, 'text/html');
            
          
            return {
                name: $.querySelectorAll('table > tbody:nth-child(1) > tr > td.player-name > div > div.player-name-col').innerText
            };
            }
            else{
                return 'error';
            }

        })
        browser.close();
        return resolve(urls);
    } catch (e) {
        return reject(e);
    }
})
}
run().then(console.log).catch(console.error);

Upvotes: 0

Related Questions