Moshe
Moshe

Reputation: 353

querySelector doesn't work with child elements in puppeteer

I'm trying to scrape this page using puppeteer: https://jcc.org/park-heights-indoor-pool-registration, and put pieces of the data into an array (time of event, title, sign-up link, etc.).

I copied the html of the page I am scraping into a local html file, and it all works fine (with the exact same code!), but with puppeteer, it returns a null error. On top of that, when I select a single element, there are no errors when gathering all of the data!

Code:

const puppeteer = require('puppeteer');

(async () => {

    let jcc_url = 'https://jcc.org/park-heights-indoor-pool-registration';


    let browser = await puppeteer.launch();
    let page = await browser.newPage();

    await page.goto(jcc_url, {waitUntil: 'networkidle0'});

    let data = await page.evaluate(() => {

        let slots_array = [];

        $(".GXPEntry").each(function (index, element) {

           slots_array[index] = {
                index: index,
                cancelled: undefined,
                time: element.querySelector(".GXPTime").textContent,
                title: element.querySelector('.GXPTitle').textContent,
                link: element.querySelector('a.signUpGXP').getAttribute("href"),
                availability: element.querySelector('div.GXPDescription span').textContent,
                dayOfWeek: element.querySelector('a').getAttribute('data-date')
            };

            if (slots_array[index].title === "CANCELED: Lap Swimming - Men's Only"
                ||
                slots_array[index].title === "CANCELED: Lap Swimming - Women's Only") {
                slots_array[index].cancelled = true;
            } else {
                slots_array[index].cancelled = false;
            }
        });

        return slots_array;

    });

    console.log(data);

    await browser.close();

    })();

This is what HTML layout of the page I am targeting looks like:

<div class="GXPEntry">
        <div class="GXPTime">8:15am-9:00am</div>
        <div class="GXPTitle"><img src="https://groupexpro.com/schedule/logos/custom/logo_53760.jpg"
         style="display: block; max-height: 30px; max-width: 120px; padding: 0px 5px 5px 0px;"
         title="">Lap Swimming - Men's Only<span
         style="position: relative; top: 2px; left: 4px;"><a class="signUpGXP removeIconGXP"
         href="https://groupexpro.com/gxp/reservations/start/index/11814665/10/05/2020?e=1"
         title="This class requires a reservation"><i
         style="background-image: url('https://groupexpro.com/gxp/design/img/glyphicons-halflings.png'); background-position: -96px -72px; background-repeat: no-repeat; display: inline-block; height: 14px; vertical-align: text-top; width: 14px; position: relative; top: 0px; left: -4px; float: left; margin-right:6px; "></i></a></span>
        </div>
        <div class="GXPInstructor">Staff</div>
        <div class="GXPStudio">Indoor Pool&nbsp;</div>
        <div class="GXPCategory">Aquatics</div>
        <div class="GXPLocation">Park Heights</div>
        <div class="GXPDescription">
            <a 11814665 alt="11814665" class="descGXP" data-date="10/05/2020" href="javascript://""="">Description</a>
            &nbsp; | &nbsp;
            <a alt="11814665" class="signUpGXP"
               href="https://groupexpro.com/gxp/reservations/start/index/11814665/10/05/2020?e=1"
               textmsg="3 SPOTS LEFT">
                Sign Up</a>
      &nbsp;      <a alt="Add to Calendar" class="addToCalendar" href="#">
                 <img alt="Add to Calendar" border="0" height="14" src="https://groupexpro.com/schedule/embed/images/ics.gif">
             </a>
            <br><br><span>3 SPOTS LEFT</span>
        </div>

I am just trying to get the href data from the link with the class of .signUpGXP, the text in the last span tag "3 SPOTS LEFT", title text from the div.GXPTitle, and the data-date attribute from the first link in the div.GXPDescription.

This works fine with jQuery if I copy the HTML into a local file, but in pupputeer it doesn't work, and gives me this error:

 (node:22638) UnhandledPromiseRejectionWarning: Error: Evaluation failed: TypeError: Cannot read property 'getAttribute' of null
    at HTMLDivElement.<anonymous> (__puppeteer_evaluation_script__:12:59)
    at Function.each (https://jcc.org/sites/default/files/js/js_POjCvph0DpQRBLbuAoUSghIegyfU_5lXHo4ESl4z0tw.js:2:2975)
    at $.fn.init.each (https://jcc.org/sites/default/files/js/js_POjCvph0DpQRBLbuAoUSghIegyfU_5lXHo4ESl4z0tw.js:2:835)
    at __puppeteer_evaluation_script__:5:24
    at ExecutionContext._evaluateInternal (/Users/moshe/coding-workspace/jcc-ph-pool-register/node_modules/puppeteer/lib/cjs/puppeteer/common/ExecutionContext.js:217:19)
    at processTicksAndRejections (internal/process/task_queues.js:97:5)
    at async ExecutionContext.evaluate (/Users/moshe/coding-workspace/jcc-ph-pool-register/node_modules/puppeteer/lib/cjs/puppeteer/common/ExecutionContext.js:106:16)
    at async /Users/moshe/coding-workspace/jcc-ph-pool-register/app.js:13:16
(node:22638) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). To terminate the node process on unhandled promise rejection, use the CLI flag `--unhandled-rejections=strict` (see https://nodejs.org/api/cli.html#cli_unhandled_rejections_mode). (rejection id: 1)
(node:22638) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process with a non-zero exit code.

Not exactly sure why it cannot find the property. It works totally fine if I do just this:

    const puppeteer = require('puppeteer');

(async () => {

    let jcc_url = 'https://jcc.org/park-heights-indoor-pool-registration';

    let browser = await puppeteer.launch();
    let page = await browser.newPage();

    await page.goto(jcc_url, {waitUntil: 'networkidle2'});

    let data = await page.evaluate(() => {
        let time = document.querySelector('.GXPTime').innerText;
        let title = document.querySelector('.GXPTitle').innerText;
        let availability = document.querySelector('.GXPDescription span').innerText;
        let link = document.querySelector('.signUpGXP').href;
        let dayOfWeek = document.querySelector('.GXPDescription a').getAttribute('data-date');

        return {
            time,
            title,
            availability,
            link,
            dayOfWeek
        }

    });

    console.log(data);

    debugger;

    await browser.close();


})();

I get all of the data here, but only the first section on the page.

I would appreciate help with this. Thanks!

Upvotes: 3

Views: 638

Answers (1)

vsemozhebuty
vsemozhebuty

Reputation: 13782

I have the same error if I run evaluated function in a browser. It seems the issue is that canseled events do not have sign-up links.

Upvotes: 2

Related Questions