Reputation: 353
I'm trying to scrape this page using puppeteer: https://jcc.org/park-heights-indoor-pool-registration, and put pieces of the data into an array (time of event, title, sign-up link, etc.).
I copied the html of the page I am scraping into a local html file, and it all works fine (with the exact same code!), but with puppeteer, it returns a null error. On top of that, when I select a single element, there are no errors when gathering all of the data!
Code:
const puppeteer = require('puppeteer');
(async () => {
let jcc_url = 'https://jcc.org/park-heights-indoor-pool-registration';
let browser = await puppeteer.launch();
let page = await browser.newPage();
await page.goto(jcc_url, {waitUntil: 'networkidle0'});
let data = await page.evaluate(() => {
let slots_array = [];
$(".GXPEntry").each(function (index, element) {
slots_array[index] = {
index: index,
cancelled: undefined,
time: element.querySelector(".GXPTime").textContent,
title: element.querySelector('.GXPTitle').textContent,
link: element.querySelector('a.signUpGXP').getAttribute("href"),
availability: element.querySelector('div.GXPDescription span').textContent,
dayOfWeek: element.querySelector('a').getAttribute('data-date')
};
if (slots_array[index].title === "CANCELED: Lap Swimming - Men's Only"
||
slots_array[index].title === "CANCELED: Lap Swimming - Women's Only") {
slots_array[index].cancelled = true;
} else {
slots_array[index].cancelled = false;
}
});
return slots_array;
});
console.log(data);
await browser.close();
})();
This is what HTML layout of the page I am targeting looks like:
<div class="GXPEntry">
<div class="GXPTime">8:15am-9:00am</div>
<div class="GXPTitle"><img src="https://groupexpro.com/schedule/logos/custom/logo_53760.jpg"
style="display: block; max-height: 30px; max-width: 120px; padding: 0px 5px 5px 0px;"
title="">Lap Swimming - Men's Only<span
style="position: relative; top: 2px; left: 4px;"><a class="signUpGXP removeIconGXP"
href="https://groupexpro.com/gxp/reservations/start/index/11814665/10/05/2020?e=1"
title="This class requires a reservation"><i
style="background-image: url('https://groupexpro.com/gxp/design/img/glyphicons-halflings.png'); background-position: -96px -72px; background-repeat: no-repeat; display: inline-block; height: 14px; vertical-align: text-top; width: 14px; position: relative; top: 0px; left: -4px; float: left; margin-right:6px; "></i></a></span>
</div>
<div class="GXPInstructor">Staff</div>
<div class="GXPStudio">Indoor Pool </div>
<div class="GXPCategory">Aquatics</div>
<div class="GXPLocation">Park Heights</div>
<div class="GXPDescription">
<a 11814665 alt="11814665" class="descGXP" data-date="10/05/2020" href="javascript://""="">Description</a>
|
<a alt="11814665" class="signUpGXP"
href="https://groupexpro.com/gxp/reservations/start/index/11814665/10/05/2020?e=1"
textmsg="3 SPOTS LEFT">
Sign Up</a>
<a alt="Add to Calendar" class="addToCalendar" href="#">
<img alt="Add to Calendar" border="0" height="14" src="https://groupexpro.com/schedule/embed/images/ics.gif">
</a>
<br><br><span>3 SPOTS LEFT</span>
</div>
I am just trying to get the href data from the link with the class of .signUpGXP
, the text in the last span tag "3 SPOTS LEFT", title text from the div.GXPTitle
, and the data-date
attribute from the first link in the div.GXPDescription
.
This works fine with jQuery if I copy the HTML into a local file, but in pupputeer it doesn't work, and gives me this error:
(node:22638) UnhandledPromiseRejectionWarning: Error: Evaluation failed: TypeError: Cannot read property 'getAttribute' of null
at HTMLDivElement.<anonymous> (__puppeteer_evaluation_script__:12:59)
at Function.each (https://jcc.org/sites/default/files/js/js_POjCvph0DpQRBLbuAoUSghIegyfU_5lXHo4ESl4z0tw.js:2:2975)
at $.fn.init.each (https://jcc.org/sites/default/files/js/js_POjCvph0DpQRBLbuAoUSghIegyfU_5lXHo4ESl4z0tw.js:2:835)
at __puppeteer_evaluation_script__:5:24
at ExecutionContext._evaluateInternal (/Users/moshe/coding-workspace/jcc-ph-pool-register/node_modules/puppeteer/lib/cjs/puppeteer/common/ExecutionContext.js:217:19)
at processTicksAndRejections (internal/process/task_queues.js:97:5)
at async ExecutionContext.evaluate (/Users/moshe/coding-workspace/jcc-ph-pool-register/node_modules/puppeteer/lib/cjs/puppeteer/common/ExecutionContext.js:106:16)
at async /Users/moshe/coding-workspace/jcc-ph-pool-register/app.js:13:16
(node:22638) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). To terminate the node process on unhandled promise rejection, use the CLI flag `--unhandled-rejections=strict` (see https://nodejs.org/api/cli.html#cli_unhandled_rejections_mode). (rejection id: 1)
(node:22638) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process with a non-zero exit code.
Not exactly sure why it cannot find the property. It works totally fine if I do just this:
const puppeteer = require('puppeteer');
(async () => {
let jcc_url = 'https://jcc.org/park-heights-indoor-pool-registration';
let browser = await puppeteer.launch();
let page = await browser.newPage();
await page.goto(jcc_url, {waitUntil: 'networkidle2'});
let data = await page.evaluate(() => {
let time = document.querySelector('.GXPTime').innerText;
let title = document.querySelector('.GXPTitle').innerText;
let availability = document.querySelector('.GXPDescription span').innerText;
let link = document.querySelector('.signUpGXP').href;
let dayOfWeek = document.querySelector('.GXPDescription a').getAttribute('data-date');
return {
time,
title,
availability,
link,
dayOfWeek
}
});
console.log(data);
debugger;
await browser.close();
})();
I get all of the data here, but only the first section on the page.
I would appreciate help with this. Thanks!
Upvotes: 3
Views: 638
Reputation: 13782
I have the same error if I run evaluated function in a browser. It seems the issue is that canseled events do not have sign-up links.
Upvotes: 2