Reputation: 87
So I'm attempting to scrape a website (it is a website with public information) for some basic company data. I'm using node and Puppeteer to do so. the working code below successfully scrapes the first page, but when it comes to clicking to the second page, I was getting Error: Execution context was destroyed, most likely because of a navigation.
, and now I'm getting an error saying my function is not a function.
Can anyone point at what I'm doing wrong and what the best approach to scrape all 28 pages would be?
SCRAPES FIRST PAGE SUCCESSFULLY
const puppeteer = require("puppeteer");
// var fs = require("fs");
const fsp = require("fs").promises;
const fs = require("fs");
let pageCount = 1; // 21 full pages of content
let companyRows;
function delay(time) {
return new Promise(function(resolve) {
setTimeout(resolve, time);
});
}
(async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.on("console", msg => {
for (let i = 0; i < msg.args().length; ++i)
console.log(`${i}: ${msg.args()[i]}`);
});
await page.goto(
"http://dpsstnet.state.or.us/IRIS_PublicInquiry/PrivateSecurity/SMSAgcyTable.aspx"
);
//Clicks a tag by id
await page.click("#btnNaLL");
await page.waitFor(1000);
const result = await page.evaluate(() => {
let row = document.querySelectorAll("tr");
let companyData = [];
row.forEach(el => {
let company = {};
let count = 0;
for (data of el.cells) {
switch (count) {
case 0:
company.name = data.innerText.trim();
case 1:
company.primaryContact = data.innerText.trim();
case 2:
company.address = data.innerText.trim();
case 3:
company.phone = data.innerText.trim();
case 4:
company.county = data.innerText.trim();
case 5:
company.status = data.innerText.trim();
default:
company.default = data.innerText.trim();
}
count++;
companyData.push(company);
//GOT SOME STUUFFFF
console.log(JSON.stringify(companyData));
}
});
// await page.waitFor(3000);
// await fsp.writeFile("./json/file.json", result.stringify());
companyData = companyData.filter((a, b) => companyData.indexOf(a) === b);
companyData = companyData.filter(e => e.status === "Active");
return companyData;
});
// fsp.writeFile(
// "./json/file.json",
// JSON.stringify(companyData, null, 2),
// err =>
// err
// ? console.error("Data not written!", err)
// : console.log("Data Written")
// );
await fsp.writeFile(
"./json/file.json",
JSON.stringify(result, null, 2),
err =>
err
? console.error("Data not written!", err)
: console.log("Data Written")
);
await page.screenshot({
path: "./screenshots/page1.png"
});
await page.pdf({ path: "./pdfs/page1.pdf" });
await browser.close();
return result;
} catch (error) {
console.log(error);
}
})();
CODE REWRITTEN TO NAVIGATE THROUGH PAGES(not working) currently, I'm getting "clickLink is not a function" when running this.
const puppeteer = require("puppeteer");
const fsp = require("fs").promises;
const fs = require("fs");
let pageCount = 1; // 21 full pages of content
let companyRows;
let pageToClick;
function delay(time) {
return new Promise(function(resolve) {
setTimeout(resolve, time);
});
}
(async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const clickLink = link => {
page.click(link);
page.waitFor(1000);
};
page.on("console", msg => {
for (let i = 0; i < msg.args().length; ++i)
console.log(`${i}: ${msg.args()[i]}`);
});
await page.goto(
"http://dpsstnet.state.or.us/IRIS_PublicInquiry/PrivateSecurity/SMSAgcyTable.aspx"
);
//Clicks a tag by id
await page.click("#btnNaLL");
await page.waitFor(1000);
let fullResult = [];
let result;
result = await page.evaluate(
(fullResult, clickLink => {
let row = document.querySelectorAll("tr");
let companyData = [];
let pageList = document.querySelectorAll("b > a");
for (let step = 0; step < 2; step++) {
row.forEach(el => {
let company = {};
let count = 0;
for (data of el.cells) {
switch (count) {
case 0:
company.name = data.innerText.trim();
case 1:
company.primaryContact = data.innerText.trim();
case 2:
company.address = data.innerText.trim();
case 3:
company.phone = data.innerText.trim();
case 4:
company.county = data.innerText.trim();
case 5:
company.status = data.innerText.trim();
default:
company.default = data.innerText.trim();
}
count++;
companyData.push(company);
//GOT SOME STUUFFFF
console.log(JSON.stringify(companyData));
}
});
companyData = companyData.filter(
(a, b) => companyData.indexOf(a) === b
);
companyData = companyData.filter(e => e.status === "Active");
fullResult = [...fullResult, ...companyData];
// console.log(JSON.stringify(pageList[step].innerText));
clickLink(pageList[step]);
}
return fullResult;
},
fullResult,
clickLink
);
await fsp.writeFile(
"./json/file.json",
JSON.stringify(result, null, 2),
err =>
err
? console.error("Data not written!", err)
: console.log("Data Written")
);
//*
await page.screenshot({
path: "./screenshots/page1.png"
});
await page.pdf({ path: "./pdfs/page1.pdf" });
await browser.close();
return result;
} catch (error) {
console.log(error);
}
})();
I imagine I'm just not understanding some concepts behind best practices with puppeteer.
The code above is just my most recent attempt to paginate and scrape. I also tried using pageList[step].click() instead of the clickLink function. I also tried moving the for loop outside the evaluate and rerunning the code a bunch of times, with an 'await page.click(nextPageNode)' but that was messy and also didn't work.
if you'd like to test the project: It's uploaded to https://github.com/jIrwinCline/scrapeDPSST. there's no README FYI but its pretty straitforward. Just pull down the code, and run "node index.js".
Please Help! I've been going at this pagination thing for the last day.
Upvotes: 0
Views: 1312
Reputation: 3033
First problem is you can't pass a function to evaluate
like that. For exposing functions to the web page context you should use exposeFunction
.
Even if you expose clickLink, it'll throw an error for page
. It's important to understand different execution contexts. Puppteer's execution context is different from the web page's. page
is an object in puppeteer context and evaluate
runs your function in the web page's execution context. so you can't pass page
to evaluate.
For pagination, I'd suggest taking out that pageList to puppeteer context since the execution context will get destroyed when you navigate:
const pageList = await page.$$('b > a');
Now loop through the links in puppeteer context and click on them and run your page.evaluate
to get the data.
for (let link of pageList){
await Promise.all([
page.waitForNavigation(),
link.click()
]);
//get result
await page.evaluate(() => ...);
}
Make sure links are not opened in new tabs since page is referring to the current page/tab.
Upvotes: 1