Reputation: 57
I'm trying to scrape https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1 with Puppeteer and Node.js
In order to do this, I first get url to each job with function scrapeJobsInIndexPage(url)
and then run function scrapeDescriptionPage(url, page)
which iterates through each job url and scrapes job description.
Problem: The code gets url to each job and successfully paginates, however, it does not execute scrapeDescriptionPage(url, page)
function and I cannot get job description.
How can I open each job url and fetch job description for each job?
This part of the code works - it fetches url of each job.
const puppeteer = require("puppeteer");
const cheerio = require("cheerio");
async function scrapeJobsInIndexPage(url) {
try {
const [page] = await browser.pages();
await page.goto("https://www.ventureloop.com/ventureloop/login.php", {
waitUntil: "networkidle0",
});
await page.click("#close-cookies", {
delay: 200,
});
await page.type("[name='email_1']", "[email protected]", {
delay: 200,
});
await page.type("[name='pass']", "Aw8rbJ!9bXt*dpb", { delay: 200 });
await page.click("#formContainer > form > div:nth-child(5) > input", {
delay: 200,
});
await page.waitForNavigation();
await page.goto(url, { waitUntil: "networkidle0" });
const totalPagesSelector = ".pag_txt_tot";
const currentPageSelector = ".pag_txt_current";
await page.waitForSelector(totalPagesSelector);
const totalPages = await page.$eval(totalPagesSelector, (el) =>
Number(el.innerText)
);
for (let currentPage = 1; currentPage <= totalPages; currentPage++) {
await page.waitForFunction(
(sel, page) => document.querySelector(sel)?.innerText === String(page),
{},
currentPageSelector,
currentPage
);
const html = await page.evaluate(() => document.body.innerHTML);
const $ = await cheerio.load(html);
const jobs = $(".tsize a:even")
.map(
(i, element) =>
"https://www.ventureloop.com/ventureloop/" + $(element).attr("href")
)
.get();
console.log(jobs);
const data = await page.evaluate(() => {
const firstDataCell =
document.querySelector("#news_tbl tr td")?.innerText;
return firstDataCell;
});
console.log(`${currentPage}: ${data}`);
await page.evaluate(() => {
document
.querySelector("span.current")
.nextElementSibling?.querySelector("a")
.click();
});
}
} catch (err) {
console.error(err);
}
}
This part is supposed to fetch job details once the url is opened, however, I don't know how to connect it to previous function.
async function scrapeDescriptionPage(url, page) {
let jobText;
try {
jobText = $("#formContainer").text();
const companyImage = await page.$eval(
".cs-media img",
(img) => img.src
);
const applyLinkRedirect = $(".ltp-btn").attr("href");
const jobDescription = $(
"#formContainer > form > div > div > div.company-detail > div:nth-child(3)"
).html();
await page.goto(applyLinkRedirect, { waitUntil: "networkidle0" });
const applyLink = await page.url();
let ventureLoopResult = new testVentureLoopDB({
url,
applyLink,
jobDescription,
companyImage,
});
ventureLoopResults.push(ventureLoopResult);
console.log(ventureLoopResults);
ventureLoopResult.save();
return ventureLoopResults;
} catch (err) {
console.log(err);
}
}
let browser;
This is last function which connects the previous two together (but scrapeDescriptionPage
doesn't work)
async function main() {
browser = await puppeteer.launch({ headless: false });
const descriptionPage = await browser.newPage();
const jobs = await scrapeJobsInIndexPage(
"https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1"
);
for (var i = 1; i < jobs.length; i++) {
const result = await scrapeDescriptionPage(jobs[i], descriptionPage);
console.log(result);
}
}
main();
Upvotes: 0
Views: 793
Reputation: 13772
I do not know cheerio
, so these are just some guesses.
jobs
outside the loop and return it after the loop:async function scrapeJobsInIndexPage(url) {
try {
//...
const jobs = [];
for (let currentPage = 1; currentPage <= totalPages; currentPage++) {
// ...
const currentJobs = $(".tsize a:even")
.map(
(i, element) =>
"https://www.ventureloop.com/ventureloop/" + $(element).attr("href")
)
.get();
console.log(currentJobs);
jobs.push(...currentJobs);
// ...
}
return jobs;
} catch (err) {
console.error(err);
}
}
scrapeDescriptionPage
seems like a function adopted from a different context. But if you need to use cheerio
with each job page, you need to add something like you've already used before:async function scrapeDescriptionPage(url, page) {
await page.goto(url, { waitUntil: "networkidle0" });
const html = await page.evaluate(() => document.body.innerHTML);
const $ = await cheerio.load(html);
let jobText;
// ...
Upvotes: 1