Christopher
Christopher

Reputation: 57

Where should I implement a for loop with pagination?

I'm trying to scrape https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1 with Puppeteer and Node.js

In order to do this, I first get url to each job with function scrapeJobsInIndexPage(url) and then run function scrapeDescriptionPage(url, page) which iterates through each job url and scrapes job description.

Problem: The code gets url to each job and successfully paginates, however, it does not execute scrapeDescriptionPage(url, page) function and I cannot get job description.

How can I open each job url and fetch job description for each job?

This part of the code works - it fetches url of each job.

const puppeteer = require("puppeteer");
const cheerio = require("cheerio");

async function scrapeJobsInIndexPage(url) {
  try {
    const [page] = await browser.pages();

    await page.goto("https://www.ventureloop.com/ventureloop/login.php", {
      waitUntil: "networkidle0",
    });
    await page.click("#close-cookies", {
      delay: 200,
    });
    await page.type("[name='email_1']", "[email protected]", {
      delay: 200,
    });
    await page.type("[name='pass']", "Aw8rbJ!9bXt*dpb", { delay: 200 });
    await page.click("#formContainer > form > div:nth-child(5) > input", {
      delay: 200,
    });

    await page.waitForNavigation();
    await page.goto(url, { waitUntil: "networkidle0" });

    const totalPagesSelector = ".pag_txt_tot";
    const currentPageSelector = ".pag_txt_current";

    await page.waitForSelector(totalPagesSelector);

    const totalPages = await page.$eval(totalPagesSelector, (el) =>
      Number(el.innerText)
    );

    for (let currentPage = 1; currentPage <= totalPages; currentPage++) {
      await page.waitForFunction(
        (sel, page) => document.querySelector(sel)?.innerText === String(page),
        {},
        currentPageSelector,
        currentPage
      );
      const html = await page.evaluate(() => document.body.innerHTML);
      const $ = await cheerio.load(html);

      const jobs = $(".tsize a:even")
        .map(
          (i, element) =>
            "https://www.ventureloop.com/ventureloop/" + $(element).attr("href")
        )
        .get();
      console.log(jobs);

  

      const data = await page.evaluate(() => {
        const firstDataCell =
          document.querySelector("#news_tbl tr td")?.innerText;
        return firstDataCell;
      });
      console.log(`${currentPage}: ${data}`);

      await page.evaluate(() => {
        document
          .querySelector("span.current")
          .nextElementSibling?.querySelector("a")
          .click();
      });
    }
  } catch (err) {
    console.error(err);
  }
}

This part is supposed to fetch job details once the url is opened, however, I don't know how to connect it to previous function.

async function scrapeDescriptionPage(url, page) {
    let jobText;

    try {
      jobText = $("#formContainer").text();

      const companyImage = await page.$eval(
        ".cs-media img",
        (img) => img.src
      );

      const applyLinkRedirect = $(".ltp-btn").attr("href");
      const jobDescription = $(
        "#formContainer > form > div > div > div.company-detail > div:nth-child(3)"
      ).html();
      await page.goto(applyLinkRedirect, { waitUntil: "networkidle0" });
      const applyLink = await page.url();

      let ventureLoopResult = new testVentureLoopDB({
        url,
        applyLink,
        jobDescription,
        companyImage,
      });
      ventureLoopResults.push(ventureLoopResult);
      console.log(ventureLoopResults);
      ventureLoopResult.save();
      return ventureLoopResults;
    } catch (err) {
      console.log(err);
    }
  }


let browser;

This is last function which connects the previous two together (but scrapeDescriptionPage doesn't work)

async function main() {
  browser = await puppeteer.launch({ headless: false });
  const descriptionPage = await browser.newPage();
  const jobs = await scrapeJobsInIndexPage(
    "https://www.ventureloop.com/ventureloop/job_search.php?g=0&jcat=46&dc=all&ldata=%&jt=1&jc=1&jd=1&d=5&btn=1"
  );
  for (var i = 1; i < jobs.length; i++) {
    const result = await scrapeDescriptionPage(jobs[i], descriptionPage);
    console.log(result);
  }
}

main();

Upvotes: 0

Views: 793

Answers (1)

vsemozhebuty
vsemozhebuty

Reputation: 13772

I do not know cheerio, so these are just some guesses.

  1. To collect all job URLs, you need to declare jobs outside the loop and return it after the loop:
async function scrapeJobsInIndexPage(url) {
  try {
    //...
    const jobs = [];

    for (let currentPage = 1; currentPage <= totalPages; currentPage++) {
      // ...
      const currentJobs = $(".tsize a:even")
        .map(
          (i, element) =>
            "https://www.ventureloop.com/ventureloop/" + $(element).attr("href")
        )
        .get();
      console.log(currentJobs);
      jobs.push(...currentJobs);
      // ...
    }

    return jobs;
  } catch (err) {
    console.error(err);
  }
}
  1. Then, scrapeDescriptionPage seems like a function adopted from a different context. But if you need to use cheerio with each job page, you need to add something like you've already used before:
async function scrapeDescriptionPage(url, page) {
    await page.goto(url, { waitUntil: "networkidle0" });
    const html = await page.evaluate(() => document.body.innerHTML);
    const $ = await cheerio.load(html);

    let jobText;

    // ...

Upvotes: 1

Related Questions