bischrob
bischrob

Reputation: 584

Clicking on internal javascript links and returning urls using puppeteer

My goal is to click on each link (called a footnote) on this page and then return the footnote link, text, and then all of the URLs that appear in the sidebar. I'm stuck on accessing the sidebar values when they appear and after a few weeks of failure, I'm looking for some pointers on what I'm doing wrong (very new to both javascript and puppeteer).

const puppeteer = require('puppeteer');
const url = 'https://www.churchofjesuschrist.org/study/scriptures/bofm/1-ne/11?lang=eng';
const selector = '.study-note-ref';
(async function () {
    const browser = await puppeteer.launch({ headless: true });
    const page = await browser.newPage();
    await page.goto(url);
    const footnotes = await page.$$eval(selector, nodes => {
        return nodes.map(node => {
            const ref = node.href.replace('https://www.churchofjesuschrist.org', '');
            const txt = node.text;
            return {
                ref,
                txt
            };
        });
    });
    for (const a of footnotes) {
        page.click(a.ref);
        const links = await page.$$eval('.scripture-ref', nodes => {
            return nodes.map(node => {
                return node.href
            })
        })
    }
    console.log(footnotes);
    console.log(links);
    // const fs = require('fs');
    // fs.writeFile('./footnotes.json', JSON.stringify(footnotes), err => err ? console.log(err) : null);
    await browser.close();
})();

Upvotes: 2

Views: 61

Answers (1)

vsemozhebuty
vsemozhebuty

Reputation: 13812

Maybe something like this:

const puppeteer = require('puppeteer');

const url = 'https://www.churchofjesuschrist.org/study/scriptures/bofm/1-ne/11?lang=eng';
const selector = '.study-note-ref';

(async function main() {
  const browser = await puppeteer.launch({ headless: true });
  const [page] = await browser.pages();
  await page.goto(url);

  const data = {};

  for (const footnote of await page.$$(selector)) {
    const [href, text] = await page.evaluate(
      (a) => {
        a.click();
        return [a.getAttribute('href').replace('/#note', ''), a.innerText.slice(1)];
      },
      footnote
    );
    data[href] = { text };

    const header = await page.waitForXPath(`//aside/div/header/span[text()="${href} ${text}"]`);

    data[href].links = await page.evaluate(
      (span) => {
        const aside = span.closest('aside');
        return [...aside.querySelectorAll('a[href]')].map(
          a => ({ [a.innerText]: a.href })
        );
      },
      header
    );

    console.log(`Done: ${href} ${text}`);
  }
  console.log(JSON.stringify(data, null, 2));
  await browser.close();
})();

Part of the output:

{
  "1a": {
    "text": "pondering",
    "links": [
      {
        "D&C 76:19": "https://www.churchofjesuschrist.org/study/scriptures/dc-testament/dc/76.19?lang=eng#p19"
      },
      {
        "TG Meditation": "https://www.churchofjesuschrist.org/study/scriptures/tg/meditation?lang=eng"
      },
      {
        "Doctrine and Covenants 76:19": "https://www.churchofjesuschrist.org/study/scriptures/dc-testament/dc/76.19?lang=eng#p19#19"
      },
      {
        "Meditation, Meditate": "https://www.churchofjesuschrist.org/study/scriptures/tg/meditation?lang=eng"
      }
    ]
  },
}

Upvotes: 2

Related Questions