Charles
Charles

Reputation: 3

How to retrieve attribute values from n number of child elements using puppeteer?

This code will retrieve the attribute value of the first element I selected and if I put /html/body/section/div[3]/img<2> or img<3> in the xpath I can retrieve the data for the consecutive img elements.

However on the site I am scraping the parent element can have any number of img elements within it and I want to get the attribute values for all of them.

Is there some way for me to retrieve all of them?

const puppeteer = require("puppeteer");

async function scrapeProduct(url) {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  await page.goto(url);

  await page.waitForXPath(
    "/html/body/section/div[3]/img"
  );
  const [el] = await page.$x(
    "/html/body/section/div[3]/img"
  );
  const attr = await page.evaluate(
    (ele) => ele.getAttribute("data-original-name"),
    el
  );

  console.log({ attr });

  await browser.close();
}

Upvotes: 0

Views: 401

Answers (1)

vsemozhebuty
vsemozhebuty

Reputation: 13772

You can get them via XPath Web API:

'use strict';

const html = `
<html><body>
  <section><div>
    <img data-original-name="foo"></img>
    <img data-original-name="bar"></img>
    <img data-original-name="baz"></img>
  </div></section>
</body></html>`;

const puppeteer = require('puppeteer');

(async function main() {
  try {
    const browser = await puppeteer.launch();
    const [page] = await browser.pages();

    await page.goto(`data:text/html,${html}`);

    const arrayOfAttributes = await page.evaluate(() => {
      const attributes = [];
      const xpathList = document.evaluate(
        '/html/body/section/div/img',
        document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null,
      );
      for (let i = 0; i < xpathList.snapshotLength; i++) {
        attributes.push(xpathList.snapshotItem(i).dataset.originalName);
      }

      return attributes;
    });

    console.log(arrayOfAttributes);

    await browser.close();
  } catch (err) {
    console.error(err);
  }
})();

Result:

[ 'foo', 'bar', 'baz' ]

Upvotes: 1

Related Questions