cerrach
cerrach

Reputation: 197

Cannot print puppeteer response on node.js console

I'm scraping a website for practice and I'm using Puppeteer for a headless browser. My issue is console logging puppeteer response on node.js console.

I am targeting a parent element in the DOM for the products and retrieving certain information from each element. Using Google Dev Tools, all my querySelectors are correct and I can print the data I want without issue. However, when trying to print to my node.js console, I always get a blank array as a response.

const puppeteer = require('puppeteer');



let scrape = async () => {

  const masterList = [];

  const browser = await puppeteer.launch({headless: true});
  const page = await browser.newPage();

  await page.goto('https://www.fschumacher.com/catalog/Wallcoverings?sid=0.07316907031133635');

  const result = await page.evaluate(() => {
    return document.querySelector('.search-results-parent').querySelectorAll('.product-preview');
  });


  [].forEach.call(result, (el) =>{
    let thumbnail = el.querySelector('.product-thumb')
                      .querySelector('a')
                      .querySelector('img')
                        .src;

    let product_info = el.querySelector('.product-info').querySelector('div');

    let product_name = product_info.querySelector('a').querySelector('div').innerText;

    let product_id = product_info.querySelector('.product-id');

    let product_color = product_id.querySelector('.product-color').innerText;
    let product_sku = product_id.querySelector('.product-sku').innerText;

    let product_price = product_info.querySelector('.product-price')
                                    .querySelector('span')
                                      .innerText;

    let whole = {
      thumbnail,
      product_name,
      product_color,
      product_sku,
      product_price
    };

    masterList.push(whole);

  });



  browser.close();

  return masterList;
};

scrape().then((res) => {
  console.log(res);
});

I am expecting the data seen in Goggle Dev Tools to appear on my node.js console.

Upvotes: 0

Views: 437

Answers (1)

hardkoded
hardkoded

Reputation: 21607

The problem is that everything you are running after [].forEach.call(result, (el) =>{ is running in your Node process, not in Chromium. So, things like el.querySelector('.product-thumb') won't work because you are "disconnected" from Chromium at that point.

Good news is that you can solve this by moving more code to the Chromium side:

const mainResult = await page.evaluate(() => {
    const masterList = [];
    var result = document.querySelector('.search-results-parent').querySelectorAll('.product-preview');

    [].forEach.call(result, (el) =>{
    let thumbnail = el.querySelector('.product-thumb')
                        .querySelector('a')
                        .querySelector('img')
                        .src;

    let product_info = el.querySelector('.product-info').querySelector('div');

    let product_name = product_info.querySelector('a').querySelector('div').innerText;

    let product_id = product_info.querySelector('.product-id');

    let product_color = product_id.querySelector('.product-color').innerText;
    let product_sku = product_id.querySelector('.product-sku').innerText;

    let product_price = product_info.querySelector('.product-price')
                                    .querySelector('span')
                                        .innerText;

    let whole = {
        thumbnail,
        product_name,
        product_color,
        product_sku,
        product_price
    };

    masterList.push(whole);

    });

    return masterList;
});
browser.close();  
return mainResult;

Upvotes: 1

Related Questions