margarita
margarita

Reputation: 894

Selector with Cheerio fails to retrieve children

I believe this one is a bug. I am trying to write a simple web scraper with request and cheerio.

How I tried to solve it:

  1. Yes, I played with other ways to define a selector.
  2. Yes, I have investigated other stackoverflow questions.
  3. Yes, I have created an issue on cheerio github, here is the link: https://github.com/cheeriojs/cheerio/issues/1252
  4. Yes, I am a professional web developer and this is not the first time I do node.js

Update: After some people pointed out, the issue was that needed dom nodes were created after my page was parsed and traversed by cheerio. So the part of the page I requested simply was not there. Any Ideas how to bypass that?

I use versions:

{
  "name": "discont",
  "version": "1.0.0",
  "description": "Find when the item is on sale",
  "main": "index.js",
  "license": "MIT",
  "devDependencies": {
    "express": "^4.16.4"
  },
  "dependencies": {
    "cheerio": "^1.0.0-rc.2",
    "ejs": "^2.6.1",
    "request": "^2.88.0"
  }
}

This is the HTML I am trying to scrape:

enter image description here

The link is here: https://www.asos.com/new-look-wide-fit/new-look-wide-fit-court-shoe/prd/10675413?clr=oatmeal&SearchQuery=&cid=6461&gridcolumn=1&gridrow=9&gridsize=4&pge=1&pgesize=72&totalstyles=826

This is my code:

request(url, options, function(error, response, html) {
    if (!error) {
      var $ = cheerio.load(html, { withDomLvl1: false });
      // console.log("product-price", $("div.product-price")[0].attribs);
      console.log("product-price", $("div#product-price > div"));
    }
  });

The console.log returns an empty array(unable to find nested div).

This is what I get in return:

initialize {
  options: 
   { withDomLvl1: false,
     normalizeWhitespace: false,
     xml: false,
     decodeEntities: true },
  _root: 
   initialize {
     '0': 
      { type: 'root',
        name: 'root',
        namespace: 'http://www.w3.org/1999/xhtml',
        attribs: {},
        'x-attribsNamespace': {},
        'x-attribsPrefix': {},
        children: [Array],
        parent: null,
        prev: null,
        next: null },
     options: 
      { withDomLvl1: false,
        normalizeWhitespace: false,
        xml: false,
        decodeEntities: true },
     length: 1,
     _root: [Circular] },
  length: 0,
  prevObject: 
   initialize {
     '0': 
      { type: 'root',
        name: 'root',
        namespace: 'http://www.w3.org/1999/xhtml',
        attribs: {},
        'x-attribsNamespace': {},
        'x-attribsPrefix': {},
        children: [Array],
        parent: null,
        prev: null,
        next: null },
     options: 
      { withDomLvl1: false,
        normalizeWhitespace: false,
        xml: false,
        decodeEntities: true },
     length: 1,
     _root: [Circular] } }

but if I change my code to

request(url, options, function(error, response, html) {
    if (!error) {
      var $ = cheerio.load(html, { withDomLvl1: false });
      // console.log("product-price", $("div.product-price")[0].attribs);
      console.log("product-price", $("div#product-price"));
    }
  });

I get an array with a single element:

initialize {
  '0': 
   { type: 'tag',
     name: 'div',
     namespace: 'http://www.w3.org/1999/xhtml',
     attribs: 
      { class: 'product-price',
        id: 'product-price',
        'data-bind': 'component: { name: "product-price", params: {state: state, showGermanVatMessage: false }}' },
     'x-attribsNamespace': { class: undefined, id: undefined, 'data-bind': undefined },
     'x-attribsPrefix': { class: undefined, id: undefined, 'data-bind': undefined },
     children: [],
     parent: 
      { type: 'tag',
        name: 'div',
        namespace: 'http://www.w3.org/1999/xhtml',
        attribs: [Object],
        'x-attribsNamespace': [Object],
        'x-attribsPrefix': [Object],
        children: [Array],
        parent: [Object],
        prev: [Object],
        next: [Object] },
     prev: 
      { type: 'text',
        data: '\n    ',
        parent: [Object],
        prev: [Object],
        next: [Circular] },
     next: 
      { type: 'text',
        data: '\n    ',
        parent: [Object],
        prev: [Circular],
        next: [Object] } },
  options: 
   { withDomLvl1: false,
     normalizeWhitespace: false,
     xml: false,
     decodeEntities: true },
  _root: 
   initialize {
     '0': 
      { type: 'root',
        name: 'root',
        namespace: 'http://www.w3.org/1999/xhtml',
        attribs: {},
        'x-attribsNamespace': {},
        'x-attribsPrefix': {},
        children: [Array],
        parent: null,
        prev: null,
        next: null },
     options: 
      { withDomLvl1: false,
        normalizeWhitespace: false,
        xml: false,
        decodeEntities: true },
     length: 1,
     _root: [Circular] },
  length: 1,
  prevObject: 
   initialize {
     '0': 
      { type: 'root',
        name: 'root',
        namespace: 'http://www.w3.org/1999/xhtml',
        attribs: {},
        'x-attribsNamespace': {},
        'x-attribsPrefix': {},
        children: [Array],
        parent: null,
        prev: null,
        next: null },
     options: 
      { withDomLvl1: false,
        normalizeWhitespace: false,
        xml: false,
        decodeEntities: true },
     length: 1,
     _root: [Circular] } }

yet, I am not able to see children of the element (the children array is empty), and I am not able to perform any methods on the object such as find() or text()

Any help is welcome!

Upvotes: 0

Views: 3284

Answers (1)

pguardiario
pguardiario

Reputation: 54987

Cheerio only has access to the DOM before any special things like XHRs have happened. You would need puppeteer or nightmarejs for the post-js-rendered DOM

Upvotes: 4

Related Questions