Reputation: 894
I believe this one is a bug. I am trying to write a simple web scraper with request and cheerio.
How I tried to solve it:
Update: After some people pointed out, the issue was that needed dom nodes were created after my page was parsed and traversed by cheerio. So the part of the page I requested simply was not there. Any Ideas how to bypass that?
I use versions:
{
"name": "discont",
"version": "1.0.0",
"description": "Find when the item is on sale",
"main": "index.js",
"license": "MIT",
"devDependencies": {
"express": "^4.16.4"
},
"dependencies": {
"cheerio": "^1.0.0-rc.2",
"ejs": "^2.6.1",
"request": "^2.88.0"
}
}
This is the HTML I am trying to scrape:
This is my code:
request(url, options, function(error, response, html) {
if (!error) {
var $ = cheerio.load(html, { withDomLvl1: false });
// console.log("product-price", $("div.product-price")[0].attribs);
console.log("product-price", $("div#product-price > div"));
}
});
The console.log returns an empty array(unable to find nested div).
This is what I get in return:
initialize {
options:
{ withDomLvl1: false,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
_root:
initialize {
'0':
{ type: 'root',
name: 'root',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Array],
parent: null,
prev: null,
next: null },
options:
{ withDomLvl1: false,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
length: 1,
_root: [Circular] },
length: 0,
prevObject:
initialize {
'0':
{ type: 'root',
name: 'root',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Array],
parent: null,
prev: null,
next: null },
options:
{ withDomLvl1: false,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
length: 1,
_root: [Circular] } }
but if I change my code to
request(url, options, function(error, response, html) {
if (!error) {
var $ = cheerio.load(html, { withDomLvl1: false });
// console.log("product-price", $("div.product-price")[0].attribs);
console.log("product-price", $("div#product-price"));
}
});
I get an array with a single element:
initialize {
'0':
{ type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs:
{ class: 'product-price',
id: 'product-price',
'data-bind': 'component: { name: "product-price", params: {state: state, showGermanVatMessage: false }}' },
'x-attribsNamespace': { class: undefined, id: undefined, 'data-bind': undefined },
'x-attribsPrefix': { class: undefined, id: undefined, 'data-bind': undefined },
children: [],
parent:
{ type: 'tag',
name: 'div',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: [Object],
'x-attribsNamespace': [Object],
'x-attribsPrefix': [Object],
children: [Array],
parent: [Object],
prev: [Object],
next: [Object] },
prev:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Object],
next: [Circular] },
next:
{ type: 'text',
data: '\n ',
parent: [Object],
prev: [Circular],
next: [Object] } },
options:
{ withDomLvl1: false,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
_root:
initialize {
'0':
{ type: 'root',
name: 'root',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Array],
parent: null,
prev: null,
next: null },
options:
{ withDomLvl1: false,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
length: 1,
_root: [Circular] },
length: 1,
prevObject:
initialize {
'0':
{ type: 'root',
name: 'root',
namespace: 'http://www.w3.org/1999/xhtml',
attribs: {},
'x-attribsNamespace': {},
'x-attribsPrefix': {},
children: [Array],
parent: null,
prev: null,
next: null },
options:
{ withDomLvl1: false,
normalizeWhitespace: false,
xml: false,
decodeEntities: true },
length: 1,
_root: [Circular] } }
yet, I am not able to see children of the element (the children array is empty), and I am not able to perform any methods on the object such as find()
or text()
Any help is welcome!
Upvotes: 0
Views: 3284
Reputation: 54987
Cheerio only has access to the DOM before any special things like XHRs have happened. You would need puppeteer or nightmarejs for the post-js-rendered DOM
Upvotes: 4