Reputation: 273
I am scraping a website that is using React components, using PhantomJS in Nodejs.
With this: https://github.com/amir20/phantomjs-node
Here is the code:
phantom.create().then(ph => {
_ph = ph;
return _ph.createPage();
}).then(page => {
_page = page;
return _page.open(url);
}).then(status => {
return _page.property('content');
}).then(content => {
console.log(content);
_page.close();
_ph.exit();
}).catch(e => console.log(e));
Problem is the react content is not rendered, it only says: <!-- react-empty: 1 -->"
where the actual react component should be loaded.
How can I scrap the rendered react component? I initially switched from a pure node-request solution to PhantomJS to fix this but now I am stuck.
UPDATE:
So I dont have a real solution yet. I switched to NightmareJS (https://github.com/segmentio/nightmare) which has a nice .wait('.some-selector')
function, which waits till the specified selector is loaded. This fixed my problems with dynamically loaded react components.
Upvotes: 0
Views: 3142
Reputation: 724
I think you should wait for rendering the react elements on the page after the page is loaded. An example of such a waiting-function, using Q promises, is below. This function returns a promise and checks for page state every 50ms. If the required page state is reached, the function resolves the promise. In the case of timeout, the function rejects the promise.
var phantom = require('phantom');
var Q = require('q');
var _ph, _page, _outObj;
var url = 'https://tech.yandex.ru/maps/jsbox/';
phantom.create().then(ph => {
_ph = ph;
return _ph.createPage();
}).then(page => {
_page = page;
return _page.open(url);
}).then(status => {
console.log(status);
return waitState(textPopulated, 3);
}).then(() => {
return _page.property('content');
}).then(content => {
console.log(content);
_page.close();
_ph.exit();
}).catch(e => console.log(e));
function textPopulated() {
return _page.evaluate(function() {
var layer = document.querySelector('.ace_text-layer');
return layer && layer.childElementCount;
}).then(function(childElementCount) {
console.log('childElementCount: ' + childElementCount);
return childElementCount > 0;
});
}
function waitState(state, timeout) { // timeout in seconds is optional
console.log('Start waiting for state: ' + state.name);
var limitTime = timeout * 1000 || 20000;
var startTime = new Date();
return wait();
function wait() {
return state().then(function(result) {
if (result) {
console.log('Reached state: ' + state.name);
return;
} else if (new Date() - startTime > limitTime) {
var errorMessage = 'Timeout state: ' + state.name;
console.log(errorMessage);
throw new Error(errorMessage);
} else {
return Q.delay(50).then(wait);
}
}).catch(function(error) {
throw error;
});
}
}
Upvotes: 3