Reputation: 81
I'm looking to scrape a page as an exercise to learn phantomjs however I'm having an issue currently. The image loading is deferred so I'm trying to figure out how I can get phantom js to scroll down and wait for the images to load. SCrolling to the bottom of the page doesnt work so I was thinking of scrolling 100px every 3 seconds until it gets to the bottom of the page. How would I achieve this with?
const phantom = require('phantom');
(async function() {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function(requestData) {
console.info('Requesting', requestData.url);
});
await page.open(<URL>);
const js = await page.includeJs('http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js');
const data = await page.evaluate(function() {
// Do something
});
page.render('test.pdf');
await page.close();
await instance.exit();
})();
Upvotes: 1
Views: 725
Reputation: 212
const phantom = require('phantom');
// Scrolls the page till new content is available
async function scrollPage(page) {
const currentContentLength = (await page.property('content')).length;
await page.evaluate(function () {
window.document.body.scrollTop = document.body.scrollHeight;
});
await wait(Math.max(5000, 10000 * Math.random()));
const nextContentLength = (await page.property('content')).length;
if (currentContentLength != nextContentLength) {
console.log("Scrolling page:", await page.property('url'), "for more content");
await scrollPage(page);
}
}
// Scrolls the page and gets the page content using PhantomJS
async function getPageData(pageUrl, shouldScrollPage) {
const instance = await phantom.create();
const page = await instance.createPage();
await page.open(pageUrl);
if (shouldScrollPage) {
await scrollPage(page);
}
const pageContent = await page.property('content');
await page.close();
await instance.exit();
return pageContent;
};
Upvotes: 0
Reputation: 16838
PhantomJS does support "scrolling", there is a page property scrollPosition
which can probably used like this:
await page.property('scrollPosition', { top: 300, left: 0 });
You can change scrollPosition
dynamically, increasing it within time, which should trigger callbacks responsible for image load.
Here's an example in raw PhantomJS script showing the technique to go down Twitter's timeline.
Upvotes: 1
Reputation: 2029
You can use node-webshot based on phantom.js in order to render pdf as well. It has many configuration. One you need is renderDelay to delay screenshotting and shotOffset to scroll where you want.
Upvotes: 0