Reputation: 1591
I have been searching for an headless web browser that can run on server for web crawlers to index a single page application. Firslyt I tried HTMLUnit and Selenium (HtmlUnitDriver) but it seems both of them have issues with xhr requests.
And I discovered PhantomJS which performs better and seems mature. PhantomJS has an internal webserver so I decided to use it with my reverse proxy. However I ran a benchmark and PhantomJS hits a cpu core 100% and the average page loading the is around 4 seconds. The reason is I have to wait the browser to load all resources to be able to get correct results. Here is my PhantomJS script:
var page = require('webpage');
var system = require('system');
var server = require('webserver').create();
// credit: http://backbonetutorials.com/seo-for-single-page-apps/
var service = server.listen(port, { 'keepAlive': true }, function(z, response) {
var request = page.create();
var lastReceived = new Date().getTime();
var requestCount = 0;
var responseCount = 0;
var requestIds = [];
var startTime = new Date().getTime();
request.onResourceReceived = function (response) {
if (requestIds.indexOf(response.id) !== -1) {
lastReceived = new Date().getTime();
responseCount++;
requestIds[requestIds.indexOf(response.id)] = null;
}
};
request.onResourceRequested = function (request) {
if (requestIds.indexOf(request.id) === -1) {
requestIds.push(request.id);
requestCount++;
}
};
request.settings = {
loadImages: false,
javascriptEnabled: true,
loadPlugins: false
};
request.open(z.url, function (status, a) {
if (status !== 'success') {
console.log('FAIL to load the address '+a);
}
});
var checkComplete = function () {
var now = new Date().getTime();
if ((now - lastReceived > 300 && requestCount === responseCount) || now - startTime > 5000) {
clearInterval(checkCompleteInterval);
response.statusCode = 200;
response.headers = {
'Cache': 'no-cache',
'Content-Type': 'text/html; charset=UTF-8',
'Connection': 'Keep-Alive',
'Keep-Alive': 'timeout=5, max=100',
'Content-Length': request.content.length
};
response.write(request.content);
response.close();
request.release();
console.log(request.url+" -> "+(now - startTime));
}
}
var checkCompleteInterval = setInterval(checkComplete, 3);
});
Is there any improvement that can be done to speed up the script, should I just run PhantomJS using its shell command for better performance or is there any alternative to these browsers?
Upvotes: 2
Views: 2023
Reputation: 1746
You can use some command line switches to improve the capture performance:
First, you can ignore all images with --load-images=no
. There's no need to load images when doing the HTML snapshots.
You can also enable the cache with --disk-cache=yes
(use --max-disk-cache-size
to set its size in bytes)
Finally, the WebPage#onResourceRequested
callback may also be useful to abort some requests (trackers, media files...) with the NetworkRequest#abort
method.
Upvotes: 3