Reputation: 339
I am just trying to get deals items from this amazon URL :
when I open this link in browser and write the query in console, it works:
document.querySelectorAll('div[class*="DealItem-module__dealItem_"]')
but when I try to fetch this through this phantomjs
script, it seems to always returning nothing:
var page = require('webpage').create();
page.viewportSize = { height: 800, width: 1920 }; // BRODIE : CHROME
page.customHeaders = {
accept:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
// 'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
dnt: '1',
'sec-ch-ua':
'" Not A;Brand";v="99", "Chromium";v="90", "Microsoft Edge";v="90"',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36 Edg/90.0.818.66',
};
page.settings.javascriptEnabled = true;
page.settings.loadImages = false;
//Script is much faster with this field set to false
phantom.cookiesEnabled = true;
phantom.javascriptEnabled = true;
page.onConsoleMessage = function (message) {
console.log('console.log() -- ', message);
}; // BUBBLE UP LOGS FROM BROWSER CONSOLE TO PHANTOM CONSOLE
page.onLoadStarted = function () {
loadInProgress = true;
console.log('page loading started');
};
page.onLoadFinished = function () {
loadInProgress = false;
console.log('page loading finished');
};
page.onError = function (msg, trace) {
console.log(msg);
trace.forEach(function (item) {
console.log(' ', item.file, ':', item.line);
});
};
// OPEN PAGE
console.log('page.open()');
page.open(
'https://www.amazon.com/gp/goldbox/ref=gbps_ftr_s-5_cd34_wht_26179410?gb_f_deals1=sortOrder:BY_SCORE,includedAccessTypes:GIVEAWAY_DEAL,enforcedCategories:2617941011&pf_rd_p=fd51d8cf-b5df-4144-8086-80096db8cd34&pf_rd_s=slot-5&pf_rd_t=701&pf_rd_i=gb_main&pf_rd_m=ATVPDKIKX0DER&pf_rd_r=A89BX6V6RQRQ94NFA0DP&ie=UTF8',
function (status) {
if (status !== 'success')
console.log('U N A B L E T O O P E N P A G E . . .');
else console.log(' P A G E O P E N E D . . .');
var selector = 'div[class*="DealItem-module__dealItem_"]'
var findAll = setInterval(function () {
console.log('trying to fetch deals...');
var deals = page.evaluate(function (sel) {
return document.querySelectorAll(
'div[class*="DealItem-module__dealItem_"]'
);
}, selector);
if(deals.length) {
console.log('deals.length', deals.length);
clearInterval(findAll);
}
}, 1000);
}
);
Also, when I try to take screenshot using page.render()
, it shows page with unloaded/unfinished JS (which is different from when we type that URL in browser and search:):
Also, I noticed that when I run this script in terminal, I get some JS errors of webpage:
Any help will be greatly appriciated
Upvotes: 4
Views: 528
Reputation: 339
Thanks for the answers Leftium and James, I've tried waitFor.js and other suggestions on Stack Overflow. But none of them worked. Now I am using Nightmare.js and it's working now, using Nighmare.js - Asynchronous operations and loops and Looping through pages when next is available #402
But knowing how to do it with phantom.js will be nice, though
Upvotes: 1
Reputation: 17903
The reason document.querySelectorAll('div[class*="DealItem-module__dealItem_"])
only returns results in the browser console and not the PhantomJS script is because they are running on two different versions of the page:
document.querySelectorAll()
does not return anything in the browser if you are logged out of Amazon or using an incognito browser. (Interestingly, that Amazon URL does show a list of deals for me while logged out in incognito mode. Amazon may only show that sign in message if it suspects an automated bot is accessing the URL...)To get the PhantomJS script to scrape the same page as the one you see in your browser, you must first sign in to Amazon on the PhantomJS headless browser. (PhantomJS probably uses a different browser executable than the one your browser uses.) There are a few different ways to do this:
Since Amazon sometimes show the list of deals even when not signed in, you may be able to get the list of deals without signing in by making PhantomJS appear like a real browser: ensure PhantomJS sends all the cookies and User Agent string like a real browser.
Finally: large sites like Amazon and Google are very good at detecting and preventing automated bots from scraping their sites. You will likely face many more obstacles in the future!
update:
I just checked the Amazon URL, and there are indeed HTTP-only cookies. This type of cookie cannot be accessed (neither read nor written) from JavaScript. So there is a good chance PhantomJS cannot read/write these cookies, aside from manually logging in via the PhantomJS script:
Upvotes: 0
Reputation: 3581
According to the documentation on the evaluate method in PhantomJS
Note: The arguments and the return value to the evaluate function must be a simple primitive object. The rule of thumb: if it can be serialized via JSON, then it is fine.
Closures, functions, DOM nodes, etc. will not work!
Instead, you should perform your length calculation inside the evaluate, then return the simple primitive length.
Upvotes: 3