Reputation: 1
I'm building an Ebay web scraper as a side project and I need to know how to call a promise based function many times to get all the seller's items on multiple pages until I reach the end of his items.
Basically, using Javascript and Node.js, if the seller has only 1 page of items, I scrape it and everything is fine.
Things get complicated for me when he has multiple pages and I need to call the promise function multiple times and returning the links to every page. I tried promise loops, I've tried recursion, and I've tried async/await. Nothing seems to work.
I will include a code snippet for you guys:
const request = require('request-promise');
const cheerio = require('cheerio');
const options = {
url : 'https://www.ebay.com/sch/i.html?_nkw=&_in_kw=1&_ex_kw=&_sacat=0&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_sadis=15&_stpos=29582&_sargn=-1%26saslc%3D1&_salic=1&_fss=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=brickearth&_sop=12&_dmd=1&_ipg=50&_fosrp=1',
//url : 'https://www.ebay.com/sch/i.html?_nkw=&_in_kw=1&_ex_kw=&_sacat=0&_udlo=&_udhi=&_ftrt=901&_ftrv=1&_sabdlo=&_sabdhi=&_samilow=&_samihi=&_sadis=15&_stpos=29582&_sargn=-1%26saslc%3D1&_salic=1&_fss=1&_fsradio=%26LH_SpecificSeller%3D1&_saslop=1&_sasl=supersavingsdeals&_sop=12&_dmd=1&_ipg=200&_fosrp=1',
method : 'GET',
headers : {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
}
const pageLinks = [];
async function getPagesRecursive(link) {
await request(link, (error, response, html) => {
if(!error && response.statusCode === 200) {
const $ = cheerio.load(html);
const nextPage = $('td.pagn-next > a').attr('href');
pageLinks.push(link)
if(nextPage) {
return getPagesRecursive(nextPage);
}
console.log(pageLinks);
}
})
}
I'm pretty sure it's nothing for you Javascript and Node.js experts ;) I've posted it to you using async/await, but I would like to use promises with the request module, since I feel I know them better and personally, I've tried every other way unsuccessfully :)
I've provided you guys two urls to try in the options object for the request module. The first one is a link to a seller with multiple items, and it should get all the page links. The 2nd link is a seller with only one page, and it should get only that seller's one page link.
Thank you in advance ;)
Upvotes: 0
Views: 240
Reputation: 350365
As request
returns a promise, you don't need to use the callback, nor the async
await
syntax -- just return the promise.
Also, instead of populating a global pageLinks
array, it is nicer if you build the array of links based on the values that the promises fulfill with.
function getPagesRecursive(link) {
return request(link).then(html => {
const $ = cheerio.load(html);
const nextPage = $('td.pagn-next > a').attr('href');
return nextPage ? getPagesRecursive(nextPage).then(links => [link, ...links])
: [link];
});
}
// Use:
getPagesRecursive(firstPage).then(links => console.log(links))
.catch(err => console.log("request failed", err));
Upvotes: 1