Reputation: 3189
I am trying to scrape a website with multiple section and pagination in parallel. The idea is to go through each section per page.
For instance if there are 6 sections and each section has 6 items per page (total page is up to 10), then I would like the code to run at least 6 jobs in paralel.
Below is what I have
const cheerio = require('cheerio');
const request = require('request-promise');
const baseUrl = 'https://www.bankmega.com/';
let category = 0;
let page = 0;
(async function () {
try {
const homePage = baseUrl + '/promolainnya.php';
const html = await request(homePage);
const $ = cheerio.load(html);
const jobs = $('div[id="subcatpromo"]').find('img').map((i, img) => scrapePerCategory({title: $(img).attr('title'), category: i + 1}));
await Promise.all(jobs); // error TypeError: undefined is not a function
} catch (e) {
console.log('error in main ', e);
}
})();
const scrapePerCategory = async (job) => {
try {
let pageNumber;
let i = 1;
let result = [];
console.log('start scraping for category ' + job.title);
do {
page = i;
category = job.category;
const url = baseUrl + `/ajax.promolainnya.php?product=1&subcat=${category}&page=${page}`;
const html = await request(url);
const $ = cheerio.load(html);
if (!pageNumber) {
pageNumber = $('a.page_promo_lain[id]').length;
}
const temp = $('#promolain').find('a').map(async (i, promoElem) => {
const title = cheerio(promoElem).find('img').attr('title');
const detailLink = cheerio(promoElem).attr('href');
const detailHTML = await request(baseUrl + detailLink);
const $ = cheerio.load(detailHTML);
const imageurl = baseUrl + $('.keteranganinside').find('img').attr('src');
console.log('category : ' + job.category + ' with item => ' + JSON.stringify({title: title, imageurl: imageurl}));
return {title: title, imageurl: imageurl};
}).get();
await Promise.all(temp).then(r => result.push(r));
i++;
} while (i <= pageNumber) ;
await Promise.all(result).then((r) => "done scraping for category " + job.title);
return result;
} catch (e) {
console.log('error in category', e);
}
};
It prints as expected when I run
start scraping for category Travel
start scraping for category Lifestyle
start scraping for category Food & Beverages
start scraping for category Gadget & Entertainment
start scraping for category Daily Needs
start scraping for category Others
category : 6 with item => {"title":"Perubahan Minimum Payment","imageurl":"https://www.bankmega.com//files/images/minimum payment-lp- rev.jpg"}
category : 1 with item => {"title":"Visa Bluebird Diskon hingga 25ribu","imageurl":"https://www.bankmega.com//files/images/0-landing-page-BLUE-BIRD.jpg"}
category : 6 with item => {"title":"Aktivasi Kartu Kredit dan PIN","imageurl":"https://www.bankmega.com//files/images/AKTIVASI-CC-lp-CS5-revrainy.jpg"}
However when the caller (main method gives error) as follows
error in main TypeError: undefined is not a function
at Function.all (<anonymous>)
It makes me wonder if code is indeed running as I expected or not.
Upvotes: 0
Views: 190
Reputation: 19288
Iterative approach should work though discovering pageNumber
on the first iteration makes it somewhat messy. Recursion should make it a lot tidier.
Not much time right now as I need to get out for my legal exercise, so here's a version of your iteration that stands a chance of working. You may well need to fix it here and there.
const cheerio = require('cheerio');
const request = require('request-promise');
const baseUrl = 'https://www.bankmega.com/';
(async function () {
try {
const $ = cheerio.load(await request(baseUrl + '/promolainnya.php'));
// map img elements to array of promises ...
let promises = $('div[id="subcatpromo"]').find('img').get().map((img, i) => scrapePerCategory({'title': $(img).attr('title'), 'category': i + 1}));
// ... and await the promises.
const jobs = await Promise.all(promises);
console.log(jobs);
} catch (e) {
console.log('error in main ', e);
}
})();
const scrapePerCategory = async (job) => {
try {
let pageNumber;
let page = 1; // both `page` and `i` counters seem unnecessary - one or the other?
const results = [];
do {
let url = baseUrl + `/ajax.promolainnya.php?product=1&subcat=${job.category}&page=${page}`;
let $ = cheerio.load(await request(url));
if (!pageNumber) {
pageNumber = $('a.page_promo_lain[id]').length;
}
// here compose `innerResults` in much the same way `results` is composed ...
let innerResults = [];
let anchors = $('#promolain').find('a');
for(var i=0; i<anchors.length; i++) { // for loop here allows `await` to await
let promoElem = cheerio(anchors[i]);
let $ = cheerio.load(await request(baseUrl + promoElem.attr('href')));
innerResults.push({
'title': promoElem.find('img').attr('title'),
'imageurl': baseUrl + $('.keteranganinside').find('img').attr('src')
});
}
// ... and aggregate `innerResults` into `results`
results.push(innerResults); // or results = results.concat(innerResults); ?
page++;
} while (page <= pageNumber);
console.log("done scraping for category " + job.title);
return results;
} catch (e) {
console.log('error in category', e);
throw e;
}
};
Upvotes: 2