Reputation: 83
I am trying to scrape http://www.ratemyprofessors.com/ for all professors. My code seems to get the following error:
FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap out of memory
1: node::Abort() [node]
2: 0x10d3f9c [node]
3: v8::Utils::ReportApiFailure(char const*, char const*) [node]
4: v8::internal::V8::FatalProcessOutOfMemory(char const*, bool) [node]
5: v8::internal::Factory::NewFillerObject(int, bool, v8::internal::AllocationSpace) [node]
6: v8::internal::Runtime_AllocateInTargetSpace(int, v8::internal::Object**, v8::internal::Isolate*) [node]
7: 0x292aec062bf
Aborted
I don't know what I did to cause this error but could it be because of my loop? I need to loop over 10 million pages but I don't know why it's even giving me this error with just 10 loops. Here is the code:
var express = require('express');
var path = require('path');
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
var app = express();
var count = 1;
var url;
while(count != 10){
url = "http://www.ratemyprofessors.com/ShowRatings.jsp?tid=" + count;
request(url, function(err, resp, body){
var $ = cheerio.load(body);
if($('.error').text().substring(0, 14) == "Page Not Found"){
console.log("hello");
count++;
return;
}else{
console.log($('.error').text().substring(0, 14) );
var pfname = $('.pfname');
var plname = $('.plname');
var professorName = pfname.text().replace(/\s/g, '') + " " +plname.text().replace(/\s/g, '');
console.log(professorName);
console.log(url);
count++;
}
return;
})
}
app.listen(3000, function(){
console.log("server is now listening");
})
Upvotes: 1
Views: 1104
Reputation: 6037
I think Raphael is correct in that you're doing way more than 10 loops, because the count is not incremented until the callback for the request. You can solve this problem by using something like async.whilst
, which allows you to run asynchronous code serially inside of a while loop:
const request = require('request')
const async = require('async')
let count = 1
const test = () => count < 10
const iteratee = callback => {
const url = 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=' + count
request(url, (error, response, body) => {
if (error) return callback(error)
// do other stuff here
count++
callback()
})
}
const done = error => {
// all done
}
async.whilst(test, iteratee, done)
This is probably safer and more responsible anyways, because you're preventing making concurrent requests to their server (imagine if you just fired off 10 million HTTP requests to the same place all at the same time – not good). If you do want to make concurrent requests, you might consider using a "parallel" method like async.map
or async.each
in conjunction with a rate limiter like bottleneck.
Upvotes: 0
Reputation: 636
You should create an array of URLs, and then iterate through that array with Cheerio. This code should get you started, although it could use a lot of improvements. The timeout at the end is so the urls can finish populating.
var request = require('request');
var cheerio = require('cheerio');
var url;
var urls = [];
for (i = 1; i < 10; i++) {
url = 'http://www.ratemyprofessors.com/ShowRatings.jsp?tid=' + i;
urls.push(url);
}
function done() {
var arrayLength = urls.length;
var promiseArray = [];
for (var i = 0; i < arrayLength; i++) {
request(urls[i], function(err, resp, body) {
var $ = cheerio.load(body);
if (
$('.error')
.text()
.substring(0, 14) == 'Page Not Found'
) {
console.log('hello');
return;
} else {
console.log(
$('.error')
.text()
.substring(0, 14)
);
var pfname = $('.pfname');
var plname = $('.plname');
var professorName =
pfname.text().replace(/\s/g, '') +
' ' +
plname.text().replace(/\s/g, '');
console.log(professorName);
console.log(url);
}
return;
});
}
}
setTimeout(function() {
done();
}, 3000);
console.log(urls);
Upvotes: 0
Reputation: 2197
You're probably doing way more than 10 loops. You only increment count in the callback for the request, which might occur a few hundred milliseconds after the request is sent. In that time, your while looping is sending requests as fast as it can.
This would probably work better if you just used a normal for-loop instead of a while loop.
Upvotes: 0