Reputation: 453
I finally figured out how callbacks work in node.js, but I'm trying now to get my code to execute in order.
The goal is to (in order):
<td>
in the <tbody>
on the page.The end goal is to go through every page (there is a separate URL for each date, so I am looping through the dates) and INSERT players that aren't in my database ONCE. The problem is that it goes through each SELECT before the INSERT queries are executed, so it's inserting them multiple times.
Here is the page I'm parsing, if it helps: http://www.basketball-reference.com/friv/dailyleaders.cgi?month=12&day=29&year=2014
Here is my code:
function loadPage (url, callback){
request(url, function(err, response, body){
if(!err && response.statusCode ==200){
var $ = cheerio.load(body);
rowsRemaining = $.length;
$('td', 'tbody').each(function(){
var text = $(this).text();
data.push(text);
rowsRemaining -= 1;
console.log('rows left: ',rowsRemaining);
});
}
if (rowsRemaining == 0){
console.log('$ length: ',$.length);
callback(data);
}
});
}
function loopThroughData (data, callback){
for(i=1;i<data.length;i+=26){
lookForPlayer(data[i].replace("'",""),function(name){
/* var insertPlayer = connection.query(
'INSERT INTO player (provider_id, team_id, position_id, name) VALUES (1, (SELECT id FROM team WHERE slug = "'+data[i+1]+'"),1,"'+name+'");',function(err,result,fields){
}); */
console.log('i is currently = ',i);
});
}
callback();
}
function lookForPlayer(name, callback){
console.log('Looking for Player...');
var selectPlayer = connection.query(
"SELECT * FROM player WHERE name = '"+name+"'", function(err, rows, fields){
if(err) throw err;
if(rows.length==0){
callback(name);
}
});
}
//loop through every day since the season started
for (d = seasonStart; d <= Date.now(); d.setDate(d.getDate() + 1)){
console.log('d = ',d);
loadPage(baseURL+(d.getMonth()+1)+'&day='+d.getDate()+'&year='+d.getFullYear(),function(data){
console.log('Page loaded...');
loopThroughData(data,function(){
});
});
}
As you can see, I tried adding a rowsRemaining variable that is meant to make sure I've parsed the whole file before calling the callback in the loadPage function, but it never gets to that point. Note that I initialize a lot of these variables before these functions (rowsRemaining, data, etc).
It also seems to loop through every date before fully loading, parsing, and INSERTing the first page, which it should not be doing.
Here is the updated code based off of @Brant's answer
function loadPage (url, callback){
request(url, function(err, response, body){
if(!err && response.statusCode ==200){
var $ = cheerio.load(body);
console.log(url);
$('td', 'tbody').each(function(){
var text = $(this).text();
data.push(text);
});
}
callback(data);
});
}
function loopThroughData (data, callback){
for(i=1;i<data.length;i+=26){
lookForPlayer(data[i].replace("'",""),function(name){
var insertPlayer = connection.query(
'INSERT INTO player (provider_id, team_id, position_id, name) VALUES (1, (SELECT id FROM team WHERE slug = "'+data[i+1]+'"),1,"'+name+'");',function(err,result,fields){
});
});
}
callback(data);
}
function lookForPlayer(name, callback){
var selectPlayer = connection.query(
"SELECT * FROM player WHERE name = '"+name+"'", function(err, rows, fields){
if(err) throw err;
if(rows.length==0){
console.log(name,' was not found in DB!');
callback(name);
}
});
}
//loop through every day since the season started
for (d = seasonStart; d <= Date.now(); d.setDate(d.getDate() + 1)){
validDatesArr.push(d);
}
async.eachSeries(validDatesArr,
function(validDatesArr, callback){
loadPage(baseURL+'/month='+validDatesArr.getMonth()+1+'&day='+validDatesArr.getDate()+'&year='+validDatesArr.getFullYear(),function(data){
loopThroughData(data, function(){
callback();
});
});
}, function(err){
if(!err){
console.log('We processed each date requests one by one');
}
}
);
So now it's loading the pages one by one, but it isn't executing the INSERT function in the loopThroughData function on that data. I would think I would just add another function to the async list, but this particular one is calling a function as opposed to using an anonymous one.
Upvotes: 0
Views: 226
Reputation: 1788
Modify your for loop to be as follows:
//loop through every day since the season started
var validDatesArr = [];
for (var d = seasonStart; d <= Date.now(); d.setDate(d.getDate() + 1)){
validDatesArr.push(d);
}
async.eachSeries(validDatesArr,
function(d, callback) {
loadPage(baseURL+(d.getMonth()+1)+'&day='+d.getDate()+'&year='+d.getFullYear(),function(data){
console.log('Page loaded...');
loopThroughData(data,function(){
callback();
});
});
}, function(err) {
if(!err) {
console.log('We processed each date request one by one')
}
}
);
And require async which can be found here: https://github.com/caolan/async
npm install async
Upvotes: 1
Reputation: 964
You can nested the Async function to control the execute flow like in a sequence programming, be careful at the Pyramid of doom, the other solution is to use the Sync version of the async functions you used (if exist). You are not forced to write Async function if you do NOT need them, Node.js use a lot of Async function because is a Non-bloking language very powerful for web development. So do NOT use the asyn style and the callback in your functions !
Upvotes: 0