Reputation: 1218
I am trying to build an algorithm to see, which words occurr the most in comments.
Therefore I came up with this (in Javascript):
var analyze = function(comments){
var detectedWords = [];
var result = {};
comments.forEach(function(comment){
var words = comment.message.split(" ");
words.forEach(function(word){
word = word.toLowerCase();
if(word !== ""){
if(detectedWords.indexOf(word) === -1){
detectedWords.push(word);
result[detectedWords.indexOf(word)] = {"name":word,"count":1};
}else{
result[detectedWords.indexOf(word)].count++;
}
}
});
});
return _.orderBy(result, ['count'], ['desc']);
}
Can the algorithm be optimized further? (toLowerCase() outside the inner loop?
In next step I would define a "blacklist" or words that are not interesting like "the, is, I, am, are,..."
Upvotes: 2
Views: 1189
Reputation: 386560
You could use a hash table for reference to the array item for a faster access to the count object. result
is now an array, which is now sortable.
var analyze = function (comments) {
var result = [],
hash = {};
comments.forEach(function (comment) {
var words = comment.message.split(" ");
words.forEach(function (word) {
word = word.toLowerCase();
if (word !== "") {
if (!hash[word]) {
hash[word] = { name: word, count: 0 };
result.push(hash[word]);
}
hash[word].count++;
}
});
});
return result.sort(function (a, b) { return b.count - a.count;});
//return _.orderBy(result, ['count'], ['desc']);
}
console.log(analyze([{ message: 'a b c d a v d e f g q' }]));
Upvotes: 1