Reputation: 41
I have a program that receives in input a large number of values and has to keep track of the n largest received. For example, let's say n is 3 and the input is 1,6,3,5,2 the output would have to be 6,5,3 (not necessarily in this order).
At the moment I'm using a min heap implemented via an array, but that isn't quite cutting it time-wise. Are there any alternatives I can look into?
Upvotes: 2
Views: 620
Reputation: 2562
The "(not necessarily in this order)" implies that you can have a sorted numerically output. As you only have to track values greater or equal than n, with a very large input of integer values, a wise way would be then to keep track of integer ranges instead of values.
It will heavily depend of the inputs. For hughes word integer inputs with a discrete uniform distribution, it would cost less memory and would be faster.
A simple pseudo code implementation would require for each input value to check it against a min heap ordered stack of ranges :
Range is
min as integer
max as integer
next as Range
duplicates as stack of integer
Range(pMin, pMax, pNext)
self.min = pMin
self.max = pMax
self.next = pNext
self.duplicates = empty stack of integer
Range heap_top = NULL
Range current_range = NULL
Range previous_range = NULL
boolean merge_flag
integer value
While read value from input
if value >= n Then
current_range = heap_top
previous_range = NULL
merge_flag = false
While current_range is not null
If value is current_range.min - 1 Then
current_range.min = value
merge_flag = true
Break
End If
If value is current_range.max + 1 Then
current_range.max = value
merge_flag = true
Break
End If
If value < current_range.min Then
current_range = NULL
Break
End If
If value is between current_range.min and current_range.max Then
# Here we track duplicates value
current_range.duplicates.push value
Break
End If
previous_range = current_range
current_range = current_range->next
End While
If current_range is not NULL Then
If merge_flag is true Then
If previous_range is not NULL and current_range.min - 1 is previous_range.max Then
# merge current range into previous one
previous_range.max = current_range.max
# Here we track duplicates value
previous_range.duplicates.pushall current_range.duplicates
previous_range.next = current_range.next
drop current_range
# If we need to keep a track of the range where belong the value
# current_range = previous_range
Else
If current_range.next is not NULL and current_range.max + 1 is
current_range.next.min Then
# merge next range into current one
# We use previous_range to point the next range
previous_range = current_range.next
current_range.max = previous_range.max
# Here we track duplicates value
current_range.duplicates.pushall previous_range.duplicates
current_range.next = previous_range.next
drop previous_range
End If
End If
End If
Else
If previous_range is NULL Then
current_range = new Range(value, value, heap_top)
heap_top = current_range
Else
current_range = new Range(value, value, previous_range.next)
previous_range.next = current_range
End If
End If
End If
End While
Less nodes implies less node traversal processing on the long run if the input is uniformly distributed. Less node traversal processing for each input value to process means a faster global processing, as we have then an algorithm approaching O(N) instead of O(N!) with N as the number of input values.
An example of C implementation of the the previous algorithm :
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <string.h>
struct s_range {
unsigned int min;
unsigned int max;
struct s_range *next;
unsigned int *duplicates;
unsigned int duplicates_count;
};
struct s_range *new_range(unsigned int pMin,unsigned int pMax, struct s_range *pNext) {
struct s_range *lRange = malloc(sizeof(struct s_range));
if (lRange == NULL) {
perror("new_range: Failed to allocate");
return NULL;
}
lRange->min = pMin;
lRange->max = pMax;
lRange->next = pNext;
lRange->duplicates = NULL;
lRange->duplicates_count = 0;
return lRange;
}
void drop_range(struct s_range *pRange) {
if (pRange != NULL) {
if (pRange->duplicates != NULL) {
free(pRange->duplicates);
}
free(pRange);
}
}
void drop_allranges(struct s_range *pTopRange) {
struct s_range *lRange;
while (pTopRange != NULL) {
lRange = pTopRange->next;
drop_range(pTopRange);
pTopRange = lRange;
}
}
int push_duplicates(struct s_range *pRange, unsigned int pValue) {
unsigned int *lDuplicates;
if (pRange == NULL) {
return 2;
}
if (pRange->duplicates == NULL) {
lDuplicates = malloc(sizeof(unsigned int));
} else {
lDuplicates = realloc(pRange->duplicates, (pRange->duplicates_count + 1) * sizeof(unsigned int));
}
if (lDuplicates == NULL) {
perror("push_duplicates: failed to allocate...");
return 1;
}
lDuplicates[pRange->duplicates_count++] = pValue;
pRange->duplicates = lDuplicates;
return 0;
}
int pushall_duplicates(struct s_range *pRangeDst, struct s_range *pRangeSrc) {
unsigned int *lDuplicates;
if (pRangeDst == NULL || pRangeSrc == NULL) {
return 2;
}
if (pRangeSrc->duplicates == NULL) {
return 0;
}
if (pRangeDst->duplicates == NULL) {
lDuplicates = malloc(pRangeSrc->duplicates_count * sizeof(unsigned int));
} else {
lDuplicates = realloc(pRangeDst->duplicates, (pRangeDst->duplicates_count + pRangeSrc->duplicates_count) * sizeof(unsigned int));
}
if (lDuplicates == NULL) {
perror("pushall_duplicates: failed to allocate...");
return 1;
}
memcpy(&lDuplicates[pRangeDst->duplicates_count], pRangeSrc->duplicates, pRangeSrc->duplicates_count * sizeof(unsigned int));
pRangeDst->duplicates_count += pRangeSrc->duplicates_count;
pRangeDst->duplicates = lDuplicates;
return 0;
}
int main(int nbargs, char *argv[]) {
struct s_range *lHeapTop = NULL;
struct s_range *lCurrentRange;
struct s_range *lPreviousRange;
unsigned int lMergeFlag;
unsigned int lValue;
unsigned int lN = 3;
unsigned int lDispFlag = 0;
if (nbargs > 1) {
lN = atoi(argv[1]);
}
if (nbargs > 2) {
lDispFlag = atoi(argv[2]);
}
while(fread(&lValue, sizeof(unsigned int), 1, stdin) > 0) {
if (lValue >= lN) {
lCurrentRange = lHeapTop;
lPreviousRange = NULL;
lMergeFlag = 0;
while(lCurrentRange != NULL) {
if (lCurrentRange->min - 1 == lValue) {
lCurrentRange->min = lValue;
lMergeFlag = 1;
break;
}
if (lCurrentRange->max + 1 == lValue) {
lCurrentRange->max = lValue;
lMergeFlag = 1;
break;
}
if (lValue < lCurrentRange->min) {
lCurrentRange = NULL;
break;
}
if (lValue >= lCurrentRange->min && lValue <= lCurrentRange->max) {
if (push_duplicates(lCurrentRange, lValue) != 0) {
drop_allranges(lHeapTop);
return 1;
}
break;
}
lPreviousRange = lCurrentRange;
lCurrentRange = lCurrentRange->next;
}
if (lCurrentRange != NULL) {
if (lMergeFlag == 1) {
if (lPreviousRange != NULL && lCurrentRange->min - 1 == lPreviousRange->max) {
lPreviousRange->max = lCurrentRange->max;
if (pushall_duplicates(lPreviousRange, lCurrentRange) != 0) {
drop_allranges(lHeapTop);
return 1;
}
lPreviousRange->next = lCurrentRange->next;
drop_range(lCurrentRange);
} else {
if (lCurrentRange->next != NULL && lCurrentRange->max + 1 == lCurrentRange->next->min) {
lPreviousRange = lCurrentRange->next;
lCurrentRange->max = lPreviousRange->max;
if (pushall_duplicates(lCurrentRange, lPreviousRange) != 0) {
drop_allranges(lHeapTop);
return 1;
}
lCurrentRange->next = lPreviousRange->next;
drop_range(lPreviousRange);
}
}
}
} else {
if (lPreviousRange == NULL) {
lCurrentRange = new_range(lValue, lValue, lHeapTop);
if (lCurrentRange == NULL) {
drop_allranges(lHeapTop);
return 1;
}
lHeapTop = lCurrentRange;
} else {
lCurrentRange = new_range(lValue, lValue, lPreviousRange->next);
if (lCurrentRange == NULL) {
drop_allranges(lHeapTop);
return 1;
}
lPreviousRange->next = lCurrentRange;
}
}
}
}
// Check the results
if (lDispFlag == 1) {
lCurrentRange = lHeapTop;
while(lCurrentRange != NULL) {
printf("From %u to %u dup:", lCurrentRange->min, lCurrentRange->max);
for (unsigned int lInd = 0; lInd < lCurrentRange->duplicates_count; lInd++) {
printf(" %u", lCurrentRange->duplicates[lInd]);
}
printf("\n");
lCurrentRange = lCurrentRange->next;
}
}
// Cleaning
drop_allranges(lHeapTop);
return 0;
}
With a discrete uniform distribution set of 65 536 words, on a x64 Debian Buster (4670K CPU), this code (range executable) is three times faster than a classic min heap one (node executable):
bash:~$ awk 'BEGIN { for (idx=0;idx<65536;idx++) { v=rand()*256; v2=rand()*256; printf("%c%c%c%c",v,v2,0,0); }}' > data.bin
bash:~$ time cat data.bin | ./range 3
real 0m5.629s
user 0m5.516s
sys 0m0.031s
bash:~$ time cat data.bin | ./node 3
real 0m15.618s
user 0m15.328s
sys 0m0.016s
Upvotes: 1