Qwello Lee
Qwello Lee

Reputation: 41

What's the most efficient way to keep track of n largest values?

I have a program that receives in input a large number of values and has to keep track of the n largest received. For example, let's say n is 3 and the input is 1,6,3,5,2 the output would have to be 6,5,3 (not necessarily in this order).

At the moment I'm using a min heap implemented via an array, but that isn't quite cutting it time-wise. Are there any alternatives I can look into?

Upvotes: 2

Views: 620

Answers (1)

Zilog80
Zilog80

Reputation: 2562

The "(not necessarily in this order)" implies that you can have a sorted numerically output. As you only have to track values greater or equal than n, with a very large input of integer values, a wise way would be then to keep track of integer ranges instead of values.

It will heavily depend of the inputs. For hughes word integer inputs with a discrete uniform distribution, it would cost less memory and would be faster.

A simple pseudo code implementation would require for each input value to check it against a min heap ordered stack of ranges :

Range is
   min as integer
   max as integer
   next as Range
   duplicates as stack of integer
   Range(pMin, pMax, pNext)
     self.min = pMin
     self.max = pMax
     self.next = pNext
     self.duplicates = empty stack of integer

Range heap_top = NULL
Range current_range = NULL
Range previous_range = NULL

boolean merge_flag 
integer value

While read value from input

  if value >= n Then 
    current_range = heap_top
    previous_range = NULL
    merge_flag = false
    While current_range is not null 
      If value is current_range.min - 1 Then
        current_range.min = value
        merge_flag = true
        Break
      End If
      If value is current_range.max + 1 Then
        current_range.max = value
        merge_flag = true
        Break
      End If
      If value < current_range.min Then
        current_range = NULL
        Break
      End If
      If value is between current_range.min and current_range.max Then
        # Here we track duplicates value
        current_range.duplicates.push value
        Break
      End If
      previous_range = current_range
      current_range = current_range->next
    End While
    If current_range is not NULL Then
       If merge_flag is true Then
         If previous_range is not NULL and current_range.min - 1 is previous_range.max Then
           # merge current range into previous one
           previous_range.max = current_range.max
           # Here we track duplicates value
           previous_range.duplicates.pushall current_range.duplicates
           previous_range.next = current_range.next
           drop current_range
           # If we need to keep a track of the range where belong the value
           # current_range = previous_range
         Else
           If current_range.next is not NULL and current_range.max + 1 is 
current_range.next.min Then
             # merge next range into current one
             # We use previous_range to point the next range
             previous_range = current_range.next
             current_range.max = previous_range.max
             # Here we track duplicates value
             current_range.duplicates.pushall previous_range.duplicates
             current_range.next = previous_range.next
             drop previous_range
           End If
         End If 
       End If
    Else
       If previous_range is NULL Then
          current_range = new Range(value, value, heap_top)
          heap_top = current_range
       Else
          current_range = new Range(value, value, previous_range.next)
          previous_range.next = current_range
       End If
    End If
  End If
End While

Less nodes implies less node traversal processing on the long run if the input is uniformly distributed. Less node traversal processing for each input value to process means a faster global processing, as we have then an algorithm approaching O(N) instead of O(N!) with N as the number of input values.

An example of C implementation of the the previous algorithm :

#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <string.h>
struct s_range {
  unsigned int min;
  unsigned int max;
  struct s_range *next;
  unsigned int *duplicates;
  unsigned int duplicates_count;
};

struct s_range *new_range(unsigned int pMin,unsigned int pMax, struct s_range *pNext) {
  struct s_range *lRange = malloc(sizeof(struct s_range));
  if (lRange == NULL) {
     perror("new_range: Failed to allocate");
     return NULL;
  }
  lRange->min = pMin;
  lRange->max = pMax;
  lRange->next = pNext;
  lRange->duplicates = NULL;
  lRange->duplicates_count = 0;
  return lRange;
}
void drop_range(struct s_range *pRange) {
   if (pRange != NULL) {
      if (pRange->duplicates != NULL) {
         free(pRange->duplicates);
      }
      free(pRange);
   }
}
void drop_allranges(struct s_range *pTopRange) {
   struct s_range *lRange;
   while (pTopRange != NULL) {
     lRange = pTopRange->next;
     drop_range(pTopRange);
     pTopRange = lRange; 
   }
}
int push_duplicates(struct s_range *pRange, unsigned int pValue) {
   unsigned int *lDuplicates;
   if (pRange == NULL) {
     return 2;
   }
   if (pRange->duplicates == NULL) {
      lDuplicates = malloc(sizeof(unsigned int));
   } else {
      lDuplicates = realloc(pRange->duplicates, (pRange->duplicates_count + 1) * sizeof(unsigned int));
   }
   if (lDuplicates == NULL) {
     perror("push_duplicates: failed to allocate...");
     return 1;
   }
   lDuplicates[pRange->duplicates_count++] = pValue;
   pRange->duplicates = lDuplicates;
   return 0;
}
int pushall_duplicates(struct s_range *pRangeDst, struct s_range *pRangeSrc) {
   unsigned int *lDuplicates;
   if (pRangeDst == NULL || pRangeSrc == NULL) {
     return 2;
   }
   if (pRangeSrc->duplicates == NULL) {
     return 0;
   }
   if (pRangeDst->duplicates == NULL) {
      lDuplicates = malloc(pRangeSrc->duplicates_count * sizeof(unsigned int));
   } else {
      lDuplicates = realloc(pRangeDst->duplicates, (pRangeDst->duplicates_count + pRangeSrc->duplicates_count) * sizeof(unsigned int));
   }
   if (lDuplicates == NULL) {
     perror("pushall_duplicates: failed to allocate...");
     return 1;
   }
   memcpy(&lDuplicates[pRangeDst->duplicates_count], pRangeSrc->duplicates, pRangeSrc->duplicates_count * sizeof(unsigned int));
   pRangeDst->duplicates_count += pRangeSrc->duplicates_count;
   pRangeDst->duplicates = lDuplicates;
   return 0;
}

      
int main(int nbargs, char *argv[]) {
  struct s_range *lHeapTop = NULL;
  struct s_range *lCurrentRange;
  struct s_range *lPreviousRange;
  unsigned int lMergeFlag;
  unsigned int lValue;
  unsigned int lN = 3;
  unsigned int lDispFlag = 0;
  if (nbargs > 1) {
    lN = atoi(argv[1]);
  }
  if (nbargs > 2) {
    lDispFlag = atoi(argv[2]);
  }
  while(fread(&lValue, sizeof(unsigned int), 1, stdin) > 0) {
    if (lValue >= lN) {
      lCurrentRange = lHeapTop;
      lPreviousRange = NULL;
      lMergeFlag = 0;
      while(lCurrentRange != NULL) {
        if (lCurrentRange->min - 1 == lValue) {
          lCurrentRange->min = lValue;
          lMergeFlag = 1;
          break;
        }
        if (lCurrentRange->max + 1 == lValue) {
          lCurrentRange->max = lValue;
          lMergeFlag = 1;
          break;
        }
        if (lValue < lCurrentRange->min) {
          lCurrentRange = NULL;
          break;
        }
        if (lValue >= lCurrentRange->min && lValue <= lCurrentRange->max) {
          if (push_duplicates(lCurrentRange, lValue) != 0) {
            drop_allranges(lHeapTop);
            return 1;
          }
          break;
        }
        lPreviousRange = lCurrentRange;
        lCurrentRange = lCurrentRange->next; 
      }
      if (lCurrentRange != NULL) {
        if (lMergeFlag == 1) {
          if (lPreviousRange != NULL && lCurrentRange->min - 1 == lPreviousRange->max) {
            lPreviousRange->max = lCurrentRange->max;
            if (pushall_duplicates(lPreviousRange, lCurrentRange) != 0) {
              drop_allranges(lHeapTop);
              return 1;
            }
            lPreviousRange->next = lCurrentRange->next; 
            drop_range(lCurrentRange);
          } else {
            if (lCurrentRange->next != NULL && lCurrentRange->max + 1 == lCurrentRange->next->min) {
              lPreviousRange = lCurrentRange->next;
              lCurrentRange->max = lPreviousRange->max;
              if (pushall_duplicates(lCurrentRange, lPreviousRange) != 0) {
                drop_allranges(lHeapTop);
                return 1;
              }
              lCurrentRange->next = lPreviousRange->next;
              drop_range(lPreviousRange);
            }
          }
        }
      } else {
        if (lPreviousRange == NULL) {
          lCurrentRange = new_range(lValue, lValue, lHeapTop);
          if (lCurrentRange == NULL) {
            drop_allranges(lHeapTop);
            return 1;
          }
          lHeapTop = lCurrentRange;
        } else {
          lCurrentRange = new_range(lValue, lValue, lPreviousRange->next);
          if (lCurrentRange == NULL) {
            drop_allranges(lHeapTop);
            return 1;
          }
          lPreviousRange->next = lCurrentRange;
        }
      }
    }
  }
  // Check the results
  if (lDispFlag == 1) {
    lCurrentRange = lHeapTop;
    while(lCurrentRange != NULL) {
      printf("From %u to %u dup:", lCurrentRange->min, lCurrentRange->max);
      for (unsigned int lInd = 0; lInd < lCurrentRange->duplicates_count; lInd++) {
        printf(" %u", lCurrentRange->duplicates[lInd]);
      }
      printf("\n"); 
      lCurrentRange = lCurrentRange->next;
    }
  }
  // Cleaning
  drop_allranges(lHeapTop);
  return 0;
}  

With a discrete uniform distribution set of 65 536 words, on a x64 Debian Buster (4670K CPU), this code (range executable) is three times faster than a classic min heap one (node executable):

bash:~$ awk 'BEGIN { for (idx=0;idx<65536;idx++) { v=rand()*256; v2=rand()*256; printf("%c%c%c%c",v,v2,0,0); }}' > data.bin
bash:~$ time cat  data.bin | ./range 3

real    0m5.629s
user    0m5.516s
sys     0m0.031s
bash:~$ time cat  data.bin | ./node 3

real    0m15.618s
user    0m15.328s
sys     0m0.016s

Upvotes: 1

Related Questions