Reputation: 5580
Is there any built-in JavaScript function to do a partial sort? If not, what is a good way to implement it?
Given an unsorted array of N elements, I would like to find K elements that are minimal with respect to some weighting function. K is much smaller than N, so it would be inefficient to sort the whole array and take the first K elements.
I would be happy even if there was something non-standard, browser-dependent. I could still fallback to the custom JavaScript implementation.
PS: This is my current custom implementation (without taking a weighting function into account, just sorting the elements as they are for simplicity):
function bisect(items, x, lo, hi) {
var mid;
if (typeof(lo) == 'undefined') lo = 0;
if (typeof(hi) == 'undefined') hi = items.length;
while (lo < hi) {
mid = Math.floor((lo + hi) / 2);
if (x < items[mid]) hi = mid;
else lo = mid + 1;
}
return lo;
}
function insort(items, x) {
items.splice(bisect(items, x), 0, x);
}
function partialSort(items, k) {
var smallest = [];
for (var i = 0, len = items.length; i < len; ++i) {
var item = items[i];
if (smallest.length < k || item < smallest[smallest.length - 1]) {
insort(smallest, item);
if (smallest.length > k)
smallest.splice(k, 1);
}
}
return smallest;
}
console.log(partialSort([5, 4, 3, 2, 1, 6, 7, 8, 1, 9], 3));
The algorithm walks through the given array one single time, keeping track of a sorted list of the k smallest items so far, using binary search to insert new elements.
Please post alternative solutions if you think they might be faster or more elegant. Timings are very welcome.
Upvotes: 15
Views: 4690
Reputation: 8923
In the code block below, the nisetamafibo
function keeps an array of the smallest items found so far. The array is sorted and truncated to length K after a certain number of new items have been added to the array, where the number is taken from the Fibonacci sequence so that it is first 1, next 1, then 2, then 3, then 5, and so on. The nisetamadouble
method doubles the interval after which the array is sorted instead, so that it is first 1, then 2, then 4, and so on. (I also tried the approach that I sorted the array each time after a fixed number of new items like 10 had been added, but it was slower. And I also tried to initialize the array at the start of the function so that I took in a fixed number of the first items of the input and sorted them, but I found that initializing the array with 1 or 0 items was the fastest, so I removed the initialization step.)
The nisetamainsertion
function uses insertion sort to sort the items. It's very slow at high K-values because insertion sort has quadratic time complexity, but it's fast at K-values of around 10 to 100 or lower, because insertion sort is fast for short arrays. The nisetamachoose
method chooses nisetamainsertion
for K-values of 100 or less but nisetamafibo
otherwise. (In the Java JDK, the file DualPivotQuicksort.java
uses insertion sort instead of quicksort for arrays with less than 47 items. A presentation about sorting algorithms in R said that "fastest for < 30 items is insert sort".)
I also tried to implement the quickselect algorithm with and without recursion. The version that didn't use recursion was a bit faster, but both versions were still slow compared to other methods especially in cases where N was high and K was low.
On another Stack Exchange site, someone came up with new variants of the Floyd-Rivest algorithm which were faster than the regular Floyd-Rivest algorithm in C: https://softwareengineering.stackexchange.com/questions/284767/kth-selection-routine-floyd-algorithm-489. I tried to implement the variant called select7MO3
in JavaScript, but it ended up being one of the slowest options in my benchmark.
function nisetamafibo(a,k=1){
let found=[],len=a.length,unsorted=0,biggestfound=Infinity,nextsort=1,prevsort=1,oldsort
for(let i=0;i<len;i++){
if(a[i]<biggestfound||i<k){
found.push(a[i])
if(++unsorted==nextsort){
found.sort((l,r)=>l<r?-1:l>r?1:0)
found=found.slice(0,k)
biggestfound=found[found.length-1]
oldsort=nextsort;nextsort+=prevsort;prevsort=oldsort
unsorted=0
}
}
}
found.sort((l,r)=>l<r?-1:l>r?1:0)
return found.slice(0,k)
}
function nisetamadouble(a,k=1){
let found=[],len=a.length,unsorted=0,biggestfound=Infinity,nextsort=1
for(let i=0;i<len;i++){
if(a[i]<biggestfound||i<k){
found.push(a[i])
if(++unsorted==nextsort){
found.sort((l,r)=>l<r?-1:l>r?1:0)
found=found.slice(0,k)
biggestfound=found[found.length-1]
nextsort*=2
unsorted=0
}
}
}
found.sort((l,r)=>l<r?-1:l>r?1:0)
return found.slice(0,k)
}
function nisetamainsertion(a,k=1){
let found=a.slice(0,k),l=a.length
found.sort((l,r)=>l<r?-1:l>r?1:0)
let biggestfound=found[k-1]
for(let i=0;i<l;i++){
let v=a[i]
if(v<biggestfound){
let insertat=k-1
for(let j=0;j<k-1;j++)if(v<found[j]||j==i){insertat=j;break}
for(let j=k-1;j>insertat;j--)found[j]=found[j-1]
found[insertat]=v
biggestfound=found[k-1]
}
}
return found
}
function nisetamachoose(a,k=1){
return k<=100?nisetamainsertion(a,k):nisetamafibo(a,k)
}
function quickselect(a,k,l,r){
l=l||0
r=r||a.length-1
while(true){
let pivot=a[r],pos=l
for(let i=l;i<=r;i++)if(a[i]<pivot){let temp=a[i];a[i]=a[pos];a[pos++]=temp}
let temp=a[r];a[r]=a[pos];a[pos]=temp
if(pos==k)break
pos<k?l=pos+1:r=pos-1
}
}
function quickselectrecursive(a,k,l,r){
l=l||0
r=r||a.length-1
let pivot=a[r],pos=l
for(let i=l;i<=r;i++)if(a[i]<pivot){let temp=a[i];a[i]=a[pos];a[pos++]=temp}
let temp=a[r];a[r]=a[pos];a[pos]=temp
if(pos<k)quickselectrecursive(a,pos+1,r,k)
if(pos>k)quickselectrecursive(a,l,pos-1,k)
}
function sortslice(a,k){
a.sort((l,r)=>l<r?-1:l>r?1:0)
return a.slice(0,k)
}
// https://softwareengineering.stackexchange.com/questions/284767/kth-selection-routine-floyd-algorithm-489
function select7MO3(a,k){
let l=0,i,r=a.length-1,rr=r,ll=l
while(r>l){
if(a[k]<a[l]){let t=a[l];a[l]=a[k];a[k]=t}
if(a[r]<a[l]){let t=a[l];a[l]=a[r];a[r]=t}
if(a[r]<a[k]){let t=a[k];a[k]=a[r];a[r]=t}
if((r-l)>k){
let n=r-l+1
i=k-l+1
let s=(2*n/3)
let div=i-n
let sd=(n*s*(n-s)/n)*(div<0?-1:div>0?1:0)
ll=Math.max(l,k-i*s/n+sd)
rr=Math.min(r,k+(n-i)*s/n+sd)
}
let pivot=a[k]
i=l
let j=r
let t=a[l];a[l]=a[k];a[k]=t
if(a[r]>pivot){t=a[r];a[r]=a[l];a[l]=t}
while(i<j){
let t=a[i];a[i]=a[j];a[j]=t
i++
j--
while(a[i]<pivot)i++
while(a[j]>pivot)j--
}
if(a[l]==pivot){i--;let t=a[l];a[l]=a[j];a[j]=t}
else{j++;let t=a[j];a[j]=a[r];a[r]=t}
if(j<=k)l=j+1
else if(k<=j)r=j-1
}
let out=a.slice(0,k)
out.sort((l,r)=>l<r?-1:l>r?1:0)
return out
}
// OP and Bergi
function bisect(items, x, lo, hi) {
var mid;
if (typeof(lo) == 'undefined') lo = 0;
if (typeof(hi) == 'undefined') hi = items.length;
while (lo < hi) {
mid = Math.floor((lo + hi) / 2);
if (x < items[mid]) hi = mid;
else lo = mid + 1;
}
return lo;
}
function insort(items, x) {
items.splice(bisect(items, x), 0, x);
}
function OP(items, k) {
var smallest = [];
for (var i = 0, len = items.length; i < len; ++i) {
var item = items[i];
if (smallest.length < k || item < smallest[smallest.length - 1]) {
insort(smallest, item);
if (smallest.length > k)
smallest.splice(k, 1);
}
}
return smallest;
}
function OP_Bergi(items, k) {
var smallest = items.slice(0, k).sort(),
max = smallest[k-1];
for (var i = k, len = items.length; i < len; ++i) {
var item = items[i];
if (item < max) {
insort(smallest, item);
smallest.length = k;
max = smallest[k-1];
}
}
return smallest;
}
// trincot
function maxSiftDown(arr, i=0, value=arr[i]) {
if (i >= arr.length) return;
while (true) {
var j = i*2+1;
if (j+1 < arr.length && arr[j] < arr[j+1]) j++;
if (j >= arr.length || value >= arr[j]) break;
arr[i] = arr[j];
i = j;
}
arr[i] = value;
}
function maxHeapify(arr) {
for (var i = arr.length>>1; i--; ) maxSiftDown(arr, i);
return arr;
}
function trincot_MaxHeap(items, k) {
var heap = maxHeapify(items.slice(0, k));
for (var i = k, len = items.length; i < len; ++i) {
var item = items[i];
if (item < heap[0]) maxSiftDown(heap, 0, item);
}
return heap.sort((a,b) => a-b);
}
// DiazJara
function DiazJara(items, k,f) {
function bisect(items, x, lo, hi) {
var mid;
if (typeof(lo) == 'undefined') lo = 0;
if (typeof(hi) == 'undefined') hi = items.length;
while (lo < hi) {
mid = Math.floor((lo + hi) / 2);
if (0>f(x,items[mid])) hi = mid;
else lo = mid + 1;
}
return lo;
}
function insort(items, x) {
items.splice(bisect(items, x), 0, x);
}
var smallest = items.slice(0, k).sort(f),
max = smallest[k-1];
for (var i = k, len = items.length; i < len; ++i) {
var item = items[i];
if (0>f(item,max)) {
insort(smallest, item);
smallest.length = k;
max = smallest[k-1];
}
}
return smallest;
}
// benchmark
for(let nk of'31 33 40 42 44 51 53 55 60 62 64 66 71 73 75'.split(' ')){
let n=parseInt(nk[0]),k0=parseInt(nk[1]),k=10**k0
let opt=[
'OP(a,k)',
'OP_Bergi(a,k)',
'trincot_MaxHeap(a,k)',
'DiazJara(a,k,(l,r)=>l-r)',
'DiazJara(a,k,(l,r)=>l<r?-1:l>r?1:0)',
'nisetamafibo(a,k)',
'nisetamadouble(a,k)',
// 'nisetamainsertion(a,k)', // this would've taken too long to run at K=1e6
'nisetamachoose(a,k)',
'quickselect(a,k);a=a.slice(0,k);a.sort((l,r)=>l<r?-1:l>r?1:0)',
'quickselectrecursive(a,k);a=a.slice(0,k);a.sort((l,r)=>l<r?-1:l>r?1:0)',
'select7MO3(a,k);a=a.slice(0,k);a.sort((l,r)=>l<r?-1:l>r?1:0)',
'sortslice(a,k)'
]
let ord=Array.from({length:100},()=>Array(opt.length).fill().map((_,i)=>i)).flat()
ord.sort(()=>Math.random()-.5)
for(let x of ord){
let o=opt[x]
let a=Array.from({length:10**n},()=>Math.random())
let t1=process.hrtime.bigint()
eval(o)
let t2=process.hrtime.bigint()-t1
console.log(n+' '+k0+' '+o+' '+t2)
}
}
This shows the median time of a hundred runs in ms and the average rank of each method (where for example 7/4 means that N was 1e7 and K was 1e4):
For most combinations of N and K, Bergi's modified version of the OP's code was actually slower than the OP's code, even though the OP's code was extremely slow in the case where N and K were both 1e6.
(l,r)=>l<r?-1:l>r?1:0
is faster than (l,r)=>l-r
as you can see by comparing the two versions of Díaz-Jara's method above.
Here's also versions of my nisetamadouble
and nisetamainsertion
methods which return the indexes of the smallest items in addition to the values:
let a=Array.from({length:1e5},()=>Math.random())
let k=10
let l=a.length
let biggestfound=Infinity,foundind=[],foundval=[]
for(let i=0;i<l;i++){
let v=a[i]
if(i<k||v<biggestfound){
let insertat=k-1
for(let j=0;j<k-1;j++)if(v<foundval[j]||j==i){insertat=j;break}
for(let j=k-1;j>insertat;j--){foundind[j]=foundind[j-1];foundval[j]=foundval[j-1]}
foundind[insertat]=i
foundval[insertat]=v
biggestfound=foundval[k-1]
}
}
console.log(foundind)
console.log(foundval)
function nisetama(a,k=1){
let found=[],len=a.length,unsorted=0,biggestfound=Infinity,nextsort=1
for(let i=0;i<len;i++){
if(a[i]<biggestfound||i<k){
found.push(a[i])
if(++unsorted==nextsort){
found.sort((l,r)=>l<r?-1:l>r?1:0)
found=found.slice(0,k)
biggestfound=found[found.length-1]
nextsort*=2
unsorted=0
}
}
}
found.sort((l,r)=>l<r?-1:l>r?1:0)
return found.slice(0,k)
}
let a2=a
nisetama(a2,k)
biggestfound=a2[k-1],foundind=[]
for(let i=0;i<l;i++)if(a[i]<=biggestfound)foundind.push(i)
foundind.sort((l,r)=>a[l]<a[r]?-1:a[l]>a[r]?1:0)
foundind=foundind.slice(0,k)
console.log(foundind)
console.log(foundval)
Upvotes: 0
Reputation: 350770
For relatively small k it can be worth it to implement a Max Heap (by lack of a native one in JavaScript):
Create a Max Heap of the first k values
For each remaining value:
Finally sort the heap and return it.
This is in fact an improvement on another idea using a Min Heap, but that one needs to heapify the whole array, and so will not run as fast. After heapifying the whole array, you just extract k times a value from that heap, and return those values.
I have added both solutions to Bergi's jsperf.com performance tests (copied to jsbench.me). For that particular test (5000 array values, k = 10), the Max Heap solution is faster. But this advantage will shrink as k is increased.
Here is the code for the Max Heap solution:
// A few Heap-functions that operate on an array
function maxSiftDown(arr, i=0, value=arr[i]) {
if (i >= arr.length) return;
while (true) {
var j = i*2+1;
if (j+1 < arr.length && arr[j] < arr[j+1]) j++;
if (j >= arr.length || value >= arr[j]) break;
arr[i] = arr[j];
i = j;
}
arr[i] = value;
}
function maxHeapify(arr) {
for (var i = arr.length>>1; i--; ) maxSiftDown(arr, i);
return arr;
}
// The main algorithm
function partialSortWithMaxHeap(items, k) {
var heap = maxHeapify(items.slice(0, k));
for (var i = k, len = items.length; i < len; ++i) {
var item = items[i];
if (item < heap[0]) maxSiftDown(heap, 0, item);
}
return heap.sort((a,b) => a-b);
}
// Sample data & call
var arr = Array.from({length:5000}, () => Math.floor(Math.random() * 1e5));
console.log(partialSortWithMaxHeap(arr, 10));
Upvotes: 4
Reputation: 1
I made a version than works with objects, like Array.sort(f):
function partialSort(items, k,f) {
function bisect(items, x, lo, hi) {
var mid;
if (typeof(lo) == 'undefined') lo = 0;
if (typeof(hi) == 'undefined') hi = items.length;
while (lo < hi) {
mid = Math.floor((lo + hi) / 2);
if (0>f(x,items[mid])) hi = mid;
else lo = mid + 1;
}
return lo;
}
function insort(items, x) {
items.splice(bisect(items, x), 0, x);
}
var smallest = items.slice(0, k).sort(f),
max = smallest[k-1];
for (var i = k, len = items.length; i < len; ++i) {
var item = items[i];
if (0>f(item,max)) {
insort(smallest, item);
smallest.length = k;
max = smallest[k-1];
}
}
return smallest;
}
// [ { e: 1 }, { e: 1 }, { e: 2 } ]
console.log(partialSort([{e:4},{e:6},{e:1},{e:8},{e:3},{e:1},{e:6},{e:2}],3,(a,b)=>a.e-b.e))
console.log()
Upvotes: 0
Reputation: 665030
No. There's only the full array sort
, so you will need to use your own implementation.
Little improvement on your code (I had thought of exactly the same algorithm :-)):
function partialSort(items, k) {
var smallest = items.slice(0, k).sort(),
max = smallest[k-1];
for (var i = k, len = items.length; i < len; ++i) {
var item = items[i];
if (item < max) {
insort(smallest, item);
smallest.length = k;
max = smallest[k-1];
}
}
return smallest;
}
(Even seems to be a little faster, I guess due to caching the max
variable)
Upvotes: 7
Reputation: 59311
There's no native partial sort function. The closest thing to what you want is Array.filter.
function isSmallEnough(element, index, array) {
return (element <= 10);
}
var filtered = [12, 5, 8, 130, 44].filter(isSmallEnough);
// filtered is [5, 8]
The example was borrowed (and slightly modified) from the above link.
Upvotes: 2