ersan
ersan

Reputation: 403

Using awk to count number of row group

I have a data set: (file.txt)

X    Y
1    a
2    b
3    c
10   d
11   e
12   f
15   g
20   h     
25   i
30   j
35   k
40   l
41   m
42   n
43   o
46   p

I want to add two columns which are Up10 and Down10,

For example:

X    Y    Up10    Down10    
35   k     3        5  

Desired Output:

X    Y    Up10    Down10
1    a     1        5
2    b     2        5
3    c     3        4
10   d     4        5
11   e     5        4
12   f     5        3
15   g     4        3
20   h     5        3
25   i     3        3
30   j     3        3
35   k     3        5
40   l     3        5
41   m     3        4
42   n     4        3
43   o     5        2
46   p     5        1

This is the Pierre François' solution: Thanks again @Pierre François

awk '
BEGIN{OFS="\t"; print "X\tY\tUp10\tDown10"}
(NR == FNR) && (FNR > 1){a[$1] = $1 + 0}
(NR > FNR) && (FNR > 1){ 
  up = 0; upl = $1 - 10
  down = 0; downl = $1 + 10
  for (i in a) { i += 0 # tricky: convert i to integer
    if ((i >= upl) && (i <= $1)) {up++}
    if ((i >= $1) && (i <= downl)) {down++}
  }
  print $1, $2, up, down;
}
' file.txt file.txt > file-2.txt

But when i use this command for 13GB data, it takes too long.

I have used this way for 13GB data again:

awk 'BEGIN{ FS=OFS="\t" }
NR==FNR{a[NR]=$1;next} {x=y=FNR;while(--x in a&&$1-10<a[x]){} while(++y in a&&$1+10>a[y]){} print $0,FNR-x,y-FNR}
' file.txt file.txt > file-2.txt

When file-2.txt reaches 1.1GB it is frozen. I am waiting several hours, but i can not see finish of command and final output file.

Note: I am working on Gogole cloud. Machine type e2-highmem-8 (8 vCPUs, 64 GB memory)

Upvotes: 0

Views: 341

Answers (2)

M. Nejat Aydin
M. Nejat Aydin

Reputation: 10123

Another single pass approach with a sliding window

awk '
NR == 1 { next } # skip the header
NR == 2 { min = max = cur = 1; X[cur] = $1; Y[cur] = $2; next }

{   X[++max] = $1; Y[max] = $2
    if (X[cur] >= $1 - 10) next
    for (; X[cur] + 10 < X[max]; ++cur) {
        for (; X[min] < X[cur] - 10; ++min) {
            delete X[min]
            delete Y[min]
        }   
        print X[cur], Y[cur], cur - min + 1, max - cur
    }
}

END {
    for (; cur <= max; ++cur) {
        for (; X[min] < X[cur] - 10; ++min);
        for (i = max; i > cur && X[cur] + 10 < X[i]; --i);
        print X[cur], Y[cur], cur - min + 1, i - cur + 1 
    }
}
' file

The script assumes the X column is ordered numerically.

Upvotes: 1

James Brown
James Brown

Reputation: 37404

A single pass awk that keeps the sliding window of 10 last records and uses that to count the ups and downs. For symmetricy's sake there should be deletes in the END but I guess a few extra array elements in memory isn't gonna make a difference:

$ awk '
BEGIN {
    FS=OFS="\t"
}
NR==1 {
    print $1,$2,"Up10","Down10"
}
NR>1 {
    a[NR]=$1
    b[NR]=$2

    for(i=NR-9;i<=NR;i++) {
        if(a[i]>=a[NR]-10&&i>=2)
            up[NR]++
        if(a[i]<=a[NR-9]+10&&i>=2)
            down[NR-9]++
    }
}
NR>10 {
    print a[NR-9],b[NR-9],up[NR-9],down[NR-9]
    delete a[NR-9]
    delete b[NR-9]
    delete up[NR-9]
    delete down[NR-9]
}
END {
    for(nr=NR+1;nr<=NR+9;nr++) {
        for(i=nr-9;i<=nr;i++)
            if(a[i]<=a[nr-9]+10&&i>=2&&i<=NR)
                down[nr-9]++
        print a[nr-9],b[nr-9],up[nr-9],down[nr-9]
   }
}' file

Output:

X       Y       Up10    Down10
1       a       1       5
2       b       2       5
...
35      k       3       5
...
43      o       5       2
46      p       5       1

Upvotes: 1

Related Questions