Reputation: 403
I have a data set: (file.txt)
X Y
1 a
2 b
3 c
10 d
11 e
12 f
15 g
20 h
25 i
30 j
35 k
40 l
41 m
42 n
43 o
46 p
I want to add two columns which are Up10 and Down10,
For example:
X Y Up10 Down10
35 k 3 5
Desired Output:
X Y Up10 Down10
1 a 1 5
2 b 2 5
3 c 3 4
10 d 4 5
11 e 5 4
12 f 5 3
15 g 4 3
20 h 5 3
25 i 3 3
30 j 3 3
35 k 3 5
40 l 3 5
41 m 3 4
42 n 4 3
43 o 5 2
46 p 5 1
This is the Pierre François' solution: Thanks again @Pierre François
awk '
BEGIN{OFS="\t"; print "X\tY\tUp10\tDown10"}
(NR == FNR) && (FNR > 1){a[$1] = $1 + 0}
(NR > FNR) && (FNR > 1){
up = 0; upl = $1 - 10
down = 0; downl = $1 + 10
for (i in a) { i += 0 # tricky: convert i to integer
if ((i >= upl) && (i <= $1)) {up++}
if ((i >= $1) && (i <= downl)) {down++}
}
print $1, $2, up, down;
}
' file.txt file.txt > file-2.txt
But when i use this command for 13GB data, it takes too long.
I have used this way for 13GB data again:
awk 'BEGIN{ FS=OFS="\t" }
NR==FNR{a[NR]=$1;next} {x=y=FNR;while(--x in a&&$1-10<a[x]){} while(++y in a&&$1+10>a[y]){} print $0,FNR-x,y-FNR}
' file.txt file.txt > file-2.txt
When file-2.txt reaches 1.1GB it is frozen. I am waiting several hours, but i can not see finish of command and final output file.
Note: I am working on Gogole cloud. Machine type e2-highmem-8 (8 vCPUs, 64 GB memory)
Upvotes: 0
Views: 341
Reputation: 10123
Another single pass approach with a sliding window
awk '
NR == 1 { next } # skip the header
NR == 2 { min = max = cur = 1; X[cur] = $1; Y[cur] = $2; next }
{ X[++max] = $1; Y[max] = $2
if (X[cur] >= $1 - 10) next
for (; X[cur] + 10 < X[max]; ++cur) {
for (; X[min] < X[cur] - 10; ++min) {
delete X[min]
delete Y[min]
}
print X[cur], Y[cur], cur - min + 1, max - cur
}
}
END {
for (; cur <= max; ++cur) {
for (; X[min] < X[cur] - 10; ++min);
for (i = max; i > cur && X[cur] + 10 < X[i]; --i);
print X[cur], Y[cur], cur - min + 1, i - cur + 1
}
}
' file
The script assumes the X
column is ordered numerically.
Upvotes: 1
Reputation: 37404
A single pass awk that keeps the sliding window of 10 last records and uses that to count the ups and downs. For symmetricy's sake there should be delete
s in the END
but I guess a few extra array elements in memory isn't gonna make a difference:
$ awk '
BEGIN {
FS=OFS="\t"
}
NR==1 {
print $1,$2,"Up10","Down10"
}
NR>1 {
a[NR]=$1
b[NR]=$2
for(i=NR-9;i<=NR;i++) {
if(a[i]>=a[NR]-10&&i>=2)
up[NR]++
if(a[i]<=a[NR-9]+10&&i>=2)
down[NR-9]++
}
}
NR>10 {
print a[NR-9],b[NR-9],up[NR-9],down[NR-9]
delete a[NR-9]
delete b[NR-9]
delete up[NR-9]
delete down[NR-9]
}
END {
for(nr=NR+1;nr<=NR+9;nr++) {
for(i=nr-9;i<=nr;i++)
if(a[i]<=a[nr-9]+10&&i>=2&&i<=NR)
down[nr-9]++
print a[nr-9],b[nr-9],up[nr-9],down[nr-9]
}
}' file
Output:
X Y Up10 Down10
1 a 1 5
2 b 2 5
...
35 k 3 5
...
43 o 5 2
46 p 5 1
Upvotes: 1