Reputation: 1071
I have
chr pos C T A G
NC_044998.1 3732 21 0 0 0
NC_044998.1 3733 22 0 2 0
NC_044998.1 3734 22 0 5 0
NC_044998.1 3735 22 0 0 0
NC_044998.1 3736 0 0 7 0
NC_044998.1 3737 0 0 0 22
NC_044998.1 3738 20 0 0 0
NC_044998.1 3739 1 0 22 0
NC_044998.1 3740 0 22 0 0
NC_044998.1 3741 22 0 0 0
I need to output the max value in $3 to $7 per line as well as the column name associated with it.
so that I have
chr pos max ref
NC_044998.1 3732 21 C
NC_044998.1 3733 22 C
NC_044998.1 3734 22 C
NC_044998.1 3735 22 C
NC_044998.1 3736 7 A
NC_044998.1 3737 22 G
NC_044998.1 3738 20 C
NC_044998.1 3739 22 A
NC_044998.1 3740 22 T
NC_044998.1 3741 22 C
I'm trying to adapt this:
awk 'NR == 1 {for (c = 3; c <= NF; i++) headers[c] = $c; next} {maxc=3;for(c=4;c<=NF;c++)if($c>$maxc){maxc=c} printf "max:%s, %s\n", $maxc, headers[maxc]}'
but it just output this max value
also have tried
awk '{maxc=3;for(c=4;c<=NF;c++)if($c>$maxc){maxc=c; $maxc = headers[c]} printf "max:%s, column:%s, column:%s\n",$maxc, maxc, headers[maxc]}'
Another issue I'm trying to figure is in cases where there's a tie between one or more columns. In that case I would like to print the max and the names of all columns associated.
Upvotes: 4
Views: 323
Reputation: 204608
Given this input file with duplicate max values on the 1st and 6th data lines:
$ cat file
chr pos C T A G
NC_044998.1 3732 21 0 21 0
NC_044998.1 3733 22 0 2 0
NC_044998.1 3734 22 0 5 0
NC_044998.1 3735 22 0 0 0
NC_044998.1 3736 0 0 7 0
NC_044998.1 3737 0 22 0 22
NC_044998.1 3738 20 0 0 0
NC_044998.1 3739 1 0 22 0
NC_044998.1 3740 0 22 0 0
NC_044998.1 3741 22 0 0 0
Using GNU awk for sorted_in
:
$ cat tst.awk
BEGIN {
PROCINFO["sorted_in"] = "@val_num_desc"
}
{
printf "%s%s%s%s", $1, OFS, $2, OFS
$1 = $2 = ""
}
NR==1 {
split($0,refs)
print "max", "ref"
next
}
{
n = split($0,vals)
for ( i in vals ) {
maxVal = vals[i]
break
}
ref = sep = ""
for ( i=1; i<=n; i++ ) {
if ( vals[i] == maxVal ) {
ref = ref sep refs[i]
sep = ","
}
}
print maxVal, ref
}
$ awk -f tst.awk file | column -t
chr pos max ref
NC_044998.1 3732 21 C,A
NC_044998.1 3733 22 C
NC_044998.1 3734 22 C
NC_044998.1 3735 22 C
NC_044998.1 3736 7 A
NC_044998.1 3737 22 T,G
NC_044998.1 3738 20 C
NC_044998.1 3739 22 A
NC_044998.1 3740 22 T
NC_044998.1 3741 22 C
The main reason I posted the above is to show how to get a min or max by using "sorted_in" along with for ( i in ... )
and then breaking after accessing the first index as that would then give you the max or min based on whatever criteria you set using PROCINFO["sorted_in"]=...
without you having to write any code to do the comparisons.
Upvotes: 2
Reputation: 786136
Let's say you have this input file:
cat file
chr pos C T A G
NC_044998.1 3732 21 0 0 0
NC_044998.1 3733 22 0 2 0
NC_044998.1 3734 22 0 5 0
NC_044998.1 3735 22 0 0 0
NC_044998.1 3736 0 0 7 7
NC_044998.1 3737 0 22 0 22
NC_044998.1 3738 20 0 0 0
NC_044998.1 3739 1 0 22 0
NC_044998.1 3740 0 22 0 0
NC_044998.1 3741 22 0 0 0
TO get your desired output, use this awk
:
awk -v b=3 'NR==1{for (i=b; i<=NF; ++i) hdr[i]=$i; print $1, $2, "max", "ref"; next} {for (i=b; i<=NF; ++i) max=($i>max?$i:max); for (i=b; i<=NF; ++i) if ($i == max) c4=(c4?c4 ";":"") hdr[i]; print $1, $2, max, c4; max=c4=""}' file
chr pos max ref
NC_044998.1 3732 21 C
NC_044998.1 3733 22 C
NC_044998.1 3734 22 C
NC_044998.1 3735 22 C
NC_044998.1 3736 7 A;G
NC_044998.1 3737 22 T;G
NC_044998.1 3738 20 C
NC_044998.1 3739 22 A
NC_044998.1 3740 22 T
NC_044998.1 3741 22 C
Here is a more readable version:
awk -v b=3 '
NR == 1 {
for (i=b; i<=NF; ++i)
hdr[i] = $i
print $1, $2, "max", "ref"
next
}
{
for (i=b; i<=NF; ++i)
max = ($i > max ? $i : max)
for (i=b; i<=NF; ++i)
if ($i == max)
c4 = (c4 ? c4 ";" : "") hdr[i]
print $1, $2, max, c4
max=c4=""
}' file
Upvotes: 3
Reputation: 8174
you can try it
awk 'BEGIN{OFS="\t";FS="[[:space:]]+"}
NR==1{print $1,$2,"max","ref";
for(i=3;i<=6;++i)BASES[i]=$(i);
}
NR>1{
basemax=3; max=$3;
for(i=4;i<=6;++i){if($i>max){basemax=i;max=$i;}}
print $1,$2,BASES[basemax],max
}
' inputfile
you get,
chr pos max ref
NC_044998.1 3732 C 21
NC_044998.1 3733 C 22
NC_044998.1 3734 C 22
NC_044998.1 3735 C 22
NC_044998.1 3736 A 7
NC_044998.1 3737 G 22
NC_044998.1 3738 C 20
NC_044998.1 3739 A 22
NC_044998.1 3740 T 22
NC_044998.1 3741 C 22
NOTE: If exist two or more bases with max value, correction
For input,
chr pos C T A G
NC_044998.1 3732 21 0 21 0
NC_044998.1 3733 22 0 2 0
NC_044998.1 3734 22 0 5 0
NC_044998.1 3735 22 0 0 0
NC_044998.1 3736 0 0 7 7
NC_044998.1 3737 0 0 0 22
NC_044998.1 3738 20 0 0 0
NC_044998.1 3739 1 0 22 0
NC_044998.1 3740 0 22 0 0
NC_044998.1 3741 22 0 0 0
you can try it
awk 'BEGIN{OFS="\t";FS="[[:space:]]+"}
NR==1{print $1,$2,"max","ref";
for(i=3;i<=6;++i)BASES[i]=$(i);
}
NR>1{basemax=BASES[3]; max=$3;
for(i=4;i<=6;++i){
if($i>max){basemax=BASES[i];max=$i;}
else if($i==max){basemax=basemax","BASES[i];}
}
print $1,$2,basemax,max
}
' fileinput
you get,
chr pos max ref
NC_044998.1 3732 C,A 21
NC_044998.1 3733 C 22
NC_044998.1 3734 C 22
NC_044998.1 3735 C 22
NC_044998.1 3736 A,G 7
NC_044998.1 3737 G 22
NC_044998.1 3738 C 20
NC_044998.1 3739 A 22
NC_044998.1 3740 T 22
NC_044998.1 3741 C 22
Upvotes: 3
Reputation: 133770
With your shown samples, please try following awk
code, written and tested in GNU awk
.
awk -v startField="3" -v endField="6" '
FNR==1{
for(i=startField;i<=endField;i++){
heading[i]=$i
}
next
}
{
max=maxInd=""
for(i=startField;i<=endField;i++){
maxInd=(max<$i?i:maxInd)
max=(max<$i?$i:max)
}
NF=(startField-1)
print $0,heading[maxInd]
}
' Input_file
Advantages of this code's approach:
startField
and endField
so we need NOT to change anything inside main awk
code.Detailed explanation: Adding detailed explanation for above.
awk -v startField="3" -v endField="6" ' ##Starting awk program and setting startField and endField to values on which user wants to look for maximum values.
FNR==1{ ##Checking condition if this is first line of Input_file.
for(i=startField;i<=endField;i++){ ##Traversing through only those fields which user needs to get max value.
heading[i]=$i ##Creating array heading whose index is i and value is current field value.
}
next ##next will skip all further statements from here.
}
{
max=maxInd="" ##Nullifying max and maxInd variables here.
for(i=startField;i<=endField;i++){ ##Traversing through only those fields which user needs to get max value.
maxInd=(max<$i?i:maxInd) ##Getting maxInd variable to current field number if current field value is greater than maxInd else keep it as maxInd itself.
max=(max<$i?$i:max) ##Getting max variable to current field value if current field value is greater than max else keep it as max itself.
}
NF=(startField-1) ##Setting NF(number of fields of current line) to startField-1 here.
print $0,heading[maxInd] ##printing current field followed by heading array value whose index is maxInd.
}
' Input_file ##Mentioning Input_file name here.
Upvotes: 4
Reputation: 26225
For your original desired outcome:
awk 'NR==1 {for (i=0; i<=NF; i++) header_array[i] = $i; } NR>1{a=$3; for (i=3;i<=6;i++) if($i >= a) {a=$i; column=header_array[i]} {printf "max:%s, %s\n", a, column}}' file
# Output
max:21, C
max:22, C
max:22, C
max:22, C
max:7, A
max:22, G
max:20, C
max:22, A
max:22, T
max:22, C
For your updated desired outcome:
awk 'NR==1 {for (i=0; i<=NF; i++) header_array[i] = $i; print "chr pos max ref"} NR>1{char=$1; pos=$2; a=$3; for (i=3;i<=6;i++) if($i >= a) {a=$i; column=header_array[i]} {printf "%s %s %s %s\n", char, pos, a, column}}' file
# Output
chr pos max ref
NC_044998.1 3732 21 C
NC_044998.1 3733 22 C
NC_044998.1 3734 22 C
NC_044998.1 3735 22 C
NC_044998.1 3736 7 A
NC_044998.1 3737 22 G
NC_044998.1 3738 20 C
NC_044998.1 3739 22 A
NC_044998.1 3740 22 T
NC_044998.1 3741 22 C
Formatted:
awk 'NR==1 {
for (i = 0; i <= NF; i++) {
header_array[i] = $i
}
print "chr pos max ref"
}
NR>1 {
char = $1
pos = $2
a = $3
for (i = 3; i <= 6; i++) {
if ($i >= a) {
a = $i
column = header_array[i]
}
} {
printf "%s %s %s %s\n", char, pos, a, column
}
}' file
Upvotes: 4