Madza Farias-Virgens
Madza Farias-Virgens

Reputation: 1071

Output name of column with max value per line

I have

chr pos C T A G
NC_044998.1     3732    21 0 0 0
NC_044998.1     3733    22 0 2 0
NC_044998.1     3734    22 0 5 0
NC_044998.1     3735    22 0 0 0
NC_044998.1     3736    0 0 7 0
NC_044998.1     3737    0 0 0 22
NC_044998.1     3738    20 0 0 0
NC_044998.1     3739    1 0 22 0
NC_044998.1     3740    0 22 0 0
NC_044998.1     3741    22 0 0 0

I need to output the max value in $3 to $7 per line as well as the column name associated with it.

so that I have

chr pos max ref
NC_044998.1     3732 21 C
NC_044998.1     3733 22 C
NC_044998.1     3734 22 C
NC_044998.1     3735 22 C
NC_044998.1     3736 7 A
NC_044998.1     3737 22 G
NC_044998.1     3738 20 C
NC_044998.1     3739 22 A
NC_044998.1     3740 22 T
NC_044998.1     3741 22 C 

I'm trying to adapt this:

awk 'NR == 1 {for (c = 3; c <= NF; i++) headers[c] = $c; next} {maxc=3;for(c=4;c<=NF;c++)if($c>$maxc){maxc=c} printf "max:%s, %s\n", $maxc, headers[maxc]}'

but it just output this max value

also have tried

awk '{maxc=3;for(c=4;c<=NF;c++)if($c>$maxc){maxc=c; $maxc = headers[c]} printf "max:%s, column:%s, column:%s\n",$maxc, maxc, headers[maxc]}'

Another issue I'm trying to figure is in cases where there's a tie between one or more columns. In that case I would like to print the max and the names of all columns associated.

Upvotes: 4

Views: 323

Answers (5)

Ed Morton
Ed Morton

Reputation: 204608

Given this input file with duplicate max values on the 1st and 6th data lines:

$ cat file
chr pos C T A G
NC_044998.1     3732    21 0 21 0
NC_044998.1     3733    22 0 2 0
NC_044998.1     3734    22 0 5 0
NC_044998.1     3735    22 0 0 0
NC_044998.1     3736    0 0 7 0
NC_044998.1     3737    0 22 0 22
NC_044998.1     3738    20 0 0 0
NC_044998.1     3739    1 0 22 0
NC_044998.1     3740    0 22 0 0
NC_044998.1     3741    22 0 0 0

Using GNU awk for sorted_in:

$ cat tst.awk
BEGIN {
    PROCINFO["sorted_in"] = "@val_num_desc"
}
{
    printf "%s%s%s%s", $1, OFS, $2, OFS
    $1 = $2 = ""
}
NR==1 {
    split($0,refs)
    print "max", "ref"
    next
}
{
    n = split($0,vals)

    for ( i in vals ) {
        maxVal = vals[i]
        break
    }

    ref = sep = ""
    for ( i=1; i<=n; i++ ) {
        if ( vals[i] == maxVal ) {
            ref = ref sep refs[i]
            sep = ","
        }
    }

    print maxVal, ref
}

$ awk -f tst.awk file | column -t
chr          pos   max  ref
NC_044998.1  3732  21   C,A
NC_044998.1  3733  22   C
NC_044998.1  3734  22   C
NC_044998.1  3735  22   C
NC_044998.1  3736  7    A
NC_044998.1  3737  22   T,G
NC_044998.1  3738  20   C
NC_044998.1  3739  22   A
NC_044998.1  3740  22   T
NC_044998.1  3741  22   C

The main reason I posted the above is to show how to get a min or max by using "sorted_in" along with for ( i in ... ) and then breaking after accessing the first index as that would then give you the max or min based on whatever criteria you set using PROCINFO["sorted_in"]=... without you having to write any code to do the comparisons.

Upvotes: 2

anubhava
anubhava

Reputation: 786136

Let's say you have this input file:

cat file

chr pos C T A G
NC_044998.1     3732    21 0 0 0
NC_044998.1     3733    22 0 2 0
NC_044998.1     3734    22 0 5 0
NC_044998.1     3735    22 0 0 0
NC_044998.1     3736    0 0 7 7
NC_044998.1     3737    0 22 0 22
NC_044998.1     3738    20 0 0 0
NC_044998.1     3739    1 0 22 0
NC_044998.1     3740    0 22 0 0
NC_044998.1     3741    22 0 0 0

TO get your desired output, use this awk:

awk -v b=3 'NR==1{for (i=b; i<=NF; ++i) hdr[i]=$i; print $1, $2, "max", "ref"; next} {for (i=b; i<=NF; ++i) max=($i>max?$i:max); for (i=b; i<=NF; ++i) if ($i == max) c4=(c4?c4 ";":"") hdr[i]; print $1, $2, max, c4; max=c4=""}' file

chr pos max ref
NC_044998.1 3732 21 C
NC_044998.1 3733 22 C
NC_044998.1 3734 22 C
NC_044998.1 3735 22 C
NC_044998.1 3736 7 A;G
NC_044998.1 3737 22 T;G
NC_044998.1 3738 20 C
NC_044998.1 3739 22 A
NC_044998.1 3740 22 T
NC_044998.1 3741 22 C

Here is a more readable version:

awk -v b=3 '
NR == 1 {
   for (i=b; i<=NF; ++i)
      hdr[i] = $i
   print $1, $2, "max", "ref"
   next
}
{
   for (i=b; i<=NF; ++i)
      max = ($i > max ? $i : max)
   for (i=b; i<=NF; ++i)
      if ($i == max)
         c4 = (c4 ? c4 ";" : "") hdr[i]
   print $1, $2, max, c4
   max=c4=""
}' file

Upvotes: 3

Jose Ricardo Bustos M.
Jose Ricardo Bustos M.

Reputation: 8174

you can try it

awk 'BEGIN{OFS="\t";FS="[[:space:]]+"}
     NR==1{print $1,$2,"max","ref"; 
           for(i=3;i<=6;++i)BASES[i]=$(i);
     } 
     NR>1{
        basemax=3; max=$3;
        for(i=4;i<=6;++i){if($i>max){basemax=i;max=$i;}}
        print $1,$2,BASES[basemax],max
     }
' inputfile

you get,

chr pos max ref
NC_044998.1 3732    C   21
NC_044998.1 3733    C   22
NC_044998.1 3734    C   22
NC_044998.1 3735    C   22
NC_044998.1 3736    A   7
NC_044998.1 3737    G   22
NC_044998.1 3738    C   20
NC_044998.1 3739    A   22
NC_044998.1 3740    T   22
NC_044998.1 3741    C   22

NOTE: If exist two or more bases with max value, correction

For input,

chr pos C T A G
NC_044998.1     3732    21 0 21 0
NC_044998.1     3733    22 0 2 0
NC_044998.1     3734    22 0 5 0
NC_044998.1     3735    22 0 0 0
NC_044998.1     3736    0 0 7 7
NC_044998.1     3737    0 0 0 22
NC_044998.1     3738    20 0 0 0
NC_044998.1     3739    1 0 22 0
NC_044998.1     3740    0 22 0 0
NC_044998.1     3741    22 0 0 0

you can try it

awk 'BEGIN{OFS="\t";FS="[[:space:]]+"}
     NR==1{print $1,$2,"max","ref"; 
         for(i=3;i<=6;++i)BASES[i]=$(i);
     } 
     NR>1{basemax=BASES[3]; max=$3;
         for(i=4;i<=6;++i){
             if($i>max){basemax=BASES[i];max=$i;}
             else if($i==max){basemax=basemax","BASES[i];}
         }
         print $1,$2,basemax,max
     }
' fileinput

you get,

chr pos max ref
NC_044998.1 3732    C,A 21
NC_044998.1 3733    C   22
NC_044998.1 3734    C   22
NC_044998.1 3735    C   22
NC_044998.1 3736    A,G 7
NC_044998.1 3737    G   22
NC_044998.1 3738    C   20
NC_044998.1 3739    A   22
NC_044998.1 3740    T   22
NC_044998.1 3741    C   22

Upvotes: 3

RavinderSingh13
RavinderSingh13

Reputation: 133770

With your shown samples, please try following awk code, written and tested in GNU awk.

awk -v startField="3" -v endField="6" '
FNR==1{
  for(i=startField;i<=endField;i++){
    heading[i]=$i
  }
  next
}
{
  max=maxInd=""
  for(i=startField;i<=endField;i++){
    maxInd=(max<$i?i:maxInd)
    max=(max<$i?$i:max)
  }
  NF=(startField-1)
  print $0,heading[maxInd]
}
'  Input_file

Advantages of this code's approach:

  • user can mention starting field number and ending field number by using variables named startField and endField so we need NOT to change anything inside main awk code.
  • 2nd advantage is since nothing is hardcoded here, so lets say User tomorrow wants to check maximum values from 10th field to 20th field then we need NOT to print or mention explicitly 9th fields to get printed since that is taken care in code itself.

Detailed explanation: Adding detailed explanation for above.

awk -v startField="3" -v endField="6" '   ##Starting awk program and setting startField and endField to values on which user wants to look for maximum values.
FNR==1{                                   ##Checking condition if this is first line of Input_file.
  for(i=startField;i<=endField;i++){      ##Traversing through only those fields which user needs to get max value.
    heading[i]=$i                         ##Creating array heading whose index is i and value is current field value.
  }
  next                                    ##next will skip all further statements from here.
}
{
  max=maxInd=""                           ##Nullifying max and maxInd variables here.
  for(i=startField;i<=endField;i++){      ##Traversing through only those fields which user needs to get max value.
    maxInd=(max<$i?i:maxInd)              ##Getting maxInd variable to current field number if current field value is greater than maxInd else keep it as maxInd itself.
    max=(max<$i?$i:max)                   ##Getting max variable to current field value if current field value is greater than max else keep it as max itself.
  }
  NF=(startField-1)                       ##Setting NF(number of fields of current line) to startField-1 here.
  print $0,heading[maxInd]                ##printing current field followed by heading array value whose index is maxInd.
}
'  Input_file                             ##Mentioning Input_file name here. 

Upvotes: 4

jared_mamrot
jared_mamrot

Reputation: 26225

For your original desired outcome:

awk 'NR==1 {for (i=0; i<=NF; i++) header_array[i] = $i; } NR>1{a=$3; for (i=3;i<=6;i++) if($i >= a) {a=$i; column=header_array[i]} {printf "max:%s, %s\n", a, column}}' file

# Output
max:21, C
max:22, C
max:22, C
max:22, C
max:7, A
max:22, G
max:20, C
max:22, A
max:22, T
max:22, C

For your updated desired outcome:

awk 'NR==1 {for (i=0; i<=NF; i++) header_array[i] = $i; print "chr pos max ref"} NR>1{char=$1; pos=$2; a=$3; for (i=3;i<=6;i++) if($i >= a) {a=$i; column=header_array[i]} {printf "%s %s %s %s\n", char, pos, a, column}}' file

# Output
chr pos max ref
NC_044998.1 3732 21 C
NC_044998.1 3733 22 C
NC_044998.1 3734 22 C
NC_044998.1 3735 22 C
NC_044998.1 3736 7 A
NC_044998.1 3737 22 G
NC_044998.1 3738 20 C
NC_044998.1 3739 22 A
NC_044998.1 3740 22 T
NC_044998.1 3741 22 C

Formatted:

awk 'NR==1 {
    for (i = 0; i <= NF; i++) {
        header_array[i] = $i
        }
    print "chr pos max ref"
    }
    NR>1 {
        char = $1
        pos = $2
        a = $3
        for (i = 3; i <= 6; i++) {
            if ($i >= a) {
                a = $i
                column = header_array[i]
            }
        } {
           printf "%s %s %s %s\n", char, pos, a, column
        }
    }' file

Upvotes: 4

Related Questions