Madza Farias-Virgens
Madza Farias-Virgens

Reputation: 1061

Counting max and min per row across columns and outputting associated column names

I'm trying to count both max and min (except 0s) per row across columns and outputting associated column names.

I'm trying this:

BEGIN{OFS="\t"}
     NR==1{print $1,$2,"ref","max","ref","min";
         for(i=3;i<=6;++i)BASES[i]=$(i);
     }
     NR>1{l=1;basemax=BASES[3];basemin=BASES[3]; max=$3; min=$3;
         for(i=4;i<=6;++i){
             if($i>max){basemax=BASES[i];max=$i;}
             else if($i==max){basemax=basemax","BASES[i];++l}
        }
         for(i=4;i<=6;++i){
             if($i<min && $i !=0){basemmin=BASES[i];mim=$i}
             else if($i==min){basemin=basemin","BASES[i];++l}
         }
         print $1,$2,basemax,max,basemin,min
     }

In a input that looks like this

chr     pos     C       T       A       G
NC_044998.1     3732    22      0       7       0
NC_044998.1     3733    22      0       0       0
NC_044998.1     3734    22      3       3       0
NC_044998.1     3735    22      0       0       3
NC_044998.1     3736    0       7       22      3
NC_044998.1     3737    0       0       0       25
NC_044998.1     3738    22      7       0       0
NC_044998.1     3739    7       3       22      25
NC_044998.1     3740    0       22      22      0
NC_044998.1     3741    22      0       0       0

The desired output is

chr          pos   ref  max  ref    min
NC_044998.1  3732  C    22   A      7
NC_044998.1  3733  C    22   C      22
NC_044998.1  3734  C    22   T,A    3
NC_044998.1  3735  C    22   G      3
NC_044998.1  3736  A    22   G      3
NC_044998.1  3737  G    25   G      25
NC_044998.1  3738  C    22   C      22
NC_044998.1  3739  G    25   C      7
NC_044998.1  3740  T,A  22   T,A    22
NC_044998.1  3741  C    22   C      22

But it outputs this instead

chr          pos   ref  max  ref    min
NC_044998.1  3732  C    22   C      22
NC_044998.1  3733  C    22   C      22
NC_044998.1  3734  C    22   C      22
NC_044998.1  3735  C    22   C      22
NC_044998.1  3736  A    22   C      0
NC_044998.1  3737  G    25   C,T,A  0
NC_044998.1  3738  C    22   C      22
NC_044998.1  3739  G    25   C      7
NC_044998.1  3740  T    22   C,A,G  0
NC_044998.1  3741  C    22   C      22

Upvotes: 2

Views: 67

Answers (3)

RavinderSingh13
RavinderSingh13

Reputation: 133458

With your shown samples, please try following awk code. Written and tested in GNU awk.

awk -v startField="3" -v endField="6" '
BEGIN{ OFS="\t"; print "chr             pos     ref     max     ref     min"}
FNR==1{
  for(i=startField;i<=endField;i++){
    heading[i]=$i
  }
  next
}
{
  min=max2=maxInd2=minInd=max=maxInd=minAllInd=maxAllInd=maxAllInd2=""
  for(i=startField;i<=endField;i++){
    if($i!=0){
      minInd=(min>$i?i:(min==$i?minInd","i:(minInd!=""?minInd:i)))
      min=(min>$i?$i:(min!=""?min:$i))
    }
    maxInd=(max<$i?i:(max==$i?maxInd","i:(maxInd!=""?maxInd:i)))
    max=(max<$i?$i:(max!=""?max:$i))
  }
  for(i=startField+1;i<=endField;i++){
    maxInd2=(max2<$i?i:(max2==$i?maxInd2","i:(maxInd2!=""?maxInd2:i)))
    max2=(max2<$i?$i:(max2!=""?max2:$i))
  }
  num1=split(maxInd,arr1,",")
  num2=split(minInd,arr2,",")
  num3=split(maxInd2,arr3,",")
  if(num1>1){
    for(k=1;k<=num1;k++){
       maxAllInd = (maxAllInd?maxAllInd ",":"") heading[arr1[k]]
    }
  }
  else{
    maxAllInd = heading[maxInd]
  }
  if(num2>1){
    for(k=1;k<=num2;k++){
       minAllInd = (minAllInd?minAllInd ",":"") heading[arr2[k]]
    }
  }
  else{
    minAllInd = heading[minInd]
  }
  if(num3>1){
    for(k=1;k<=num3;k++){
       maxAllInd2 = (maxAllInd2?maxAllInd2 ",":"") heading[arr3[k]]
    }
  }
  else{
    maxAllInd2 = heading[maxInd2]
  }
  if(startField>1){
    NF=(startField-1)
      if(min !=0 ){
         print $0,maxAllInd,max,minAllInd,min
      }
      if(min == 0 && max2 != 0){
         print $0,maxAllInd,max,maxAllInd2,max2
      }
      if(min == 0 && max2 == 0){
         print $0,maxAllInd,max,maxAllInd,max
      }  
  }
  else{
      if(min !=0 ){
         print maxAllInd,max,minAllInd,min
      }
      if(min == 0 && max2 != 0){
         print maxAllInd,max,maxAllInd2,max2
      }
      if(min == 0 && max2 == 0){
         print maxAllInd,max,maxAllInd,max
      }       
  }
}
'  Input_file

Upvotes: 3

anubhava
anubhava

Reputation: 785058

This awk script should work for you:

cat maxmin.awk

NR == 1 {
   for (i=b; i<=NF; ++i)
      hdr[i] = $i
   print $1, $2, "ref", "max", "ref", "min"
   next
}
{
   for (i=b; i<=NF; ++i) {
      max = ($i > max ? $i : max)
      min = ($i && (min == "" || $i < min) ? $i : min)
   }
   for (i=b; i<=NF; ++i) {
      if ($i == min)
         rmin = (rmin ? rmin "," : "") hdr[i]
      if ($i == max)
         rmax = (rmax ? rmax "," : "") hdr[i]
   }
   print $1, $2, rmax, max, rmin, min
   max = min = rmax = rmin = ""
}

And use it as:

awk -v b=3 -f maxmin.awk gg | column -t
chr          pos   ref  max  ref  min
NC_044998.1  3732  C    22   A    7
NC_044998.1  3733  C    22   C    22
NC_044998.1  3734  C    22   T,A  3
NC_044998.1  3735  C    22   G    3
NC_044998.1  3736  A    22   G    3
NC_044998.1  3737  G    25   G    25
NC_044998.1  3738  C    22   T    7
NC_044998.1  3739  G    25   T    3
NC_044998.1  3740  T,A  22   T,A  22
NC_044998.1  3741  C    22   C    22

column -t has been used for tabular output only.

Upvotes: 2

tshiono
tshiono

Reputation: 22012

  • You have some typos in variable names such as basemmin and mim.
  • If the count of C is 0, the min value has no chance to be updated.
  • You can combine the two for loops into one.
  • The variable l is not used.

Then would you please try the following:

awk -v OFS="\t" '
NR==1 {
    print $1, $2, "ref", "max", "ref", "min"
    for (i = 3; i <= 6; i++) bases[i] = $i
}
NR>1 {
    basemax = bases[3]; basemin = bases[3]; max = $3; min = $3
    for (i = 4; i <= 6; i++) {
        if ($i > max) {basemax = bases[i]; max = $i}
        else if ($i == max) {basemax = basemax "," bases[i]}
        if ($i < min && $i != 0 || min == 0) {basemin = bases[i]; min = $i}
        else if ($i == min) {basemin = basemin "," bases[i]}
    }
    print $1, $2, basemax, max, basemin, min
}' input_file

Output:

chr     pos     ref     max     ref     min
NC_044998.1     3732    C       22      A       7
NC_044998.1     3733    C       22      C       22
NC_044998.1     3734    C       22      T,A     3
NC_044998.1     3735    C       22      G       3
NC_044998.1     3736    A       22      G       3
NC_044998.1     3737    G       25      G       25
NC_044998.1     3738    C       22      T       7
NC_044998.1     3739    G       25      T       3
NC_044998.1     3740    T,A     22      T,A     22
NC_044998.1     3741    C       22      C       22

Please note the output slightly differs from your desired output, which may contain typos.

Upvotes: 2

Related Questions