Reputation: 1071

For each different occurrence in field, print lines with max value associated

I have

ID=exon-XM_030285750.2  LOC100221041    7895
ID=exon-XM_030285760.2  LOC100221041    8757
ID=exon-XM_030285720.2  LOC100221041    8656
ID=exon-XM_030285738.2  LOC100221041    8183
ID=exon-XM_030285728.2  LOC100221041    8402
ID=exon-XM_030285733.2  LOC100221041    7398
ID=exon-XM_030285715.2  LOC100221041    8780
ID=exon-XM_030285707.2  LOC100221041    8963
ID=exon-XM_030285694.2  DCBLD2  5838
ID=exon-XM_030285774.2  CMSS1   1440
ID=exon-XM_012570107.3  CMSS1   1502
ID=exon-XM_012570104.3  FILIP1L 6371
ID=exon-XM_030285654.2  FILIP1L 6456
ID=exon-XM_030285647.2  FILIP1L 6488
ID=exon-XM_032751000.1  FILIP1L 5886
ID=exon-XM_030285671.2  FILIP1L 5622
ID=exon-XM_030285682.2  FILIP1L 5395
ID=exon-XR_004369230.1  LOC116808959    2289

I want to print the line for which each element in $2 is associates with highest value in $3

ID=exon-XM_030285707.2  LOC100221041    8963
ID=exon-XM_030285694.2  DCBLD2  5838
ID=exon-XM_012570107.3  CMSS1   1502
ID=exon-XM_030285647.2  FILIP1L 6488
ID=exon-XR_004369230.1  LOC116808959    2289

I tried this

awk -f avg.sh test | awk 'BEGIN {OFS = "\t"} arr[$2]==0 {arr[$2]=$3} ($3 > arr[$2]) {arr[$2]=$3} END{for (i in arr) {print i, arr[i]}}'

from here

how to conditionally filter rows in awk

but I would like to also keep $1 in the output and keep the same ordering as in the input.

The answer to this

Computing averages of chunks of a column

shows how to build an array that keeps the original ordering, but I'm falling putting the two together

Upvotes: 1

Answers (3)

stack0114106

Reputation: 8791

You can do with sort and awk.

If ordering is optional.

$ sort -k2,2  -k3,3nr madza.txt  |  awk ' $2!=p2 { if(NR>1) print p; p=$0;p2=$2 } END { print p }'
ID=exon-XR_004369230.1  LOC116808959    2289
ID=exon-XM_030285707.2  LOC100221041    8963
ID=exon-XM_030285647.2  FILIP1L 6488
ID=exon-XM_030285694.2  DCBLD2  5838
ID=exon-XM_012570107.3  CMSS1   1502
$

To keep the ordering, you can introduce seq numbers and remove them at the last.

$ awk ' { $(NF+1)=NR}1 '  madza.txt | sort -k2,2  -k3,3nr  |  awk ' $2!=p2 { if(NR>1) print p; p=$0;p2=$2 } END { print p }' | sort -k4 -n | awk ' {NF=NF-1}1 '
ID=exon-XM_030285707.2 LOC100221041 8963
ID=exon-XM_030285694.2 DCBLD2 5838
ID=exon-XM_012570107.3 CMSS1 1502
ID=exon-XM_030285647.2 FILIP1L 6488
ID=exon-XR_004369230.1 LOC116808959 2289
$

Upvotes: 1

anubhava

Reputation: 786241

You may use this awk:

awk '!($2 in max) || $3 > max[$2] { 
   if(!($2 in max))
      ord[++n] = $2
   max[$2] = $3
   rec[$2] = $0
}
END {
   for (i=1; i<=n; ++i)
      print rec[ord[i]]
}' file | column -t

ID=exon-XM_030285707.2  LOC100221041  8963
ID=exon-XM_030285694.2  DCBLD2        5838
ID=exon-XM_012570107.3  CMSS1         1502
ID=exon-XM_030285647.2  FILIP1L       6488
ID=exon-XR_004369230.1  LOC116808959  2289

Upvotes: 2

RavinderSingh13

Reputation: 133770

Could you please try following, written and tested with shown samples in GNU awk.

awk '
!arr1[$2]++{
  found[++count]=$2
}
{
  arr[$2]=(arr[$2]>$3?arr[$2]:$3)
  val[$2 OFS $3]=$1
}
END{
  for(i=1;i<=count;i++){
    print val[found[i] OFS arr[found[i]]],found[i],arr[found[i]]
  }
}'  Input_file

Output will be as follows.

ID=exon-XM_030285707.2 1 8963
ID=exon-XM_030285694.2 2 5838
ID=exon-XM_012570107.3 3 1502
ID=exon-XM_030285647.2 4 6488
ID=exon-XR_004369230.1 5 2289

To get in TAB separated form try following.

awk -v OFS="\t" '
!arr1[$2]++{
  found[++count]=$2
}
{
  arr[$2]=(arr[$2]>$3?arr[$2]:$3)
  val[$2 OFS $3]=$1
}
END{
  for(i=1;i<=count;i++){
    print val[found[i] OFS arr[found[i]]],found[i],arr[found[i]]
  }
}' Input_file | 
column -t -s $'\t'

Upvotes: 2

For each different occurrence in field, print lines with max value associated

Answers (3)

Related Questions