Reputation: 1061

Counting number of occurrence of different strings per line and appending counts as columns

I have

NC_044998.1  3789  hetdevdev   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev
NC_044998.1  3803  homorefref  hetrefdev   homorefref  homorefref  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref
NC_044998.1  3806  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref
NC_044998.1  3856  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev
NC_044998.1  4008  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev
NC_044998.1  4012  homodevdev  hetdevref   hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  hetdevref   homodevdev
NC_044998.1  4020  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref
NC_044998.1  5353  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref
NC_044998.1  5364  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  hetrefdev   homorefref  homorefref
NC_044998.1  5435  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref

I wanna count the number of occurrence of "homorefref", "homodevdev", "hetrefdev", "hetdevref", "hetdevdev" per line an output as a column, as in

NC_044998.1  3789  hetdevdev   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 11 0 0 0
NC_044998.1  3803  homorefref  hetrefdev   homorefref  homorefref  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref 8 0 3 0 0
NC_044998.1  3806  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref 9 0 2 0 0
NC_044998.1  3856  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 1 0
NC_044998.1  4008  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 1 0
NC_044998.1  4012  homodevdev  hetdevref   hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  hetdevref   homodevdev 0 7 0 4 0
NC_044998.1  4020  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref 10 0 1 0 0
NC_044998.1  5353  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref 10 0 1 0 0
NC_044998.1  5364  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  hetrefdev   homorefref  homorefref 9 0 2 0 0
NC_044998.1  5435  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref 10 0 1 0 0

I know how to do for each string in a loop using grep and then paste as a new column

while read i; do echo $i |grep -o "homorefref"| wc -l;  done < file

or also for individual strings using awk

awk '{print $1,$2,gsub(/homorefref/,"")}' file

But I can't find a way of doing for all strings at once.

I also tried this

var="homorefref,homodevdev,hetrefdev,hetdevref,hetdevdev"
char=","
awk -F"${char}" '{print NF-1}' file <<< "${string}"

Upvotes: 1

Answers (3)

stack0114106

Reputation: 8711

Another awk:

Using 2 files approach, where the first is the inline file using echo "homorefref,homodevdev..." which is split using RS=",", and the second file is the actual data file using RS="\n".

$ awk ' NR==FNR {a[NR]=$0; next} { delete b; for(i=2;i<=NF;i++) b[$i]++; printf "%s ",$0; for(i=1;i<=5;i++) printf "%d ", b[a[i]] ; print "" } ' RS="," <(echo "homorefref,homodevdev,hetrefdev,hetdevref,hetdevdev") RS="\n" data.txt
NC_044998.1  3789  hetdevdev   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 0 0
NC_044998.1  3803  homorefref  hetrefdev   homorefref  homorefref  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref 8 0 3 0 0
NC_044998.1  3806  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref 9 0 2 0 0
NC_044998.1  3856  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 1 0
NC_044998.1  4008  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 1 0
NC_044998.1  4012  homodevdev  hetdevref   hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  hetdevref   homodevdev 0 7 0 4 0
NC_044998.1  4020  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref 10 0 1 0 0
NC_044998.1  5353  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref 10 0 1 0 0
NC_044998.1  5364  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  hetrefdev   homorefref  homorefref 9 0 2 0 0
NC_044998.1  5435  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref 10 0 1 0 0

Inserting newlines for code readability

awk ' 
NR==FNR {a[NR]=$0; next} 
{ 
  delete b; for(i=2;i<=NF;i++) b[$i]++; 
  printf "%s ",$0; 
  for(i=1;i<=5;i++) printf "%d ", b[a[i]] ; 
  print "" 
} ' RS="," <(echo "homorefref,homodevdev,hetrefdev,hetdevref,hetdevdev") RS="\n" data.txt

Upvotes: 1

Ed Morton

Reputation: 203625

$ cat tst.awk
BEGIN {
    numTags = split("homorefref homodevdev hetrefdev hetdevref hetdevdev",tags)
}
{
    delete cnt
    for (i=3; i<=NF; i++) {
        tag = $i
        cnt[tag]++
    }
    printf "%s", $0
    for (tagNr=1; tagNr<=numTags; tagNr++) {
        tag = tags[tagNr]
        printf "%s%d", OFS, cnt[tag]
    }
    print ""
}

$ awk -f tst.awk file
NC_044998.1  3789  hetdevdev   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 0 1
NC_044998.1  3803  homorefref  hetrefdev   homorefref  homorefref  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref 8 0 3 0 0
NC_044998.1  3806  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref 9 0 2 0 0
NC_044998.1  3856  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 1 0
NC_044998.1  4008  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 1 0
NC_044998.1  4012  homodevdev  hetdevref   hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  hetdevref   homodevdev 0 7 0 4 0
NC_044998.1  4020  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref 10 0 1 0 0
NC_044998.1  5353  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref 10 0 1 0 0
NC_044998.1  5364  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  hetrefdev   homorefref  homorefref 9 0 2 0 0
NC_044998.1  5435  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref 10 0 1 0 0

Upvotes: 1

anubhava

Reputation: 785196

You may use this awk:

awk 'BEGIN {s="homorefref,homodevdev,hetrefdev,hetdevref,hetdevdev"; n=split(s, a, ","); for (i=1; i<=n; ++i) col[a[i]]} {for (i=1; i<=NF; ++i) if ($i in col) ++fq[$i]; printf "%s", $0; for (i=1; i<=n; ++i) printf "%s%s", OFS, fq[a[i]]+0; print ""; delete fq}' file

NC_044998.1  3789  hetdevdev   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 0 1
NC_044998.1  3803  homorefref  hetrefdev   homorefref  homorefref  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref 8 0 3 0 0
NC_044998.1  3806  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref 9 0 2 0 0
NC_044998.1  3856  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 1 0
NC_044998.1  4008  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 1 0
NC_044998.1  4012  homodevdev  hetdevref   hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  hetdevref   homodevdev 0 7 0 4 0
NC_044998.1  4020  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref 10 0 1 0 0
NC_044998.1  5353  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref 10 0 1 0 0
NC_044998.1  5364  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  hetrefdev   homorefref  homorefref 9 0 2 0 0
NC_044998.1  5435  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref 10 0 1 0 0

A more readable format:

awk '
BEGIN {
   s = "homorefref,homodevdev,hetrefdev,hetdevref,hetdevdev"
   n=split(s, a, ",")
   for (i=1; i<=n; ++i)
      col[a[i]]
}
{
   for (i=1; i<=NF; ++i)
      if ($i in col)
         ++fq[$i]
   printf "%s", $0
   for (i=1; i<=n; ++i)
      printf "%s%s", OFS, fq[a[i]]+0
   print ""
   delete fq
}' file

Upvotes: 1

Counting number of occurrence of different strings per line and appending counts as columns

Answers (3)

Related Questions