Geroge
Geroge

Reputation: 561

Create reverse complement sequence based on AWK

Dear stackoverflow users,

I have TAB sep data like this:

head -4 input.tsv

seq A      C change
seq T      A ok
seq C      C change
seq AC   CCT change

And I need create reverse complement function in awk which do something like this

head -4 output.tsv

seq T      G change
seq T      A ok
seq G      G change
seq GT   AGG change

So if 4th column is flagged "change" I need to create reverse complement sequence.

HINT - the same things doing for example tr in bash - Bash one liner for this task is:

echo "ACCGA" | rev | tr "ATGC" "TACG"

I was tried something like this

awk 'BEGIN {c["A"] = "T"; c["C"] = "G"; c["G"] = "C"; c["T"] = "A" }{OFS="\t"}
function revcomp(   i, o) {
    o = ""
    for(i = length; i > 0; i--)
        o = o c[substr($0, i, 1)]
    return(o)
}
{

if($4 == "change"){$2 = revcom(); $3 = revcom()} print $0; else print $0}' input

Biological reverse sequence mean:

A => T
C => G
G => C
T => A

and reverse complement mean:

ACCATG => CATGGT

Edited: Also anybody just for education can share this solution in python.

Upvotes: 2

Views: 1332

Answers (4)

Sundeep
Sundeep

Reputation: 23667

If you are okay with perl:

$ perl -F'\t' -lane 'if($F[3] eq "change") {
                     $F[1] = (reverse $F[1] =~ tr/ATGC/TACG/r);
                     $F[2] = (reverse $F[2] =~ tr/ATGC/TACG/r) }
                     print join "\t", @F' ip.txt
seq T   G   change
seq T   A   ok
seq G   G   change
seq GT  AGG change

Can also use, but this is not specific to columns, will change any sequence of ATCG characters:

perl -lpe 's/\t\K[ATCG]++(?=.*\tchange$)/reverse $&=~tr|ATGC|TACG|r/ge'

Upvotes: 1

Ed Morton
Ed Morton

Reputation: 203684

Kinda inefficient for this particular application since it creates the mapping array on each call to tr() and does the same loop in tr() and then again in rev() but figured I'd show how to write standalone tr() and rev() functions and it'll probably be fast enough for your needs anyway:

$ cat tst.awk
BEGIN { FS=OFS="\t" }
$4 == "change" {
    for ( i=2; i<=3; i++) {
        $i = rev(tr($i,"ACGT","TGCA"))
    }
}
{ print }

function tr(instr,old,new,      outstr,pos,map) {
    for (pos=1; pos<=length(old); pos++) {
        map[substr(old,pos,1)] = substr(new,pos,1)
    }
    for (pos=1; pos<=length(instr); pos++) {
        outstr = outstr map[substr(instr,pos,1)]
    }
    return outstr
}

function rev(instr,     outstr,pos) {
    for (pos=1; pos<=length(instr); pos++) {
        outstr = substr(instr,pos,1) outstr
    }
    return outstr
}

.

$ awk -f tst.awk file
seq     T       G       change
seq     T       A       ok
seq     G       G       change
seq     GT      AGG     change

Upvotes: 2

Inian
Inian

Reputation: 85683

With a little tinkering of your attempt you can do something like below.

function revcomp(arg) {
    o = ""
    for(i = length(arg); i > 0; i--)
        o = o c[substr(arg, i, 1)]
    return(o)
}

BEGIN {c["A"] = "T"; c["C"] = "G"; c["G"] = "C"; c["T"] = "A" ; OFS="\t"}

{
    if($4 == "change") {
        $2 = revcomp($2); 
        $3 = revcomp($3)
    } 
}1

The key here was to use the function revcomp to take the arguments as the column values and operate on it by iterating from end. You were previously doing on the whole line $0, i.e. substr($0, i, 1) which would be causing a lot of unusual lookups on the array c.

I've also taken the liberty of changing the prototype of your function revcomp to take the input string and return the reversed one. Because I wasn't sure how you were intending to use in your original attempt.

If you are intending to use the above in a part of a larger script, I would recommend putting the whole code as-above in a script file, set the she-bang interpreter to #!/usr/bin/awk -f and run the script as awk -f script.awk input.tsv

A crude bash version implemented in awk would like below. Note that, it is not clean and not a recommended approach. See more at AllAboutGetline

As before call the function as $2 = revcomp_bash($2) and $3 = revcomp_bash($3)

function revcomp_bash(arg) {
    o = ""
    cmd = "printf \"%s\" " arg "| rev | tr \"ATGC\" \"TACG\""
    while ( ( cmd | getline o ) > 0 ) { 

    }
    close(cmd);
    return(o)
}

Your whole code speaks GNU awk-ism, so didn't care for converting it to a POSIX compliant one. You could use split() with a empty de-limiter instead of length() but the POSIX specification gladly says that "The effect of a null string as the value of fs is unspecified."

Upvotes: 3

RavinderSingh13
RavinderSingh13

Reputation: 133538

Could you please try following, written and tested with shown samples(in GNU awk).

awk '
BEGIN{
  label["A"]="T"
  label["C"]="G"
  label["G"]="C"
  label["T"]="A"
}
function cVal(field){
  delete array
  num=split($field,array,"")
  for(k=1;k<=num;k++){
   if(array[k] in label){
     val=label[array[k]] val
   }
  }
  $field=val
  val=""
}
$NF=="change"{
  for(i=2;i<=(NF-1);i++){
    cVal(i)
  }
}
1
'  Input_file | column -t

Explanation: Adding detailed explanation for above code.

awk '                                ##Starting awk program from here.
BEGIN{                               ##Starting BEGIN section of this code here.
  label["A"]="T"                     ##Creating array label with index A and value T.
  label["C"]="G"                     ##Creating array label with index C and value G.
  label["G"]="C"                     ##Creating array label with index G and value C.
  label["T"]="A"                     ##Creating array label with index T and value A.
}
function cVal(field){                ##Creating function named cVal here with passing value field in it.
  delete array                       ##Deleting array here.
  num=split($field,array,"")         ##Splitting current field value passed to it and creating array.
  for(k=1;k<=num;k++){               ##Running for loop fromk=1 to till value of num here.
   if(array[k] in label){            ##Checking condition if array value with index k is present in label array then do following.
     val=label[array[k]] val         ##Creating val which has label value with index array with index k and keep concatenating its value to it.
   }
  }
  $field=val                         ##Setting current field value to val here.
  val=""                             ##Nullifying val here.
}
$NF=="change"{                       ##Checking condition if last field is change then do following.
  for(i=2;i<=(NF-1);i++){            ##Running for loop from 2nd field to 2nd last field.
    cVal(i)                          ##Calling function with passing current field number to it.
  }
}
1                                    ##1 will print current line here.
' Input_file | column -t             ##Mentioning Input_file name here.

Upvotes: 2

Related Questions