Reputation: 23
I have a long list of ID's I need to parse. I want to extract three pieces of information and write to a 3 column CSV. Column 1 = the field between tr|XXXX|, column 3 = the field after the second | but before OS=.
Column 2 would be conditional. If there is 'GN=XXX' in the line, I'd like it to return XXX. If GN= isn't present, I'd like to write the first section of column 3 (i.e. up to the first space).
Input:
>tr|I1WXP1|I1WXP1_9EURY Methyl coenzyme M reductase subunit A (Fragment) OS=uncultured euryarchaeote OX=114243 GN=mcrA PE=4 SV=1
>tr|A0A059VAR9|A0A059VAR9_9EURY V-type ATP synthase beta chain (Fragment) OS=Halorubrum sp. Ga66 OX=1480727 GN=atpB PE=3 SV=1
>tr|Q51760|Q51760_9EURY Glutaredoxin-like protein OS=Pyrococcus furiosus OX=2261 PE=1 SV=1
Desired output:
I1WXP1,mcrA,I1WXP1_9EURY Methyl coenzyme M reductase subunit A (Fragment)
A0A059VAR9,atpB, A0A059VAR9_9EURY V-type ATP synthase beta chain (Fragment)
Q51760,Q51760_9EURY,Q51760_9EURY Glutaredoxin-like protein
I can get the first two with awk, for e.g.:
awk '{split($0,a,"|"); print a[2]
But I can't work out the conditional, or how to act on the 'GN=' pattern neatly.
So for example, extracting bold text:
tr|**I1WXP1**|**I1WXP1_9EURY Methyl coenzyme M reductase subunit A (Fragment)** OS=uncultured euryarchaeote OX=114243 GN=**mcrA** PE=4 SV=1
Becomes:
I1WXP1, mcrA, I1WXP1_9EURY Methyl coenzyme M reductase subunit A(Fragment)
Upvotes: 2
Views: 116
Reputation: 133458
1st solution(GNU awk
version): With your shown samples, please try following awk
code. Written and tested with shown samples in GNU awk
.
awk -F'^>tr\\|| OS=' '
BEGIN{ OFS="," }
NF>=2{
gsub(/\|/,OFS,$2)
match($0,/GN=(\S+)/,gnArray)
if(gnArray[1]==""){
match($2,/(^[^,]*),(\S+)(.*)/,NoGNfoundArr)
val=NoGNfoundArr[1] OFS NoGNfoundArr[2] OFS NoGNfoundArr[2] NoGNfoundArr[3]
}
else{
match($2,/(^[^,]*),(.*)/,GNFoundArr)
val=GNFoundArr[1] OFS gnArray[1] OFS GNFoundArr[2]
}
print val
}
' Input_file
2nd solution(with any awk
version): Adding additional awk
code which should work with any version of it.
awk '
BEGIN{ OFS="," }
match($0,/^>tr.* OS=/){
val1=substr($0,RSTART+4,RLENGTH-8)
gsub(/\|/,",",val1)
match($0,/GN=[^[:space:]]+/)
val2=substr($0,RSTART+3,RLENGTH-3)
if(val2){
match(val1,/^[^,]*/)
print substr(val1,RSTART,RLENGTH),val2 substr(val1,RSTART+RLENGTH)
}
else{
match(val1,/,[^[:space:]]+/)
print substr(val1,1,RSTART-1),substr(val1,RSTART+1,RLENGTH-1),substr(val1,RSTART+1,RLENGTH-1) substr(val1,RSTART+RLENGTH)
}
}
' Input_file
Upvotes: 0
Reputation: 3975
GNU Awk
awk '
BEGIN{FS="|"; OFS=","}
/GN=/ { gn=gensub(/.* GN=([^ ]*) .*/,"\\1",1) }
!/GN=/ { gn=gensub(/ .*/,"",1,$3) }
{ sub(/ OS=.*$/,""); print $2,gn,$3 }
' file
I1WXP1,mcrA,I1WXP1_9EURY Methyl coenzyme M reductase subunit A (Fragment)
A0A059VAR9,atpB,A0A059VAR9_9EURY V-type ATP synthase beta chain (Fragment)
Q51760,Q51760_9EURY,Q51760_9EURY Glutaredoxin-like protein
awk '
BEGIN{FS="|"; OFS=","}
match($3,/(.*) OS=.* GN=([^ ]*) .*/,a) {
print $2,a[2],a[1]
next
}
match($3,/^([^ ]*) (.*) OS=.*/,a){
print $2,a[1],a[1]" "a[2]
}
' file
I1WXP1,mcrA,I1WXP1_9EURY Methyl coenzyme M reductase subunit A (Fragment)
A0A059VAR9,atpB,A0A059VAR9_9EURY V-type ATP synthase beta chain (Fragment)
Q51760,Q51760_9EURY,Q51760_9EURY Glutaredoxin-like protein
Upvotes: 0
Reputation: 58371
This might work for you (GNU sed):
sed -En 's/^[^|]*\|([^|]*)\|(.*) OS.*GN=(\S+).*/\1,\3,\2/p;t
s/^[^|]*\|([^|]*)\|((\S+).*) OS.*/\1,\3,\2/p' file
Use pattern matching and back references to format the desired result.
If the first match fails use the second.
If neither matches no output.
Upvotes: 0
Reputation: 10123
As an alternative in sed
:
sed -e '/.*tr|\([^|]*\)|\(.*\) OS=.* GN=\([^ ]*\).*/{s//\1,\3,\2/;b;}' \
-e 's/.*tr|\([^|]*\)|\(\([^ ]*\).*\) OS=.*/\1,\3,\2/' \
file
Upvotes: 0
Reputation: 203229
Whenever your input contains tag=value pairs as yours does I find it best to first create an array to contain that mapping and then you can access the values by their tags (names) however you like, e.g. using any awk:
$ cat tst.awk
BEGIN { FS="[|]"; OFS="," }
{
delete tag2val
description = $3; sub(/ +[^ ]+=.*/,"",description)
assignments = substr($3,length(description)+1)
tag2val["GN"] = description; sub(/ .*/,"",tag2val["GN"])
split(assignments,a," ")
for ( i in a ) {
tag = a[i]; sub(/=.*/,"",tag)
val = substr(a[i],length(tag)+2)
tag2val[tag] = val
}
print $2, tag2val["GN"], description
}
$ awk -f tst.awk file
I1WXP1,mcrA,I1WXP1_9EURY Methyl coenzyme M reductase subunit A (Fragment)
A0A059VAR9,atpB,A0A059VAR9_9EURY V-type ATP synthase beta chain (Fragment)
Q51760,Q51760_9EURY,Q51760_9EURY Glutaredoxin-like protein
With that approach if you want to print or test other fields it's trivial, e.g.:
$ cat tst.awk
BEGIN { FS="[|]"; OFS="," }
{
delete tag2val
description = $3; sub(/ +[^ ]+=.*/,"",description)
assignments = substr($3,length(description)+1)
tag2val["GN"] = description; sub(/ .*/,"",tag2val["GN"])
split(assignments,a," ")
for ( i in a ) {
tag = a[i]; sub(/=.*/,"",tag)
val = substr(a[i],length(tag)+2)
tag2val[tag] = val
}
print $2, tag2val["GN"], tag2val["OS"], tag2val["PE"], description
}
$ awk -f tst.awk file
I1WXP1,mcrA,uncultured,4,I1WXP1_9EURY Methyl coenzyme M reductase subunit A (Fragment)
A0A059VAR9,atpB,Halorubrum,3,A0A059VAR9_9EURY V-type ATP synthase beta chain (Fragment)
Q51760,Q51760_9EURY,Pyrococcus,1,Q51760_9EURY Glutaredoxin-like protein
Upvotes: 2