Reputation: 1316
cat file.txt
MNS GYPA*N
MNS GYPA*M c.59T>C;c.71A>G;c.72G>T
MNS GYPA*Mc c.71G>A;c.72T>G
MNS GYPA*Vw c.140C>T
MNS GYPA*Mg c.68C>A
MNS GYPA*Vr c.197C>A
MNS GYPB*Mta c.230C>T
MNS GYPB*Ria c.226G>A
MNS GYPB*Nya c.138T>A
MNS GYPA*Hut c.140C>A
.
.
.
the second column values could start with GYPA,GYPB,GYPC,GYPD, ... GYPZ. I would like to set a position count for each GYP* and split the third column as follows:
1 MNS GYPA*N
2 MNS GYPA*M c.59T>C
2 MNS GYPA*M c.71A>G
2 MNS GYPA*M c.72G>T
3 MNS GYPA*Mc c.71G>A
3 MNS GYPA*Mc c.72T>G
4 MNS GYPA*Vw .140C>T
5 MNS GYPA*Mg c.68C>A
6 MNS GYPA*Vr c.197C>A
1 MNS GYPB*Mta c.230C>T
2 MNS GYPB*Ria c.226G>A
3 MNS GYPB*Nya c.138T>A
4 MNS GYPB*Hut c.140C>A
.
.
.
cat format.awk
BEGIN {FS=OFS="\t"}
$2 ~ /GYPA/
{ num=split($3,arr,/;/);
for (i=1;i<=num;i++)
{ print NR,$1,$2,arr[i]}}
$2 ~ /GYPB/
{ num=split($3,arr,/;/);
for (i=1;i<=num;i++)
{ print NR,$1,$2,arr[i]} }
...
I am not sure how to reset NR when it reaches the the next ~ GYP. The GYP{A..Z} are in order from A to Z.
Upvotes: 2
Views: 679
Reputation: 133700
awk '
{
match($2,/[^*]*/)
gy_value=substr($2,RSTART,RLENGTH)
}
gy_value!=prev_gy_value{
count=0
}
!arr[$2]++{
count++
}
{
num=split($3,array,";")
for(i=1;i<=num;i++){
print count,$1,$2,array[i]
}
}
NF<3;
{
prev_gy_value=gy_value
}
' file.txt
Explanation: Adding a detailed explanation for above code.
awk ' ##Starting awk program from here.
{
match($2,/[^*]*/) ##Using match function to match till * in 2nd field.
gy_value=substr($2,RSTART,RLENGTH) ##Creating variable gy_value which has sub-string of 2nd field sub-string in it.
}
gy_value!=prev_gy_value{
count=0 ##Creating variable count as 0 here.
}
{
count++ ##Increasing value of count with 1 here.
}
{
num=split($3,array,";") ##Splitting 3rd field into an array with delimiter ; and its count is stored into num variable.
for(i=1;i<=num;i++){ ##Starting for loop from i=1 to till value of num here.
print count,$1,$2,array[i] ##Printing value of $1,$2 and array with index variable i here.
}
}
NF<3; ##Checking condition if NF<3 then print the line here.
{
prev_gy_value=gy_value ##Setting value of variable gy_value to variable named prev_gy_value here(which is used above code to make sure about values check).
}
' Input_file ##Mentioning Input_file name here.
Upvotes: 3
Reputation: 26531
I am not sure how to reset
NR
when it reaches the the next ~GYP
. TheGYP{A..Z}
are in order from A to Z.
It is not possible to reset or change internal awk variables like NR
, FNR
or NF
. These values are set by awk. The easiest is to keep track of an alternative NR
as a variable which can can name c
or anything else. This value can be reset under any condition to any value you want.
Example: Have a counter that resets to one every time it sees foo
in a record:
awk '{c++}($0 ~ /foo/){c=1}{print c,$0}'
In case of the OP, something like this might be used:
awk 'BEGIN{FS=OFS="\t"}
{c++; key=substr($2,1,index($2,"*")-1)}
(key != key_prev) { c=1 }
{ prefix="" }
(key == "GYPA") { prefix="NM_002099.7:"}
{ num=split($3,a,";"); for(i=1;i<=num;++i) print c,$1,$2,prefix a[i] }
{ key_prev=key }' file
Upvotes: 1