Reputation: 41
I am trying to run some modified code from a previous thread on this subject. I have a file data.txt where the first row is the header. I would like to create a new file with only those columns that match the entries from a second file (list.txt).
data.txt
1,2,3,4,5,6,7,8,9,10
1.000000,0,0,0,0,0,0,0,0,0
0,1.000000,0.031250,0,0,0,0.031250,0,0,0
0,0.031250,1.000000,0,0,0,0.062500,0,0,0
0,0,0,1.000000,0,0,0,0,0,0
0,0,0,0,1.000000,0,0,0,0,0
0,0,0,0,0,1.000000,0,0.062500,0,0
0,0.031250,0.062500,0,0,0,1.000000,0,0,0
0,0,0,0,0,0.062500,0,1.000000,0,0
0,0,0,0,0,0,0,0,1.000000,0
list.txt
3
5
7
9
Desired output is
3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0,1.000000,0
0,0,0,0
0,0,0,1.000000
I have used the code below
echo "${DATAFILE:-data.txt}"
echo "${COLUMNFILE:-list.txt}"
awk {
j=1
while ((getline < COLUMNFILE) > 0) {
col[j++] = $1
}
n=j-1;
close(COLUMNFILE)
for (i=1; i<=n; i++) s[col[i]]=i
}
NR==1 {
for (f=1; f<=NF; f++)
if ($f in s) c[s[$f]]=f
next
}
{ sep=","
for (f=1; f<=n; f++) {
printf("%c%s",sep,$c[f])
sep=FS
}
print ""
}
DATAFILE
I get the result below which is duplicating the rows in data.txt and not doing any selection. The entries from list.txt are printing at the end of the file
1,2,3,4,5,6,7,8,9,10
1,2,3,4,5,6,7,8,9,10
1.000000,0,0,0,0,0,0,0,0,0
1.000000,0,0,0,0,0,0,0,0,0
0,1.000000,0.031250,0,0,0,0.031250,0,0,0
0,1.000000,0.031250,0,0,0,0.031250,0,0,0
0,0.031250,1.000000,0,0,0,0.062500,0,0,0
0,0.031250,1.000000,0,0,0,0.062500,0,0,0
0,0,0,1.000000,0,0,0,0,0,0
0,0,0,1.000000,0,0,0,0,0,0
0,0,0,0,1.000000,0,0,0,0,0
0,0,0,0,1.000000,0,0,0,0,0
0,0,0,0,0,1.000000,0,0.062500,0,0
0,0,0,0,0,1.000000,0,0.062500,0,0
0,0.031250,0.062500,0,0,0,1.000000,0,0,0
0,0.031250,0.062500,0,0,0,1.000000,0,0,0
0,0,0,0,0,0.062500,0,1.000000,0,0
0,0,0,0,0,0.062500,0,1.000000,0,0
0,0,0,0,0,0,0,0,1.000000,0
0,0,0,0,0,0,0,0,1.000000,0
3
3
5
5
7
7
9
9
Any help is much appreciated.
Upvotes: 0
Views: 407
Reputation: 2882
awk solution, taking into account that columnnames can be anything, not only the index of the columns.
BEGIN { FS=OFS="," }
NR==FNR { l[$0]++; next } # save headers from list
FNR==1{ for (i=1; i<=NF; i++)
if ($i in l){ max=i; c[i]++ }} # save column index in c;
# max index in max
{ for(j=1; j<=NF; j++) # loop over column indices
if(j in c) # if index in c
printf "%s%s", $j, (j==max ? ORS : OFS) # print column
}
with input:
$ cat list.txt
C
E
G
I
and
$ cat data.txt
A,B,C,D,E,F,G,H,I,J
1.000000,0,0,0,0,0,0,0,0,0
0,1.000000,0.031250,0,0,0,0.031250,0,0,0
0,0.031250,1.000000,0,0,0,0.062500,0,0,0
0,0,0,1.000000,0,0,0,0,0,0
0,0,0,0,1.000000,0,0,0,0,0
0,0,0,0,0,1.000000,0,0.062500,0,0
0,0.031250,0.062500,0,0,0,1.000000,0,0,0
0,0,0,0,0,0.062500,0,1.000000,0,0
0,0,0,0,0,0,0,0,1.000000,0
gives as result:
$ awk 'BEGIN {FS=OFS=","} NR==FNR{l[$0]++;next} FNR==1{ for (i=1; i<=NF; i++) if ($i in l){max=i; c[i]++}}{for (j=1;j<=NF;j++) if(j in c) printf "%s%s",$j,(j==max ?ORS:OFS) }' list.txt data.txt
C,E,G,I
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000
Upvotes: 0
Reputation: 37394
Here's an awk that processes the list.txt
to a field list and calls another awk with that list to process the data.txt
:
$ awk '
BEGIN { FS=OFS="," } # set the delimiters for the list file
NR==FNR { # process the list file
p=p (p==""?"":OFS) "$" $1 # make a field list ($3,$5,$7,$9)
next
}
{ # process the data or call the processor
RS="" # for getline to return multilined output
cmd="awk \047BEGIN{FS=OFS=\",\"}{print "p"}\047 " FILENAME # build awk call
cmd | getline res # actual awk call and output to res
print res # output res
exit # exit after first record
}
' list data
3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000
Upvotes: 1
Reputation: 203229
$ awk '
BEGIN { FS=OFS="," }
NR==FNR { f[++nf]=$0; next }
{ for (i=1; i<=nf; i++) printf "%s%s", $(f[i]), (i<nf?OFS:ORS) }
' list.txt data.txt
3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000
Upvotes: 3
Reputation: 67467
a non awk
solution to compare and contrast...
$ join -t, <(sort list) <(<file tr ',' '\n' | pr -10ts, | sort) |
sort -n |
tr ',' '\n' |
pr -4ts,
3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000
you need the magic numbers 10
and 4
which are the column numbers of the original file and extracted ones (those can be automated as well). Multiple sorts required to convert numerical sorting to lexicographical and back (required for join
).
The algorithm in essence is transpose
-join
-transpose
Upvotes: 1
Reputation: 92854
awk solution:
awk -F, 'function pr(a){ r=""; for(i=1;i<=NF;i++) if(i in a) r=(r!="")? r","$i:$i; print r }
NR==FNR{ a[$0]; next }{ pr(a) }' list.txt data.txt
The output:
3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000
Upvotes: 1
Reputation: 50034
You can pass both your position file and your data file into awk and do your logic internally:
awk -F"," 'FILENAME=="list.txt"{a[NR]=$1}FILENAME=="data.txt"{for(i=1; i<=length(a); i++){printf (i==length(a)?"%s\n":"%s,"),$a[i]}}' list.txt data.txt
Here we are:
-F","
)FILENAME=="list.txt"
)a[NR]=$1
)FILENAME=="data.txt"
)for(i=1; i<=length(a); i++)
$a[i]
). If the position is the last position found (i==length(a)
) then print it out with a line feed following ("%s\n"
) otherwise print it out with a comma following ($a[i]
).The other option is to pass in your positions via the -v (variable) flag, but this doesn't account well for a variable number of positions:
awk -F"," -v f1=$(awk 'NR==1' list.txt) -v f2=$(awk 'NR==2' list.txt) -v f3=$(awk 'NR==3' list.txt) -v f4=$(awk 'NR==4' list.txt) '{print $f1, $f2, $f3, $f4}' data.txt
Upvotes: 1