Reputation: 41

awk extract columns from file based on header from second file 3

I am trying to run some modified code from a previous thread on this subject. I have a file data.txt where the first row is the header. I would like to create a new file with only those columns that match the entries from a second file (list.txt).

data.txt

1,2,3,4,5,6,7,8,9,10
1.000000,0,0,0,0,0,0,0,0,0
0,1.000000,0.031250,0,0,0,0.031250,0,0,0
0,0.031250,1.000000,0,0,0,0.062500,0,0,0
0,0,0,1.000000,0,0,0,0,0,0
0,0,0,0,1.000000,0,0,0,0,0
0,0,0,0,0,1.000000,0,0.062500,0,0
0,0.031250,0.062500,0,0,0,1.000000,0,0,0
0,0,0,0,0,0.062500,0,1.000000,0,0
0,0,0,0,0,0,0,0,1.000000,0

list.txt

Desired output is

3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0,1.000000,0
0,0,0,0
0,0,0,1.000000

I have used the code below

echo "${DATAFILE:-data.txt}"
echo "${COLUMNFILE:-list.txt}"

awk {
     j=1
     while ((getline < COLUMNFILE) > 0) {
        col[j++] = $1
     }
     n=j-1;
     close(COLUMNFILE)
     for (i=1; i<=n; i++) s[col[i]]=i
   }
   NR==1 {
     for (f=1; f<=NF; f++)
       if ($f in s) c[s[$f]]=f
     next
   }
   { sep=","
     for (f=1; f<=n; f++) {
       printf("%c%s",sep,$c[f])
       sep=FS
     }
     print ""
 } 
 DATAFILE

I get the result below which is duplicating the rows in data.txt and not doing any selection. The entries from list.txt are printing at the end of the file

1,2,3,4,5,6,7,8,9,10
1,2,3,4,5,6,7,8,9,10
1.000000,0,0,0,0,0,0,0,0,0
1.000000,0,0,0,0,0,0,0,0,0

0,1.000000,0.031250,0,0,0,0.031250,0,0,0
0,1.000000,0.031250,0,0,0,0.031250,0,0,0

0,0.031250,1.000000,0,0,0,0.062500,0,0,0
0,0.031250,1.000000,0,0,0,0.062500,0,0,0

0,0,0,1.000000,0,0,0,0,0,0
0,0,0,1.000000,0,0,0,0,0,0

0,0,0,0,1.000000,0,0,0,0,0
0,0,0,0,1.000000,0,0,0,0,0

0,0,0,0,0,1.000000,0,0.062500,0,0
0,0,0,0,0,1.000000,0,0.062500,0,0

0,0.031250,0.062500,0,0,0,1.000000,0,0,0
0,0.031250,0.062500,0,0,0,1.000000,0,0,0

0,0,0,0,0,0.062500,0,1.000000,0,0
0,0,0,0,0,0.062500,0,1.000000,0,0

0,0,0,0,0,0,0,0,1.000000,0
0,0,0,0,0,0,0,0,1.000000,0

3
3

5
5

7
7

9
9

Any help is much appreciated.

Upvotes: 0

Answers (6)

Marc Lambrichs

Reputation: 2882

awk solution, taking into account that columnnames can be anything, not only the index of the columns.

BEGIN { FS=OFS="," }
NR==FNR { l[$0]++; next }                      # save headers from list
FNR==1{ for (i=1; i<=NF; i++) 
            if ($i in l){ max=i; c[i]++ }}         # save column index in c;
                                                   # max index in max 
{ for(j=1; j<=NF; j++)                             # loop over column indices
      if(j in c)                                   # if index in c
          printf "%s%s", $j, (j==max ? ORS : OFS)  # print column
}

with input:

$ cat list.txt
C
E
G
I

and

$ cat data.txt
A,B,C,D,E,F,G,H,I,J
1.000000,0,0,0,0,0,0,0,0,0
0,1.000000,0.031250,0,0,0,0.031250,0,0,0
0,0.031250,1.000000,0,0,0,0.062500,0,0,0
0,0,0,1.000000,0,0,0,0,0,0
0,0,0,0,1.000000,0,0,0,0,0
0,0,0,0,0,1.000000,0,0.062500,0,0
0,0.031250,0.062500,0,0,0,1.000000,0,0,0
0,0,0,0,0,0.062500,0,1.000000,0,0
0,0,0,0,0,0,0,0,1.000000,0

gives as result:

$ awk 'BEGIN {FS=OFS=","} NR==FNR{l[$0]++;next} FNR==1{ for (i=1; i<=NF; i++) if ($i in l){max=i; c[i]++}}{for (j=1;j<=NF;j++) if(j in c) printf "%s%s",$j,(j==max ?ORS:OFS) }' list.txt data.txt
C,E,G,I
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000

Upvotes: 0

James Brown

Reputation: 37394

Here's an awk that processes the list.txt to a field list and calls another awk with that list to process the data.txt:

$ awk '
BEGIN { FS=OFS="," }          # set the delimiters for the list file
NR==FNR {                     # process the list file
    p=p (p==""?"":OFS) "$" $1 # make a field list ($3,$5,$7,$9)
    next
}
{                             # process the data or call the processor
    RS=""                     # for getline to return multilined output
    cmd="awk \047BEGIN{FS=OFS=\",\"}{print "p"}\047 " FILENAME   # build awk call
    cmd | getline res         # actual awk call and output to res
    print res                 # output res
    exit                      # exit after first record
}
' list data
3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000

Upvotes: 1

Ed Morton

Reputation: 203229

$ awk '
    BEGIN { FS=OFS="," }
    NR==FNR { f[++nf]=$0; next }
    { for (i=1; i<=nf; i++) printf "%s%s", $(f[i]), (i<nf?OFS:ORS) }
' list.txt data.txt
3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000

Upvotes: 3

karakfa

Reputation: 67467

a non awk solution to compare and contrast...

$ join -t, <(sort list) <(<file tr ',' '\n' | pr -10ts, | sort) | 
  sort -n | 
  tr ',' '\n' | 
  pr -4ts,


3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000

you need the magic numbers 10 and 4 which are the column numbers of the original file and extracted ones (those can be automated as well). Multiple sorts required to convert numerical sorting to lexicographical and back (required for join).

The algorithm in essence is transpose-join-transpose

Upvotes: 1

RomanPerekhrest

Reputation: 92854

awk solution:

awk -F, 'function pr(a){ r=""; for(i=1;i<=NF;i++) if(i in a) r=(r!="")? r","$i:$i; print r }
         NR==FNR{ a[$0]; next }{ pr(a) }' list.txt data.txt

The output:

3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000

Upvotes: 1

JNevill

Reputation: 50034

You can pass both your position file and your data file into awk and do your logic internally:

 awk -F"," 'FILENAME=="list.txt"{a[NR]=$1}FILENAME=="data.txt"{for(i=1; i<=length(a); i++){printf (i==length(a)?"%s\n":"%s,"),$a[i]}}' list.txt data.txt

Here we are:

Splitting incoming files with a comma delimiter (-F",")
If the FILENAME awk variable is "list.txt" (FILENAME=="list.txt")
++Then add the value in the line to an array using the line number as the index (a[NR]=$1)
If the FILENAME awk variable is "data.txt" (FILENAME=="data.txt")
++Then loop through each element in the array for(i=1; i<=length(a); i++)
++++And print out the value of the item at that position ($a[i]). If the position is the last position found (i==length(a)) then print it out with a line feed following ("%s\n") otherwise print it out with a comma following ($a[i]).

The other option is to pass in your positions via the -v (variable) flag, but this doesn't account well for a variable number of positions:

awk -F"," -v f1=$(awk 'NR==1' list.txt) -v f2=$(awk 'NR==2' list.txt) -v f3=$(awk 'NR==3' list.txt) -v f4=$(awk 'NR==4' list.txt) '{print $f1, $f2, $f3, $f4}' data.txt

Upvotes: 1

awk extract columns from file based on header from second file 3

Answers (6)

Related Questions