Reputation: 73
I have a large csv matrix file (input.csv) in the following format:
Patient,sample 66_pos_LC/MS Pos_con,sample 57_net_LC/MS Neg_dis,sample 1_LC/MS Polar_con,sample 3_LC/MS Neg_net
xx,2.5,-7.8,2.5,3.6
ab 1,5.4,3,0.3
yy,43,33,77,55
depending on the first line of the file, I want to split file content based on LC/MS feature. That is, if I want all LC/MS Neg, output file would look like :
output1.csv
Patient,sample 57_net_LC/MS Neg_dis,sample 3_LC/MS Neg_net
xx,-7.8,3.6
ab,5.4,0.3
yy,33,55
For this I use:
head -n 1 input.csv | tr ',' '\n' | cat -n | grep 'LC/MS Neg'
This gives the column numbers where I should look (in this case: 3 and 5); to get the above output, I use:
cut -d, -f1,3,5 input.csv > output1.csv
Although it gives me the desired output, but I find this would be complicated for extracting many columns at a time.
I will highly appreciate some awk/sed solution.
Thank you.
Upvotes: 1
Views: 52
Reputation: 23667
$ cat get_cols.awk
BEGIN{ FS=OFS="," }
NR==1 {
i = 1
idx[i++] = 1
for(j=2; j<=NF; j++)
if($j ~ /LC\/MS Neg/)
idx[i++] = j
}
{
for(k=1; k<i; k++)
printf "%s", k==1 ? $idx[k] : OFS $idx[k]
print ""
}
,
idx
to save the index we need to print
Run it as:
$ awk -f get_cols.awk ip.txt
Patient,sample 57_net_LC/MS Neg_dis,sample 3_LC/MS Neg_net
xx,-7.8,3.6
ab,5.4,0.3
yy,33,55
Same concept with perl
$ perl -F, -lane '@idx = grep { $F[$_] =~ m|LC/MS Neg| } (1..$#F) if $.==1;
print join ",", @F[0,@idx]' ip.txt
Patient,sample 57_net_LC/MS Neg_dis,sample 3_LC/MS Neg_net
xx,-7.8,3.6
ab,5.4,0.3
yy,33,55
Upvotes: 0
Reputation: 203522
$ cat tst.awk
BEGIN { FS=OFS="," }
NR==1 {
f[++numFlds] = 1
for (i=2; i<=NF; i++) {
if ($i ~ sel) {
f[++numFlds] = i
}
}
}
{
for (i=1; i<=numFlds; i++) {
printf "%s%s", $(f[i]), (i<numFlds ? OFS : ORS)
}
}
.
$ awk -v sel=Neg -f tst.awk file
Patient,sample 57_net_LC/MS Neg_dis,sample 3_LC/MS Neg_net
xx,-7.8,3.6
ab 1,3,
yy,33,55
$ awk -v sel=Pos -f tst.awk file
Patient,sample 66_pos_LC/MS Pos_con
xx,2.5
ab 1,5.4
yy,43
$ awk -v sel=Polar -f tst.awk file
Patient,sample 1_LC/MS Polar_con
xx,2.5
ab 1,0.3
yy,77
$ awk -v sel='Pos|Neg' -f tst.awk file
Patient,sample 66_pos_LC/MS Pos_con,sample 57_net_LC/MS Neg_dis,sample 3_LC/MS Neg_net
xx,2.5,-7.8,3.6
ab 1,5.4,3,
yy,43,33,55
Upvotes: 2