Reputation: 218
I am looking to learn how to concatenate multiple columns in linux. I have a dataset which looks like this:
gene match_type drug sources pmids
ABO Definite CHEMBL50267 DrugBank 17139284|17016423
ABO Definite URIDINE_DIPHOSPHATE TdgClinicalTrial 17139284|17016423
ABO Definite CHEMBL439009 DrugBank 12972418
ABO Definite CHEMBL1232343 DrugBank NA
ABO Definite CHEMBL503075 DrugBank NA
I am trying to bring this into one row (concatenating the drug column, the sources column and the pmids column) to look like:
gene match_type drug sources pmids
ABO Definite CHEMBL1232343 CHEMBL439009 CHEMBL50267 CHEMBL503075 URIDINE_DIPHOSPHATE NA DrugBank TdgClinicalTrial DrugBank DrugBank DrugBank 0 12972418 17139284|17016423 17139284|17016423 NA NA
I have looked into using awk with if statements but I am not really sure where to start with this, any help in the right direction would be appreciated.
Upvotes: 0
Views: 42
Reputation: 133438
If you are not worried about spacing part of headers then please try following.
awk '
FNR==1{
print
next
}
{
for(i=3;i<=NF;i++){
a[$1 OFS i]=(a[$1 OFS i]?a[$1 OFS i] FS $i:$i)
}
b[$1]=$2
}
END{
for(j in b){
printf j OFS b[j] OFS
for(i=3;i<=NF;i++){
printf("%s %s",a[j OFS i],i==NF?ORS:OFS)
}
}
}' OFS="\t" Input_file
Explanation: Adding detailed explanation of above command now.
awk ' ##Starting awk program here.
FNR==1{ ##Checking condition if FNR==1 means first line of Input_file then do following.
print ##Printing the current line.
next ##next will skip all further lines from here.
} ##Closing FNR==1 condition BLOCK here.
{ ##Starting BLOCK which will be executed apart from 1st line of Input_file.
for(i=3;i<=NF;i++){ ##tarting a for loop which starts from i=3 to till value of NF.
a[$1 OFS i]=(a[$1 OFS i]?a[$1 OFS i] FS $i:$i) ##Creating an array a whose index is $1 and i value and concatenating its value with its own value.
} ##Closing for loop block here.
b[$1]=$2 ##Creating array named b whose index is $1 and value is $2.
} ##Closing block for, for loop now.
END{ ##Starting END block of awk program here.
for(j in b){ ##Traversing through array b here.
printf j OFS b[j] OFS ##Printing value of j OFS value of b[j] and OFS value here.
for(i=3;i<=NF;i++){ ##Starting for loop from i=3 to value of NF here.
printf("%s %s",a[j OFS i],i==NF?ORS:OFS) ##Printing value of a[j OFS i] along with either space or new line. New line should be printed when loop reached its maximum value.
} ##Closing block for inner for loop here.
} ##Closing block for outer for loop here.
}' OFS="\t" file ##Setting OFS as TAB here and mentioning Input_file name here.
Upvotes: 1