Reputation:
I am using the following bash function that operates on CSV files and execute for each AWK code doing some math operations on the column data and eventually saves the processed CSV in a new file.
home="$PWD"
# folder with the outputs
rescore="${home}"/rescore
# folder with the folders to analyse
storage="${home}"/results_bench
cd "${storage}"
# pattern of the csv file located inside each of sub-directory of "${storage}"
str='*str1.csv'
rescore_data2 () {
str_name=$(basename "${str}" .csv)
printf >&2 'Dataset for %s is being rescored... ' "${str_name}"; sleep 0.1
mkdir "${rescore}"/"${str_name}"
# Apply the following AWK code for rescoring and final data collecting
while read -r d; do
awk -F', *' -v OFS=', ' '
FNR==1 {
path=FILENAME
sub(/\/[^/]+$/,"",path)
prefix=suffix=FILENAME
sub(/_.*/, "", prefix)
sub(/\/[^\/]+$/, "", suffix); sub(/^.*_/, "", suffix)
print suffix,"dG(rescored)"
next
}
{
print $1, sqrt((($3+12)/12)^2+(($2-240)/240)^2)
}
' "${d}_"*/${str} > "${rescore}/"${str_name}"/"${d%%_*}".csv"
done < <(find . -maxdepth 1 -type d -name '*_*_*' | awk -F '[_/]' '!seen[$2]++ {print $2}')
}
Basically each processed CSV has the following format:
#inout CSV located in the folder 10V1_cne_lig12
ID, POP, dG
1, 142, -5.6500
2, 10, -5.5000
3, 2, -4.9500
4, 150, -4.1200
My awk code converts it to the 2 column format (via applying math equation on the 2nd and 3rd columns):
# output.csv
lig12, dG(rescored)
1, 0.596625
2, 1.05873
3, 1.11285
4, 0.697402
Note that lig12 in the first line is the suffix (used as the ID of the csv) extracted by my AWK code from the part of the FOLDER contained this CSV, and 10V1 is the prefix ( defines type of the csv)
I need to pipe my AWK script to something like sed or AWK that will do further modifications of the obtained output.csv , which should be converted to one line format, containing: the suffix (lig12), the minimal value detected in the second column of the output (here it is 0.596625) as well as its corresponded ID number from the first column (1):
lig12, 0.596625 (1)
Here is one line AWK solution, which do the job just for one csv:
awk -F ', ' ' NR==1 { coltitle=$1 } NR==2 { min=$2; id=$1 } NR>3 && $2<min { min=$2; id=$1 } END { print coltitle FS min" ("id")" }'
May it be piped correctly to the first AWK code inside of the rescore_data2(), which is applied on many CSVs processed by my bash function? So the expected output stored in ("${rescore}/"${str_name}"/"${d%%_*}".csv") should contain the number of the lines (with the dG(min) of each CSV) equal to the number of the processed CSVs.
# expected output for 10 processed CSVs belonged to the prefix 10V1
# currently it does not print dGmin correctly for different CSVs.
name: 10V1, dG(min) # header with prefix should be in the top!
lig199, 0.946749 (1)
lig211, 0.946749 (1)
lig278, 0.756155 (2)
lig40, 0.756155 (2)
lig576, 0.594778 (1)
lig619, 0.594778 (1)
lig697, 0.594778 (1)
lig800, 0.594778 (1)
lig868, 0.594778 (1)
lig868, 0.594778 (1)
Upvotes: 0
Views: 153
Reputation: 22087
I have extracted awk script as follows (with minor modifications):
awk -F', *' -v OFS=', ' '
FNR==1 {
path=FILENAME
sub(/\/[^/]+$/,"",path)
prefix=suffix=FILENAME
sub(/_.*/, "", prefix)
sub(/\/[^\/]+$/, "", suffix); sub(/^.*_/, "", suffix)
print suffix,"dG(rescored)"
next
}
{
print $1, sqrt((($3+12)/12)^2+(($2-240)/240)^2)
}
' 10V1_cne_lig12/foo_str3a.csv
The output looks like:
lig12, dG(rescored)
1, 0.668396
2, 1.10082
3, 1.15263
4, 0.756198
Although the values slightly differ from the provided result,
please let me go on as is.
Then add a modifications to the awk script as:
awk -F', *' -v OFS=', ' '
FNR==1 {
dgmin = "" # initialize the min value
path=FILENAME
sub(/\/[^/]+$/,"",path)
prefix=suffix=FILENAME
sub(/_.*/, "", prefix)
sub(/\/[^\/]+$/, "", suffix); sub(/^.*_/, "", suffix)
print suffix,"dG(rescored)"
next
}
{
dG = sqrt((($3+12)/12)^2+(($2-240)/240)^2)
if (dGmin == "" || dG < dGmin) {
dGmin = dG # update the min dG value
dGminid = $1 # update the ID with the min dG
}
}
END {
print suffix, dGmin " (" dGminid ")" # report the results
}
' 10V1_cne_lig12/foo_str3a.csv
Output:
lig12, dG(rescored)
lig12, 0.668396 (1)
You'll see the 1st record is picked along with its ID.
The awk script above assumes the input file is just one.
If you want to process multiple csv files at once, you will need
to put the "report the results"
line not only in the END{}
block but
possibly the starting of the FNR==1{}
block
(whenever one file processing is done).
[Update]
Would you replace your rescore_data3()
function with:
rescore_data3 () {
str_name=$(basename "${str}" .csv)
printf >&2 'Dataset for %s is being rescored... ' "${str_name}"; sleep 0.1
mkdir -p "${rescore}"/"${str_name}"
# Apply the following AWK code for rescoring and final data collecting
while read -r d; do
awk -F', *' -v OFS=', ' '
FNR==1 {
if (suffix) # suppress the empty line
print suffix, dGmin " (" dGminid ")"
# report the results
dGmin = "" # initialize the min value
path=FILENAME
sub(/\/[^/]+$/,"",path)
prefix=suffix=FILENAME
sub(/_.*/, "", prefix)
sub(/\/[^\/]+$/, "", suffix); sub(/^.*_/, "", suffix)
if (FNR==NR)
print prefix # print the header line
next
}
{
dG = sqrt((($3+12)/12)^2+(($2-240)/240)^2)
if (dGmin == "" || dG < dGmin) {
dGmin = dG # update the min dG value
dGminid = $1 # update the ID with the min dG
}
}
END {
print suffix, dGmin " (" dGminid ")" # report the results
}
' "${d}_"*/${str} > "${rescore}/"${str_name}"/"${d%%_*}".csv"
done < <(find . -maxdepth 1 -type d -name '*_*_*' | awk -F '[_/]' '!seen[$2]++ {print $2}')
}
if (suffix) ...
in the FNR==1{}
block to suppress the empty line at the beginning of the result file.dgmin = ""
which should be dGmin = ""
in my previous answer.-p
option to mkdir
so you can avoid the
mkdir: cannot create directory: File exists
error.Upvotes: 1