Reputation: 89
I have a file is having data as below(number of file count is always varies)
May 1 09:00 ./archive/xxx_cs_app_gmas_reject_MDM_20180501090001.csv 0.000 2 ✔
May 1 17:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501174500.csv 0.055 185 ✈
I want the output in one line like below but need the sum of record count(column 6) and its better to print the latest file(according to timestamp)
May 1 17:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501090001.csv 0.000 187 ✔
I tried using below sed command so that I can ignore the timestamp of the file. But can you please help me to remove the duplicates along with SUM of counts (column 6)
sed -e 's/\(.*\)[0-9]\{6\}\.\([^.]*\)/\1.\2/'
Upvotes: 2
Views: 250
Reputation: 8711
Here is Perl one-liner
$ cat barcode.txt
May 1 09:00 ./archive/xxx_cs_app_gmas_reject_MDM_20180501090001.csv 0.000 2 ✔
May 1 17:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501174500.csv 0.055 185 ✈
May 1 19:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501194500.csv 0.055 3 ⤈
May 1 17:45 ./archive/aaa_cs_app_g_reject_MDM_20180502174500.txt 0.055 1 ✈
May 1 17:45 ./archive/aaa_cs_app_g_reject_MDM_20180502184500.txt 0.015 2 ✈
$ perl -lane ' $x=$F[3];$x=~s/(.*)(_.*)/$1/g; $kv{$x}+=$F[5];$kv2{$x}=$_; END {for(keys %kv) { $kv2{$_}=~s/(.*)(\d+)(\s\S+)$/$1$kv{$_}$3/g; print $kv2{$_} } } ' barcode.txt
May 1 17:45 ./archive/aaa_cs_app_g_reject_MDM_20180502184500.txt 0.015 3 ✈
May 1 19:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501194500.csv 0.055 190 ⤈
Upvotes: 0
Reputation: 203684
Borrowing @Allan's sample input:
$ cat tst.awk
{
base = time = $4
sub(/_[^_]+$/,"",base)
sub(/.*_/,"",time)
if (time > max[base]) {
max[base] = time
pre[base] = $1 OFS $2 OFS $3 OFS $4 OFS $5
suc[base] = $7
}
tot[base] += $6
}
END {
for (base in tot) {
print pre[base], tot[base], suc[base]
}
}
$ awk -f tst.awk file
May 1 17:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501174500.csv 0.055 187 ✈
May 2 12:00 ./archive/xxx_123_20180502120001.csv 0.000 3 ✒
May 1 18:45 ./archive/xxx_uvw_ABC_20180501184500.csv 0.055 138 ✕
May 1 19:45 ./archive/xxx_456_20180501194500.csv 0.055 135 ✕
Upvotes: 2
Reputation: 12438
awk
solution that works even if the csv files are not in timestamp order and not in order at all!!!
INPUT:
$ more csv_list.input
May 1 09:00 ./archive/xxx_cs_app_gmas_reject_MDM_20180501090001.csv 0.000 2 ✔
May 1 17:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501174500.csv 0.055 185 ✈
May 1 12:00 ./archive/xxx_uvw_ABC_20180501120001.csv 0.000 3 ✒
May 2 12:00 ./archive/xxx_123_20180502120001.csv 0.000 3 ✒
May 1 18:45 ./archive/xxx_uvw_ABC_20180501184500.csv 0.055 135 ✕
May 1 19:45 ./archive/xxx_456_20180501194500.csv 0.055 135 ✕
AWK 1-LINER CMD:
awk '{tmp=$4;gsub(/_[0-9]{14}\.csv/,"",$4);a[$4]+=$6;sub(/\.csv$/,"",tmp); tmp=substr(tmp,length(tmp)-13, length(tmp));if(!timestamp[$4] || tmp>timestamp[$4]){timestamp[$4]=tmp;line1[$4]=$1 OFS $2 OFS $3; line2[$4]=$5; line3[$4]=$7};}END{for(i in a){print line1[i] OFS i"_"timestamp[i]".csv" OFS line2[i] OFS a[i] OFS line3[i]}}' csv_list.input
AWK SCRIPT & EXPLANATIONS:
# gawk profile, created Wed May 2 15:00:50 2018
# Rule(s)
{
tmp = $4
gsub(/_[0-9]{14}\.csv/, "", $4) #find the filename without timestamp
a[$4] += $6 #sum the 6th column value, key=filename without timestamp
sub(/\.csv$/, "", tmp) #remove the .csv
tmp = substr(tmp, length(tmp) - 13, length(tmp)) # get the timestamp of the file
if (! timestamp[$4] || tmp > timestamp[$4]) { # if the element is empty or if the new timesptamp is bigger than the previous one
timestamp[$4] = tmp #save the new timestamp
line1[$4] = $1 OFS $2 OFS $3 #save the 3 first columns of the latest file
line2[$4] = $5 # save the 5th column
line3[$4] = $7 # save the 6th column
}
}
# END rule(s)
END {
for (i in a) { #recombine the information to generate the ouput
print line1[i] OFS i "_" timestamp[i] ".csv" OFS line2[i] OFS a[i] OFS line3[i]
}
}
OUTPUT:
May 1 17:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501174500.csv 0.055 187 ✈
May 2 12:00 ./archive/xxx_123_20180502120001.csv 0.000 3 ✒
May 1 18:45 ./archive/xxx_uvw_ABC_20180501184500.csv 0.055 138 ✕
May 1 19:45 ./archive/xxx_456_20180501194500.csv 0.055 135 ✕
Upvotes: 1
Reputation: 92854
Awk
solution:
Sample input.txt
:
May 1 09:00 ./archive/xxx_cs_app_gmas_reject_MDM_20180501090001.csv 0.000 2 ✔
May 1 17:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501174500.csv 0.055 185 ✈
May 1 19:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501194500.csv 0.055 3 ⤈
May 1 17:45 ./archive/aaa_cs_app_g_reject_MDM_20180502174500.txt 0.055 1 ✈
May 1 17:45 ./archive/aaa_cs_app_g_reject_MDM_20180502184500.txt 0.015 2 ✈
awk '{
curr_fn = $4;
sub(/[0-9]{14}/, "", $4)
}
fn {
if (fn == $4) { cnt += $6; $6 = cnt }
else { print rec; cnt = 0 }
}
{
fn = $4; $4 = curr_fn;
cnt = $6; rec = $0
}
END{ print rec }' input.txt
curr_fn
- current filename (given by the 4th field $4
)sub(/[0-9]{14}/, "", $4)
- remove datetime value from filename for further filename comparisonrec = $0
- capturing the whole current recordThe output:
May 1 19:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501194500.csv 0.055 190 ⤈
May 1 17:45 ./archive/aaa_cs_app_g_reject_MDM_20180502184500.txt 0.015 3 ✈
Upvotes: 3