Reputation: 89

How to remove duplicates and get SUM of a particular column in unix shell?

I have a file is having data as below(number of file count is always varies)

May 1 09:00 ./archive/xxx_cs_app_gmas_reject_MDM_20180501090001.csv 0.000       2   &#x2714;
May 1 17:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501174500.csv 0.055       185 &#x2708;

I want the output in one line like below but need the sum of record count(column 6) and its better to print the latest file(according to timestamp)

May 1 17:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501090001.csv 0.000       187   &#x2714;

I tried using below sed command so that I can ignore the timestamp of the file. But can you please help me to remove the duplicates along with SUM of counts (column 6)

sed -e 's/\(.*\)[0-9]\{6\}\.\([^.]*\)/\1.\2/'

Upvotes: 2

Answers (4)

stack0114106

Reputation: 8711

Here is Perl one-liner

$ cat barcode.txt
May 1 09:00 ./archive/xxx_cs_app_gmas_reject_MDM_20180501090001.csv 0.000       2   &#x2714;
May 1 17:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501174500.csv 0.055       185 &#x2708;
May 1 19:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501194500.csv 0.055       3 &#x2908;
May 1 17:45 ./archive/aaa_cs_app_g_reject_MDM_20180502174500.txt 0.055       1 &#x2708;
May 1 17:45 ./archive/aaa_cs_app_g_reject_MDM_20180502184500.txt 0.015       2 &#x2708;

$ perl -lane ' $x=$F[3];$x=~s/(.*)(_.*)/$1/g; $kv{$x}+=$F[5];$kv2{$x}=$_; END {for(keys %kv) { $kv2{$_}=~s/(.*)(\d+)(\s\S+)$/$1$kv{$_}$3/g; print $kv2{$_} } } ' barcode.txt
May 1 17:45 ./archive/aaa_cs_app_g_reject_MDM_20180502184500.txt 0.015       3 &#x2708;
May 1 19:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501194500.csv 0.055       190 &#x2908;

Upvotes: 0

Ed Morton

Reputation: 203684

Borrowing @Allan's sample input:

$ cat tst.awk
{
    base = time = $4
    sub(/_[^_]+$/,"",base)
    sub(/.*_/,"",time)
    if (time > max[base]) {
        max[base] = time
        pre[base] = $1 OFS $2 OFS $3 OFS $4 OFS $5
        suc[base] = $7
    }
    tot[base] += $6
}
END {
    for (base in tot) {
        print pre[base], tot[base], suc[base]
    }
}

$ awk -f tst.awk file
May 1 17:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501174500.csv 0.055 187 &#x2708;
May 2 12:00 ./archive/xxx_123_20180502120001.csv 0.000 3 &#x2712;
May 1 18:45 ./archive/xxx_uvw_ABC_20180501184500.csv 0.055 138 &#x2715;
May 1 19:45 ./archive/xxx_456_20180501194500.csv 0.055 135 &#x2715;

Upvotes: 2

Allan

Reputation: 12438

awk solution that works even if the csv files are not in timestamp order and not in order at all!!!

INPUT:

$ more csv_list.input 
May 1 09:00 ./archive/xxx_cs_app_gmas_reject_MDM_20180501090001.csv 0.000       2   &#x2714;
May 1 17:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501174500.csv 0.055       185 &#x2708;
May 1 12:00 ./archive/xxx_uvw_ABC_20180501120001.csv 0.000       3   &#x2712;
May 2 12:00 ./archive/xxx_123_20180502120001.csv 0.000       3   &#x2712;
May 1 18:45 ./archive/xxx_uvw_ABC_20180501184500.csv 0.055       135 &#x2715;
May 1 19:45 ./archive/xxx_456_20180501194500.csv 0.055       135 &#x2715;

AWK 1-LINER CMD:

awk '{tmp=$4;gsub(/_[0-9]{14}\.csv/,"",$4);a[$4]+=$6;sub(/\.csv$/,"",tmp); tmp=substr(tmp,length(tmp)-13, length(tmp));if(!timestamp[$4] || tmp>timestamp[$4]){timestamp[$4]=tmp;line1[$4]=$1 OFS $2 OFS $3; line2[$4]=$5; line3[$4]=$7};}END{for(i in a){print line1[i] OFS i"_"timestamp[i]".csv" OFS line2[i] OFS a[i] OFS line3[i]}}' csv_list.input

AWK SCRIPT & EXPLANATIONS:

# gawk profile, created Wed May  2 15:00:50 2018

# Rule(s)

{
        tmp = $4 
        gsub(/_[0-9]{14}\.csv/, "", $4) #find the filename without timestamp
        a[$4] += $6 #sum the 6th column value, key=filename without timestamp
        sub(/\.csv$/, "", tmp) #remove the .csv
        tmp = substr(tmp, length(tmp) - 13, length(tmp)) # get the timestamp of the file
        if (! timestamp[$4] || tmp > timestamp[$4]) { # if the element is empty or if the new timesptamp is bigger than the previous one
                timestamp[$4] = tmp #save the new timestamp
                line1[$4] = $1 OFS $2 OFS $3 #save the 3 first columns of the latest file
                line2[$4] = $5 # save the 5th column
                line3[$4] = $7 # save the 6th column
        }
}

# END rule(s)

END {
        for (i in a) { #recombine the information to generate the ouput
                print line1[i] OFS i "_" timestamp[i] ".csv" OFS line2[i] OFS a[i] OFS line3[i]
        }
}

OUTPUT:

May 1 17:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501174500.csv 0.055 187 &#x2708;
May 2 12:00 ./archive/xxx_123_20180502120001.csv 0.000 3 &#x2712;
May 1 18:45 ./archive/xxx_uvw_ABC_20180501184500.csv 0.055 138 &#x2715;
May 1 19:45 ./archive/xxx_456_20180501194500.csv 0.055 135 &#x2715;

Upvotes: 1

RomanPerekhrest

Reputation: 92854

Awk solution:

Sample input.txt:

May 1 09:00 ./archive/xxx_cs_app_gmas_reject_MDM_20180501090001.csv 0.000       2   &#x2714;
May 1 17:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501174500.csv 0.055       185 &#x2708;
May 1 19:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501194500.csv 0.055       3 &#x2908;
May 1 17:45 ./archive/aaa_cs_app_g_reject_MDM_20180502174500.txt 0.055       1 &#x2708;
May 1 17:45 ./archive/aaa_cs_app_g_reject_MDM_20180502184500.txt 0.015       2 &#x2708;

awk '{
         curr_fn = $4;
         sub(/[0-9]{14}/, "", $4)
     }
     fn {
         if (fn == $4) { cnt += $6; $6 = cnt }
         else { print rec; cnt = 0 }
     }
     {
         fn = $4; $4 = curr_fn;
         cnt = $6; rec = $0
     }
     END{ print rec }' input.txt

curr_fn - current filename (given by the 4th field $4)
sub(/[0-9]{14}/, "", $4) - remove datetime value from filename for further filename comparison
rec = $0 - capturing the whole current record

The output:

May 1 19:45 ./archive/xxx_cs_app_gmas_reject_MDM_20180501194500.csv 0.055 190 &#x2908;
May 1 17:45 ./archive/aaa_cs_app_g_reject_MDM_20180502184500.txt 0.015 3 &#x2708;

Upvotes: 3

How to remove duplicates and get SUM of a particular column in unix shell?

Answers (4)

Related Questions