Diana Ysabel
Diana Ysabel

Reputation: 106

Unix, group rows and sum values of columns from file.csv

I have this file.csv

"201707"|"51976551"|1|0|1|"20170702"
"201707"|"51955194"|1|0|0|"20170702"
"201707"|"51923555"|1|0|1|"20170702"
"201707"|"51976551"|1|0|1|"20170703"
"201707"|"51955194"|1|0|0|"20170703"
"201707"|"51923555"|1|0|1|"20170703"
"201707"|"51960597"|1|0|0|"20170703"

And my hope result is group by the number and sum the column 3, 4 and 5

"201707"|"51976551"|2|0|2
"201707"|"51955194"|2|0|0
"201707"|"51923555"|2|0|2
"201707"|"51960597"|1|0|0

I've tried with:

cat file.csv | awk -F"|" '
  { a[$2] += $3 }
  END {
    for (i in a) {
      printf "%s|%s\n", i, a[i];
    }
  }
'

And the result is:

"51976551"|2
"51955194"|2
"51923555"|2
"51960597"|1

Only shows the sum of third column, but I need 2 columns more. what should I do in this case?

Upvotes: 2

Views: 1884

Answers (2)

Akshay Hegde
Akshay Hegde

Reputation: 16997

To Preserve order:

By processing in END block

awk  'BEGIN{
            FS=OFS="|"
      }
      {
             k = $1 OFS $2; 
             if(!(k in t)){
                    o[++c]=k; 
                    t[k]
             } 
             for(i=3; i<=5; i++)
                    a[k OFS i]+=$i
       }
    END{
             for(i=1; i in o; i++)
             {
                 printf "%s", o[i]; 
                 for(j=3; j<=5; j++)
                     printf "%s%s", OFS, a[o[i] OFS j]; 
                 print ""
              }
        }
     ' infile

Or by reading same file twice (GNU awk)

awk  'BEGIN{
        FS=OFS="|"
      }
      function ps(f)
      {
           for(i=3;i<=5;i++)
           if(f)
           { 
                   a[k OFS i]+=$i; 
                   t[k] 
           }else 
                   s=(s ? s OFS :"") a[k OFS i]
       }
       {
         k=$1 OFS $2
       }
       FNR==NR{
         ps(1); 
         next
       }
       k in t{
         s=""; 
         ps();  
         print k, s; 
         delete t[k] 
       }
     ' infile infile

Input:

$ cat input
"201707"|"51976551"|1|0|1|"20170702"
"201707"|"51955194"|1|0|0|"20170702"
"201707"|"51923555"|1|0|1|"20170702"
"201707"|"51976551"|1|0|1|"20170703"
"201707"|"51955194"|1|0|0|"20170703"
"201707"|"51923555"|1|0|1|"20170703"
"201707"|"51960597"|1|0|0|"20170703"

Output-1:

$ awk  'BEGIN{FS=OFS="|"}{k = $1 OFS $2; if(!(k in t)){o[++c]=k; t[k]} for(i=3; i<=5; i++)a[k OFS i]+=$i}END{for(i=1; i in o; i++){printf "%s", o[i]; for(j=3; j<=5; j++)printf "%s%s", OFS, a[o[i] OFS j]; print ""}}' infile
"201707"|"51976551"|2|0|2
"201707"|"51955194"|2|0|0
"201707"|"51923555"|2|0|2
"201707"|"51960597"|1|0|0

Output-2:

$ awk  'BEGIN{FS=OFS="|"}function ps(f){for(i=3;i<=5;i++)if(f){ a[k OFS i]+=$i; t[k] }else s=(s ? s OFS :"") a[k OFS i]}{k=$1 OFS $2}FNR==NR{ps(1); next}k in t{s=""; ps();  print k, s; delete t[k] }' infile infile
"201707"|"51976551"|2|0|2
"201707"|"51955194"|2|0|0
"201707"|"51923555"|2|0|2
"201707"|"51960597"|1|0|0

Upvotes: 1

John1024
John1024

Reputation: 113844

Try:

$ awk -F"|" '{ a[$1 OFS $2]+=$3; b[$1 OFS $2]+=$4; c[$1 OFS $2]+=$5 }
  END {
    for (i in a) {
      print i, a[i], b[i], c[i];
    }
  }
' OFS=\| file.csv
"201707"|"51976551"|2|0|2
"201707"|"51960597"|1|0|0
"201707"|"51923555"|2|0|2
"201707"|"51955194"|2|0|0

How it works

  • -F"|"

    This sets the field separator on input to |.

  • a[$1 OFS $2]+=$3; b[$1 OFS $2]+=$4; c[$1 OFS $2]+=$5

    This keeps track of the totals of the third, fourth, and fifth columns.

  • END { for (i in a) { print i, a[i], b[i], c[i]; } }

    This prints out the results.

  • OFS=\|

    This tells awk to use | as the field separator on output.

Upvotes: 4

Related Questions