Reputation: 106
I have this file.csv
"201707"|"51976551"|1|0|1|"20170702"
"201707"|"51955194"|1|0|0|"20170702"
"201707"|"51923555"|1|0|1|"20170702"
"201707"|"51976551"|1|0|1|"20170703"
"201707"|"51955194"|1|0|0|"20170703"
"201707"|"51923555"|1|0|1|"20170703"
"201707"|"51960597"|1|0|0|"20170703"
And my hope result is group by the number and sum the column 3, 4 and 5
"201707"|"51976551"|2|0|2
"201707"|"51955194"|2|0|0
"201707"|"51923555"|2|0|2
"201707"|"51960597"|1|0|0
I've tried with:
cat file.csv | awk -F"|" '
{ a[$2] += $3 }
END {
for (i in a) {
printf "%s|%s\n", i, a[i];
}
}
'
And the result is:
"51976551"|2
"51955194"|2
"51923555"|2
"51960597"|1
Only shows the sum of third column, but I need 2 columns more. what should I do in this case?
Upvotes: 2
Views: 1884
Reputation: 16997
To Preserve order:
By processing in END block
awk 'BEGIN{
FS=OFS="|"
}
{
k = $1 OFS $2;
if(!(k in t)){
o[++c]=k;
t[k]
}
for(i=3; i<=5; i++)
a[k OFS i]+=$i
}
END{
for(i=1; i in o; i++)
{
printf "%s", o[i];
for(j=3; j<=5; j++)
printf "%s%s", OFS, a[o[i] OFS j];
print ""
}
}
' infile
Or by reading same file twice (GNU awk)
awk 'BEGIN{
FS=OFS="|"
}
function ps(f)
{
for(i=3;i<=5;i++)
if(f)
{
a[k OFS i]+=$i;
t[k]
}else
s=(s ? s OFS :"") a[k OFS i]
}
{
k=$1 OFS $2
}
FNR==NR{
ps(1);
next
}
k in t{
s="";
ps();
print k, s;
delete t[k]
}
' infile infile
Input:
$ cat input
"201707"|"51976551"|1|0|1|"20170702"
"201707"|"51955194"|1|0|0|"20170702"
"201707"|"51923555"|1|0|1|"20170702"
"201707"|"51976551"|1|0|1|"20170703"
"201707"|"51955194"|1|0|0|"20170703"
"201707"|"51923555"|1|0|1|"20170703"
"201707"|"51960597"|1|0|0|"20170703"
Output-1:
$ awk 'BEGIN{FS=OFS="|"}{k = $1 OFS $2; if(!(k in t)){o[++c]=k; t[k]} for(i=3; i<=5; i++)a[k OFS i]+=$i}END{for(i=1; i in o; i++){printf "%s", o[i]; for(j=3; j<=5; j++)printf "%s%s", OFS, a[o[i] OFS j]; print ""}}' infile
"201707"|"51976551"|2|0|2
"201707"|"51955194"|2|0|0
"201707"|"51923555"|2|0|2
"201707"|"51960597"|1|0|0
Output-2:
$ awk 'BEGIN{FS=OFS="|"}function ps(f){for(i=3;i<=5;i++)if(f){ a[k OFS i]+=$i; t[k] }else s=(s ? s OFS :"") a[k OFS i]}{k=$1 OFS $2}FNR==NR{ps(1); next}k in t{s=""; ps(); print k, s; delete t[k] }' infile infile
"201707"|"51976551"|2|0|2
"201707"|"51955194"|2|0|0
"201707"|"51923555"|2|0|2
"201707"|"51960597"|1|0|0
Upvotes: 1
Reputation: 113844
Try:
$ awk -F"|" '{ a[$1 OFS $2]+=$3; b[$1 OFS $2]+=$4; c[$1 OFS $2]+=$5 }
END {
for (i in a) {
print i, a[i], b[i], c[i];
}
}
' OFS=\| file.csv
"201707"|"51976551"|2|0|2
"201707"|"51960597"|1|0|0
"201707"|"51923555"|2|0|2
"201707"|"51955194"|2|0|0
-F"|"
This sets the field separator on input to |
.
a[$1 OFS $2]+=$3; b[$1 OFS $2]+=$4; c[$1 OFS $2]+=$5
This keeps track of the totals of the third, fourth, and fifth columns.
END {
for (i in a) {
print i, a[i], b[i], c[i];
}
}
This prints out the results.
OFS=\|
This tells awk to use |
as the field separator on output.
Upvotes: 4