speed bump
speed bump

Reputation: 451

AWK: Help on transforming data table

I have the following file called in.txt:

2020-01-01  fruit   banana  3.4
2020-03-02  alcohol smirnov 26.99
2020-03-10  fruit   orange  4.20
2020-04-03  fruit   orange  4.20
2021-09-01  alcohol beer    6.00
2021-08-03  fruit   mango   6.99
2022-01-01  fruit   orange  4.30
2022-03-04  alcohol beer    6.00
2022-03-03  alcohol beer    6.00
2022-04-01  fruit   mango   7.20

I want to transform the file so it reads something like this:

                2020-01-01      2021-01-01      2022-01-01
                -2020-12-31     -2021-12-31     -2022-12-31
fruit   banana  3.40             0.00            0.00
        orange  8.40            0.00            4.30
        mango   0.00            6.99            7.20

Subt            11.80           6.99            11.50


alcohol beer    0.00            6.00            12.00
        smirnov 26.99           0.00            0.00

Subt            26.99           6.00            12.00

Total           38.59           12.99           23.50

I have started writing the following script but am stuck on how to approach this. How can I display totals columns side by side. The other problem is that this is just dummy data. I have many different categories other than fruit and alcohol and it seems wrong to write if statements and for-loops for each one. Also how can I print fruit and alcohol out just once rather than for every iteration of column 3 and bring the date range to the top. Help is much appreciated.

#!/usr/bin/env bash
  
awk '

        BEGIN{
        FS=OFS="\t";
        }

        {        
        if ($2 ~ fruit && $1 >= "2020-01-01" && $1 <= "2020-12-31") {                        
                        a[$3]+=$4;
                        sa+=$4;
                }
        }       
        
        END {
        PROCINFO["sorted_in"]="@ind_str_asc";
        for (i in a) {
                        print "fruit", i, a[i]
                }
        }        
        
' "${@:--}"

Upvotes: 1

Views: 124

Answers (3)

tshiono
tshiono

Reputation: 22012

Would you please try the following:

#!/bin/bash

awk '
    {
        year = substr($1, 1, 4)                         # extract year
        if (from == "" || from > year) from = year      # first (smallest) year
        if (to == "" || to < year) to = year            # last (largest) year

        if ($3 in category == 0) {
            category[$3] = $2                           # map item to category
            list[$2] = list[$2] fs[$2] $3               # csv of items
            fs[$2] = ","                                # delimiter for csv
        }
        sum[$3,year] += $4                              # sum of the item in the year
        subt[$2,year] += $4                             # sum of the category in the year
        ttl[year] += $4                                 # sum in the year

    }
    END {
        format1 = "%-10s%-10s"                          # format for the left cells
        format2 = "%-16s"                               # format for the header
        format3 = "%-16.2f"                             # format for the amounts

        # print upper header
        printf(format1, "", "")
        for (y = from; y <= to; y++) {
            printf(format2, y "-01-01")
        }
        print ""

        # print second header
        printf(format1, "", "")
        for (y = from; y <= to; y++) {
            printf(format2, "-" y "-12-31")
        }
        print ""

        for (cat in list) {                             # loop over the categories ("fruit" and "alcohol")
            n = split(list[cat], item, ",")             # split into items
            for (i = 1; i <= n; i++) {                  # loop over the items
                printf(format1, i == 1 ? cat : "", item[i])
                for (y = from; y <= to; y++) {          # loop over years
                    printf(format3, sum[item[i],y])     # append the sum of the year
                }
                print ""                                # finally break the line
            }
            print ""                                    # insert blank line
            printf(format1, "Subt", "")
            for (y = from; y <= to; y++) {
                printf(format3, subt[cat,y])            # append the subtotal
            }
            print "\n"
        }
        printf(format1, "Total", "")
        for (y = from; y <= to; y++) {
            printf(format3, ttl[y])                     # append the total amount
        }
        print ""
    }
' in.txt

Output with the provided input:

                    2020-01-01      2021-01-01      2022-01-01      
                    -2020-12-31     -2021-12-31     -2022-12-31     
alcohol   smirnov   26.99           0.00            0.00            
          beer      0.00            6.00            12.00           

Subt                26.99           6.00            12.00           

fruit     banana    3.40            0.00            0.00            
          orange    8.40            0.00            4.30            
          mango     0.00            6.99            7.20            

Subt                11.80           6.99            11.50           

Total               38.79           12.99           23.50           

Please forgive me the order of items are not same as the OP's.

Upvotes: 1

Ed Morton
Ed Morton

Reputation: 203324

Using GNU awk for arrays of arrays:

$ cat tst.awk
BEGIN { OFS="\t" }
{
    sub(/-.*/,"",$1)
    minYear = ( NR==1 || $1 < minYear ? $1 : minYear )
    maxYear = ( NR==1 || $1 > maxYear ? $1 : maxYear )
    items[$2][$3]
    vals[$1][$2][$3] += $4
    typeTots[$1][$2] += $4
    yearTots[$1] += $4
}
END {
    printf "%s", OFS
    for ( year=minYear; year<=maxYear; year++ ) {
        printf "%s%s", OFS, year
    }
    print ""

    for ( type in items ) {
        itemCnt = 0
        for ( item in items[type] ) {
            printf "%s%s%s", (itemCnt++ ? "" : type), OFS, item
            for ( year=minYear; year<=maxYear; year++ ) {
                printf "%s%0.2f", OFS, vals[year][type][item]
            }
            print ""
        }
        printf "Subt%s", OFS
        for ( year=minYear; year<=maxYear; year++ ) {
            printf "%s%0.2f", OFS, typeTots[year][type]
        }
        print ORS
    }

    printf "Total%s", OFS
    for ( year=minYear; year<=maxYear; year++ ) {
        printf "%s%0.2f", OFS, yearTots[year]
    }
    print ""
}

$ awk -f tst.awk in.txt
                2020    2021    2022
alcohol beer    0.00    6.00    12.00
        smirnov 26.99   0.00    0.00
Subt            26.99   6.00    12.00

fruit   orange  8.40    0.00    4.30
        mango   0.00    6.99    7.20
        banana  3.40    0.00    0.00
Subt            11.80   6.99    11.50

Total           38.79   12.99   23.50

or if you really want specific date ranges instead of just the year in the header:

$ cat tst.awk
BEGIN { OFS="\t" }
{
    sub(/-.*/,"",$1)
    minYear = ( NR==1 || $1 < minYear ? $1 : minYear )
    maxYear = ( NR==1 || $1 > maxYear ? $1 : maxYear )
    items[$2][$3]
    vals[$1][$2][$3] += $4
    typeTots[$1][$2] += $4
    yearTots[$1] += $4
}
END {
    printf "%s", OFS
    for ( year=minYear; year<=maxYear; year++ ) {
        printf "%s%s-01-01", OFS, year
    }
    print ""

    printf "%s", OFS
    for ( year=minYear; year<=maxYear; year++ ) {
        printf "%s-%s-12-31", OFS, year
    }
    print ""

    for ( type in items ) {
        itemCnt = 0
        for ( item in items[type] ) {
            printf "%s%s%s", (itemCnt++ ? "" : type), OFS, item
            for ( year=minYear; year<=maxYear; year++ ) {
                printf "%s%0.2f", OFS, vals[year][type][item]
            }
            print ""
        }
        printf "Subt%s", OFS
        for ( year=minYear; year<=maxYear; year++ ) {
            printf "%s%0.2f", OFS, typeTots[year][type]
        }
        print ORS
    }

    printf "Total%s", OFS
    for ( year=minYear; year<=maxYear; year++ ) {
        printf "%s%0.2f", OFS, yearTots[year]
    }
    print ""
}

$ awk -f tst.awk in.txt | column -s$'\t' -t
                  2020-01-01   2021-01-01   2022-01-01
                  -2020-12-31  -2021-12-31  -2022-12-31
alcohol  beer     0.00         6.00         12.00
         smirnov  26.99        0.00         0.00
Subt              26.99        6.00         12.00
fruit    orange   8.40         0.00         4.30
         mango    0.00         6.99         7.20
         banana   3.40         0.00         0.00
Subt              11.80        6.99         11.50
Total             38.79        12.99        23.50

Upvotes: 0

kvantour
kvantour

Reputation: 26471

I believe the following piece of awk code is a good start. The remaining part to do is just some cleanup and some extra code for the sums.

BEGIN{ 
   # how many divisions per year
   n=1
   # initialisation of some variables
   tmax=0;tmin=999999; ymax=qmax=0;ymin=9999;qmin=99
}
# convert date to quarter,trim,half
{ y=$1+0; q=(substr($1,6,7)+0)%n}
# compute min max time
(y*100+q < tmin) { ymin=y;qmin=q;tmin=y*100+q }
(y*100+q > tmax) { ymax=y;qmax=q;tmax=y*100+q }
# Create arrays that keep track of everything
# a : prices by year,q,category and element
# b : just a list of categories, eg fruit
# c : just a list of elements and the category it belongs to.
{ a[y,q,$2,$3]=$4; b[$2]; c[$3]=$2 }
END{
   # loop over categories (eg fruit)
   for(i in b) {
     # loop over elemnts
     for(j in c) {
        # exclude elements that do not belong to category
        if (i!=c[j]) continue
        s=i OFS j;
        # loop over the time
        for (y=ymin;y<=ymax;y++) {
          for (q=0;q<n;++q) {
             if (y*100+q < tmin) continue
             if (y*100+q > tmax) continue
             s=s OFS a[y,q,i,j]+0
          }
        }
        print s
     }
   }
}

This currently outputs:

alcohol beer 0 6 6
alcohol smirnov 26.99 0 0
fruit orange 4.2 0 4.3
fruit mango 0 6.99 7.2
fruit banana 3.4 0 0

Upvotes: 0

Related Questions