speed bump
speed bump

Reputation: 451

Awk script to add two columns depending on search pattern

I have the following awk script:

#! /usr/bin/awk -f

BEGIN{
FS=OFS="\t";

split("Mtm|^Dept|^Im|^TT|ge[mn]$,^Mini", arr1, ",")
split("Variable Expenses,Fixed Expenses", arr2, ",")
itm="Accounting & Legal";
}

$3<0 && $4~arr1[1] {$7=itm; $8=arr2[1]; print $0}
$3<0 && $4~arr1[2] {$7=itm; $8=arr2[2]; print $0}

Which produces the following output:

c_wp_e_2025 2021-05-10  -120.00 8430Ken Jebsen  BILL PAYMENT    12:08-11662 Accounting & Legal  Variable Expenses
c_wp_e_2148 2021-07-27  -90.85  6450Barrie L. Jorgen    BILL PAYMENT    09:31-35651 Accounting & Legal  Variable Expenses
e_wp_e_789  2018-06-12  -190.00 Immigration NZ 11   DEBIT       Accounting & Legal  Variable Expenses
e_wp_e_818  2018-06-29  -153.43 Dept of Home Affa 26    DEBIT   AUD 141.37  Accounting & Legal  Variable Expenses
e_wp_e_839  2018-07-18  -67.47  TT Services New Z 16    DEBIT       Accounting & Legal  Variable Expenses
e_wp_e_2512 2021-07-27  -470.20 Dept Internal Aff 23    DEBIT       Accounting & Legal  Variable Expenses
c_az_262    2021-10-01  -210.45 Mtm Accounting Ltd. Mlm Accounting  Accounting & Legal  Variable Expenses
c_az_421    2021-07-19  -561.20 Paypal *Mtm     Paypal *Mtm Accounting & Legal  Variable Expenses
c_az_1082   2020-11-25  -52.20  Ministryofbusiness  285515Dpsa2A    Mbie    Accounting & Legal  Fixed Expenses
c_az_1110   2020-11-16  -2228.70    Mtm Accounting Ltd. Mlm Accounting  Accounting & Legal  Variable Expenses

This works as desired. I have however many more categories besides itm="Accounting & Legal" (e.g. itm="Bank_charges) and sometimes there are Fixed Expenses and Variable Expenses. At other times one of them only in which case the arrays created in the splits would only have one value. In such a case the second statement in the main body would become unused. For every new category there are different search patterns. I am new to writing awk programs and am stuck on how to approach this. How could this be written efficiently to accommodate the above scenario.

The input (excerpt) would be:

c_wp_e_2025     2021-05-10      -120.00 8430Ken Jebsen    BILL PAYMENT    12:08-11662
c_wp_e_2148     2021-07-27      -90.85  6450Barrie L. Jorgen    BILL PAYMENT    09:31-35651
e_wp_e_789      2018-06-12      -190.00 Immigration NZ 11       DEBIT
e_wp_e_818      2018-06-29      -153.43 Dept of Home Affa 26    DEBIT   AUD 141.37
e_wp_e_839      2018-07-18      -67.47  TT Services New Z 16    DEBIT
e_wp_e_2512     2021-07-27      -470.20 Dept Internal Aff 23    DEBIT
c_az_262        2021-10-01      -210.45 Mtm Accounting Ltd.     Mlm     Accounting
c_az_421        2021-07-19      -561.20 Paypal *Mtm             Paypal *Mtm
c_az_1082       2020-11-25      -52.20  Ministryofbusiness      285515Dpsa2A    Mbie
c_az_1110       2020-11-16      -2228.70        Mtm Accounting Ltd.     Mlm     Accounting

Criteria for Bank_charges e.g. would be:

split("^Cle|Forei|2 WBC|irnie W| \
wtown W|*M|^Repl|elex|^Unar|lert$,enance$|ebit|A/C|rice$", arr1, ",")
split("Variable Expenses,Fixed Expenses", arr2,",")
itm="Bank_charges

Some output from this criteria would result in:

c_az_1668   2020-03-06  -10.00  Visa Debit Card Fee 4825561******   4823    Bank_charges    Fixed Expenses
c_az_1687   2020-02-28  -8.50   Monthly A/C Fee         Bank_charges    Fixed Expenses
c_az_1688   2020-02-28  -2.50   Clearance Fee           Bank_charges    Variable Expenses
c_az_1785   2020-01-31  -8.50   Monthly A/C Fee         Bank_charges    Fixed Expenses

Another category that only has Variable expenses is:

split("^318|^74 |ASB|^City S|^Fix", arr1,",")
split("Variable Expenses", arr2,",")
itm="Bank_withdrawals"

This however does not work at all with the above action. All data from the input file is categorized Bank_withdrawals and classified as Variable expenses.

Upvotes: 0

Views: 73

Answers (3)

Ed Morton
Ed Morton

Reputation: 203129

I think this is what you're trying to do but it's obviously untested since there's isn't a concrete sample of input and expected output in your question to test with:

$ cat tst.sh
#!/usr/bin/env bash

awk '
    BEGIN{
        FS=OFS="\t"

        mkMap("Mtm|^Dept|^Im|^TT|ge[mn]$", "Variable Expenses", "Accounting & Legal")
        mkMap("^Mini", "Fixed Expenses", "Accounting & Legal")
        mkMap("^Cle|Forei|2 WBC|irnie W| wtown W|[*]M|^Repl|elex|^Unar|lert$", "Variable Expenses", "Bank_charges")
        mkMap("enance$|ebit|A/C|rice$", "Fixed Expenses", "Bank_charges")
        mkMap("^318|^74 |ASB|^City S|^Fix", "Variable Expenses", "Bank_withdrawals")
    }

    $3 < 0 {
        found = 0
        for ( re in re2type ) {
            if ( $4 ~ re ) {
                $7 = re2item[re]
                $8 = re2type[re]
                found = 1
            }
        }
        if ( found ) {
            print
        }
    }
    function mkMap(re,type,item) {
        re2type[re] = type
        re2item[re] = item
    }
' "${@:--}"

Upvotes: 1

tshiono
tshiono

Reputation: 22002

Would you please try the following:

#!/usr/bin/awk -f

BEGIN {
    FS=OFS="\t";

    # enumerate the item, keyword and the statement in order
    itm = "Accounting & Legal"
    key[itm] = "Mtm|^Dept|^Im|^TT|ge[mn]$,^Mini"
    stmnt[itm] = "Variable Expenses,Fixed Expenses"

    itm = "Bank_charges"
    key[itm] = "^Cle|Forei|2 WBC|irnie W|wtown W|*M|^Repl|elex|^Unar|lert$,enance$|ebit|A/C|rice$"
    stmnt[itm] = "Variable Expenses,Fixed Expenses"

    itm="Bank_withdrawals"
    key[itm] = "^318|^74 |ASB|^City S|^Fix"
    stmnt[itm] = "Variable Expenses"

    # initialize key2 and stmnt2 using itm, key and stmnt defined above
    for (itm in key) {
        split(key[itm], a, ",")
        key2[itm,1] = a[1]
        key2[itm,2] = a[2]

        split(stmnt[itm], a, ",")
        stmnt2[itm,1] = a[1]
        stmnt2[itm,2] = a[2]
    }
}

# main loop with the input lines
{
    for (itm in key) {
        for (i = 1; i <= 2; i++) {
            if (key2[itm,i] != "") {
                if ($3 < 0 && $4 ~ key2[itm,i]) {
                    $7 = itm; $8 = stmnt2[itm,i]
                    print $0
                }
            }
        }
    }
}

If you have more items, append them with the keys and statements below the "Bank_charges" lines referring to the existing codes. If an item has either statement "Variable Expenses" or "Fixed Expenses", just put one element without a comma.

Upvotes: 1

Jad
Jad

Reputation: 1286

If you're new to awk, it might be best to look at writing this long hand:

#! /usr/bin/awk -f

BEGIN{ FS=OFS="\t"; }

$3<0 && $4~"Mtm|^Dept|^Im|^TT|ge[mn]$" {$7=;"Accounting & Legal"; $8="Variable Expenses"; print $0}
$3<0 && $4~"^Mini" {$7="Accounting & Legal"; $8="Fixed Expenses"; print $0}

(Please excuse typos, writing on my phone)

However I don't believe this is what you're actually asking for, so I'd suggest looping through the elements of the arrays that have been created in the begin like so:

#! /usr/bin/awk -f

BEGIN{
  FS=OFS="\t";

  els=split("Mtm|^Dept|^Im|^TT|ge[mn]$,^Mini", arr1, ",");
  split("Variable Expenses,Fixed Expenses", arr2, ",");
  split("Accounting & Legal","Accounting & Legal", arr3, ",");
}
for (i=1;i<=els;i++){
  $3<0 && $4~arr1[i] {$7=arr3[i]; $8=arr2[i]; print $0}
}

Upvotes: 0

Related Questions