VIPIN KUMAR
VIPIN KUMAR

Reputation: 3147

How to parse multiple files and put the output in generated dynamic file using awk?

Here is my awk script filtered.awk which works fine for a single input file.

#Field Seperator
BEGIN { FS="[,:\"]" }

#Searching and Storing in an Array
/searchKeyword/ {a[$5]=a[$5]OFS$6}

#Looping on Array
END {
     for (k in a)
      {
        print FILENAME, k, gsub(OFS,OFS,a[k]) > ("output_" FILENAME)
      }
}

Sample input-

cat input1.txt
"YY/XX","searchKeyword-ZZZZ.abc:06","200OK",64594889937362
"YY/XX","searchKeyword-ZZZZ.abc:13","200OK",64594860937362
"YY/XX","searchKeyword-ZZZZ.abc:06","200OK",64594822937362
"YY/XX","searchKeyword-ZZZZ.abc:06","200OK",64594823937362
"YY/XX","searchKeyword-ZZZZ.pqr:13","200OK",64594890937362
"YY/XX","searchKeyword-ZZZZ.pqr:08","200OK",64594877937362
"YY/XX","searchKeyword-ZZZZ.pqr:13","200OK",64594860937362
"YY/XX","searchKeyword-ZZZZ.pqr:13","200OK",64594870937362
"YY/XX","searchKeyword-ZZZZ.cde:12","200OK",64594803937362
"YY/XX","searchKeyword-ZZZZ.cde:00","200OK",64594870937362
"YY/XX","searchKeyword-ZZZZ.cde:00","200OK",64594860937362
"YY/XX","searchKeyword-ZZZZ.cde:08","200OK",64594825193736

second input file -

cat input2.txt
"XXX/YYY","searchKeyword-YYYYY.pqr:99910","200OK",439865231,"4334373212"
"XXX/YYY","searchKeyword-YYYYY.cde:99904","200OK",439868231,"4334953212"
"XXX/YYY","searchKeyword-YYYYY.mno:99909","200OK",439827231,"4334178212"
"XXX/YYY","searchKeyword-YYYYY.pqr:99911","200OK",439874231,"4334353212"
"XXX/YYY","searchKeyword-YYYYY.cde:99900","200OK",439893231,"4334130212"
"XXX/YYY","searchKeyword-YYYYY.mno:99910","200OK",439886231,"4334868212"
"XXX/YYY","searchKeyword-YYYYY.pqr:99905","200OK",439850231,"4334495212"
"XXX/YYY","searchKeyword-YYYYY.cde:99905","200OK",439878231,"4334131212"
"XXX/YYY","searchKeyword-YYYYY.mno:99910","200OK",439871231,"4334895212"
"XXX/YYY","searchKeyword-YYYYY.pqr:99910","200OK",439874231,"4334353212"
"XXX/YYY","searchKeyword-YYYYY.cde:99908","200OK",439848231,"4334823212"
"XXX/YYY","searchKeyword-YYYYY.mno:99914","200OK",439820231,"4334177212"
"XXX/YYY","searchKeyword-YYYYY.pqr:99910","200OK",439882231,"4334579212"
"XXX/YYY","searchKeyword-YYYYY.cde:99903","200OK",439840231,"4334966212"
"XXX/YYY","searchKeyword-YYYYY.mno:99908","200OK",439894231,"4334365212"

third input file

cat input3.txt
"XXX/YYY","searchKeyword-YYYYY.cde:99900","200OK",439893231,"4334130212"
"XXX/YYY","searchKeyword-YYYYY.mno:99910","200OK",439886231,"4334868212"
"XXX/YYY","searchKeyword-YYYYY.pqr:99905","200OK",439850231,"4334495212"
"XXX/YYY","searchKeyword-YYYYY.cde:99905","200OK",439878231,"4334131212"
"XXX/YYY","searchKeyword-YYYYY.mno:99910","200OK",439871231,"4334895212"
"XXX/YYY","searchKeyword-YYYYY.pqr:99910","200OK",439874231,"4334353212"
"PPP/QQQ","searchKeyword-ZZZZ.abc:06","200OK",64594822937362
"PPP/QQQ","searchKeyword-ZZZZ.abc:06","200OK",64594823937362
"PPP/QQQ","searchKeyword-ZZZZ.pqr:13","200OK",64594890937362
"PPP/QQQ","searchKeyword-ZZZZ.pqr:08","200OK",64594877937362
"PPP/QQQ","searchKeyword-ZZZZ.pqr:13","200OK",64594860937362
"PPP/QQQ","searchKeyword-ZZZZ.pqr:13","200OK",64594870937362
"PPP/QQQ","searchKeyword-ZZZZ.cde:12","200OK",64594803937362
"PPP/QQQ","searchKeyword-ZZZZ.cde:00","200OK",64594870937362

I passed the input files like below and got the output in output_input3.txt file.

awk -f filtered.awk input*
cat output_input3.txt
input3.txt searchKeyword-ZZZZ.cde 6
input3.txt searchKeyword-YYYYY.cde 7
input3.txt searchKeyword-ZZZZ.pqr 8
input3.txt searchKeyword-YYYYY.pqr 7
input3.txt searchKeyword-ZZZZ.abc 6
input3.txt searchKeyword-YYYYY.mno 7

looks like it is not processing the first two files at all.

And I was expecting output in dynamically generated files like below -

==> output_input1.txt <==
input1.txt searchKeyword-ZZZZ.cde 4
input1.txt searchKeyword-ZZZZ.pqr 4
input1.txt searchKeyword-ZZZZ.abc 4

==> output_input2.txt <==
input2.txt searchKeyword-YYYYY.cde 5
input2.txt searchKeyword-YYYYY.pqr 5
input2.txt searchKeyword-YYYYY.mno 5

==> output_input3.txt <==
input3.txt searchKeyword-ZZZZ.cde 2
input3.txt searchKeyword-YYYYY.cde 2
input3.txt searchKeyword-ZZZZ.pqr 4
input3.txt searchKeyword-YYYYY.pqr 2
input3.txt searchKeyword-ZZZZ.abc 2
input3.txt searchKeyword-YYYYY.mno 2

but I am getting output in only one file output_input3.txt Any suggestion ? And how we can further divide the dynamic file generation for output like below -

==> output_input1_cde.txt <==
input1.txt searchKeyword-ZZZZ.cde 4

==> output_input1_pqr.txt <==
input1.txt searchKeyword-ZZZZ.pqr 4

==> output_input1_abc.txt <==
input1.txt searchKeyword-ZZZZ.abc 4

==> output_input2_cde.txt <==
input2.txt searchKeyword-YYYYY.cde 5

==> output_input2_pqr.txt <==
input2.txt searchKeyword-YYYYY.pqr 5

==> output_input2_mno.txt <==
input2.txt searchKeyword-YYYYY.mno 5

==> output_input3_cde.txt <==
input3.txt searchKeyword-ZZZZ.cde 2
input3.txt searchKeyword-YYYYY.cde 2

==> output_input3_pqr.txt <==
input3.txt searchKeyword-ZZZZ.pqr 4
input3.txt searchKeyword-YYYYY.pqr 2

==> output_input3_abc.txt <==
input3.txt searchKeyword-ZZZZ.abc 2

==> output_input3_mno.txt <==
input3.txt searchKeyword-YYYYY.mno 2

NOTE: I am using awk on mac (awk version 20070501) and tried with ENDFILE, I think ENDFILE doesn't exist in awk on mac.

Upvotes: 0

Views: 158

Answers (1)

James Brown
James Brown

Reputation: 37414

END can only see the last instance of FILENAME. If you are using GNU awk, try replacing END with ENDFILE and see if that's what you want (you may need to delete a, maybe add close. Using GNU awk (due to ENDFILE):

$ cat foo.awk
#Field Seperator
BEGIN { FS="[,:\"]" }

#Searching and Storing in an Array
/searchKeyword/ {a[$5]=a[$5]OFS$6}

#Looping on Array
ENDFILE {                                              # replaced END with ENDFILE
     out="output_" FILENAME                            # to define just once
     for (k in a) 
     {
         print FILENAME, k, gsub(OFS,OFS,a[k]) > out
     }
     delete a                                          # added delete
     close(out)                                        # good habit eventho GNU awk
}

Results:

$ cat output_input1 
input1 searchKeyword-ZZZZ.abc 4
input1 searchKeyword-ZZZZ.cde 4
input1 searchKeyword-ZZZZ.pqr 4
$ cat output_input2
input2 searchKeyword-YYYYY.mno 5
input2 searchKeyword-YYYYY.cde 5
input2 searchKeyword-YYYYY.pqr 5
$ cat output_input3
input3 searchKeyword-ZZZZ.abc 2
input3 searchKeyword-YYYYY.mno 2
input3 searchKeyword-ZZZZ.cde 2
input3 searchKeyword-ZZZZ.pqr 4
input3 searchKeyword-YYYYY.pqr 2
input3 searchKeyword-YYYYY.cde 2

If you don't have GNU awk and ENDFILE available, you need to handle the FILENAME in FNR==1 and the END blocks. Of course you could (and should) make a function() and call them from previously mentioned blocks but to highlight:

#Field Seperator
BEGIN { FS="[,:\"]" }

FNR==1 {
    if(filename!="") {                                  # no file before the first
        out="output_" filename                          # using previous filename
        for (k in a)
        {
            print filename, k, gsub(OFS,OFS,a[k]) > out
        }
        delete a                                        # empty env
        close(out)                                      # close used file
    }
    filename=FILENAME                                   # remember filename
}
#Searching and Storing in an Array
/searchKeyword/ {a[$5]=a[$5]OFS$6}

#Looping on Array
END {
     out="output_" FILENAME
     for (k in a)
     {
         print FILENAME, k, gsub(OFS,OFS,a[k]) > out
     }
     delete a                                        # good habit but more for
     close(out)                                      # symmetricity
}

Update: Updated as requested in the comments. Sorry, I missed that part totally at the first time.

#Field Seperator                                                                  
BEGIN { FS="[,:\"]" }

FNR==1 {
    if(filename!="") {                                # no file before the first  
        for (k in a)
        {
            n=split(k,f,".")                          # get the abc etc           
            out="output_" filename "_" f[n] ".txt"    # construct the filename    
            print filename, k, a[k] >> out            # appending to files        
            close(out)                                # spare the fds             
        }
        delete a                                      # empty env                 
    }
    filename=FILENAME                                 # remember filename         
}
#Searching and Storing in an Array                                                
/searchKeyword/ {a[$5]++}                             # changed the counting      

#Looping on Array                                                                 
END {
    for (k in a)
    {
        n=split(k,f,".")                              # etc                       
        out="output_" filename "_" f[n] ".txt"        # construct                 
        print filename, k, a[k] >> out                # append                    
        close(out)                                    # fds                       
    }
}

Upvotes: 2

Related Questions