Reputation: 57
The code provided reads a CSV file and prints the count of all strings found in descending order. However, I would like to know how to specify what fields I would like to read in count...for example
./example-awk.awk 1,2 file.csv
would read strings from fields 1 and 2 and print the counts
#!/bin/awk -f
BEGIN {
FIELDS = ARGV[1];
delete ARGV[1];
FS = ", *"
}
{
for(i = 1; i <= NF; i++)
if(FNR != 1)
data[++data_index] = $i
}
END {
produce_numbers(data)
PROCINFO["sorted_in"] = "@val_num_desc"
for(i in freq)
printf "%s\t%d\n", i, freq[i]
}
function produce_numbers(sortedarray)
{
n = asort(sortedarray)
for(i = 1 ; i <= n; i++)
{
freq[sortedarray[i]]++
}
return
}
This is currently the code I am working with, ARGV[1] will of course be the specified fields. I am unsure how to go about storing this value to use it.
For example ./example-awk.awk 1,2 simple.csv
with simple.csv
containing
A,B,C,A
B,D,C,A
C,D,A,B
D,C,A,A
Should result in
D 3
C 2
B 2
A 1
Because it only counts strings in fields 1 and 2
Upvotes: 4
Views: 139
Reputation: 133428
EDIT(as per OP's request): As per OP he/she needs to have solution using ARGV
so adding solution as per that now (NOTE: cat script.awk
is only written to show content of actual awk
script only).
cat script.awk
BEGIN{
FS=","
OFS="\t"
for(i=1;i<(ARGC-1);i++){
arr[ARGV[i]]
delete ARGV[i]
}
}
{
for(i in arr){ value[$i]++ }
}
END{
PROCINFO["sorted_in"] = "@ind_str_desc"
for(j in value){
print j,value[j]
}
}
Now when we run it as follows:
awk -f script.awk 1 2 Input_file
D 3
C 2
B 2
A 1
My original solution: Could you please try following, written and tested with shown samples. It is a generic solution where awk
program has a variable named fields
where you could mention all field numbers which you want to deal with using ,
(comma) separator in it.
awk -v fields="1,2" '
BEGIN{
FS=","
OFS="\t"
num=split(fields,arr,",")
for(i=1;i<=num;i++){
key[arr[i]]
}
}
{
for(i in key){
value[$i]++
}
}
END{
for(i in value){
print i,value[i]
}
}' Input_file | sort -rk1
Output will be as follows.
D 3
C 2
B 2
A 1
Upvotes: 4
Reputation: 37394
I decided to give it a go in the spirit of OP's attempt as kids don't learn if kids don't play (trying ARGIND
manipulation (it doesn't work) and delete ARGV[]
and some others that also didn't work):
$ gawk '
BEGIN {
FS=","
OFS="\t"
split(ARGV[1],t,/,/) # field list picked from ARGV
for(i in t) # from vals to index
h[t[i]]
delete ARGV[1] # ARGIND manipulation doesnt work
}
{
for(i in h) # subset of fields processes
a[$i]++ # count hits
}
END {
PROCINFO["sorted_in"]="@val_num_desc" # ordering from OPs attempt
for(i in a)
print i,a[i]
}' 1,2 file
Output
D 3
B 2
C 2
A 1
You could as well drop the ARGV[]
manipulation and replace the BEGIN
block with:
$ gawk -v var=1,2 '
BEGIN {
FS=","
OFS="\t"
split(var,t,/,/) # field list picked from a var
for(i in t) # from vals to index
h[t[i]]
} ...
Upvotes: 2
Reputation: 203209
Don't use a shebang to invoke awk in a shell script as that robs you of the ability to use the shell and awk separately for what they both do best. Use the shebang to invoke your shell and then call awk within the script. You also don't need to use gawk-only sorting functions for this:
$ cat tst.sh
#!/usr/bin/env bash
(( $# == 2 )) || { echo "bad args: $0 $*" >&2; exit 1; }
cols=$1
shift
awk -v cols="$cols" '
BEGIN {
FS = ","
OFS = "\t"
split(cols,tmp)
for (i in tmp) {
fldNrs[tmp[i]]
}
}
{
for (fldNr in fldNrs) {
val = $fldNr
cnt[val]++
}
}
END {
for (val in cnt) {
print val, cnt[val]
}
}
' "${@:--}" |
sort -r
$ ./tst.sh 1,2 file
D 3
C 2
B 2
A 1
Upvotes: 4