stack0114106
stack0114106

Reputation: 8711

awk - pull out pair columns and get the count of occurrences

I have a table schema - names of the columns in a comma separated fashion. For clarity, I'll put them in one column per line as below

$ cat cols_name.txt
id
resp
x_amt
rate1
rate2
rate3
pay1
pay2
rate_r1
rate_r2
x_rate1
x_rate2
x_rate3
x_rate_r1
x_rate_r2
x_pay1
x_pay2
rev1
x_rev1

I need to find out the pairs that match column pairs ( pay1 -> x_pay1 ) and list them together as an intermediate output like below

x_rate1 rate1
x_rate2 rate2
x_rate3 rate3
x_pay1 pay1
x_pay2 pay2
x_rate_r1 rate_r1
x_rate_r2 rate_r2
x_rev1 rev1

And then finally print the frequency as

 pay 2
 rate 3
 rate_r 2
 rev 1

In my attempt to get the intermediate output, the below awk command is not working.

awk ' NR==FNR { if( $1~/^x_/ ) a[$1]=1 ; next }  $1~/"x_" a[$1]/ { print $0 } ' cols_name.txt cols_name.txt

It is not printing anything. Could you pls help to fix

Upvotes: 3

Views: 188

Answers (4)

anubhava
anubhava

Reputation: 785038

Here is single pass awk to get it done:

 awk '/^x_/ {xk[$0]; next} {s=$0; sub(/[0-9]+$/, "", s); xv[$0]=s} END {for (i in xv) if ("x_" i in xk) {print "x_" i, i; ++fq[xv[i]]}; print "== Summary =="; for (i in fq) print i, fq[i]}' file

x_rev1 rev1
x_rate1 rate1
x_rate2 rate2
x_rate3 rate3
x_rate_r1 rate_r1
x_pay1 pay1
x_rate_r2 rate_r2
x_pay2 pay2
== Summary ==
rate_r 2
rate 3
rev 1
pay 2

A more readable form:

awk '
/^x_/ {
   xk[$0]
   next
}
{
   s = $0
   sub(/[0-9]+$/, "", s)
   xv[$0] = s
}
END {
   for (i in xv)
      if ("x_" i in xk) {
         print "x_" i, i
         ++fq[xv[i]]
      }
   print "== Summary =="
   for (i in fq)
      print i, fq[i]
}' file

Upvotes: 2

Ed Morton
Ed Morton

Reputation: 203284

Using any awk in any shell on every Unix box and assuming every entry in your input file occurs only once as in your posted example:

$ cat tst.awk
{
    sub(/^x_/,"")
    pair = "x_" $0 OFS $0
    if ( ++count[pair] == 2 ) {
        print pair
        sub(/[0-9]+$/,"")
        freq[$0]++
    }
}
END {
    print "---"
    for (key in freq) {
        print key, freq[key]
    }
}

$ awk -f tst.awk cols_name.txt
x_rate1 rate1
x_rate2 rate2
x_rate3 rate3
x_rate_r1 rate_r1
x_rate_r2 rate_r2
x_pay1 pay1
x_pay2 pay2
x_rev1 rev1
---
rate_r 2
rev 1
rate 3
pay 2

Upvotes: 2

RavinderSingh13
RavinderSingh13

Reputation: 133458

With your shown samples, could you please try following, written and tested with shown samples in GNU awk.

awk -v s1="x_" '
FNR==NR{
  if($0~"^"s1){
    arr[$0]=$0
  }
  next
}
((s1 $0) in arr){
  print arr[s1 $0],$0
  gsub(/^x_|[0-9]+$/,"",$0)
  sum[$0]++
}
END{
  for(i in sum){
    print i,sum[i]
  }
}
'  Input_file  Input_file

Explanation: Adding detailed explanation for above.

awk -v s1="x_" '             ##Starting awk program from here.
FNR==NR{                     ##Checking condition which will be TRUE when first time Input_file is being read.
  if($0~"^"s1){              ##Checking condition if line starts with s1 value then do following.
    arr[$0]=$0               ##Creating arr with current line index and value with current line.
  }
  next                       ##next will skip all further statements from here.
}
((s1 $0) in arr){            ##Checking condition if s1 $0 is present in arr then do following.
  print arr[s1 $0],$0        ##Printing value of array with current line.
  gsub(/^x_|[0-9]+$/,"",$0)  ##Globally substituting starting x_ AND ending digits with NULL in current line.
  sum[$0]++                  ##Creating sum with inceasing value of 1 each time cursor comes here.
}
END{                         ##Starting END block of this question from here.
  for(i in sum){             ##Traversing through sum elements here.
    print i,sum[i]           ##Printing key and value of sum in here.
  }
}
'  Input_file  Input_file    ##Mentioning Input_file names here.

Upvotes: 1

Raman Sailopal
Raman Sailopal

Reputation: 12867

Assuming that the file is actually:

id,resp,x_amt,rate1,rate2,rate3,pay1,pay2,rate_r1,rate_r2,x_rate1,x_rate2,x_rate3,x_rate_r1,x_rate_r2,x_pay1,x_pay2,rev1,x_rev1

as suggested in the original post (it's not very clear), using GNU awk:

awk '{ split($0,map,",");for (i in map) { map1[map[i]]="1" } for (i in map) { if ( map[i] ~ /^x_/ ) { hd=gensub("x_","","g",map[i]);hd1=gensub("[[:digit:]]","","g",hd);if (map1[hd]=="1") { map2[hd1]++;print map[i]" "hd } } } printf "\n";for (i in map2) { print i" "map2[i] } }' cols_name.txt

Explanation:

awk '{ 
        split($0,map,",");                                     # Split the line into an array called map, using comma as the separator
        for (i in map) { 
           map1[map[i]]="1"                                    # Loop through map and create another array map1 with the values of map as indexes
        } 
        for (i in map) { 
           if ( map[i] ~ /^x_/ ) {                            
               hd=gensub("x_","","g",map[i]);                  # Loop through map and it the value is prefixed with "x_", remove it, reading the result into hd
               hd1=gensub("[[:digit:]]","","g",hd);            # Take any digits out of hd and read into hd1
               if (map1[hd]=="1") {
                 map2[hd1]++;                                  # Create a third array map2 with the index hd1 and the value an incrementing counter 
                 print map[i]" "hd                             # If a match exists in the map1 array, print the match
               } 
            } 
         } 
         printf "\n";
         for (i in map2) { 
            print i" "map2[i]                                  # Loop through the count array and print the values
         }   
       }' cols_name.txt

Output:

x_pay2 pay2
x_rev1 rev1
x_rate1 rate1
x_rate2 rate2
x_rate3 rate3
x_rate_r1 rate_r1
x_rate_r2 rate_r2
x_pay1 pay1

rate_r 2
rate 3
rev 1
pay 2

Upvotes: 1

Related Questions