Reputation: 8711
I have a table schema - names of the columns in a comma separated fashion. For clarity, I'll put them in one column per line as below
$ cat cols_name.txt
id
resp
x_amt
rate1
rate2
rate3
pay1
pay2
rate_r1
rate_r2
x_rate1
x_rate2
x_rate3
x_rate_r1
x_rate_r2
x_pay1
x_pay2
rev1
x_rev1
I need to find out the pairs that match column pairs ( pay1 -> x_pay1 ) and list them together as an intermediate output like below
x_rate1 rate1
x_rate2 rate2
x_rate3 rate3
x_pay1 pay1
x_pay2 pay2
x_rate_r1 rate_r1
x_rate_r2 rate_r2
x_rev1 rev1
And then finally print the frequency as
pay 2
rate 3
rate_r 2
rev 1
In my attempt to get the intermediate output, the below awk command is not working.
awk ' NR==FNR { if( $1~/^x_/ ) a[$1]=1 ; next } $1~/"x_" a[$1]/ { print $0 } ' cols_name.txt cols_name.txt
It is not printing anything. Could you pls help to fix
Upvotes: 3
Views: 188
Reputation: 785038
Here is single pass awk
to get it done:
awk '/^x_/ {xk[$0]; next} {s=$0; sub(/[0-9]+$/, "", s); xv[$0]=s} END {for (i in xv) if ("x_" i in xk) {print "x_" i, i; ++fq[xv[i]]}; print "== Summary =="; for (i in fq) print i, fq[i]}' file
x_rev1 rev1
x_rate1 rate1
x_rate2 rate2
x_rate3 rate3
x_rate_r1 rate_r1
x_pay1 pay1
x_rate_r2 rate_r2
x_pay2 pay2
== Summary ==
rate_r 2
rate 3
rev 1
pay 2
A more readable form:
awk '
/^x_/ {
xk[$0]
next
}
{
s = $0
sub(/[0-9]+$/, "", s)
xv[$0] = s
}
END {
for (i in xv)
if ("x_" i in xk) {
print "x_" i, i
++fq[xv[i]]
}
print "== Summary =="
for (i in fq)
print i, fq[i]
}' file
Upvotes: 2
Reputation: 203284
Using any awk in any shell on every Unix box and assuming every entry in your input file occurs only once as in your posted example:
$ cat tst.awk
{
sub(/^x_/,"")
pair = "x_" $0 OFS $0
if ( ++count[pair] == 2 ) {
print pair
sub(/[0-9]+$/,"")
freq[$0]++
}
}
END {
print "---"
for (key in freq) {
print key, freq[key]
}
}
$ awk -f tst.awk cols_name.txt
x_rate1 rate1
x_rate2 rate2
x_rate3 rate3
x_rate_r1 rate_r1
x_rate_r2 rate_r2
x_pay1 pay1
x_pay2 pay2
x_rev1 rev1
---
rate_r 2
rev 1
rate 3
pay 2
Upvotes: 2
Reputation: 133458
With your shown samples, could you please try following, written and tested with shown samples in GNU awk
.
awk -v s1="x_" '
FNR==NR{
if($0~"^"s1){
arr[$0]=$0
}
next
}
((s1 $0) in arr){
print arr[s1 $0],$0
gsub(/^x_|[0-9]+$/,"",$0)
sum[$0]++
}
END{
for(i in sum){
print i,sum[i]
}
}
' Input_file Input_file
Explanation: Adding detailed explanation for above.
awk -v s1="x_" ' ##Starting awk program from here.
FNR==NR{ ##Checking condition which will be TRUE when first time Input_file is being read.
if($0~"^"s1){ ##Checking condition if line starts with s1 value then do following.
arr[$0]=$0 ##Creating arr with current line index and value with current line.
}
next ##next will skip all further statements from here.
}
((s1 $0) in arr){ ##Checking condition if s1 $0 is present in arr then do following.
print arr[s1 $0],$0 ##Printing value of array with current line.
gsub(/^x_|[0-9]+$/,"",$0) ##Globally substituting starting x_ AND ending digits with NULL in current line.
sum[$0]++ ##Creating sum with inceasing value of 1 each time cursor comes here.
}
END{ ##Starting END block of this question from here.
for(i in sum){ ##Traversing through sum elements here.
print i,sum[i] ##Printing key and value of sum in here.
}
}
' Input_file Input_file ##Mentioning Input_file names here.
Upvotes: 1
Reputation: 12867
Assuming that the file is actually:
id,resp,x_amt,rate1,rate2,rate3,pay1,pay2,rate_r1,rate_r2,x_rate1,x_rate2,x_rate3,x_rate_r1,x_rate_r2,x_pay1,x_pay2,rev1,x_rev1
as suggested in the original post (it's not very clear), using GNU awk:
awk '{ split($0,map,",");for (i in map) { map1[map[i]]="1" } for (i in map) { if ( map[i] ~ /^x_/ ) { hd=gensub("x_","","g",map[i]);hd1=gensub("[[:digit:]]","","g",hd);if (map1[hd]=="1") { map2[hd1]++;print map[i]" "hd } } } printf "\n";for (i in map2) { print i" "map2[i] } }' cols_name.txt
Explanation:
awk '{
split($0,map,","); # Split the line into an array called map, using comma as the separator
for (i in map) {
map1[map[i]]="1" # Loop through map and create another array map1 with the values of map as indexes
}
for (i in map) {
if ( map[i] ~ /^x_/ ) {
hd=gensub("x_","","g",map[i]); # Loop through map and it the value is prefixed with "x_", remove it, reading the result into hd
hd1=gensub("[[:digit:]]","","g",hd); # Take any digits out of hd and read into hd1
if (map1[hd]=="1") {
map2[hd1]++; # Create a third array map2 with the index hd1 and the value an incrementing counter
print map[i]" "hd # If a match exists in the map1 array, print the match
}
}
}
printf "\n";
for (i in map2) {
print i" "map2[i] # Loop through the count array and print the values
}
}' cols_name.txt
Output:
x_pay2 pay2
x_rev1 rev1
x_rate1 rate1
x_rate2 rate2
x_rate3 rate3
x_rate_r1 rate_r1
x_rate_r2 rate_r2
x_pay1 pay1
rate_r 2
rate 3
rev 1
pay 2
Upvotes: 1