Mathews Jose
Mathews Jose

Reputation: 409

Finding the longest sequence of digits from a string in bash using awk alone

I am trying to find the longest sequence of digits from a string in bash using awk alone. I have formed the below command and it is giving me the output.

$ echo "This_is_1234_and_44448888_1234567_111111_23456789_and_234" | sed 's/./\n&/g' | awk 'BEGIN{max_length=0} { tmp=match($1,/[0-9]/) ; if (tmp) { numbers[i]=numbers[i]$0;non_digit=0;} else if (non_digit<1) { non_digit=2 ;i++; } } END { i=0; for (key in numbers) { current_length=length(numbers[key]); if (current_length > max_length) { max_length = current_length; i = 0;} if (current_length >= max_length) {i++; max_length_strings[i] = numbers[key];} } print "max_length for the consecutive number portion is ",max_length; for ( j in max_length_strings ) { print "String_Part: " max_length_strings[j] " and Length: " max_length; }}'
max_length for the consecutive number portion is  8
String_Part: 23456789 and Length: 8
String_Part: 44448888 and Length: 8

But for getting each character from the string for digit checking, I am using sed command.

So how can I avoid this use of sed and achieve the same result by using a single awk command?

Upvotes: 2

Views: 581

Answers (2)

anubhava
anubhava

Reputation: 786291

Using just awk:

s="This_is_1234_and_44448888_1234567_111111_23456789_and_234"

awk -v RS='[^[:digit:]]+' 'length($0) >= max{
   max=length($0)
   num[max]=(num[max]?num[max] "," $0:$0)
}
END {
   printf "max length=%s, numbers: %s\n", max, num[max]
}' <<< "$s"

max length=8, numbers: 44448888,23456789

Explanation:

  • Using custom RS='[^[:digit:]]+' we are making record separator as 1 or more non-digit characters thus each record becomes digit fields
  • We are then check length of each record (all digit fields) and keep updating max variable in length($0) >= max block
  • We also keep all the max length fields in num array
  • In the END block we just print max and num array entry
  • This is gnu-awk specific due to multi character RS

Upvotes: 1

Ed Morton
Ed Morton

Reputation: 204638

With GNU awk 4.* for FPAT and true multi-dimensional arrays:

$ cat tst.awk
BEGIN { FPAT="[0-9]+" }
{
    delete strs
    for (i=1;i<=NF;i++) {
        cur = length($i)
        strs[cur][$i]
        max = (i>1 && cur>max ? cur : max)
    }
    for (str in strs[max]) {
        printf "String_Part: %s and Length: %d\n", str, max
    }
}

$ awk -f tst.awk file
String_Part: 23456789 and Length: 8
String_Part: 44448888 and Length: 8

The above assumes you want the output of the max for each input line and not across the whole file. If you do want it across the whole file then, again with GNU awk (for RT):

$ cat tst.awk
BEGIN { RS="[0-9]+" }
{
    cur = length(RT)
    strs[cur][RT]
    max = (NR>1 && cur>max ? cur : max)
}
END {
    for (str in strs[max]) {
        printf "String_Part: %s and Length: %d\n", str, max
    }
}

$ awk -f tst.awk file
String_Part: 23456789 and Length: 8
String_Part: 44448888 and Length: 8

Upvotes: 0

Related Questions