Eve
Eve

Reputation: 785

Generate all valid values for a regular expression

I know by using Xeger, we can get a random value for a specified pattern.

String regex = "[0-9]{2}"; 
Xeger generator = new Xeger(regex);
String result = generator.generate();

I want to know is there a way to return all of the valid strings for the specified regex. For example, for pattern: [0-9]{2}, we can get all of the values from 00 to 99.

Thanks

Edit:

Here we don't consider the infinite outputs like + and *; how can we get all values for a finite regex?

Last edit:

Thanks everyone! Finally I don't consider all the possible values as there may be thousands. I limit a specific number as the number of values to reduce the amount.

Upvotes: 19

Views: 5081

Answers (4)

Andremoniy
Andremoniy

Reputation: 34920

Here is am open-source generator RegLdg, written in C language - the regular expression grammar language dictionary generator.

I believe it will be not very difficult to make a Java port of this program.

Upvotes: 5

berdario
berdario

Reputation: 1860

Since a regexp is defined by a finite state machine, I wondered if there was something out there able to automatically reason on such machines and that was a good fit to be repurposed for this work... and clojure.core.logic delivered

So, I looked at this definition of the regexp grammar (unfortunately, it lacks the {} quantifiers, but they should be pretty easy to add to my code) adapted it to the java escapes, and worked out this 110 lines long clojure program:

(ns regexp-unfolder.core
  (:require [instaparse.core :as insta])
  (:require [clojure.core.logic :as l])
  (:require [clojure.set :refer [union difference]])
  (:gen-class :methods [#^{:static true} [unfold [String] clojure.lang.LazySeq]])
)

(def parse-regexp (insta/parser 
             "re = union | simple-re?
             union = re '|' simple-re
             simple-re = concat | base-re
             concat = simple-re base-re
             base-re = elementary-re | star | plus
             star = elementary-re '*'
             plus = elementary-re '+'
             elementary-re = group | char | '$' | any | set
             any = '.'
             group = '(' re ')'
             set = positive-set | negative-set
             positive-set = '['  set-items ']'
             negative-set = '[^' set-items ']'
             set-items = set-item*
             set-item = range | char
             range = char '-' char
             char = #'[^\\\\\\-\\[\\]]|\\.'" ))

(def printables (set (map char (range 32 127))))

(declare fns handle-first)

(defn handle-tree [q qto [ type & nodes]]
  (if (nil? nodes)
    [[q [""] qto]]
    ((fns type handle-first) q qto nodes)))

(defn star [q qto node &]
  (cons [q [""] qto]
         (handle-tree q q (first node))))

(defn plus [q qto node &] 
  (concat (handle-tree q qto (first node))
          (handle-tree qto qto (first node))))

(defn any-char [q qto & _] [[q (vec printables) qto]] )

(defn char-range [[c1 _ c2]]
  (let [extract-char (comp int first seq second)]
    (set (map char (range (extract-char c1) (inc (extract-char c2)))))))

(defn items [nodes]
  (union (mapcat
    (fn [[_ [type & ns]]]
      (if (= type :char)
        #{(first ns)}        
        (char-range ns)))
    (rest (second nodes)))))

(defn handle-set [q qto node &] [[q (vec (items node)) qto]])

(defn handle-negset [q qto node &] [[q (vec (difference printables (items node))) qto]])

(defn handle-range [q qto & nodes] [[q (vec (char-range nodes)) qto]])

(defn handle-char [q qto node &] [[q (vec node) qto]] )

(defn handle-concat [q qto nodes] 
  (let [syms (for [x  (rest nodes)] (gensym q))]
    (mapcat handle-tree  (cons q syms) (concat syms [qto] ) nodes)
  ))

(defn handle-first [q qto [node & _]] (handle-tree q qto node))

(def fns {:concat handle-concat, :star star, :plus plus, :any any-char, :positive-set handle-set, :negative-set handle-negset, :char handle-char})

(l/defne transition-membero
  [state trans newstate otransition]
  ([_ _ _ [state trans-set newstate]]
     (l/membero trans trans-set)))

(defn transitiono [state trans newstate transitions]
  (l/conde
   [(l/fresh [f] 
             (l/firsto transitions f)
             (transition-membero state trans newstate f))]
   [(l/fresh [r]
             (l/resto transitions r)
             (transitiono state trans newstate r))])
  )

(declare transitions)

;; Recognize a regexp finite state machine encoded in triplets [state, transition, next-state], adapted from a snippet made by Peteris Erins

(defn recognizeo
  ([input]
     (recognizeo 'q0 input))
  ([q input]
     (l/matche [input] ; start pattern matching on the input
        (['("")]
           (l/== q 'ok)) ; accept the empty string if we are in an accepting state
        ([[i . nput]]
           (l/fresh [qto]
                  (transitiono q i qto transitions) ; assert it must be what we transition to qto from q with input symbol i
                  (recognizeo qto nput)))))) ; recognize the remainder


(defn -unfold [regex] 
  (def transitions 
    (handle-tree 'q0 'ok (parse-regexp regex)))
  (map (partial apply str) (l/run* [q] (recognizeo q))))

Being written with core.logic, it should be fairly easy to adapt it to work also as a regexp matcher

I limited the printables characters from 32 to 126 ascii, otherwise it'd be too cumbersome to deal with regexps such as [^c], but you can extend it quite easily... also, I haven't implemented yet unions, optional patterns, and the \w, \s, etc. escapes for character classes

This is the biggest thing I wrote in clojure up to now, but the basics seems to be covered just fine... some examples:

regexp-unfolder.core=> (-unfold "ba[rz]")
("bar" "baz")
regexp-unfolder.core=> (-unfold "[a-z3-7]")
("a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t" "u" "v" "w" "x" "y" "z" "3" "4" "5" "6" "7")
regexp-unfolder.core=> (-unfold "[a-z3-7][01]")
("a0" "a1" "b0" "b1" "c0" "c1" "d0" "d1" "e0" "e1" "f0" "f1" "g0" "g1" "h0" "h1" "i0" "i1" "j0" "j1" "k0" "k1" "l0" "l1" "m0" "m1" "n0" "n1" "o0" "o1" "p0" "p1" "q0" "q1" "r0" "r1" "s0" "s1" "t0" "t1" "u0" "u1" "v0" "v1" "w0" "w1" "x0" "x1" "y0" "y1" "z0" "z1" "30" "31" "40" "41" "50" "51" "60" "70" "61" "71")
regexp-unfolder.core=> (-unfold "[^A-z]")
(" " "@" "!" "\"" "#" "$" "%" "&" "'" "(" ")" "*" "+" "," "-" "." "/" "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" ":" ";" "{" "<" "|" "=" "}" ">" "~" "?")
regexp-unfolder.core=> (take 20 (-unfold "[abc]*"))
("" "a" "b" "c" "aa" "ab" "ac" "ba" "ca" "aaa" "bb" "cb" "aab" "bc" "cc" "aac" "aba" "aca" "baa" "caa")
regexp-unfolder.core=> (take 20 (-unfold "a+b+"))
("ab" "aab" "abb" "abbb" "aaab" "abbbb" "aabb" "abbbbb" "abbbbbb" "aabbb" "abbbbbbb" "abbbbbbbb" "aaaab" "aabbbb" "aaabb" "abbbbbbbbb" "abbbbbbbbbb" "aabbbbb" "abbbbbbbbbbb" "abbbbbbbbbbbb")

Since I started this way, I implemented also infinite outputs :)

If someone is interested, I uploaded it here

and obviously, here's an example of how to invoke unfold from plain old Java:

import static regexp_unfolder.core.unfold;

public class UnfolderExample{
    public static void main(String[] args){
        @SuppressWarnings("unchecked")
        Iterable<String> strings = unfold("a+b+");
        for (String s : strings){
            System.out.println(s);
        }
    }
}

Upvotes: 6

Sergiu Toarca
Sergiu Toarca

Reputation: 2759

Finding all matches is very similar to finding a random match. Below is a simple modification of the logic that generates random matches on www.debuggex.com, assuming you already have a parse tree.

The idea is that for every subtree, you return a list of all possible generated strings, given a string that was generated by all previous nodes in your parse tree.

AltTree.all = (prefix) ->
    rets = []
    for child in children
        rets.extend(child.all(prefix))

ConcatTree.all = (prefix) ->
    prefixes = [prefix]
    for child in children
        newPrefixes = []
        for p in prefixes
            newPrefixes.extend(child.all(p))
        prefixes = newPrefixes
    return prefixes

RepeatTree.all = (prefix) ->
    prefixes = [prefix]
    rets = []
    for i up to max
        newPrefixes = []
        for p in prefixes
            newPrefixes.extend(onlyChild.all(p))
        prefixes = newPrefixes
        if i >= min
            rets.extend(prefixes)
    return rets

CharsetTree.all = (prefix) ->
    rets = []
    for char in allValidChars():
        rets.push(prefix + char)
    return rets

The rest of the trees are left as exercises (most notably the literal tree).

Note that there are intentionally no optimizations for the sake of clarity. Calling myTree.all('') will generate a list such that every valid matching string appears once for every path that generates this string. You will probably want to add deduplication and get rid of the excessive copying.

I should also add that this will only work for regular expressions that have a small number of total matching strings. This is because all of the strings are being stored. If you want to get around this limitation, you can yieldify this algorithm. You will need to maintain a stack (think of it as being a bread crumb trail) of where you are in the tree. When a new string is asked for, you will create it from the path you travelled, and then update the path.

Upvotes: 2

Lie Ryan
Lie Ryan

Reputation: 64923

A trivial implementation of such algorithm is simply:

def generate_matching(pattern):
    alphabets = [...]
    l = 1
    while True:
        # generate all Cartesian product of the alphabets of length `l`
        for s in itertools.product(alphabets, repeat=l):
            s = "".join(s)
            if pattern.match(s):
                print s
        l += 1

Upvotes: 0

Related Questions