Dmitry Matveyev
Dmitry Matveyev

Reputation: 481

Random AST generation with the specified size in Clojure

I would like to generate a random abstract syntax tree

(def terminal-set #{'x 'R})
(def function-arity {'+ 2, '- 2, '* 2, '% 2})
(def function-set (into #{} (keys function-arity)))
(def terminal-vec (into [] terminal-set))
(def function-vec (into [] function-set))

;; protected division
(defn % [^Number x ^Number y]
  (if (zero? y)
    0
    (/ x y)))

with the specified size

(defn treesize [tree] (count (flatten tree)))

following the algorithm from the book Sean Luke, 2013, Essentials of Metaheuristics, Lulu, second edition, available at https://cs.gmu.edu/~sean/book/metaheuristics/

We randomly extend the horizon of a tree with nonleaf nodes until the number of nonleaf nodes, plus the remaining spots, is greater than or equal to the desired size. We then populate the remaining slots with leaf nodes:

ptc2 algorithm

For example

(+ (* x (+ x x)) x)

is of size 7.

The algorithm in the book uses pointers/references Q which is very convenient there. In my case I have to use some kind of recursion to construct the tree. The problem is that I can't keep the state size of the tree between all algorithms using recursion which results in larger trees:

(defn ptc2-tree
  "Generate a random tree up to its `max-size`.
  Note: `max-size` is the number of nodes, not the same as its depth."
  [max-size]
  (if (> 2 max-size)
    (rand-nth terminal-vec)
    (let [fun   (rand-nth function-vec)
          arity (function-arity fun)]
      (cons fun (repeatedly arity #(ptc2-tree (- max-size arity 1)))))))

I also tried using atom for size but still couldn't get the exact tree size I want, it was either too small or too big depending on implementation.

Beside this I also have to somehow randomize the location where I insert the new node/tree.

How do I write this algorithm?

EDIT: A final touch to the correct solution:

(defn sequentiate [v] 
  (map #(if (seqable? %) (sequentiate %) %) (seq v)))

Upvotes: 3

Views: 311

Answers (2)

Alan Thompson
Alan Thompson

Reputation: 29958

As a coincidence, I have been working on AST manipulation code in the Tupelo Forest library. You can see example code here, and a video from the 2017 Clojure/Conj here.

The following shows how I would solve this problem. I tried to make the names as self-evident as possible so it should be easy to understand how the algorithm proceeds.

Basics:

(def op->arity {:add 2
                :sub 2
                :mul 2
                :div 2
                :pow 2})
(def op-set (set (keys op->arity)))
(defn choose-rand-op [] (rand-elem op-set))

(def arg-set #{:x :y})
(defn choose-rand-arg [] (rand-elem arg-set))

(defn num-hids [] (count (all-hids)))

Helper functions:

(s/defn hid->empty-kids :- s/Int
  [hid :- HID]
  (let [op             (hid->attr hid :op)
        arity          (grab op op->arity)
        kid-slots-used (count (hid->kids hid))
        result         (- arity kid-slots-used)]
    (verify (= 2 arity))
    (verify (not (neg? result)))
    result))

(s/defn node-has-empty-slot? :- s/Bool
  [hid :- HID]
  (pos? (hid->empty-kids hid)))

(s/defn total-empty-kids :- s/Int
  []
  (reduce +
    (mapv hid->empty-kids (all-hids))))

(s/defn add-op-node :- HID
  [op :- s/Keyword]
  (add-node {:tag :op :op op} )) ; add node w no kids

(s/defn add-leaf-node :- tsk/KeyMap
  [parent-hid :- HID
   arg :- s/Keyword]
  (kids-append parent-hid [(add-leaf {:tag :arg :arg arg})]))

(s/defn need-more-op? :- s/Bool
  [tgt-size :- s/Int]
  (let [num-op            (num-hids)
        total-size-so-far (+ num-op (total-empty-kids))
        result            (< total-size-so-far tgt-size)]
    result))

Main algorithm:

(s/defn build-rand-ast :- tsk/Vec ; bush result
  [ast-size]
  (verify (<= 3 ast-size)) ; 1 op & 2 args minimum;  #todo refine this
  (with-debug-hid
    (with-forest (new-forest)
      (let [root-hid (add-op-node (choose-rand-op))] ; root of AST
        ; Fill in random op nodes into the tree
        (while (need-more-op? ast-size)
          (let [node-hid (rand-elem (all-hids))]
            (when (node-has-empty-slot? node-hid)
              (kids-append node-hid
                [(add-op-node (choose-rand-op))]))))
        ; Fill in random arg nodes in empty leaf slots
        (doseq [node-hid (all-hids)]
          (while (node-has-empty-slot? node-hid)
            (add-leaf-node node-hid (choose-rand-arg))))
        (hid->bush root-hid)))))

(defn bush->form [it]
  (let [head (xfirst it)
        tag  (grab :tag head)]
    (if (= :op tag)
      (list (kw->sym (grab :op head))
        (bush->form (xsecond it))
        (bush->form (xthird it)))
      (kw->sym (grab :arg head)))))

(dotest
  (let [tgt-size 13]
    (dotimes [i 5]
      (let [ast     (build-rand-ast tgt-size)
            res-str (pretty-str ast)]
        (nl)
        (println res-str)
        (println (pretty-str (bush->form ast))) ))))

It prints results in both hierical "bush" format, and in lispy forms as well. Here are 2 typical results:

[{:tag :op, :op :mul}
 [{:tag :op, :op :div}
  [{:tag :op, :op :pow}
   [{:tag :op, :op :sub}
    [{:tag :arg, :arg :y, :value nil}]
    [{:tag :arg, :arg :x, :value nil}]]
   [{:tag :op, :op :div}
    [{:tag :arg, :arg :y, :value nil}]
    [{:tag :arg, :arg :y, :value nil}]]]
  [{:tag :arg, :arg :y, :value nil}]]
 [{:tag :op, :op :pow}
  [{:tag :arg, :arg :x, :value nil}]
  [{:tag :arg, :arg :y, :value nil}]]]

(mul (div (pow (sub y x) (div y y)) y) (pow x y))


[{:tag :op, :op :div}
 [{:tag :op, :op :mul}
  [{:tag :op, :op :pow}
   [{:tag :arg, :arg :x, :value nil}]
   [{:tag :arg, :arg :y, :value nil}]]
  [{:tag :op, :op :add}
   [{:tag :op, :op :div}
    [{:tag :arg, :arg :x, :value nil}]
    [{:tag :arg, :arg :y, :value nil}]]
   [{:tag :arg, :arg :x, :value nil}]]]
 [{:tag :op, :op :mul}
  [{:tag :arg, :arg :x, :value nil}]
  [{:tag :arg, :arg :y, :value nil}]]]

(div (mul (pow x y) (add (div x y) x)) (mul x y))

I used three-letter op-codes instead of math symbols for simplicity, but they could be easily replaced with Clojure function symbol names for input to eval.

Upvotes: 1

Aleph Aleph
Aleph Aleph

Reputation: 5395

The below is more or less a word-for-word translation of the PTC2 algorithm in the article. It's not quite idiomatic Clojure code; you may want to split it into functions / smaller blocks as you see reasonable.

(defn ptc2 [target-size]
  (if (= 1 target-size)
    (rand-nth terminal-vec)
    (let [f (rand-nth function-vec)
          arity (function-arity f)]
      ;; Generate a tree like [`+ nil nil] and iterate upon it
      (loop [ast (into [f] (repeat arity nil))
             ;; q will be something like ([1] [2]), being a list of paths to the
             ;; nil elements in the AST
             q (for [i (range arity)] [(inc i)])
             c 1]
        (if (< (+ c (count q)) target-size)
          ;; Replace one of the nils in the tree with a new node
          (let [a (rand-nth q)
                f (rand-nth function-vec)
                arity (function-arity f)]
            (recur (assoc-in ast a (into [f] (repeat arity nil)))
                   (into (remove #{a} q)
                         (for [i (range arity)] (conj a (inc i))))
                   (inc c)))
          ;; In the end, fill all remaining slots with terminals
          (reduce (fn [t path] (assoc-in t path (rand-nth terminal-vec)))
                  ast q))))))

You can use Clojure's loop construct (or reduce to keep the state of your iteration - in this algorith, the state includes):

  • ast, which is a nested vector that represents the formula which is being built, where the not-yet-completed nodes are marked as nil;
  • q, which corresponds to Q in the pseudocode and is a list of the paths to unfinished nodes in the ast,
  • and c, which is the count of the non-leaf nodes in the tree.

In the result, you get something like:

(ptc2 10) ;; => [* [- R [% R [% x x]]] [- x R]]

We make the AST using vectors (as opposed to lists) as it allows us to use assoc-in to progressively build the tree; you may want to convert it to nested lists by yourself if you need.

Upvotes: 3

Related Questions