Hanik
Hanik

Reputation: 105

Slow lexer in clojure

I'm trying to write a simple lexer in clojure. For now, it recognizes only white-space separated identifiers.

(refer 'clojure.set :only '[union])

(defn char-range-set
  "Generate set containing all characters in the range [from; to]"
  [from to]
  (set (map char (range (int from) (inc (int to))))))

(def ident-initial (union (char-range-set \A \Z) (char-range-set \a \z) #{\_}))

(def ident-subseq (union ident-initial (char-range-set \0 \9)))

(defn update-lex [lex token source]
  (assoc (update lex :tokens conj token) :source source))

(defn scan-identifier [lex]
  (assert (ident-initial (first (:source lex))))
  (loop [[c & cs :as source] (rest (:source lex))
         value [(first (:source lex))]]
    (if (ident-subseq c)
      (recur cs (conj value c))
      (update-lex lex {:type :identifier :value value} source))))

(defn scan [{tokens :tokens [c & cs :as source] :source :as lex}]
  (cond
    (Character/isWhitespace c) (assoc lex :source cs)
    (ident-initial c) (scan-identifier lex)))

(defn tokenize [source]
  (loop [lex {:tokens [] :source source}]
    (if (empty? (:source lex))
      (:tokens lex)
      (recur (scan lex)))))

(defn measure-tokenizer [n]
  (let [s (clojure.string/join (repeat n "abcde "))]
    (time (tokenize s))
    (* n (count "abcde "))))

Lexer processes approximately 6 million characters for 15 seconds.

=> (measure-tokenizer 1000000)
"Elapsed time: 15865.909399 msecs"

After that, I converted all maps and vectors into transients. This gave no improvement.

Also, I've implemented analogous algorithm in C++. It takes only 0.2 seconds for the same input.

My question is: How can I improve my code? Maybe I use clojure data structures incorrectly?

UPDATE:

So here's my C++ code.

#include <iostream>
#include <vector>
#include <chrono>
#include <unordered_set>
#include <cstdlib>
#include <string>
#include <cctype>
using namespace std;

struct Token
{
   enum { IDENTIFIER = 1 };
   int type;
   string value;
};

class Lexer
{
public:
   Lexer(const std::string& source)
      : mSource(source)
      , mIndex(0)
   {
      initCharSets();
   }

   std::vector<Token> tokenize()
   {
      while (mIndex < mSource.size())
      {
         scan();
      }

      return mResult;
   }

private:

   void initCharSets()
   {
      for (char c = 'a'; c <= 'z'; ++c)
         mIdentifierInitial.insert(c);
      for (char c = 'A'; c <= 'Z'; ++c)
         mIdentifierInitial.insert(c);
      mIdentifierInitial.insert('_');

      mIdentifierSubsequent = mIdentifierInitial;
      for (char c = '0'; c <= '9'; ++c)
         mIdentifierSubsequent.insert(c);
   }

   void scan()
   {
      skipSpaces();

      if (mIndex < mSource.size())
      {
         if (mIdentifierInitial.find(mSource[mIndex]) != mIdentifierInitial.end())
         {
            scanIdentifier();
         }

         mResult.push_back(mToken);
      }
   }

   void scanIdentifier()
   {
      size_t i = mIndex;

      while ((i < mSource.size()) && (mIdentifierSubsequent.find(mSource[i]) != mIdentifierSubsequent.end()))
         ++i;

      mToken.type = Token::IDENTIFIER;
      mToken.value = mSource.substr(mIndex, i - mIndex);
      mIndex = i;
   }

   void skipSpaces()
   {
      while ((mIndex < mSource.size()) && std::isspace(mSource[mIndex]))
         ++mIndex;
   }

   unordered_set<char> mIdentifierInitial;
   unordered_set<char> mIdentifierSubsequent;
   string mSource;
   size_t mIndex;
   vector<Token> mResult;
   Token mToken;
};

void measureBigString(int n)
{
   std::string substri = "jobbi ";
   std::string bigstr;
   for (int i =0 ;i < n;++i)
      bigstr += substri;

   Lexer lexer(bigstr);

   std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();

   lexer.tokenize();

   std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();

   std::cout << n << endl;
   std::cout << "Time difference = " << std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count() <<std::endl;
   std::cout << "\n\n\n";
}



int main()
{
   measureBigString(1000000);


   return 0;
}

Upvotes: 2

Views: 545

Answers (2)

rmcv
rmcv

Reputation: 1976

UPDATE:

One more significant tuning is on vector de-structuring. By replacing code like this:

(let [[c & cs] xs] ...)

with:

(let [c  (first xs)
      cs (rest xs)] ...)

will give another x2 performance improvement. All together you will get a x26 speedup - which should be on par with C++ implementation.

So in short:

  1. Type hint avoids all the reflection call
  2. Record gives you optimised access/update to properties
  3. first and rest avoids vector de-structuring - which uses nth/nthFrom and performs sequential access for seq.

Hopefully vector de-structuring can be optimised to avoid nthFrom for common case like this (where only first and rest are there in the binding).


FIRST TUNING - with type hint and record:

You can also use record instead of generic map:

(refer 'clojure.set :only '[union])

(defn char-range-set
  "Generate set containing all characters in the range [from; to]"
  [from to]
  (set (map char (range (int from) (inc (int to))))))

(def ident-initial (union (char-range-set \A \Z) (char-range-set \a \z) #{\_}))

(def ident-subseq (union ident-initial (char-range-set \0 \9)))

(defrecord Token [type value])
(defrecord Lex [tokens source])

(defn update-lex [^Lex lex ^Token token source]
  (assoc (update lex :tokens conj token) :source source))

(defn scan-identifier [^Lex lex]
  (let [[x & xs] (:source lex)]
    (loop [[c & cs :as source] xs
           value               [x]]
      (if (ident-subseq c)
        (recur cs (conj value c))
        (update-lex lex (Token. :identifier value) source)))))

(defn scan [^Lex lex]
  (let [[c & cs] (:source lex)
        tokens   (:tokens lex)]
    (cond
      (Character/isWhitespace ^char c) (assoc lex :source cs)
      (ident-initial c)                (scan-identifier lex))))

(defn tokenize [source]
  (loop [lex (Lex. [] source)]
    (if (empty? (:source lex))
      (:tokens lex)
      (recur (scan lex)))))

(use 'criterium.core)

(defn measure-tokenizer [n]
  (let [s (clojure.string/join (repeat n "abcde "))]
    (bench (tokenize s))
    (* n (count "abcde "))))

(measure-tokenizer 1000)

Using criterium:

Evaluation count : 128700 in 60 samples of 2145 calls.
             Execution time mean : 467.378916 µs
    Execution time std-deviation : 329.455994 ns
   Execution time lower quantile : 466.867909 µs ( 2.5%)
   Execution time upper quantile : 467.984646 µs (97.5%)
                   Overhead used : 1.502982 ns

Comparing to the original code:

Evaluation count : 9960 in 60 samples of 166 calls.
             Execution time mean : 6.040209 ms
    Execution time std-deviation : 6.630519 µs
   Execution time lower quantile : 6.028470 ms ( 2.5%)
   Execution time upper quantile : 6.049443 ms (97.5%)
                   Overhead used : 1.502982 ns

The optimized version is roughly x13 speedup. With n=1,000,000, it now takes ~0.5 second.

Upvotes: 3

Alex Miller
Alex Miller

Reputation: 70201

I don't see anything obviously wrong with this code. I wouldn't expect transients to help you too much as you are not bulk loading, but rather updating once per loop (plus I doubt that's actually the slowest part).

My guess at which things are slow:

  • checking the character in the set (requires hashing and traversing the internal hash tree). Instead of building sets, creating functions that actually did an int-based check on the character ranges (> this, < that, etc) would not be as pretty but would almost certainly be faster, particularly if you took the care to use primitive type hints and avoid boxing to objects.
  • each time through the loop bashes a value nested in a hashmap. That's not going to be the fastest operation. If you did keep that thing as an independent transient vector that would be faster and avoid rebuilding the upper tree. Depending how far outside idiomatic Clojure and into Java land you wanted to go, you could also use a mutable ArrayList. It's dirty, but it's fast - if you constrain the scope of who is exposed to that mutable state, then I would consider something like this. Conceptually, same thing as a transient vector.

Upvotes: 3

Related Questions