mac01021
mac01021

Reputation: 825

Generating recursive structures in scalacheck

I'm trying to make a generator for a recursive datatype called Row. A row is a list of named Vals, where a Val is either an atomic Bin or else a nested Row.

This is my code:

package com.dtci.data.anonymize.parquet

import java.nio.charset.StandardCharsets
import org.scalacheck.Gen

object TestApp extends App {

  sealed trait Val
  case class Bin(bytes: Array[Byte]) extends Val
  object Bin {
    def from_string(str: String): Bin = Bin(str.getBytes(StandardCharsets.UTF_8))
  }
  case class Row(flds: List[(String, Val)]) extends Val

  val gen_bin = Gen.alphaStr.map(Bin.from_string)
  val gen_field_name = Gen.alphaLowerStr
  val gen_field = Gen.zip(gen_field_name, gen_val)
  val gen_row = Gen.nonEmptyListOf(gen_field).map(Row.apply)
  def gen_val: Gen[Val] = Gen.oneOf(gen_bin, gen_row)

  gen_row.sample.get.flds.foreach( fld => println(s"${fld._1} --> ${fld._2}"))
}

It crashes with the following stack trace:

Exception in thread "main" java.lang.NullPointerException
    at org.scalacheck.Gen.$anonfun$flatMap$2(Gen.scala:84)
    at org.scalacheck.Gen$R.flatMap(Gen.scala:243)
    at org.scalacheck.Gen$R.flatMap$(Gen.scala:240)
    at org.scalacheck.Gen$R$$anon$3.flatMap(Gen.scala:228)
    at org.scalacheck.Gen.$anonfun$flatMap$1(Gen.scala:84)
    at org.scalacheck.Gen$Parameters.useInitialSeed(Gen.scala:318)
    at org.scalacheck.Gen$$anon$5.doApply(Gen.scala:255)
    at org.scalacheck.Gen$$anon$1.$anonfun$doApply$1(Gen.scala:110)
    at org.scalacheck.Gen$Parameters.useInitialSeed(Gen.scala:318)
    at org.scalacheck.Gen$$anon$1.doApply(Gen.scala:109)
    at org.scalacheck.Gen.$anonfun$map$1(Gen.scala:79)
    at org.scalacheck.Gen$Parameters.useInitialSeed(Gen.scala:318)
    at org.scalacheck.Gen$$anon$5.doApply(Gen.scala:255)
    at org.scalacheck.Gen.$anonfun$flatMap$2(Gen.scala:84)
    at org.scalacheck.Gen$R.flatMap(Gen.scala:243)
    at org.scalacheck.Gen$R.flatMap$(Gen.scala:240)
    at org.scalacheck.Gen$R$$anon$3.flatMap(Gen.scala:228)
    at org.scalacheck.Gen.$anonfun$flatMap$1(Gen.scala:84)
    at org.scalacheck.Gen$Parameters.useInitialSeed(Gen.scala:318)
    at org.scalacheck.Gen$$anon$5.doApply(Gen.scala:255)
    at org.scalacheck.Gen$$anon$1.$anonfun$doApply$1(Gen.scala:110)
    at org.scalacheck.Gen$Parameters.useInitialSeed(Gen.scala:318)
    at org.scalacheck.Gen$$anon$1.doApply(Gen.scala:109)
    at org.scalacheck.Gen$.$anonfun$sequence$2(Gen.scala:492)
    at scala.collection.LinearSeqOps.foldLeft(LinearSeq.scala:168)
    at scala.collection.LinearSeqOps.foldLeft$(LinearSeq.scala:164)
    at scala.collection.immutable.List.foldLeft(List.scala:79)
    at org.scalacheck.Gen$.$anonfun$sequence$1(Gen.scala:490)
    at org.scalacheck.Gen$Parameters.useInitialSeed(Gen.scala:318)
    at org.scalacheck.Gen$$anon$5.doApply(Gen.scala:255)
    at org.scalacheck.Gen.$anonfun$map$1(Gen.scala:79)
    at org.scalacheck.Gen$Parameters.useInitialSeed(Gen.scala:318)
    at org.scalacheck.Gen$$anon$5.doApply(Gen.scala:255)
    at org.scalacheck.Gen$$anon$1.$anonfun$doApply$1(Gen.scala:110)
    at org.scalacheck.Gen$Parameters.useInitialSeed(Gen.scala:318)
    at org.scalacheck.Gen$$anon$1.doApply(Gen.scala:109)
    at org.scalacheck.Gen.$anonfun$flatMap$2(Gen.scala:84)
    at org.scalacheck.Gen$R.flatMap(Gen.scala:243)
    at org.scalacheck.Gen$R.flatMap$(Gen.scala:240)
    at org.scalacheck.Gen$R$$anon$3.flatMap(Gen.scala:228)
    at org.scalacheck.Gen.$anonfun$flatMap$1(Gen.scala:84)
    at org.scalacheck.Gen$Parameters.useInitialSeed(Gen.scala:318)
    at org.scalacheck.Gen$$anon$5.doApply(Gen.scala:255)
    at org.scalacheck.Gen$.$anonfun$sized$1(Gen.scala:551)
    at org.scalacheck.Gen$Parameters.useInitialSeed(Gen.scala:318)
    at org.scalacheck.Gen$$anon$5.doApply(Gen.scala:255)
    at org.scalacheck.Gen$$anon$1.$anonfun$doApply$1(Gen.scala:110)
    at org.scalacheck.Gen$Parameters.useInitialSeed(Gen.scala:318)
    at org.scalacheck.Gen$$anon$1.doApply(Gen.scala:109)
    at org.scalacheck.Gen.$anonfun$map$1(Gen.scala:79)
    at org.scalacheck.Gen$Parameters.useInitialSeed(Gen.scala:318)
    at org.scalacheck.Gen$$anon$5.doApply(Gen.scala:255)
    at org.scalacheck.Gen.sample(Gen.scala:154)

What's wrong with my code, and what would have been the best way for me to diagnose it myself?

As a note, I've seen the remarks about Gen.oneOf being strict and needing Gen.lzy for recursive structures. But if, in my code, I wrap the definition of gen_val inside of Gen.lzy(...) then I get a stack overflow rather than the current null pointer exception.

Upvotes: 1

Views: 286

Answers (1)

Igor Ramazanov
Igor Ramazanov

Reputation: 41

First of all, be careful using object Main extends App. I find its fields initialization semantic less obvious than plain old main with line-after-line semantics:

object Main {
  def main(args: Array[String]): Unit = {...}
}

This is likely a problem with the NullPointerException.

Usually, it can be fixed by careful checking out fields initialization order and marking some (or all of them) val's as lazy.

The StackOverflowError arises because of too deep generated data structure.

Generally, when you are dealing with any kind of recursion, always consider the base case when the recursion should stop and the step which eventually will hit the base case.

In your particular case we can utilize the Gen.sized and Gen.resize which are responsible for how "big" are generated elements (checkout docs and google for more information).

package com.dtci.data.anonymize.parquet

import java.nio.charset.StandardCharsets
import org.scalacheck.Gen

object Main extends App {

  sealed trait Val
  case class Bin(bytes: Array[Byte]) extends Val
  object Bin {
    def from_string(str: String): Bin = Bin(str.getBytes(StandardCharsets.UTF_8))
  }
  case class Row(flds: List[(String, Val)]) extends Val

  val gen_bin = Gen.alphaStr.map(Bin.from_string)
  val gen_field_name = Gen.alphaLowerStr
  val gen_field = Gen.zip(gen_field_name, gen_val)
  val gen_row = Gen.sized(size => Gen.resize(size / 2, Gen.nonEmptyListOf(gen_field).map(Row.apply)))

  def gen_val: Gen[Val] = Gen.sized { size =>
    if (size <= 0) {
      gen_bin
    } else {
      Gen.oneOf(gen_bin, gen_row)
    }
  }

  gen_row.sample.get.flds.foreach(fld => println(s"${fld._1} --> ${fld._2}"))
}

Upvotes: 0

Related Questions