Reputation: 5389
I am using the decision tree algorithm and I get the following error. I have more than 500 features. Is that a problem? Any help would be great!
java.lang.UnknownError: no bin was found for continuous variable.
at org.apache.spark.mllib.tree.DecisionTree$.findBin$1(DecisionTree.scala:492)
at org.apache.spark.mllib.tree.DecisionTree$.org$apache$spark$mllib$tree$DecisionTree$$findBinsForLevel$1(DecisionTree.scala:529)
at org.apache.spark.mllib.tree.DecisionTree$$anonfun$3.apply(DecisionTree.scala:653)
at org.apache.spark.mllib.tree.DecisionTree$$anonfun$3.apply(DecisionTree.scala:653)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:144)
at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.aggregate(TraversableOnce.scala:201)
at scala.collection.AbstractIterator.aggregate(Iterator.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$21.apply(RDD.scala:838)
at org.apache.spark.rdd.RDD$$anonfun$21.apply(RDD.scala:838)
at org.apache.spark.SparkContext$$anonfun$23.apply(SparkContext.scala:1116)
at org.apache.spark.SparkContext$$anonfun$23.apply(SparkContext.scala:1116)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:111)
at org.apache.spark.scheduler.Task.run(Task.scala:51)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:187)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:744)
14/08/13 16:36:06 ERROR ExecutorUncaughtExceptionHandler: Uncaught exception in thread Thread[Executor task launch worker-0,5,main]
java.lang.UnknownError: no bin was found for continuous variable.
at org.apache.spark.mllib.tree.DecisionTree$.findBin$1(DecisionTree.scala:492)
at org.apache.spark.mllib.tree.DecisionTree$.org$apache$spark$mllib$tree$DecisionTree$$findBinsForLevel$1(DecisionTree.scala:529)
at org.apache.spark.mllib.tree.DecisionTree$$anonfun$3.apply(DecisionTree.scala:653)
at org.apache.spark.mllib.tree.DecisionTree$$anonfun$3.apply(DecisionTree.scala:653)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:144)
at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.aggregate(TraversableOnce.scala:201)
at scala.collection.AbstractIterator.aggregate(Iterator.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$21.apply(RDD.scala:838)
at org.apache.spark.rdd.RDD$$anonfun$21.apply(RDD.scala:838)
at org.apache.spark.SparkContext$$anonfun$23.apply(SparkContext.scala:1116)
at org.apache.spark.SparkContext$$anonfun$23.apply(SparkContext.scala:1116)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:111)
at org.apache.spark.scheduler.Task.run(Task.scala:51)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:187)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:744)
Upvotes: 0
Views: 190
Reputation: 5389
It was indeed happening due to unclean input data. Few rows had "NaN" entry for a column. Once we cleaned up that, everything was fine.
Upvotes: 1