Reputation: 2135
There is a table with two columns books
and readers
of these books, where books
and readers
are book and reader IDs, respectively.
When trying to order readers by number of books they read, I get AbstractSparkSQLParser
exception:
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.sql.functions._
object Small {
case class Book(book: Int, reader: Int)
val recs = Array(
Book(book = 1, reader = 30),
Book(book = 2, reader = 10),
Book(book = 3, reader = 20),
Book(book = 1, reader = 20),
Book(book = 1, reader = 10),
Book(book = 1, reader = 40),
Book(book = 2, reader = 40),
Book(book = 2, reader = 30))
def main(args: Array[String]) {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
// set up environment
val conf = new SparkConf()
.setMaster("local[5]")
.setAppName("Small")
.set("spark.executor.memory", "2g")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
val df = sc.parallelize(recs).toDF()
val readerGroups = df.groupBy("reader").count()
readerGroups.show()
readerGroups.registerTempTable("readerGroups")
readerGroups.printSchema()
// "SELECT reader, count FROM readerGroups ORDER BY count DESC"
val readerGroupsSorted = sqlContext.sql("SELECT * FROM readerGroups ORDER BY count DESC")
readerGroupsSorted.show()
println("Group Cnt: "+readerGroupsSorted.count())
And this is an output, 'groupBy` works all right:
reader count
40 2
10 2
20 2
30 2
Resulting schema:
root
|-- reader: integer (nullable = false)
|-- count: long (nullable = false)
Yet SELECT * FROM readerGroups ORDER BY count DESC
fails with exception (see below). In fact all other select
rtequests fail as well, except for SELECT * FROM readerGroups
and SELECT reader FROM readerGroups
- these work. Why is that?
How to make ORDER BY count DESC
work?
Exception in thread "main" java.lang.RuntimeException: [1.43] failure: ``('' expected but `desc' found
SELECT * FROM readerGroups ORDER BY count DESC
^
at scala.sys.package$.error(package.scala:27)
at org.apache.spark.sql.catalyst.AbstractSparkSQLParser.apply(AbstractSparkSQLParser.scala:40)
at org.apache.spark.sql.SQLContext$$anonfun$2.apply(SQLContext.scala:134)
at org.apache.spark.sql.SQLContext$$anonfun$2.apply(SQLContext.scala:134)
at org.apache.spark.sql.SparkSQLParser$$anonfun$org$apache$spark$sql$SparkSQLParser$$others$1.apply(SparkSQLParser.scala:96)
at org.apache.spark.sql.SparkSQLParser$$anonfun$org$apache$spark$sql$SparkSQLParser$$others$1.apply(SparkSQLParser.scala:95)
at scala.util.parsing.combinator.Parsers$Success.map(Parsers.scala:136)
at scala.util.parsing.combinator.Parsers$Success.map(Parsers.scala:135)
at scala.util.parsing.combinator.Parsers$Parser$$anonfun$map$1.apply(Parsers.scala:242)
at scala.util.parsing.combinator.Parsers$Parser$$anonfun$map$1.apply(Parsers.scala:242)
at scala.util.parsing.combinator.Parsers$$anon$3.apply(Parsers.scala:222)
at scala.util.parsing.combinator.Parsers$Parser$$anonfun$append$1$$anonfun$apply$2.apply(Parsers.scala:254)
at scala.util.parsing.combinator.Parsers$Parser$$anonfun$append$1$$anonfun$apply$2.apply(Parsers.scala:254)
at scala.util.parsing.combinator.Parsers$Failure.append(Parsers.scala:202)
at scala.util.parsing.combinator.Parsers$Parser$$anonfun$append$1.apply(Parsers.scala:254)
at scala.util.parsing.combinator.Parsers$Parser$$anonfun$append$1.apply(Parsers.scala:254)
at scala.util.parsing.combinator.Parsers$$anon$3.apply(Parsers.scala:222)
at scala.util.parsing.combinator.Parsers$$anon$2$$anonfun$apply$14.apply(Parsers.scala:891)
at scala.util.parsing.combinator.Parsers$$anon$2$$anonfun$apply$14.apply(Parsers.scala:891)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at scala.util.parsing.combinator.Parsers$$anon$2.apply(Parsers.scala:890)
at scala.util.parsing.combinator.PackratParsers$$anon$1.apply(PackratParsers.scala:110)
at org.apache.spark.sql.catalyst.AbstractSparkSQLParser.apply(AbstractSparkSQLParser.scala:38)
at org.apache.spark.sql.SQLContext$$anonfun$parseSql$1.apply(SQLContext.scala:138)
at org.apache.spark.sql.SQLContext$$anonfun$parseSql$1.apply(SQLContext.scala:138)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.sql.SQLContext.parseSql(SQLContext.scala:138)
at org.apache.spark.sql.SQLContext.sql(SQLContext.scala:933)
at Small$.main(Small.scala:60)
at Small.main(Small.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:140)
Upvotes: 1
Views: 9039
Reputation: 674
the problem is the name of the colum COUNT. COUNT is a reserved word in spark, so you cant use his name to do a query, or a sort by this field.
You can try to do it with backticks:
select * from readerGroups ORDER BY `count` DESC
The other option is to rename the column count by something different like NumReaders or whatever...
Upvotes: 3
Reputation: 2465
Use a derived table to order by a calculated field (such as top, max, count ...)
SELECT * FROM
(
SELECT reader, count(book) AS book_count
FROM readerbook
GROUP by reader) a
ORDER BY book_count desc
Actually on second thought, it might be possible to just do your order by if you use an alias like this:
SELECT reader, count(book) AS book_count
FROM readerbook
GROUP by reader
ORDER BY book_count desc
Upvotes: 0