Reputation: 73
How do you aggregate dynamically in scala spark based on data types?
For example:
SELECT ID, SUM(when DOUBLE type)
, APPEND(when STRING), MAX(when BOOLEAN)
from tbl
GROUP BY ID
Upvotes: 2
Views: 415
Reputation: 27383
You can do this by getting the runtime schema matching on the datatype, example :
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import spark.implicits._
val df =Seq(
(1, 1.0, true, "a"),
(1, 2.0, false, "b")
).toDF("id","d","b","s")
val dataTypes: Map[String, DataType] = df.schema.map(sf => (sf.name,sf.dataType)).toMap
def genericAgg(c:String) = {
dataTypes(c) match {
case DoubleType => sum(col(c))
case StringType => concat_ws(",",collect_list(col(c))) // "append"
case BooleanType => max(col(c))
}
}
val aggExprs: Seq[Column] = df.columns.filterNot(_=="id") // use all
.map(c => genericAgg(c))
df
.groupBy("id")
.agg(
aggExprs.head,aggExprs.tail:_*
)
.show()
gives
+---+------+------+-----------------------------+
| id|sum(d)|max(b)|concat_ws(,, collect_list(s))|
+---+------+------+-----------------------------+
| 1| 3.0| true| a,b|
+---+------+------+-----------------------------+
Upvotes: 2