Reputation: 103
env: spark 2.4.5
id,date,item1,item2,item3
0,1,111,,
0,1,,222,
0,1,,,333
1,1,111,,
1,1,,222,
1,1,,,333
id,date,item1,item2,item3
0,1,111,222,333
1,1,111,222,333
As you can see, I want to merge rows with the same id and date into one.
I've tried arrays_zip function to handle it but failed:
val soruce = spark.read("/home/user/test.csv").csv.options("header", "true")
spark.sql("SELECT id , date, arrays_zip( collect_list(item1), collect_list(item2), collect_list(item3)) FROM source GROUP BY id,date").show(false)
+---+----+-------------------------------------------------------------------------+
|id |date|arrays_zip(collect_list(item1), collect_list(item2), collect_list(item3))|
+---+----+-------------------------------------------------------------------------+
|0 |1 |[[111, 222, 333]] |
|1 |1 |[[111, 222, 333]] |
+---+----+-------------------------------------------------------------------------+
Maybe I should explode this array into cols?
Appreciated if you could give me some suggestions.
Upvotes: 1
Views: 175
Reputation: 31540
Use flatten
and array
instead of arrays_zip
then use element_at
function to get the item from each element.
val df = spark.read("/home/user/test.csv").csv.options("header", "true")
df.groupBy(col("id"),col("date")).
agg(flatten(array(collect_list(col("item1")),collect_list(col("item2")),collect_list(col("item3")))).alias("it")).
withColumn("item1",element_at(col("it"),1)).
withColumn("item2",element_at(col("it"),2)).
withColumn("item3",element_at(col("it"),3)).
drop("it").
show()
//+---+----+-----+-----+-----+
//| id|date|item1|item2|item3|
//+---+----+-----+-----+-----+
//| 0| 1| 111| 222| 333|
//| 1| 1| 111| 222| 333|
//+---+----+-----+-----+-----+
2.Using groupBy and first(col,ignoreNulls=true)
df.groupBy(col("id"),col("date")).
agg(first(col("item1")).alias("item1"),first(col("item2"),true).alias("item2"),first(col("item3"),true).alias("item3")).
show()
//+---+----+-----+-----+-----+
//| id|date|item1|item2|item3|
//+---+----+-----+-----+-----+
//| 0| 1| 111| 222| 333|
//| 1| 1| 111| 222| 333|
//+---+----+-----+-----+-----+
SQL:
df.createOrReplaceTempView("tmp")
//using first
spark.sql("select id,date,first(item1,true) as item1,first(item2,true) as item2,first(item3,true) as item3 from tmp group by id,date").show()
//using max
spark.sql("select id,date,max(item1) as item1,max(item2) as item2,max(item3) as item3 from tmp group by id,date").show()
//using flatten array
spark.sql("select id,date, element_at(tmp,1)item1, element_at(tmp,2)item2, element_at(tmp,3)item3 from (select id,date,flatten(array(collect_list(item1),collect_list(item2),collect_list(item3))) as tmp from tmp group by id,date)t").show()
//+---+----+-----+-----+-----+
//| id|date|item1|item2|item3|
//+---+----+-----+-----+-----+
//| 0| 1| 111| 222| 333|
//| 1| 1| 111| 222| 333|
//+---+----+-----+-----+-----+
Dynamic way:
val df = spark.read("/home/user/test.csv").csv.options("header", "true")
val df1=df.groupBy(col("id"),col("date")).agg(flatten(array(collect_list(col("item1")),collect_list(col("item2")),collect_list(col("item3")))).alias("it"))
val len=df1.agg(max(size(col("it")))).collect()(0)(0).toString.toInt
spark.range(len).collect().foldLeft(df1)((df,len) => df.withColumn(s"item${len+1}",col("it")(len))).
drop("it").
show()
//+---+----+-----+-----+-----+
//| id|date|item1|item2|item3|
//+---+----+-----+-----+-----+
//| 0| 1| 111| 222| 333|
//| 1| 1| 111| 222| 333|
//+---+----+-----+-----+-----+
Upvotes: 2