Reputation:
I have a dataframe df with columns a,b,c,d,e,f,g.
I have a scala List L1 which is List[Any] = List(a,b,c)
How to perform a group by operation on DF and find duplicates if any using the list L1
Also how to find out if the dataframe has nulls/blanks/emptyvalues for the columns which are mentioned in list L1
e.g. df.groupby(l1) needs to be used as l1 may vary from time to time
Upvotes: 0
Views: 623
Reputation: 1214
// Null
case class Source(
a: Option[String],
b: Option[String],
c: Option[String],
d: Option[String],
e: Option[String],
f: Option[String],
g: Option[String] )
val l = List("a", "b", "c")
val sourceDF = Seq(
Source(None, Some("b1"), Some("c1"), Some("d1"), Some("e1"), Some("f1"), Some("g1")),
Source(Some("a2"), None, Some("c2"), Some("d2"), Some("e2"), Some("f2"), Some("g2")),
Source(Some("a3"), Some("b3"), None, Some("d3"), Some("e3"), Some("f3"), Some("g3")),
Source(Some("a4"), Some("b4"), Some("c4"), Some("d4"), Some("e4"), Some("f4"), Some("g4"))
).toDF()
sourceDF.show(false)
// +----+----+----+---+---+---+---+
// |a |b |c |d |e |f |g |
// +----+----+----+---+---+---+---+
// |null|b1 |c1 |d1 |e1 |f1 |g1 |
// |a2 |null|c2 |d2 |e2 |f2 |g2 |
// |a3 |b3 |null|d3 |e3 |f3 |g3 |
// |a4 |b4 |c4 |d4 |e4 |f4 |g4 |
// +----+----+----+---+---+---+---+
val f1 = l.map(i => s" $i is null").mkString(" or ")
sourceDF.where(f1).show(false)
// +----+----+----+---+---+---+---+
// |a |b |c |d |e |f |g |
// +----+----+----+---+---+---+---+
// |null|b1 |c1 |d1 |e1 |f1 |g1 |
// |a2 |null|c2 |d2 |e2 |f2 |g2 |
// |a3 |b3 |null|d3 |e3 |f3 |g3 |
// +----+----+----+---+---+---+---+
// groupBy
val gbDF = sourceDF.groupBy(l.head, l.tail:_*).count()
gbDF.show(false)
// +----+----+----+-----+
// |a |b |c |count|
// +----+----+----+-----+
// |a2 |null|c2 |1 |
// |a4 |b4 |c4 |1 |
// |a3 |b3 |null|1 |
// |null|b1 |c1 |1 |
// +----+----+----+-----+
Upvotes: 1