Reputation: 199
I have a table structure like this:
unique_id | group | value_1 | value_2 | value_3
abc_xxx 1 200 null 100
def_xxx 1 0 3 40
ghi_xxx 2 300 1 2
that I need to extract the following information from:
I can do the first one with a simple groupBy and count
df.select().groupBy(group).count()
I'm not so sure how to approach the next two which is needed for me to compute the null and zero rate from the total rows per group.
Upvotes: 0
Views: 536
Reputation: 669
data= [
('abc_xxx', 1, 200, None, 100),
('def_xxx', 1, 0, 3, 40 ),
('ghi_xxx', 2, 300, 1, 2 ),
]
df = spark.createDataFrame(data, ['unique_id','group','value_1','value_2','value_3'])
# new edit
df = df\
.withColumn('contains_null', when(isnull(col('value_1')) | isnull(col('value_2')) | isnull(col('value_3')), lit(1)).otherwise(lit(0)))\
.withColumn('contains_zero', when((col('value_1')==0) | (col('value_2')==0) | (col('value_3')==0), lit(1)).otherwise(lit(0)))
df.groupBy('group')\
.agg(count('unique_id').alias('total_rows'), sum('contains_null').alias('null_value_rows'), sum('contains_zero').alias('zero_value_rows')).show()
+-----+----------+---------------+---------------+
|group|total_rows|null_value_rows|zero_value_rows|
+-----+----------+---------------+---------------+
| 1| 2| 1| 1|
| 2| 1| 0| 0|
+-----+----------+---------------+---------------+
# total_count = (count('value_1') + count('value_2') + count('value_3'))
# null_count = (sum(when(isnull(col('value_1')), lit(1)).otherwise(lit(0)) + when(isnull(col('value_2')), lit(1)).otherwise(lit(0)) + when(isnull(col('value_3')), lit(1)).otherwise(lit(0))))
# zero_count = (sum(when(col('value_1')==0, lit(1)).otherwise(lit(0)) + when(col('value_2')==0, lit(1)).otherwise(lit(0)) + when(col('value_3')==0, lit(1)).otherwise(lit(0))))
# df.groupBy('group')\
# .agg(total_count.alias('total_numbers'), null_count.alias('null_values'), zero_count.alias('zero_values')).show()
#+-----+-------------+-----------+-----------+
#|group|total_numbers|null_values|zero_values|
#+-----+-------------+-----------+-----------+
#| 1| 5| 1| 1|
#| 2| 3| 0| 0|
#+-----+-------------+-----------+-----------+
Upvotes: 3