Reputation: 361
Suppose, I have a dataframe like below:
simpleData = (("U1", "cd1dd155-ccd8-4b8c-bea7-571359e35fed", 1655605947), \
("U1", "7f20182f-8c82-4c70-8213-f7889cfdd5eb", 1655777060), \
("U1", "7f20182f-8c82-4c70-8213-f7889cfdd5eb", 1655777062),
("U1", "c4d5a218-d61d-4e9a-b1ea-646f676c4cb7", 1656209951), \
("U1", "c4d5a218-d61d-4e9a-b1ea-646f676c4cb7", 1656209952), \
("U1", "c4d5a218-d61d-4e9a-b1ea-646f676c4cb7", 1656209999), \
)
columns= ["UID", "Sess", "Time"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)
+---+------------------------------------+----------+
|UID|Sess |Time |
+---+------------------------------------+----------+
|U1 |cd1dd155-ccd8-4b8c-bea7-571359e35fed|1655605947|
|U1 |7f20182f-8c82-4c70-8213-f7889cfdd5eb|1655777060|
|U1 |7f20182f-8c82-4c70-8213-f7889cfdd5eb|1655777062|
|U1 |c4d5a218-d61d-4e9a-b1ea-646f676c4cb7|1656209951|
|U1 |c4d5a218-d61d-4e9a-b1ea-646f676c4cb7|1656209952|
|U1 |c4d5a218-d61d-4e9a-b1ea-646f676c4cb7|1656209999|
+---+------------------------------------+----------+
And I am assigning a rank of rows within a window partition as
df2 = df.withColumn("sess_2", F.dense_rank().over(Window.orderBy('UID', 'Sess')))
df2.show(truncate=False)
I get the following output:
+---+------------------------------------+----------+------+
|UID|Sess |Time |sess_2|
+---+------------------------------------+----------+------+
|U1 |7f20182f-8c82-4c70-8213-f7889cfdd5eb|1655777060|1 |
|U1 |7f20182f-8c82-4c70-8213-f7889cfdd5eb|1655777062|1 |
|U1 |c4d5a218-d61d-4e9a-b1ea-646f676c4cb7|1656209951|2 |
|U1 |c4d5a218-d61d-4e9a-b1ea-646f676c4cb7|1656209952|2 |
|U1 |c4d5a218-d61d-4e9a-b1ea-646f676c4cb7|1656209999|2 |
|U1 |cd1dd155-ccd8-4b8c-bea7-571359e35fed|1655605947|3 |
+---+------------------------------------+----------+------+
whereas I am expecting it to be
+---+------------------------------------+----------+------+
|UID|Sess |Time |sess_2|
+---+------------------------------------+----------+------+
|U1 |cd1dd155-ccd8-4b8c-bea7-571359e35fed|1655605947|1 |
|U1 |7f20182f-8c82-4c70-8213-f7889cfdd5eb|1655777060|2 |
|U1 |7f20182f-8c82-4c70-8213-f7889cfdd5eb|1655777062|2 |
|U1 |c4d5a218-d61d-4e9a-b1ea-646f676c4cb7|1656209951|3 |
|U1 |c4d5a218-d61d-4e9a-b1ea-646f676c4cb7|1656209952|3 |
|U1 |c4d5a218-d61d-4e9a-b1ea-646f676c4cb7|1656209999|3 |
+---+------------------------------------+----------+------+
How do I assign the correct sess_2 for each partition with UID and sess ordered by time?
Upvotes: 0
Views: 46
Reputation: 3242
To solve this, first you will need to find the session start time per UID
and Sess
and order your Window based on this.
from pyspark.sql import functions as F
from pyspark.sql import Window
simpleData = (("U1", "cd1dd155-ccd8-4b8c-bea7-571359e35fed", 1655605947), \
("U1", "7f20182f-8c82-4c70-8213-f7889cfdd5eb", 1655777060), \
("U1", "7f20182f-8c82-4c70-8213-f7889cfdd5eb", 1655777062),
("U1", "c4d5a218-d61d-4e9a-b1ea-646f676c4cb7", 1656209951), \
("U1", "c4d5a218-d61d-4e9a-b1ea-646f676c4cb7", 1656209952), \
("U1", "c4d5a218-d61d-4e9a-b1ea-646f676c4cb7", 1656209999), \
)
columns= ["UID", "Sess", "Time"]
df = spark.createDataFrame(data = simpleData, schema = columns)
# per UID and Sess find when the session started and add Sess to the end to resolve duplicates when multiple sessions can start at the same time.
sess_ws = Window.partitionBy("UID", "Sess")
df_with_session_start = df.withColumn("session_start", F.concat(F.min("time").over(sess_ws), F.col("Sess")))
df2 = df_with_session_start.withColumn("sess_2", F.dense_rank().over(Window.partitionBy("UID").orderBy("session_start")))
df2.show(truncate=False)
"""
+---+------------------------------------+----------+------+
|UID|Sess |Time |sess_2|
+---+------------------------------------+----------+------+
|U1 |cd1dd155-ccd8-4b8c-bea7-571359e35fed|1655605947|1 |
|U1 |7f20182f-8c82-4c70-8213-f7889cfdd5eb|1655777060|2 |
|U1 |7f20182f-8c82-4c70-8213-f7889cfdd5eb|1655777062|2 |
|U1 |c4d5a218-d61d-4e9a-b1ea-646f676c4cb7|1656209951|3 |
|U1 |c4d5a218-d61d-4e9a-b1ea-646f676c4cb7|1656209952|3 |
|U1 |c4d5a218-d61d-4e9a-b1ea-646f676c4cb7|1656209999|3 |
+---+------------------------------------+----------+------+
"""
Upvotes: 1