Reputation: 1863
I am working on a writing unittest for PySpark. Below is the actual function.
def get_some_timestamp(self, final_set):
final_set.createOrReplaceTempView("session_data")
session_df = self.spark.sql("""SELECT \
id,\
date(sent_at) as date_without_timestamp, \
sent_at as date_time,\
CAST(lag(sent_at) OVER (PARTITION BY id, date(sent_at) ORDER BY sent_at) as timestamp) as prev_timestamp,\
FROM session_data""")
return session_df
UnitTest for that function looks something like this:-
def test_get_some_timestamp(self):
test_data_df = self.spark.createDataFrame(
[
('1234','2019-01-01T23:01:01.123Z','pageview'),
('4567','2019-01-02T23:01:02.123Z','pageview'),
('1234','2019-01-01T23:03:01.123Z','click'),
('1234','2019-01-01T20:01:01.123Z','pageview'),
('4567','2019-01-02T18:01:10.678Z','pageview'),
('7890','2019-01-01T23:01:01.123Z','pageview')
],
['id', 'sent_at','event_name']
)
expected_output_pandas_df = pd.DataFrame({'id':['1234','4567','1234','1234','4567','7890'],
'date_without_timestamp':['2019-01-01','2019-01-02','2019-01-01','2019-01-01','2019-01-02','2019-01-01'],
'date_time':['2019-01-01T23:01:01.123','2019-01-02T23:01:02.123','2019-01-01T23:03:01.123','2019-01-01T20:01:01.123','2019-01-02T18:01:10.678','2019-01-01T23:01:01.123'],
'prev_timestamp':[pd.to_datetime('2019-01-01T20:01:01.123'),'2019-01-02 18:01:10.678','2019-01-01T23:01:01.123','NaT','NaT','NaT'],
'event_name':['pageview','pageview','click','pageview','pageview','pageview'],
})
actual_output_pandas_df = get_some_timestamp(self,test_data_df).toPandas()
self.assert_equal_with_sort(expected_output_pandas_df,actual_output_pandas_df,['id','date_time'])
My assert function is as follows :-
def assert_equal_with_sort(self, results, expected, keycolumns):
results_sorted = results.sort_values(by=keycolumns).reset_index(drop=True)
expected_sorted = expected.sort_values(by=keycolumns).reset_index(drop=True)
assert_frame_equal(results_sorted, expected_sorted)
Now, when I run this unittest, it fails with following error:-
Traceback (most recent call last):
File "/Users/neilshah/Documents/GitCode/ms_data_etl/tests/test_utm_session_tagging.py", line 161, in test_get_previous_activity_timestamp
self.assert_equal_with_sort(expected_output_pandas_df,actual_output_pandas_df,['anonymous_id','date_time'])
File "/Users/neilshah/Documents/GitCode/ms_data_etl/tests/test_utm_session_tagging.py", line 77, in assert_equal_with_sort
assert_frame_equal(results_sorted, expected_sorted,check_frame_type=False,check_dtype=False,check_index_type=False,check_column_type=False,check_datetimelike_compat=True)
File "/Users/neilshah/anaconda3/lib/python3.6/site-packages/pandas/util/testing.py", line 1348, in assert_frame_equal
obj='DataFrame.iloc[:, {idx}]'.format(idx=i))
File "/Users/neilshah/anaconda3/lib/python3.6/site-packages/pandas/util/testing.py", line 1216, in assert_series_equal
check_dtype=check_dtype)
File "/Users/neilshah/anaconda3/lib/python3.6/site-packages/pandas/util/testing.py", line 1087, in assert_numpy_array_equal
_raise(left, right, err_msg)
File "/Users/neilshah/anaconda3/lib/python3.6/site-packages/pandas/util/testing.py", line 1081, in _raise
raise_assert_detail(obj, msg, left, right)
File "/Users/neilshah/anaconda3/lib/python3.6/site-packages/pandas/util/testing.py", line 1018, in raise_assert_detail
raise AssertionError(msg)
AssertionError: numpy array are different
numpy array values are different (100.0 %)
[left]: [2019-01-01, 2019-01-01, 2019-01-01, 2019-01-02, 2019-01-02, 2019-01-01]
[right]: [2019-01-01, 2019-01-01, 2019-01-01, 2019-01-02, 2019-01-02, 2019-01-01]
I have tried adding different parameters given here https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.testing.assert_frame_equal.html But, it does not seem to work.
I have also printed the datatype for both dataframe. All columns are of type object
except prev_timestamp
which is of type datetime64[ns]
for both dataframe.
Can anyone help me here?
Upvotes: 2
Views: 591
Reputation: 1863
It seems having same datatype does not help. If we are comparing anything other than String
, Datatype has to match exactly.
So in my case, it was date
The way I resolved is as follows:-
expected_output_pandas_df = pd.DataFrame(
{
'id':['1234','4567','1234','1234','4567','7890'],
'date_without_timestamp':[pd.to_datetime('2019-01-01').date(),pd.to_datetime('2019-01-02').date(),'pd.to_datetime('2019-01-01').date(),pd.to_datetime('2019-01-01').date(),pd.to_datetime('2019-01-02').date(),pd.to_datetime('2019-01-01').date()],
'date_time':[pd.to_datetime('2019-01-01T23:01:01.123'),'2019-01-02T23:01:02.123','2019-01-01T23:03:01.123','2019-01-01T20:01:01.123','2019-01-02T18:01:10.678','2019-01-01T23:01:01.123'],
'prev_timestamp':[pd.to_datetime('2019-01-01T20:01:01.123'),'2019-01-02 18:01:10.678','2019-01-01T23:01:01.123','NaT','NaT','NaT'],
'event_name':['pageview','pageview','click','pageview','pageview','pageview'],
}
)
I faced similar issue for integer type as well. Way to resolve would be
some_pandas_df = pd.DataFrame({'some_int_value':[pd.to_numeric('123456'),pd.to_numeric('543214')]})
Upvotes: 1