Reputation: 421
I have a dataframe that has a column with JSON that I need to parse. Looks like the JSON is a bit malformed as it does not have a key just a list of k/v pairs. I have tried
json_schema = spark.read.json(df.rdd.map(lambda row: row.data)).schema
df = df.withColumn('p_data', from_json(col('data'), json_schema))
but the p_data column just ends up with a list of the values from the key/value pairs in the object. Here is what's contained in the JSON data column of one of the records I've isolated to work with
{
'build_version': '1619195909',
'device_id': 'ios',
'trials': {},
'appsflyer_id': '1598758374496-7286375',
'idfa': '01C61C80-8682-437D-BB4E-8F40BC6EAB08',
'platform': 'ios',
'user_id': 6746546.0,
'namespace': 'company',
'referrer': '',
'timestamp': 1620694797109,
'user_agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148',
'gpat': '',
'persistent_session_id': '6db3c852-0c61-4c4d-bd35-9aeeddb1fd3a',
'kind': 'view_carousel',
'carousel': 'new_on_gopuff',
'product_ids': [25823, 41910, 25802, 25724, 41912, 25816, 41772, 26589, 42321, 25714, 26418, 26773, 42534, 26419, 26437, 41869, 25715, 25720, 41707, 23255, 26376, 28983, 10768],
'source': '#categories',
'event_time': '2021-05-11 00:59:57.285-00',
'event_type': 'Mixcart.Metric',
'action': None,
'order_id': None,
'prefix': '',
'delivery_id': None
}
Upvotes: 0
Views: 476
Reputation: 6082
by defining schema that matches your JSON, I can read it easily
from pyspark.sql import types as T
# they're all string now but you can update it to match with your datatype
schema = T.StructType([
T.StructField('build_version', T.StringType()),
T.StructField('device_id', T.StringType()),
T.StructField('trials', T.StringType()),
T.StructField('appsflyer_id', T.StringType()),
T.StructField('idfa', T.StringType()),
T.StructField('platform', T.StringType()),
T.StructField('user_id', T.StringType()),
T.StructField('namespace', T.StringType()),
T.StructField('referrer', T.StringType()),
T.StructField('timestamp', T.StringType()),
T.StructField('user_agent', T.StringType()),
T.StructField('gpat', T.StringType()),
T.StructField('persistent_session_id', T.StringType()),
T.StructField('kind', T.StringType()),
T.StructField('carousel', T.StringType()),
T.StructField('product_ids', T.StringType()),
T.StructField('source', T.StringType()),
T.StructField('event_time', T.StringType()),
T.StructField('event_type', T.StringType()),
T.StructField('action', T.StringType()),
T.StructField('order_id', T.StringType()),
T.StructField('prefix', T.StringType()),
T.StructField('delivery_id', T.StringType()),
])
(df
.withColumn('data', F.from_json(F.col('data'), schema))
.select('data.*')
.show(10, False)
)
# +-------------+---------+------+---------------------+------------------------------------+--------+---------+---------+--------+-------------+---------------------------------------------------------------------------------------------------------------+----+------------------------------------+-------------+-------------+-------------------------------------------------------------------------------------------------------------------------------------------+-----------+--------------------------+--------------+------+--------+------+-----------+
# |build_version|device_id|trials|appsflyer_id |idfa |platform|user_id |namespace|referrer|timestamp |user_agent |gpat|persistent_session_id |kind |carousel |product_ids |source |event_time |event_type |action|order_id|prefix|delivery_id|
# +-------------+---------+------+---------------------+------------------------------------+--------+---------+---------+--------+-------------+---------------------------------------------------------------------------------------------------------------+----+------------------------------------+-------------+-------------+-------------------------------------------------------------------------------------------------------------------------------------------+-----------+--------------------------+--------------+------+--------+------+-----------+
# |1619195909 |ios |{} |1598758374496-7286375|01C61C80-8682-437D-BB4E-8F40BC6EAB08|ios |6746546.0|company | |1620694797109|Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148| |6db3c852-0c61-4c4d-bd35-9aeeddb1fd3a|view_carousel|new_on_gopuff|[25823,41910,25802,25724,41912,25816,41772,26589,42321,25714,26418,26773,42534,26419,26437,41869,25715,25720,41707,23255,26376,28983,10768]|#categories|2021-05-11 00:59:57.285-00|Mixcart.Metric|null |null | |null |
# +-------------+---------+------+---------------------+------------------------------------+--------+---------+---------+--------+-------------+---------------------------------------------------------------------------------------------------------------+----+------------------------------------+-------------+-------------+-------------------------------------------------------------------------------------------------------------------------------------------+-----------+--------------------------+--------------+------+--------+------+-----------+
Upvotes: 2