Reputation: 465
I have a pyspark dataframe that contain one column
df.show(1)
table
[[,,hello,yes],[take,no,I,m],[hi,good,,]....]
df.printSchema
root
|--table:string (nullable:true)
My problem is how to convert that column to array of arrays: T.ArrayType(T.ArrayType(T.StringType()))
Upvotes: 1
Views: 4247
Reputation: 6323
Try this-
spark>=2.4
Use combination of translate
and regex_replace
val df = Seq("[[,,hello,yes],[take,no,I,m],[hi,good,,]]").toDF("table")
df.show(false)
df.printSchema()
/**
* +-----------------------------------------+
* |table |
* +-----------------------------------------+
* |[[,,hello,yes],[take,no,I,m],[hi,good,,]]|
* +-----------------------------------------+
*
* root
* |-- table: string (nullable = true)
*/
val p = df.withColumn("arr", split(
translate(
regexp_replace($"table", """\]\s*,\s*\[""", "##"), "][", ""
), "##"
))
val processed = p.withColumn("arr", expr("TRANSFORM(arr, x -> split(x, ','))"))
processed.show(false)
processed.printSchema()
/**
* +-----------------------------------------+----------------------------------------------------+
* |table |arr |
* +-----------------------------------------+----------------------------------------------------+
* |[[,,hello,yes],[take,no,I,m],[hi,good,,]]|[[, , hello, yes], [take, no, I, m], [hi, good, , ]]|
* +-----------------------------------------+----------------------------------------------------+
*
* root
* |-- table: string (nullable = true)
* |-- arr: array (nullable = true)
* | |-- element: array (containsNull = true)
* | | |-- element: string (containsNull = true)
*/
Upvotes: 1
Reputation: 31460
Use from_json
function from Spark-2.4+
Example:
df.show(10,False)
#+---------------------------------------------+
#|table |
#+---------------------------------------------+
#|[['','','hello','yes'],['take','no','i','m']]|
#+---------------------------------------------+
df.printSchema()
#root
# |-- table: string (nullable = true)
from pyspark.sql.functions import *
from pyspark.sql.types import *
#schema
sch=ArrayType(ArrayType(StringType()))
df.withColumn("dd",from_json(col("table"),sch)).select("dd").show(10,False)
#+------------------------------------+
#|dd |
#+------------------------------------+
#|[[, , hello, yes], [take, no, i, m]]|
#+------------------------------------+
#schema after converting to array
df.withColumn("dd",from_json(col("table"),sch)).select("dd").printSchema()
#root
# |-- dd: array (nullable = true)
# | |-- element: array (containsNull = true)
# | | |-- element: string (containsNull = true)
Upvotes: 4