Reputation: 761
I have a dataframe with array of struct and inside that another array of struct. Any easy way to select few of the structs in the main array and also few in the nested array without disturbing the structure of the entire dataframe?
SIMPLE INPUT:
-MainArray
---StructCol1
---StructCol2
---StructCol3
---SubArray
------SubArrayStruct4
------SubArrayStruct5
------SubArrayStruct6
SIMPLE OUTPUT:
-MainArray
---StructCol1
---StructCol2
---SubArray
------SubArrayStruct4
------SubArrayStruct5
The source code to try it is as below
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.ArrayType
import org.apache.spark.sql.types.IntegerType
val arrayStructData = Seq(
Row("Army",List(Row("1","Infantry","100",List(Row("Gun","Station"),Row("Bazooka","Barracks"))),Row("2","Cavalry","150",List(Row("Grenadier","Seige factory"),Row("Canon","Tank Factory"))))),
Row("Navy",List(Row("3","Transport","200",List(Row("Cruiser","Cruise Lines"),Row("SubMarine","Yard"))),Row("4","Battle Ships","250",List(Row("Frigate","Dock"),Row("Galleon","Hub")))))
)
val arrayStructSchema = new StructType()
.add("Category",StringType)
.add("ArmyOrNavy",ArrayType(new StructType()
.add("ID",StringType)
.add("Type",StringType)
.add("Count",StringType)
.add("Items",ArrayType(new StructType().add("ItemName",StringType).add("ItemTrainingArea",StringType)))
))
val df = spark.createDataFrame(spark.sparkContext.parallelize(arrayStructData),arrayStructSchema)
df.printSchema()
df.show(false)
root
|-- Category: string (nullable = true)
|-- ArmyOrNavy: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- ID: string (nullable = true)
| | |-- Type: string (nullable = true)
| | |-- Count: string (nullable = true)
| | |-- Items: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- ItemName: string (nullable = true)
| | | | |-- ItemTrainingArea: string (nullable = true)
+--------+-----------------------------------------------------------------------------------------------------------------------------------+
|Category|ArmyOrNavy |
+--------+-----------------------------------------------------------------------------------------------------------------------------------+
|Army |[[1, Infantry, 100, [[Gun, Station], [Bazooka, Barracks]]], [2, Cavalry, 150, [[Grenadier, Seige factory], [Canon, Tank Factory]]]]|
|Navy |[[3, Transport, 200, [[Cruiser, Cruise Lines], [SubMarine, Yard]]], [4, Battle Ships, 250, [[Frigate, Dock], [Galleon, Hub]]]] |
+--------+-----------------------------------------------------------------------------------------------------------------------------------+
The output I need is
root
|-- Category: string (nullable = true)
|-- ArmyOrNavy: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- ID: string (nullable = true)
| | |-- Items: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- ItemTrainingArea: string (nullable = true)
I tried doing something like this but this doesn't look right
val df2 = df.selectExpr("Category",
"Array (Struct(ArmyOrNavy.ID,CAST(ArmyOrNavy.Items AS array<array<struct<ItemName:string,ItemTrainingArea:string>>>) Items)) as ArmyOrNavy")
df2.printSchema
df2.show(false)
Upvotes: 0
Views: 1060
Reputation: 4481
You can do it using to_json
and from_json
and set new struct DateType
for struct field (array) while parsing json:
val newArrayType = ArrayType(
new StructType()
.add("ID", StringType)
.add("Items", ArrayType(
new StructType()
.add("ItemTrainingArea", StringType)
))
)
val jsonFieldName = "ArmyOrNavy_json"
val transformedDF = df.withColumn(jsonFieldName, to_json($"ArmyOrNavy"))
.withColumn("ArmyOrNavy", from_json(col(jsonFieldName), newArrayType))
.drop(jsonFieldName)
transformedDF.printSchema()
transformedDF.show(truncate = false)
// output
root
|-- Category: string (nullable = true)
|-- ArmyOrNavy: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- ID: string (nullable = true)
| | |-- Items: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- ItemTrainingArea: string (nullable = true)
+--------+----------------------------------------------------------------------+
|Category|ArmyOrNavy |
+--------+----------------------------------------------------------------------+
|Army |[[1, [[Station], [Barracks]]], [2, [[Seige factory], [Tank Factory]]]]|
|Navy |[[3, [[Cruise Lines], [Yard]]], [4, [[Dock], [Hub]]]] |
+--------+----------------------------------------------------------------------+
Upvotes: 1