Reputation: 6426
I am trying to select all elements from an array that exists in one of the fields in Athena, for example:
{
id: "1",
name: "bla",
array: [{
val1: "2",
val2: "2"
}, {
val1: "3",
val2: "4"
}]
}
{
id: "3",
name: "bla bla",
array: [{
val1: "5",
val2: "6"
}, {
val1: "7",
val2: "8"
}]
}
I am trying to create a view that selects all the elements from the inner array, the result would be:
+----+------+------+
| id | val1 | val2 |
+----+------+------+
| 1 | 2 | 2 |
+----+------+------+
| 1 | 3 | 4 |
+----+------+------+
| 2 | 5 | 6 |
+----+------+------+
| 2 | 7 | 8 |
+----+------+------+
What is the query to produce such an output?
The actual file will be one item per line, like this:
{ id: "1", name: "bla", array: [{ val1: "2", val2: "2" }, { val1: "3", val2: "4" }] }
{ id: "3", name: "bla bla", array: [{ val1: "5", val2: "6" }, { val1: "7", val2: "8" }] }
The DDL to create the table looks like this:
CREATE EXTERNAL TABLE all (
id STRING,
name STRING,
array ARRAY<
struct<
val1:STRING,
val2:STRING
> >
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
LOCATION 's3://abc/def'
Upvotes: 0
Views: 298
Reputation: 5144
I was able to achieve the expected result by following below steps:
{ id: "1", name: "bla", array: [{ val1: "2", val2: "2" }, { val1: "3", val2: "4" }] }
TO
{
"id": "1",
"name": "bla",
"array": [
{
"val1": "2",
"val2": "2"
},
{
"val1": "3",
"val2": "4"
}
]
}
CREATE EXTERNAL TABLE testt_json2( `id` string COMMENT 'from deserializer', `name` string COMMENT 'from deserializer', `array` array<struct<val1:string,val2:string>> COMMENT 'from deserializer')
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
WITH SERDEPROPERTIES ('paths'='array,id,name')
STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://test/'
WITH dataset AS (
SELECT *
FROM testt_json2
)
SELECT id,
t.names.val1,
t.names.val2
FROM dataset
CROSS JOIN UNNEST(array) AS t(names)
Upvotes: 1