Deniz
Deniz

Reputation: 191

Hive read multiple elements from XML

I am trying to select multiple elements from hive external table .
my XML :

<?xml version="1.0" encoding="UTF-8"?>
<data>
    <units version="1">
        <unit>
            <id>1</id>
            <name>ABC</name>
            <details>
                <detail>
                    <subid>001</subid>
                </detail>
            </details>
        </unit>
        <unit>
            <id>2</id>
            <name>DEF</name>
            <details>
                <detail>
                    <subid>002</subid>
                </detail>
            </details>
        </unit>
    </units>
</data>

and my table script is like this :

create external table testxml (
  id array<string>,
  name array<string>,
  subid array<string>
)
row format serde "com.ibm.spss.hive.serde2.xml.XmlSerDe"
with serdeproperties (
  "column.xpath.id"="/units/unit/id/text()",
  "column.xpath.name"="/units/unit/name/text()",
  "column.xpath.subid"="/units/unit/details/detail/subid/text()"
)
stored as inputformat "com.ibm.spss.hive.serde2.xml.XmlInputFormat"
outputformat "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
location "somepath/test" 
tblproperties (
  "xmlinput.start"="<units version",
  "xmlinput.end"="</units>"
);

my select statement returns these values :

hive> select * from testxml;
OK
["1","2"]       ["ABC","DEF"]   ["001","002"]

but my desired output is should be like this :

+--+------+------+
|id|  name| subid|
+--+------+------+
| 1|   ABC|   001|
| 2|   DEF|   002|
+--+------+------+

any feedback would be relly helpful.

Upvotes: 1

Views: 555

Answers (1)

Bala
Bala

Reputation: 11244

Try this

create external table testxml (
  id string,
  name string,
  subid string
)
ROW FORMAT SERDE 'com.ibm.spss.hive.serde2.xml.XmlSerDe'
with serdeproperties (
  "column.xpath.id"="/unit/id/text()",
  "column.xpath.name"="/unit/name/text()",
  "column.xpath.subid"="/unit/details/detail/subid/text()"
)
stored as inputformat "com.ibm.spss.hive.serde2.xml.XmlInputFormat"
outputformat "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
location "file:///home/cloudera/xmlfiles2" 
tblproperties (
  "xmlinput.start"="<unit>",
  "xmlinput.end"="</unit>"
);

INFO  : OK
+-------------+---------------+----------------+--+
| testxml.id  | testxml.name  | testxml.subid  |
+-------------+---------------+----------------+--+
| 1           | ABC           | 001            |
| 2           | DEF           | 002            |
+-------------+---------------+----------------+--+
2 rows selected (0.154 seconds)

Upvotes: 1

Related Questions