Reputation: 51
I am creating external table in Hive using parquet file as a storage
hive> CREATE EXTERNAL TABLE test_data(
c1 string, c2 int, c3 string, c4 string, c5 string, c6 float,
c7 string, c8 string, c9 string, c10 string, c11 string, c12 string)
ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat'
OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'
LOCATION '/path/test_data/';
selecting this table getting NULL in any rows and columns
SELECT * FROM test_data;
OK
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
Time taken: 0.191 seconds, Fetched: 34 row(s)
I've got parquet file by Pig transformation from tab delimeted file using following sequence
grunt> A = LOAD '/path/test.data' USING PigStorage('\t')
AS ( c1: chararray,c2: int,c3: chararray,
c4: chararray,c5: chararray,c6: float,
c7: chararray,c8: chararray,c9: chararray,
c10: chararray, c11: chararray, c12: chararray );
grunt> STORE A INTO '/path/test_data' USING parquet.pig.ParquetStorer;
To check up that the parquet file contains valid data read it back
grunt> B = LOAD'/path/test_data' USING parquet.pig.ParquetLoader;
grunt> DUMP B;
(19,14370,rs6054257,G,A,29.0,PASS,NS=3;DP=14;AF=0.5;DB;H2,GT:GQ:DP:HQ,0|0:48:1:51,51,1|0:48:8:51,51,1/1:43:5:.,.)
(20,17330,.,T,A,3.0,q10,NS=3;DP=11;AF=0.017,GT:GQ:DP:HQ,0|0:49:3:58,50,0|1:3:5:65,3,0/0:41:3)
(20,1110696,rs6040355,A,G,T,67.0,PASS,NS=2;DP=10;AF=0.333,0.667;AA=T;DB,GT:GQ:DP:HQ,1|2:21:6:23,27,2|1:2:0:18,2,2/2:35:4)
(20,1230237,.,T,.,47.0,PASS,NS=3;DP=13;AA=T,GT:GQ:DP:HQ,0|0:54:7:56,60,0|0:48:4:51,51,0/0:61:2)
(20,1234567,microsat1,GTC,G,GTCTC,50.0,PASS,NS=3;DP=9;AA=G,GT:GQ:DP,0/1:35:4,0/2:17:2,1/1:40:3)
(20,2234567,.,C,[13:123457[ACGC,50.0,PASS,SVTYPE=BND;NS=3;DP=9;AA=G,GT:GQ:DP,0/1:35:4,0/1:17:2,1/1:40:3)
(20,2234568,.,C,.TC,50.0,PASS,SVTYPE=BND;NS=3;DP=9;AA=G,GT:GQ:DP,0/1:35:4,0/1:17:2,1/1:40:3)
(20,2234569,.,C,CT.,50.0,PASS,SVTYPE=BND;NS=3;DP=9;AA=G,GT:GQ:DP,0/1:35:4,0/1:17:2,1/1:40:3)
(20,3234569,.,C,<INV>,50.0,PASS,SVTYPE=BND;NS=3;DP=9;AA=G,GT:GQ:DP,0/1:35:4,0/1:17:2,1/1:40:3)
(20,4234569,.,N,.[13:123457[,50.0,PASS,SVTYPE=BND;NS=3;DP=9;AA=G,GT:GQ:DP,0/1:35:4,0/1:17:2,./.:40:3)
(20,5234569,.,N,[13:123457[.,50.0,PASS,SVTYPE=BND;NS=3;DP=9;AA=G,GT:GQ:DP,0/1:35:4,0/1:17:2,1/1:40:3)
(Y,17330,.,T,A,3.0,q10,NS=3;DP=11;AF=0.017,GT:GL,0:0,49,0:0,3,1:41,0)
What am I doing wrong?
Upvotes: 5
Views: 9692
Reputation: 71
I too faced similar issues.
TBLPROPERTIES ('PARQUET.COMPRESS'='GZIP')
Upvotes: 4
Reputation: 964
Try
CREATE EXTERNAL TABLE test_data( c1 string, c2 int, c3 string, c4 string, c5 string, c6 float, c7 string, c8 string, c9 string, c10 string, c11 string, c12 string) ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe' WITH SERDEPROPERTIES ("casesensitive"="c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12") STORED AS INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat' OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat' LOCATION '/path/test_data/'
Upvotes: 0
Reputation: 1063
In my case it seemed that Hive was sensitive to column names.
Having exported my parquet file from a dataframe in Spark, I had to use the exactly the same column name in Hive as I had in the Spark dataframe originally.
When I used generic column names such as c1
I'd get NULL
values for all values in that particular column.
Upvotes: 6
Reputation: 51
In my case, HDFS file was in comma delimited format, I was not defining row format delimited in command. Like:
CREATE EXTERNAL TABLE TABLENAME(COL1 INT, COL2 STRING....) LOCATION '/user/....';
it used to create table with same number of rows but with all values as null. I modified above command as follows and it works as charm:
CREATE EXTERNAL TABLE TABLENAME (COL1 INT, COL2 STRING....) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION '/user/.....';
Upvotes: 5