Reputation: 115
I have a file which is a variable length file but I need to be able to read it in and load it to a staging table. The position of each field is the same in the file, but the ending of it can vary as the last field isn't required.
The length can vary from 805 characters to a max of 822. For the below regex, if I try and load a record that is 805 characters long, it'll null out the whole record. I've tried making the last few fields variable length, but it's still not loading all of the records still.
drop table dz_1318_disc.asc_monthly_stg;
CREATE EXTERNAL TABLE dz_1318_disc.asc_monthly_stg
(
CMAFFID String COMMENT '/*@type=varchar(12)*/',
ADTMBRCHR String COMMENT '/*@type=varchar(97)*/',
SITENAME String COMMENT '/*@type=varchar(90)*/',
SRVADD1 String COMMENT '/*@type=varchar(30)*/',
SRVADD2 String COMMENT '/*@type=varchar(30)*/',
SRVADD3 String COMMENT '/*@type=varchar(30)*/',
SRVCITY String COMMENT '/*@type=varchar(30)*/',
SRVST String COMMENT '/*@type=varchar(3)*/',
SRVZIP5 String COMMENT '/*@type=varchar(5)*/',
SRVZIP4 String COMMENT '/*@type=varchar(12)*/',
BILFRST String COMMENT '/*@type=varchar(50)*/',
BILLAST String COMMENT '/*@type=varchar(90)*/',
BILADD1 String COMMENT '/*@type=varchar(30)*/',
BILADD2 String COMMENT '/*@type=varchar(30)*/',
BILADD3 String COMMENT '/*@type=varchar(30)*/',
BILCITY String COMMENT '/*@type=varchar(30)*/',
BILST String COMMENT '/*@type=varchar(3)*/',
BILZIP5 String COMMENT '/*@type=varchar(5)*/',
BILZIP4 String COMMENT '/*@type=varchar(12)*/',
BILPHON String COMMENT '/*@type=varchar(10)*/',
SRVPHON String COMMENT '/*@type=varchar(10)*/',
NEWCUSTNUM String COMMENT '/*@type=varchar(38)*/',
ACTDATE String COMMENT '/*@type=varchar(8)*/',
DISDATE String COMMENT '/*@type=varchar(8)*/',
ACTSTAT String COMMENT '/*@type=varchar(70)*/',
RECURR String COMMENT '/*@type=varchar(10)*/',
INSTALL String COMMENT '/*@type=varchar(10)*/',
LEADSRC String COMMENT '/*@type=varchar(3)*/',
CAMKGRUP String COMMENT '/*@type=varchar(3)*/',
DISCCODE String COMMENT '/*@type=varchar(6)*/',
DISTRIC String COMMENT '/*@type=varchar(9)*/ ',
COMBINE String COMMENT '/*@type=varchar (18)*)/'
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES
(
"input.regex" = "(.{12})(.{97})(.{90})(.{30})(.{30})(.{30})(.{30})(.{3})(.{5})(.{12})(.{50})(.{90})(.{30})(.{30})(.{30})(.{30})(.{3})(.{5})(.{12})(.{10})(.{10})(.{38})(.{8})(.{8})(.{70})(.{10})(.{10})(.{3})(.{3})(.{6})(.{9})(.{9}).*"
)
STORED AS TEXTFILE
LOCATION '/dz/dz_1318/disc.db/kpp_monthly/';
The file layout is as follows:
----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2--
xxxx 012345678 JOHNSON, JOHN 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 JOHN JOHNSON 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 1234567890123456789012345678 20151024201411101 479.87999926.7812347899 CMV 450 2 123456789
xxxx 012345678 JOHNSON, JOHN 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 JOHN JOHNSON 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 1234567890123456789012345678 20151024201411101 479.87999926.7812347899 CMV 450 25 123456789
xxxx 012345678 JOHNSON, JOHN 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 JOHN JOHNSON 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 1234567890123456789012345678 20151024201411101 479.87999926.7812347899 CMV 450 258 123456789
xxxx 012345678 JOHNSON, JOHN 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 JOHN JOHNSON 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 1234567890123456789012345678 20151024201411101 479.87999926.7812347899 CMV 450 2
xxxx 012345678 JOHNSON, JOHN 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 JOHN JOHNSON 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 1234567890123456789012345678 20151024201411101 479.87999926.7812347899 CMV 450 25
xxxx 012345678 JOHNSON, JOHN 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 JOHN JOHNSON 1234 WEST STREET APT B ATTN JOHN SOME CITY TX 78216 1234567890123456789012345678 20151024201411101 479.87999926.7812347899 CMV 450 258
Upvotes: 0
Views: 301
Reputation: 7947
Based on your sample data, you have to specify a non fixed length for your last columns (.{0,9})
, additional to that you need to include one more column if you want to see your last element (123456789) in an independent column
CREATE EXTERNAL TABLE dz_1318_disc.asc_monthly_stg
(
CMAFFID String COMMENT '/*@type=varchar(12)*/',
ADTMBRCHR String COMMENT '/*@type=varchar(97)*/',
SITENAME String COMMENT '/*@type=varchar(90)*/',
SRVADD1 String COMMENT '/*@type=varchar(30)*/',
SRVADD2 String COMMENT '/*@type=varchar(30)*/',
SRVADD3 String COMMENT '/*@type=varchar(30)*/',
SRVCITY String COMMENT '/*@type=varchar(30)*/',
SRVST String COMMENT '/*@type=varchar(3)*/',
SRVZIP5 String COMMENT '/*@type=varchar(5)*/',
SRVZIP4 String COMMENT '/*@type=varchar(12)*/',
BILFRST String COMMENT '/*@type=varchar(50)*/',
BILLAST String COMMENT '/*@type=varchar(90)*/',
BILADD1 String COMMENT '/*@type=varchar(30)*/',
BILADD2 String COMMENT '/*@type=varchar(30)*/',
BILADD3 String COMMENT '/*@type=varchar(30)*/',
BILCITY String COMMENT '/*@type=varchar(30)*/',
BILST String COMMENT '/*@type=varchar(3)*/',
BILZIP5 String COMMENT '/*@type=varchar(5)*/',
BILZIP4 String COMMENT '/*@type=varchar(12)*/',
BILPHON String COMMENT '/*@type=varchar(10)*/',
SRVPHON String COMMENT '/*@type=varchar(10)*/',
NEWCUSTNUM String COMMENT '/*@type=varchar(38)*/',
ACTDATE String COMMENT '/*@type=varchar(8)*/',
DISDATE String COMMENT '/*@type=varchar(8)*/',
ACTSTAT String COMMENT '/*@type=varchar(70)*/',
RECURR String COMMENT '/*@type=varchar(10)*/',
INSTALL String COMMENT '/*@type=varchar(10)*/',
LEADSRC String COMMENT '/*@type=varchar(3)*/',
CAMKGRUP String COMMENT '/*@type=varchar(3)*/',
DISCCODE String COMMENT '/*@type=varchar(6)*/',
DISTRIC String COMMENT '/*@type=varchar(9)*/ ',
COMBINE String COMMENT '/*@type=varchar (18)*)/',
YOUREXTRACOLUMN String
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES
(
"input.regex" = "(.{12})(.{97})(.{90})(.{30})(.{30})(.{30})(.{30})(.{3})(.{5})(.{12})(.{50})(.{90})(.{30})(.{30})(.{30})(.{30})(.{3})(.{5})(.{12})(.{10})(.{10})(.{38})(.{8})(.{8})(.{70})(.{10})(.{10})(.{3})(.{3})(.{6})(.{9})(.{0,9})(.{0,9}).*"
)
STORED AS TEXTFILE
;
test it
select combine, yourextracolumn, length(combine) from dz_1318_disc.asc_monthly_stg
output
2 ,123456789,9
25 ,123456789,9
258 ,123456789,9
3,,1
35,,2
358,,3
Upvotes: 1