gauravd2196
gauravd2196

Reputation: 185

Only STRING defined columns are loaded in HIVE i.e. columns with int and double are NULL

Only STRING defined columns are loaded in HIVE i.e. columns with int and double are NULL

Create table command

create table A(
id STRING,
member_id STRING,
loan_amnt DOUBLE,   
funded_amnt DOUBLE,
`funded_amnt_inv` DOUBLE,
`term` STRING,
`int_rate`  STRING, 
`installment` DOUBLE,   
`grade` STRING, 
`sub_grade` STRING, 
`emp_title` STRING, 
`emp_length` STRING,    
`home_ownership` STRING,    
`nnual_inc` INT,
`verification_status` STRING,
`issue_d` STRING,
`loan_status` STRING,
`pymnt_plan`    STRING,
`url`   STRING,
`desc`  STRING,
`purpose`   STRING,
`title` STRING,
`zip_code` STRING,  
`addr_state`    STRING,
`dti`   DOUBLE,
`delinq_2yrs`   INT,
`earliest_cr_line` STRING,  
`inq_last_6mths`    STRING,
`mths_since_last_delinq`    STRING,
`mths_since_last_record`    STRING,
`open_acc`  INT,
`pub_rec`   INT,
`revol_bal` INT,
`revol_util`    STRING,    
`total_acc` INT,    
`initial_list_status`   STRING,    
`out_prncp` DOUBLE,    
`out_prncp_inv` DOUBLE,
`total_pymnt`   DOUBLE,
`total_pymnt_inv`   DOUBLE,
`total_rec_prncp`   DOUBLE,
`total_rec_int` DOUBLE,
`total_rec_late_fee`    DOUBLE,
`recoveries`    DOUBLE,
`collection_recovery_fee`   DOUBLE,
`last_pymnt_d`  STRING,
`last_pymnt_amnt`   DOUBLE,
`next_pymnt_d`  STRING,
`last_credit_pull_d`    STRING,
`collections_12_mths_ex_med`    INT,
`mths_since_last_major_derog`   STRING,
`policy_code`   STRING,
`application_type`  STRING,
`annual_inc_joint`  STRING,
`dti_joint` STRING,
`verification_status_joint` STRING, 
`acc_now_delinq`    STRING,
`tot_coll_amt`  STRING,
`tot_cur_bal`   STRING,
`open_acc_6m`   STRING,
`open_il_6m`    STRING,
`open_il_12m`   STRING,
`open_il_24m`   STRING,
`mths_since_rcnt_il` STRING,    
`total_bal_il`   STRING,
`il_util`    STRING,
`open_rv_12m ` STRING,  
`open_rv_24m`  STRING,  
`max_bal_bc`  STRING,   
`all_util`   STRING,    
`total_credit_rv`   STRING,
`inq_fi`  STRING,   
`total_fi_tl`    STRING,
`inq_last_12m`  STRING
)  

ROW FORMAT delimited
fields terminated by ','

STORED AS TEXTFILE;

Loading data into table A

load data local inpath '/home/cloudera/Desktop/Project-3/1/LoanStats3a.txt' into table A;

Select data

hive> SELECT * FROM A LIMIT 1;

Output


"1077501" "1296599" NULL NULL NULL " 36 months" " 10.65%" NULL "B" "B2" "" "10+ years" "RENT" NULL "Verified" "Dec-2011" "Fully Paid" "n" "https://www.lendingclub.com/browse/loanDetail.action?loan_id=1077501" " Borrower added on 12/22/11 > I need to upgrade my business technologies.
" "credit_card" "Computer" "860xx" "AZ" NULL NULL "Jan-1985" "1" "" "" NULL NULL NULL "83.7%"NULL "f" NULL NULL NULL NULL NULL NULL NULL NULL NULL "Jan-2015" NULL "" "Dec-2015" NULL "" "1" "INDIVIDUAL"

"" "" "" "0" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" "" ""


Upvotes: 2

Views: 787

Answers (2)

gauravd2196
gauravd2196

Reputation: 185

I found the solution :-

create table stat2(id String, member_id INT, loan_amnt FLOAT, funded_amnt FLOAT, funded_amnt_inv FLOAT, term String, int_rate String, installment FLOAT, grade String, sub_grade String, emp_title String, emp_length String, home_ownership String, annual_inc FLOAT, verification_status String, issue_d date, loan_status String, pymnt_plan String, url String, descp String, purpose String, title String, zip_code String, addr_state String, dti FLOAT, delinq_2yrs FLOAT, earliest_cr_line String, inq_last_6mths FLOAT, mths_since_last_delinq FLOAT, mths_since_last_record FLOAT, open_acc FLOAT, pub_rec FLOAT, revol_bal FLOAT, revol_util String, total_acc FLOAT, initial_list_status String, out_prncp FLOAT, out_prncp_inv FLOAT, total_pymnt FLOAT, total_pymnt_inv FLOAT, total_rec_prncp FLOAT, total_rec_int FLOAT, total_rec_late_fee FLOAT, recoveries FLOAT, collection_recovery_fee FLOAT, last_pymnt_d String, last_pymnt_amnt FLOAT, next_pymnt_d String, last_credit_pull_d String, collections_12_mths_ex_med FLOAT, mths_since_last_major_derog FLOAT, policy_code FLOAT, application_type String, annual_inc_joint FLOAT, dti_joint FLOAT, verification_status_joint String, acc_now_delinq FLOAT, tot_coll_amt FLOAT, tot_cur_bal FLOAT, open_acc_6m FLOAT, open_il_6m FLOAT, open_il_12m FLOAT, open_il_24m FLOAT, mths_since_rcnt_il FLOAT, total_bal_il FLOAT, il_util FLOAT, open_rv_12m FLOAT, open_rv_24m FLOAT, max_bal_bc FLOAT, all_util FLOAT, total_rev_hi_lim FLOAT, inq_fi FLOAT, total_cu_tl FLOAT, inq_last_12m FLOAT)

ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' with serdeproperties (

"separatorChar" = ",",

"quoteChar" = "\""
)

STORED AS TEXTFILE tblproperties ("skip.header.line.count"="2", "skip.footer.line.count"="4");

Upvotes: 1

Zoltan
Zoltan

Reputation: 3115

It seems that your CSV contains quotes around the individual fields. The surrounding quotes are not supported by HIVE and as a result they become part of the fields. In case of string fields, the quotes become part of the string. In case of numeric fields, the quotes make the field an invalid number, resulting in NULLs.

See csv-serde for a serde that supports quotes in CSV files.

Upvotes: 0

Related Questions