Reputation: 433
I have a requirement like below:
I've got a HIVE table containing below fields:
Table: USER_PRODUCT
user_id, product1_id, product2_id, product3_id, ... , product10_id
Here, the actual item for each user_id can be anything from 1 to 10 (Meaning for some user_id ONLY product1_id, product2_id is present)
I want to process above and remove items which are invalid based on another table containing product details:
Table: PRODUCT_DEAILS
product_id, product_status
I want to achieve this by writing a HIVE query.
Can someone help me in writing the query? My concern is how to iterate over all product_ids for each user_id?
For(all_rows in USER_PRODUCT) Iterate over all product_IDs from 1 to 10) Check if product is valid based on product status in PRODUCT_DEAILS if(valid) --> keep as it is else --> Remove product from table by setting it null
Upvotes: 1
Views: 1215
Reputation: 38335
If product_deals is small enough, build array of valid products, cross join with USER_PRODUCT and use array_contains to check if a product is valid:
set hive.auto.convert.join=true;
set hive.auto.convert.join.noconditionaltask=true;
set hive.mapjoin.smalltable.filesize=1000000000; --adjust to small table size
set hive.auto.convert.join.noconditionaltask=1000000000;
with valid_product as (
select collect_set(product_id) as list
from PRODUCT_DEAILS
where product_status='valid'
sort by product_id
)
insert overwrite table USER_PRODUCT
select p.user_id,
case when array_contains(v.list, p.product1_id) then p.product1_id end product1_id,
case when array_contains(v.list, p.product2_id) then p.product2_id end product2_id,
case when array_contains(v.list, p.product3_id) then p.product3_id end product3_id,
case when array_contains(v.list, p.product4_id) then p.product4_id end product4_id,
case when array_contains(v.list, p.product5_id) then p.product5_id end product5_id,
case when array_contains(v.list, p.product6_id) then p.product6_id end product6_id,
case when array_contains(v.list, p.product7_id) then p.product7_id end product7_id,
case when array_contains(v.list, p.product8_id) then p.product8_id end product8_id,
case when array_contains(v.list, p.product9_id) then p.product9_id end product9_id,
case when array_contains(v.list, p.product10_id) then p.product10_id end product10_id
from USER_PRODUCT p
cross join valid_product v; --cross join with single row containing array
If PRODUCT_DEALS is too big to fit in array, then use common joins:
set hive.auto.convert.join=true;
set hive.auto.convert.join.noconditionaltask=true;
set hive.mapjoin.smalltable.filesize=1000000000; --adjust to small table size
set hive.auto.convert.join.noconditionaltask=1000000000;
with valid_product as (
select distinct product_id --Get distinct IDs of valid products
from PRODUCT_DEAILS
where product_status='valid'
)
insert overwrite table USER_PRODUCT
select p.user_id,
case when v1.product_id is not null then p.product1_id end product1_id,
case when v2.product_id is not null then p.product2_id end product2_id,
case when v3.product_id is not null then p.product3_id end product3_id,
case when v4.product_id is not null then p.product4_id end product4_id,
case when v5.product_id is not null then p.product5_id end product5_id,
case when v6.product_id is not null then p.product6_id end product6_id,
case when v7.product_id is not null then p.product7_id end product7_id,
case when v8.product_id is not null then p.product8_id end product8_id,
case when v9.product_id is not null then p.product9_id end product9_id,
case when v10.product_id is not null then p.product10_id end product10_id
from USER_PRODUCT p
left join valid_product v1 on p.product1_id=v1.product_id
left join valid_product v2 on p.product2_id=v2.product_id
left join valid_product v3 on p.product3_id=v3.product_id
left join valid_product v4 on p.product4_id=v4.product_id
left join valid_product v5 on p.product5_id=v5.product_id
left join valid_product v6 on p.product6_id=v6.product_id
left join valid_product v7 on p.product7_id=v7.product_id
left join valid_product v8 on p.product8_id=v8.product_id
left join valid_product v9 on p.product9_id=v9.product_id
left join valid_product v10 on p.product10_id=v10.product_id;
Upvotes: 2