sabra2121
sabra2121

Reputation: 51

HIVE left join on nearest date

I am trying to join 2 tables in HIVE using a key and the nearest date in the 2 tables at the time of join. For example: Below are the 2 input tables

<----------TABLE A------------->            <------------TABLE B------------>
A_id    A_date      changed_col             B_id    B_date      B_value A_id
****    ******      ***********             ****    ******      ******* *****   
A01     2017-03-20      ABC                 B01     2017-04-02  200     A01
A01     2017-04-01      XYZ                 B01     2017-04-04  500     A01
A01     2017-04-05      LLL             

However when I LEFT JOIN table B with Table A, it should look for the nearest lowest date in table A for the same key(A_id). Below is the expected output table:

B_id    B_date          A_id        A_date      changed col   B_value
****    ******          ****        ******      ***********   *******
B01     2017-02-04      A01     2017-01-04      XYZ             200
B01     2017-04-04      A01     2017-01-04      XYZ             500

Any help is much appreciated. Thanks

Upvotes: 2

Views: 2026

Answers (4)

David דודו Markovitz
David דודו Markovitz

Reputation: 44941

select  B.B_id
       ,B.B_date
       ,B.A_id
       ,A.A_data.A_date         as A_date 
       ,A.A_data.changed_col    as changed_col
       ,B_value

from                B

        left join  (select      B.B_id
                               ,max (named_struct ('A_date',A_date,'changed_col',changed_col)) as A_data

                    from                B
                                join    A
                                on      A.A_id = B.A_id

                    where       A.A_date <= B.B_date 

                    group by    B.B_id
                    ) A

        on          A.B_id  =
                    B.B_id

+------+------------+------+------------+-------------+---------+
| b_id |   b_date   | a_id |   a_date   | changed_col | b_value |
+------+------------+------+------------+-------------+---------+
| B01  | 2017-04-02 | A01  | 2017-04-01 | XYZ         |     200 |
| B01  | 2017-04-04 | A01  | 2017-04-01 | XYZ         |     500 |
+------+------------+------+------------+-------------+---------+

Upvotes: 1

David דודו Markovitz
David דודו Markovitz

Reputation: 44941

select  B.B_id
       ,B.B_date
       ,B.A_id
       ,A.A_date
       ,A.changed_col
       ,B_value

from                B

        left join  (select  *

                    from   (select  B.B_id
                                   ,A.A_date
                                   ,A.changed_col

                                   ,row_number () over
                                    (
                                        partition by    B.B_id
                                        order by        A.A_date desc
                                    ) as rn
                            from            B
                                    join    A
                                    on      A.A_id = B.A_id
                            where   A.A_date <= B.B_date 
                            ) A

                    where rn = 1
                    ) A

        on          A.B_id  =
                    B.B_id

+------+------------+------+------------+-------------+---------+
| b_id |   b_date   | a_id |   a_date   | changed_col | b_value |
+------+------------+------+------------+-------------+---------+
| B01  | 2017-04-02 | A01  | 2017-04-01 | XYZ         |     200 |
| B01  | 2017-04-04 | A01  | 2017-04-01 | XYZ         |     500 |
+------+------------+------+------------+-------------+---------+

Upvotes: 2

David דודו Markovitz
David דודו Markovitz

Reputation: 44941

select  B_id
       ,dt                  as B_date
       ,A_id
       ,A_data.A_date       as A_date 
       ,A_data.changed_col
       ,B_value

from   (select  B_id,dt,B_value,A_id,tab               
               ,max 
                (   case 
                        when tab = 'A' 
                        then named_struct ('A_date',dt,'changed_col',changed_col) 
                    end
                ) over
                (
                    partition by    A_id
                    order by        dt,tab
                    rows            between unbounded preceding
                                    and     current row
                ) as A_data


        from   (select  B_id,B_date as dt,B_value,A_id
                       ,'B' as tab,null as changed_col
                from    B                

                union all

                select  null as B_id,A_date as dt,null as B_value,A_id
                       ,'A' as tab,changed_col
                from    A
                ) t
        ) t

where   tab = 'B'

+------+------------+------+------------+-------------+---------+
| b_id |   b_date   | a_id |   a_date   | changed_col | b_value |
+------+------------+------+------------+-------------+---------+
| B01  | 2017-04-02 | A01  | 2017-04-01 | XYZ         |     200 |
| B01  | 2017-04-04 | A01  | 2017-04-01 | XYZ         |     500 |
+------+------------+------+------------+-------------+---------+            

Upvotes: 1

Another alternative is:

select B.B_id, B.B_date, B.A_id, A1.A_id, A1.A_date, A1.changed_col, B.B_value
from 
TABLE_B as B,
(
select A_id, A_date, chaged_col
from TABLE_A as A
where A_date = (select min(A_date) from TABLE_A where A_id = A.A_id) ) as A1
where B.A_id = A.A_id

Upvotes: -1

Related Questions