007
007

Reputation: 2186

T-SQL : dedup and query latest unique records

DECLARE @Table TABLE
    (
        [ID]    INT
      , [Alias] NVARCHAR(50)
      , [Key]   INT
      , [Val]   NVARCHAR(10)
      , [RN]    INT
    )

INSERT INTO @Table ([ID], [Alias], [Key], [Val], [RN])
VALUES
    ( 1, N'A1', 1, N'One', 1 )
  , ( 1, N'A1', 2, N'Two', 1 )
  , ( 1, N'A1', 3, N'', 1 )
  , ( 1, N'A2', 1, N'One', 2 )
  , ( 1, N'A2', 2, N'', 2 )
  , ( 1, N'A2', 3, N'Three', 2 )
  , ( 1, N'A3', 1, N'One', 3 )
  , ( 1, N'A3', 2, N'Two', 3 )
  , ( 1, N'A3', 4, N'Four_New', 3 )
  , ( 1, N'A4', 4, N'Four', 4 )
  , ( 1, N'A4', 5, N'Five', 4 )

  , ( 2, N'B1', 1, N'', 1 )
  , ( 2, N'B1', 2, N'', 1 )
  , ( 2, N'B1', 3, N'', 1 )
  , ( 2, N'B2', 1, N'One', 2 )
  , ( 2, N'B2', 2, N'', 2 )
  , ( 2, N'B2', 3, N'', 2 )
  , ( 2, N'B3', 2, N'Two', 3 )
  , ( 2, N'B3', 4, N'Four', 3 )
  , ( 2, N'B4', 3, N'Three', 4 )
  , ( 2, N'B4', 6, N'Six', 4 )

/* OUTPUT # 1
    ID Alias Key    Val
    1  A5    1      One         -- Kept the Val from Alias = A1
    1  A5    2      Two         -- Kept the Val from Alias = A1
    1  A5    3                  -- Kept the Val from Alias = A1
    1  A5    4      Four_New    -- Taking the Key/Val from Alias = A3 and assigned it to Alias = A1 for ID = 1
    1  A5    5      Five        -- Taking the Key/Val from Alias = A4 and assigned it to Alias = A1 for ID = 1

    2  B5    1                  -- Kept the Val from Alias = B1
    2  B5    2                  -- Kept the Val from Alias = B1
    2  B5    3                  -- Kept the Val from Alias = B1
    2  B5    4      Four        -- Taking the Key/Val from Alias = B3 and assigned it to Alias = B1 for ID = 2
    2  B5    6      Six         -- Taking the Key/Val from Alias = B4 and assigned it to Alias = B1 for ID = 1
*/


/* OUTPUT #2
ID Alias Key    Val
1  A5    1      One         -- Kept the Val from Alias = A1
1  A5    2      Two         -- Kept the Val from Alias = A1
1  A5    3      Three       -- Taking the Val from Alias = A2 because it's the 1st (based on RN ASC) latest filled value for Key = 3 for ID = 1
1  A5    4      Four_New    -- Got the Key/Val from Alias = A3 and assigned it to Alias = A1
1  A5    5      Five        -- Got the Key/Val from Alias = A4 and assigned it to Alias = A1

2  B5    1      One         -- Taking the Val from Alias = B2 because it's the 1st (based on RN ASC) latest filled value for Key = 1 for ID = 2
2  B5    2      Two         -- Taking the Val from Alias = B3 because it's the 1st (based on RN ASC) latest filled value for Key = 2 for ID = 2
2  B5    3      Three       -- Taking the Val from Alias = B4 because it's the 1st (based on RN ASC) latest filled value for Key = 3 for ID = 2
2  B5    4      Four        -- Got the Key/Val from Alias = B3 and assigned it to Alias = B1
2  B5    6      Six         -- Got the Key/Val from Alias = B4 and assigned it to Alias = B1
*/

Desc: This is a sample data representation of a real table, which happens to be quite big.

Fields:

Goal:

TWO outputs

Output # 1 Combine all the matching person's records and create a brand new person that takes all key/val data form the latest entry of that person and adds any missing key/val from previous entries (always take the latest key/val).

Output # 2 Combine all the matching person's records and create a brand new person that takes all non(empty/null) key/val data from the latest entry of that person and adds any missing key/val from previous entries (always take the latest key/val)

Upvotes: 0

Views: 37

Answers (1)

George Joseph
George Joseph

Reputation: 5922

I would opt for ranking of the results as follows

for Output#1

with base_rows
  as (select *
            ,rank() over(partition by id,[key] order by rn asc) as rnk
            ,cast(max(substring(alias,2,len(alias))) over(partition by id)+1 as nvarchar(100)) as max_index
            ,max(substring(alias,1,1)) over(partition by id) as alias_char
        from t
      )
 select id
        ,concat(alias_char,max_index)
        ,[key]
        ,[val]
        ,[rn]
   from base_rows
  where rnk=1


+----+------------------+-----+----------+----+
| id | (No column name) | key |   val    | rn |
+----+------------------+-----+----------+----+
|  1 | A5               |   1 | One      |  1 |
|  1 | A5               |   2 | Two      |  1 |
|  1 | A5               |   3 |          |  1 |
|  1 | A5               |   4 | Four_New |  3 |
|  1 | A5               |   5 | Five     |  4 |
|  2 | B5               |   1 |          |  1 |
|  2 | B5               |   2 |          |  1 |
|  2 | B5               |   3 |          |  1 |
|  2 | B5               |   4 | Four     |  3 |
|  2 | B5               |   6 | Six      |  4 |
+----+------------------+-----+----------+----+

For output #2

Its the same query used as output#1, except we filter out all records which are empty in the base_rows block.

with base_rows
  as (select *
            ,rank() over(partition by id,[key] order by rn asc) as rnk
            ,cast(max(substring(alias,2,len(alias))) over(partition by id)+1 as nvarchar(100)) as max_index
            ,max(substring(alias,1,1)) over(partition by id) as alias_char
        from t
      where [val] <> ''  
      )
 select id
        ,concat(alias_char,max_index)
        ,[key]
        ,[val]
        ,[rn]
   from base_rows
 where rnk=1



+----+------------------+-----+----------+----+
| id | (No column name) | key |   val    | rn |
+----+------------------+-----+----------+----+
|  1 | A5               |   1 | One      |  1 |
|  1 | A5               |   2 | Two      |  1 |
|  1 | A5               |   3 | Three    |  2 |
|  1 | A5               |   4 | Four_New |  3 |
|  1 | A5               |   5 | Five     |  4 |
|  2 | B5               |   1 | One      |  2 |
|  2 | B5               |   2 | Two      |  3 |
|  2 | B5               |   3 | Three    |  4 |
|  2 | B5               |   4 | Four     |  3 |
|  2 | B5               |   6 | Six      |  4 |
+----+------------------+-----+----------+----+

Upvotes: 2

Related Questions