Reputation: 16981
I'm trying to implement Kimball data mart which uses -1 and -2 rows in dimension table for late arriving dimension and for NULL business key. I have an example code below which creates one staging table for fact and dimension data, two dimension tables and one fact table for data mart. Here is my example code with data in SQL:
--drop table stg_sales
--go
CREATE TABLE dbo.stg_sales
(
stg_sales_id INT IDENTITY(1, 1) NOT NULL,
sales_number INT NOT NULL,
sales_amt INT NULL,
cust_number INT NULL,
cust_firstname NVARCHAR(50) NULL,
cust_lastname NVARCHAR(100) NULL,
cust_address NVARCHAR(500) NULL,
salesperson_number INT NULL,
CONSTRAINT pk_stg_sales PRIMARY KEY (stg_sales_id)
)
go
INSERT stg_sales
(sales_number,
sales_amt,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
salesperson_number)
VALUES (123,
434,
2342,
'Jim',
'Moriaty',
'something',
23)
INSERT stg_sales
(sales_number,
sales_amt,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
salesperson_number)
VALUES (124,
234,
2342,
'Jim',
'Moriaty',
'something',
23)
INSERT stg_sales
(sales_number,
sales_amt,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
salesperson_number)
VALUES (125,
434,
4545,
'Joe',
'Esk',
'someother',
24)
INSERT stg_sales
(sales_number,
sales_amt,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
salesperson_number)
VALUES (126,
434,
5555,
'Daniel',
'Hart',
'Someaddr',
NULL) --salesperson_number business key missing here
INSERT stg_sales
(sales_number,
sales_amt,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
salesperson_number)
VALUES (127,
333,
4444,
'Pat',
'Smith',
'Someaddr',
30)
SELECT *
FROM stg_sales
--create a dimension and fact tables
--drop table dbo.dim_customer
--go
CREATE TABLE dbo.dim_customer
(
customer_wid INT IDENTITY(1, 1) NOT NULL,
cust_number INT NULL,
cust_firstname NVARCHAR(50) NULL,
cust_lastname NVARCHAR(100) NULL,
cust_address NVARCHAR(500) NULL,
date_insert DATETIME2 NOT NULL DEFAULT (Getdate()),
date_update DATETIME2 NULL,
is_current BIT NOT NULL
CONSTRAINT pk_dim_customer PRIMARY KEY (customer_wid)
CONSTRAINT chk_is_current CHECK (is_current IN (0, 1))
)
go
SET IDENTITY_INSERT dbo.dim_customer ON
INSERT dbo.dim_customer
(customer_wid,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
date_insert,
date_update,
is_current)
VALUES (-1,
-1,
'unknown',
'unknown',
'unknown',
Getdate(),
Getdate(),
1)
INSERT dbo.dim_customer
(customer_wid,
cust_number,
cust_firstname,
cust_lastname,
cust_address,
date_insert,
date_update,
is_current)
VALUES (-2,
-2,
'Error',
'Error',
'Error',
Getdate(),
Getdate(),
1)
SET IDENTITY_INSERT dbo.dim_customer OFF
--insert data into dimension table
INSERT dbo.dim_customer
(cust_number,
cust_firstname,
cust_lastname,
cust_address,
is_current)
SELECT DISTINCT cust_number,
cust_firstname,
cust_lastname,
cust_address,
1 AS is_current
FROM dbo.stg_sales
WHERE cust_number <> 4444 --left one record off to simulate the situation where you don't have corrensponding row in dim table (late arriving dimension)
SELECT *
FROM dbo.dim_customer
DROP TABLE dbo.dim_salesperson
--create salesperson table
CREATE TABLE dbo.dim_salesperson
(
salesperson_wid INT IDENTITY(1, 1) NOT NULL,
salesperson_number INT NULL,
salesperson_firstname NVARCHAR(50) NULL,
salesperson_lastname NVARCHAR(100) NULL,
salesperson_address NVARCHAR(500) NULL,
date_insert DATETIME2 NOT NULL DEFAULT (Getdate()),
date_update DATETIME2 NULL,
is_current BIT NOT NULL
CONSTRAINT pk_dim_salesperson PRIMARY KEY (salesperson_wid)
CONSTRAINT chk_dim_salesperson_is_current CHECK (is_current IN (0, 1))
)
go
SET IDENTITY_INSERT dbo.dim_salesperson ON
INSERT dbo.dim_salesperson
(salesperson_wid,
salesperson_number,
salesperson_firstname,
salesperson_lastname,
salesperson_address,
date_insert,
date_update,
is_current)
VALUES (-1,
-1,
'Not available',
'Not available',
'Not available',
Getdate(),
Getdate(),
1)
INSERT dbo.dim_salesperson
(salesperson_wid,
salesperson_number,
salesperson_firstname,
salesperson_lastname,
salesperson_address,
date_insert,
date_update,
is_current)
VALUES (-2,
-2,
'Error',
'Error',
'Error',
Getdate(),
Getdate(),
1)
SET IDENTITY_INSERT dbo.dim_salesperson OFF
--insert data into salesperson
INSERT dbo.dim_salesperson
(salesperson_number,
salesperson_firstname,
salesperson_lastname,
salesperson_address,
is_current)
VALUES (23,
'John',
'Fox',
'something',
1)
INSERT dbo.dim_salesperson
(salesperson_number,
salesperson_firstname,
salesperson_lastname,
salesperson_address,
is_current)
VALUES (24,
'Hadley',
'Fox',
'something',
1)
INSERT dbo.dim_salesperson
(salesperson_number,
salesperson_firstname,
salesperson_lastname,
salesperson_address,
is_current)
VALUES (30,
'Ashley',
'Fox',
'something',
1)
SELECT *
FROM dbo.dim_salesperson
SELECT *
FROM dbo.stg_sales
--create and populate the fact table
--drop table dbo.f_sales
--go
CREATE TABLE dbo.f_sales
(
sales_number INT NOT NULL,
customer_wid INT NOT NULL,
salesperson_wid INT NOT NULL,
sales_amt INT NULL
CONSTRAINT pk_f_sales PRIMARY KEY (sales_number)
CONSTRAINT fk_customer_wid FOREIGN KEY (customer_wid) REFERENCES
dbo.dim_customer(customer_wid),
CONSTRAINT fk_salesperson_wid FOREIGN KEY (salesperson_wid) REFERENCES
dbo.dim_salesperson(salesperson_wid)
)
--populate the fact table
INSERT dbo.f_sales
(sales_number,
customer_wid,
salesperson_wid,
sales_amt)
SELECT stg.sales_number,
Isnull(dimcust.customer_wid, -1) AS customer_wid,
--this is maybe correct way to assign -1 foreign key when there is no corresponding dimension row in the dim table
Isnull(dimsp.salesperson_wid, -2) AS salesperson_wid,
--NOT CORRECT, how to assign -2 foreign key when the business key is NULL in the source?
stg.sales_amt
FROM dbo.stg_sales AS stg
LEFT JOIN dbo.dim_customer AS dimcust
ON stg.cust_number = dimcust.cust_number
LEFT JOIN dbo.dim_salesperson AS dimsp
ON stg.salesperson_number = dimsp.salesperson_number
SELECT *
FROM dbo.f_sales
How can I assign -2 for the rows in which the business key is missing in the source system. You can read more about the theory behind this implementation from Kimball:
This is basically what I'm trying to achieve:
Handling NULL in dimensions and measures
Edit:
I think I can use a COALESCE
or ISNULL
in the left join, it seems to produce the correct result:
INSERT dbo.f_sales
(sales_number,
customer_wid,
salesperson_wid,
sales_amt)
SELECT stg.sales_number,
Isnull(dimcust.customer_wid, -1) AS customer_wid,
--this is maybe correct way to assign -1 foreign key when there is no corresponding dimension row in the dim table
dimsp.salesperson_wid,
stg.sales_amt
FROM dbo.stg_sales AS stg
LEFT JOIN dbo.dim_customer AS dimcust
ON COALESCE(stg.cust_number, -2) = dimcust.cust_number
LEFT JOIN dbo.dim_salesperson AS dimsp
ON COALESCE(stg.salesperson_number, -2) = dimsp.salesperson_number
Upvotes: 5
Views: 8362
Reputation: 245
I thought a late arriving fact was where a dimension related to the fact is slowly changing, and facts were late. Say you have a sales system, and there is a product called 'Opal Fruit' and the product changes the name to 'Starburst'. The company wants to track whether the change in name affects the sales, so they set up a type 2 slowly changing dimension (SCD). In the SCD, they track the product name, so each time the product name changes, they issue a new record for that product, with the same business(natural) key but a different surrogate key, and they record the valid from and to dates for that name. So, say 'Opal Fruit' has a valid from date of 1/1/1970 and a valid to date 28/3/1995. The version with product name 'Opal Fruit' has a valid from of '29/3/1995 and the valid to is null, as it is still valid.
Now, say on 1/4/1995, one day after the name change, they get some sales transactions whose transaction date is 1/4/1995 and some whose transaction date is 27/3/1995. Late arriving means that the transactions for a given dimension arrive after the product dimension changed. So, the transaction that arrived on 27/3/1995 need to have the surrogate key for the product dimension whose product name is 'Opal Fruitt' while those that arrived on 1/4/1995 need the surroagte key of the product dimension where the name is 'Starburst'. What you have to do is work out which version of the 'Opal Fruit / Starburst' product was current at the transaction date, and assign that product surrogate key to the transaction when it goes into the sale fact.
That is my understanding of late arriving facts. Fact that arrive before the dimensions elements to handle them have arrive are surely early arriving facts, not late. Maybe it's just a terminology thing.
Upvotes: 0
Reputation: 22187
Purely as a lookup technique
-- add nullable keys to the staging table
alter table dbo.stg_sales ADD
sales_person_wid integer null
, customer_wid integer null
;
-- insert to staging table here (as in your example)
-- lookup sales person key
update dbo.stg_sales
set sales_person_wid = p.sales_person_wid
from dbo.stg_sales as s , dbo.dim_salesperson as p
where s.salesperson_number = p.salesperson_number ;
-- decide what to do with missing business keys
update dbo.stg_sales
set sales_person_wid = -2
where sales_person_wid is null ;
-- do similar for customer
-- now all keys in staging table are not null
-- load to fact table
However, usual technique is to assign special (unknown, n/a, error) business keys to transactions during extraction or maybe cleaning. In other words that special Error
business key could have been assigned before the record landed into the staging table.
And in final note, late arriving dimension means that business key (salesperson_number
) is known to the operational system, but the transaction (sale fact) made it to warehouse before the dimension's data. So, salesperson_number
would be not null
, but would not exist in the dimenson table. You than have to save this transaction somewhere and try to update the FK (salesperson_wid) once the record arrives in the dimension; a day later or so.
Upvotes: 4