Reputation: 35149
data_source_1
tabledata_source_2
tabledata_sources_view
viewdata_source_1
:has no dups:
db=# select count(*) from (select distinct * from data_source_1);
count
--------
543243
(1 row)
db=# select count(*) from (select * from data_source_1);
count
--------
543243
(1 row)
data_source_2
:has no dups:
db=# select count(*) from (select * from data_source_2);
count
-------
5304
(1 row)
db=# select count(*) from (select distinct * from data_source_2);
count
-------
5304
(1 row)
data_sources_view
:has dups:
db=# select count(*) from (select distinct * from data_sources_vie);
count
--------
538714
(1 row)
db=# select count(*) from (select * from data_sources_view);
count
--------
548547
(1 row)
The view is simple as:
CREATE VIEW data_sources_view
AS SELECT *
FROM (
(
SELECT a, b, 'data_source_1' as source
FROM data_source_1
)
UNION ALL
(
SELECT a, b, 'data_source_2' as source
FROM data_source_2
)
);
'data_source_x' as source
eliminates the possibility of overlapping data.db# create table t1 as select * from data_sources_view;
SELECT
db=#
db=# create table t2 as select distinct * from data_sources_view;
SELECT
db=# create table t3 as select * from t1 minus select * from t2;
SELECT
db=# select 't1' as table_name, count(*) from t1 UNION ALL
db-# select 't2' as table_name, count(*) from t2 UNION ALL
db-# select 't3' as table_name, count(*) from t3;
table_name | count
------------+--------
t1 | 548547
t3 | 0
t2 | 538714
(3 rows)
Redshift (PostgreSQL
)
Upvotes: 2
Views: 59
Reputation: 1
UNION vs UNION ALL
OUTPUT
FOO
OUTPUT
FOO
FOO
Upvotes: 0
Reputation: 1270391
The reason is because your data sources have more than two columns. If you do these counts:
select count(*) from (select distinct a, b from data_source_1);
and
select count(*) from (select distinct a, b from data_source_2);
You should find that they are different from the count(*)
you get on the same table.
Upvotes: 2