· 5 years ago · Jun 25, 2020, 02:30 PM
1-- create schema
2-- create dim tables (shipping, customer, product, geo)
3-- fix data quality problem
4-- create sales_fact table
5-- match number of rows between staging and dw (business layer)
6
7
8
9
10
11
12create schema dw;
13
14
15
16--SHIPPING
17
18--creating a table
19drop table if exists dw.shipping_dim ;
20CREATE TABLE dw.shipping_dim
21(
22 ship_id serial NOT NULL,
23 shipping_mode varchar(14) NOT NULL,
24 CONSTRAINT PK_shipping_dim PRIMARY KEY ( ship_id )
25);
26
27--deleting rows
28truncate table dw.shipping_dim;
29
30--generating ship_id and inserting ship_mode from orders
31insert into dw.shipping_dim
32select 100+row_number() over(), ship_mode from (select distinct ship_mode from stg.orders ) a;
33--checking
34select * from dw.shipping_dim sd;
35
36
37
38
39--CUSTOMER
40
41drop table if exists dw.customer_dim ;
42CREATE TABLE dw.customer_dim
43(
44cust_id serial NOT NULL,
45customer_id varchar(8) NOT NULL, --id can't be NULL
46 customer_name varchar(22) NOT NULL,
47 CONSTRAINT PK_customer_dim PRIMARY KEY ( cust_id )
48);
49
50--deleting rows
51truncate table dw.customer_dim;
52--inserting
53insert into dw.customer_dim
54select 100+row_number() over(), customer_id, customer_name from (select distinct customer_id, customer_name from stg.orders ) a;
55--checking
56select * from dw.customer_dim cd;
57
58
59
60
61--GEOGRAPHY
62
63drop table if exists dw.geo_dim ;
64CREATE TABLE dw.geo_dim
65(
66 geo_id serial NOT NULL,
67 country varchar(13) NOT NULL,
68 city varchar(17) NOT NULL,
69 state varchar(20) NOT NULL,
70 postal_code varchar(20) NULL, --can't be integer, we lost first 0
71 CONSTRAINT PK_geo_dim PRIMARY KEY ( geo_id )
72);
73
74--deleting rows
75truncate table dw.geo_dim;
76--generating geo_id and inserting rows from orders
77insert into dw.geo_dim
78select 100+row_number() over(), country, city, state, postal_code from (select distinct country, city, state, postal_code from stg.orders ) a;
79--data quality check
80select distinct country, city, state, postal_code from dw.geo_dim
81where country is null or city is null or postal_code is null;
82
83-- City Burlington, Vermont doesn't have postal code
84update dw.geo_dim
85set postal_code = '05401'
86where city = 'Burlington' and postal_code is null;
87
88--also update source file
89update stg.orders
90set postal_code = '05401'
91where city = 'Burlington' and postal_code is null;
92
93
94select * from dw.geo_dim
95where city = 'Burlington'
96
97
98
99
100--PRODUCT
101
102--creating a table
103drop table if exists dw.product_dim ;
104CREATE TABLE dw.product_dim
105(
106 prod_id serial NOT NULL, --we created surrogated key
107 product_id varchar(50) NOT NULL, --exist in ORDERS table
108 product_name varchar(127) NOT NULL,
109 category varchar(15) NOT NULL,
110 sub_category varchar(11) NOT NULL,
111 segment varchar(11) NOT NULL,
112 CONSTRAINT PK_product_dim PRIMARY KEY ( prod_id )
113);
114
115--deleting rows
116truncate table dw.product_dim ;
117--
118insert into dw.product_dim
119select 100+row_number() over () as prod_id ,product_id, product_name, category, subcategory, segment from (select distinct product_id, product_name, category, subcategory, segment from stg.orders ) a;
120--checking
121select * from dw.product_dim cd;
122
123
124
125--CALENDAR use function instead
126-- examplehttps://tapoueh.org/blog/2017/06/postgresql-and-the-calendar/
127
128--creating a table
129drop table if exists dw.calendar_dim ;
130CREATE TABLE dw.calendar_dim
131(
132dateid serial NOT NULL,
133year int NOT NULL,
134quarter int NOT NULL,
135month int NOT NULL,
136week int NOT NULL,
137date date NOT NULL,
138week_day varchar(20) NOT NULL,
139leap varchar(20) NOT NULL,
140CONSTRAINT PK_calendar_dim PRIMARY KEY ( dateid )
141);
142
143--deleting rows
144truncate table dw.calendar_dim;
145--
146insert into dw.calendar_dim
147select
148to_char(date,'yyyymmdd')::int as date_id,
149 extract('year' from date)::int as year,
150 extract('quarter' from date)::int as quarter,
151 extract('month' from date)::int as month,
152 extract('week' from date)::int as week,
153 date::date,
154 to_char(date, 'dy') as week_day,
155 extract('day' from
156 (date + interval '2 month - 1 day')
157 ) = 29
158 as leap
159 from generate_series(date '2000-01-01',
160 date '2030-01-01',
161 interval '1 day')
162 as t(date);
163--checking
164select * from dw.calendar_dim;
165
166
167
168
169
170--METRICS
171
172--creating a table
173drop table if exists dw.sales_fact ;
174CREATE TABLE dw.sales_fact
175(
176 sales_id serial NOT NULL,
177 cust_id integer NOT NULL,
178 order_date_id integer NOT NULL,
179 ship_date_id integer NOT NULL,
180 prod_id integer NOT NULL,
181 ship_id integer NOT NULL,
182 geo_id integer NOT NULL,
183 order_id varchar(25) NOT NULL,
184 sales numeric(9,4) NOT NULL,
185 profit numeric(21,16) NOT NULL,
186 quantity int4 NOT NULL,
187 discount numeric(4,2) NOT NULL,
188 CONSTRAINT PK_sales_fact PRIMARY KEY ( sales_id ));
189
190
191insert into dw.sales_fact
192select
193 100+row_number() over() as sales_id
194 ,cust_id
195 ,to_char(order_date,'yyyymmdd')::int as order_date_id
196 ,to_char(ship_date,'yyyymmdd')::int as ship_date_id
197 ,p.prod_id
198 ,s.ship_id
199 ,geo_id
200 ,o.order_id
201 ,sales
202 ,profit
203 ,quantity
204 ,discount
205from stg.orders o
206inner join dw.shipping_dim s on o.ship_mode = s.shipping_mode
207inner join dw.geo_dim g on o.postal_code = g.postal_code and g.country=o.country and g.city = o.city and o.state = g.state --City Burlington doesn't have postal code
208inner join dw.product_dim p on o.product_name = p.product_name and o.segment=p.segment and o.subcategory=p.sub_category and o.category=p.category and o.product_id=p.product_id
209inner join dw.customer_dim cd on cd.customer_id=o.customer_id and cd.customer_name=o.customer_name
210
211
212--do you get 9994rows?
213select count(*) from dw.sales_fact sf
214inner join dw.shipping_dim s on sf.ship_id=s.ship_id
215inner join dw.geo_dim g on sf.geo_id=g.geo_id
216inner join dw.product_dim p on sf.prod_id=p.prod_id
217inner join dw.customer_dim cd on sf.cust_id=cd.cust_id;