Designing Data Systems Project - v0.2

You might also like

Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 16

Project Report

Design Data Systems


Student: Tuan Anh Pham

1. Staging

1.1. Student provides a screenshot after extracting 6 Yelp files into staging schema. The
screenshot should have 6 tables with the correct respective row counts.

1.2. Student provides a screenshot after extracting 2 files into the staging schema. The
screenshot should have two tables - temperature and precipitation.

1.3. The student provides a diagram showing 8 files pointing to staging database to ODS
to DWH to Reporting.
2. Operational Data Store

2.1. Student provides SQL queries that transform staging to ODS.

#Create USER table in ODS


CREATE TABLE "YELPRATINGPROJECT"."ODS"."USER_ODS_TABLE" (
user_id STRING NOT NULL PRIMARY KEY,
name STRING,
review_count INT,
yelping_since DATE,
friends ARRAY,
useful INT,
funny INT,
cool INT,
fans INT,
elite ARRAY,
average_stars FLOAT,
compliment_hot INT,
compliment_more INT,
compliment_profile INT,
compliment_cute INT,
compliment_list INT,
compliment_note INT,
compliment_plain INT,
compliment_cool INT,
compliment_funny INT,
compliment_writer INT,
compliment_photos INT
)
COMMENT = 'User table in ODS layer';

#Copy data from USER table in Stating layer into USER table in ODS
INSERT INTO USER_ODS_TABLE
SELECT
USERJSON:user_id,
USERJSON:name,
USERJSON:review_count,
USERJSON:yelping_since,
USERJSON:friends,
USERJSON:useful,
USERJSON:funny,
USERJSON:cool,
USERJSON:fans,
USERJSON:elite,
USERJSON:average_stars,
USERJSON:compliment_hot,
USERJSON:compliment_more,
USERJSON:compliment_profile,
USERJSON:compliment_cute,
USERJSON:compliment_list,
USERJSON:compliment_note,
USERJSON:compliment_plain,
USERJSON:compliment_cool,
USERJSON:compliment_funny,
USERJSON:compliment_writer,
USERJSON:compliment_photos
FROM "YELPRATINGPROJECT"."STAGING"."USER_STAGING_TABLE";

#Create BUSINESS table in ODS


CREATE TABLE "YELPRATINGPROJECT"."ODS"."BUSINESS_ODS_TABLE" (
business_id STRING NOT NULL PRIMARY KEY,
name STRING,
address STRING,
city STRING,
state STRING,
postal_code STRING,
latitude FLOAT,
longitude FLOAT,
stars FLOAT,
review_count INT,
is_open INT,
attributes OBJECT,
categories ARRAY,
hours ARRAY)
COMMENT = 'Business table in ODS layer';

#Copy data from BUSINESS table in Stating layer into BUSINESS table in ODS
INSERT INTO BUSINESS_ODS_TABLE
SELECT
BUSINESSJSON:business_id,
BUSINESSJSON:name,
BUSINESSJSON:address,
BUSINESSJSON:city,
BUSINESSJSON:state,
BUSINESSJSON:postal_code,
BUSINESSJSON:latitude,
BUSINESSJSON:longitude,
BUSINESSJSON:stars,
BUSINESSJSON:review_count,
BUSINESSJSON:is_open,
BUSINESSJSON:attributes,
BUSINESSJSON:categories,
BUSINESSJSON:hours
FROM "YELPRATINGPROJECT"."STAGING"."BUSINESS_STAGING_TABLE";

#Create REVIEW table in ODS


CREATE TABLE "YELPRATINGPROJECT"."ODS"."REVIEW_ODS_TABLE" (
review_id STRING NOT NULL PRIMARY KEY,
user_id STRING FOREIGN KEY REFERENCES USER_ODS_TABLE(user_id),
business_id STRING FOREIGN KEY REFERENCES BUSINESS_ODS_TABLE(business_id),
stars INT,
date DATE,
text STRING,
useful INT,
funny INT,
cool INT)
COMMENT = 'Review table in ODS layer';

#Copy data from REVIEW table in Stating layer into REVIEW table in ODS
INSERT INTO REVIEW_ODS_TABLE
SELECT
REVIEWJSON:review_id,
REVIEWJSON:user_id,
REVIEWJSON:business_id,
REVIEWJSON:stars,
REVIEWJSON:date,
REVIEWJSON:text,
REVIEWJSON:useful,
REVIEWJSON:funny,
REVIEWJSON:cool
FROM "YELPRATINGPROJECT"."STAGING"."REVIEW_STAGING_TABLE";

#Create TIP table in ODS


CREATE TABLE "YELPRATINGPROJECT"."ODS"."TIP_ODS_TABLE" (
user_id STRING FOREIGN KEY REFERENCES USER_ODS_TABLE(user_id),
business_id STRING FOREIGN KEY REFERENCES BUSINESS_ODS_TABLE(business_id),
text STRING,
date DATE,
compliment_count INT,
PRIMARY KEY(user_id,business_id))
COMMENT = 'Tip table in ODS layer';
#Copy data from TIP table in Stating layer into TIP table in ODS
INSERT INTO TIP_ODS_TABLE
SELECT
TIPJSON:user_id,
TIPJSON:business_id,
TIPJSON:text,
TIPJSON:date,
TIPJSON:compliment_count
FROM "YELPRATINGPROJECT"."STAGING"."TIP_STAGING_TABLE";

#Create CHECKIN table in ODS


CREATE TABLE "YELPRATINGPROJECT"."ODS"."CHECKIN_ODS_TABLE" (
business_id STRING FOREIGN KEY REFERENCES BUSINESS_ODS_TABLE(business_id),
date STRING,
PRIMARY KEY(business_id))
COMMENT = 'Checkin table in ODS layer';

#Copy data from CHECKIN table in Stating layer into CHECKIN table in ODS
INSERT INTO CHECKIN_ODS_TABLE
SELECT
CHECKINJSON:business_id,
CHECKINJSON:date
FROM "YELPRATINGPROJECT"."STAGING"."CHECKIN_STAGING_TABLE";

#Create COVID table in ODS


CREATE TABLE "YELPRATINGPROJECT"."ODS"."COVID_ODS_TABLE" (
business_id STRING FOREIGN KEY REFERENCES BUSINESS_ODS_TABLE(business_id),
highlights STRING,
delivery_or_takeout STRING,
grubhub_enabled STRING,
call_to_action_enabled STRING,
request_a_quote_enabled STRING,
covid_banner STRING,
temporary_closed_until STRING,
virtual_services_offered STRING,
PRIMARY KEY(business_id))
COMMENT = 'Covid table in ODS layer';

#Copy data from COVID table in Stating layer into COVID table in ODS
INSERT INTO COVID_ODS_TABLE
SELECT
COVIDJSON:"business_id",
COVIDJSON:"highlights",
COVIDJSON:"Delivery or takeout",
COVIDJSON:"Grubhub enabled",
COVIDJSON:"Call To Action enabled",
COVIDJSON:"Request a Quote Enabled",
COVIDJSON:"Covid Banner",
COVIDJSON:"Temporary Closed Until",
COVIDJSON:"Virtual Services Offered"
FROM "YELPRATINGPROJECT"."STAGING"."COVID_STAGING_TABLE";

#Create TEMPERATURE table in ODS


CREATE TABLE "YELPRATINGPROJECT"."ODS"."TEMPERATURE_ODS_TABLE" (
date DATE NOT NULL PRIMARY KEY,
"min" FLOAT,
"max" FLOAT,
normal_min FLOAT,
normal_max FLOAT)
COMMENT = 'Temperature table in ODS layer';

#Copy data from TEMPERATURE table in Stating layer into TEMPERATURE table in ODS
INSERT INTO TEMPERATURE_ODS_TABLE
SELECT
to_date(date,'yyyymmdd'),
min,
max,
normal_min,
normal_max
FROM "YELPRATINGPROJECT"."STAGING"."TEMPERATURE_STAGING_TABLE";

#Create PRECIPITATION_STAGING_TABLE_2 to covert date to DATE, precipitation to FLOAT,


and precipitation_normal to FLOAT

CREATE TABLE "YELPRATINGPROJECT"."STAGING"."PRECIPITATION_STAGING_TABLE_2" (


date DATE,
precipitation FLOAT,
precipitation_normal FLOAT,
PRIMARY KEY (date))
COMMENT = 'Precipitation table 2 in Staging layer';

#Copy data from PRECIPITATION_STAGING_TABLE to PRECIPITATION_STAGING_TABLE_2


INSERT INTO "YELPRATINGPROJECT"."STAGING"."PRECIPITATION_STAGING_TABLE_2"
SELECT
to_date(date,'yyyymmdd'),
try_to_decimal(precipitation, 10, 2),
try_to_decimal(precipitation_normal, 10, 2)
FROM "YELPRATINGPROJECT"."STAGING"."PRECIPITATION_STAGING_TABLE";
#Create PRECIPITATION table in ODS layer
CREATE TABLE "YELPRATINGPROJECT"."ODS"."PRECIPITATION_ODS_TABLE" (
date DATE FOREIGN KEY REFERENCES TEMPERATURE_ODS_TABLE(date),
precipitation FLOAT,
precipitation_normal FLOAT,
PRIMARY KEY (date))
COMMENT = 'Precipitation table in ODS layer';

#Copy data from PRECIPITATION_STAGING_TABLE_2 to PRECIPITATION table in ODS layer


INSERT INTO PRECIPITATION_ODS_TABLE
SELECT
date,
IFNULL(precipitation, lag(precipitation) ignore nulls over (order by date)),
IFNULL(precipitation_normal, lag(precipitation_normal) ignore nulls over (order by date))
FROM "YELPRATINGPROJECT"."STAGING"."PRECIPITATION_STAGING_TABLE_2";

2.2 Student provides SQL queries that use JSON functions to transform staging data from
a single JSON structure into multiple columns for ODS.

#Copy data from USER table in Stating layer into USER table in ODS
INSERT INTO USER_ODS_TABLE
SELECT
USERJSON:user_id,
USERJSON:name,
USERJSON:review_count,
USERJSON:yelping_since,
USERJSON:friends,
USERJSON:useful,
USERJSON:funny,
USERJSON:cool,
USERJSON:fans,
USERJSON:elite,
USERJSON:average_stars,
USERJSON:compliment_hot,
USERJSON:compliment_more,
USERJSON:compliment_profile,
USERJSON:compliment_cute,
USERJSON:compliment_list,
USERJSON:compliment_note,
USERJSON:compliment_plain,
USERJSON:compliment_cool,
USERJSON:compliment_funny,
USERJSON:compliment_writer,
USERJSON:compliment_photos
FROM "YELPRATINGPROJECT"."STAGING"."USER_STAGING_TABLE";

#Copy data from BUSINESS table in Stating layer into BUSINESS table in ODS
INSERT INTO BUSINESS_ODS_TABLE
SELECT
BUSINESSJSON:business_id,
BUSINESSJSON:name,
BUSINESSJSON:address,
BUSINESSJSON:city,
BUSINESSJSON:state,
BUSINESSJSON:postal_code,
BUSINESSJSON:latitude,
BUSINESSJSON:longitude,
BUSINESSJSON:stars,
BUSINESSJSON:review_count,
BUSINESSJSON:is_open,
BUSINESSJSON:attributes,
BUSINESSJSON:categories,
BUSINESSJSON:hours
FROM "YELPRATINGPROJECT"."STAGING"."BUSINESS_STAGING_TABLE";

#Copy data from REVIEW table in Stating layer into REVIEW table in ODS
INSERT INTO REVIEW_ODS_TABLE
SELECT
REVIEWJSON:review_id,
REVIEWJSON:user_id,
REVIEWJSON:business_id,
REVIEWJSON:stars,
REVIEWJSON:date,
REVIEWJSON:text,
REVIEWJSON:useful,
REVIEWJSON:funny,
REVIEWJSON:cool
FROM "YELPRATINGPROJECT"."STAGING"."REVIEW_STAGING_TABLE";

#Copy data from TIP table in Stating layer into TIP table in ODS
INSERT INTO TIP_ODS_TABLE
SELECT
TIPJSON:user_id,
TIPJSON:business_id,
TIPJSON:text,
TIPJSON:date,
TIPJSON:compliment_count
FROM "YELPRATINGPROJECT"."STAGING"."TIP_STAGING_TABLE";

#Copy data from CHECKIN table in Stating layer into CHECKIN table in ODS
INSERT INTO CHECKIN_ODS_TABLE
SELECT
CHECKINJSON:business_id,
CHECKINJSON:date
FROM "YELPRATINGPROJECT"."STAGING"."CHECKIN_STAGING_TABLE";

#Copy data from COVID table in Stating layer into COVID table in ODS
INSERT INTO COVID_ODS_TABLE
SELECT
COVIDJSON:"business_id",
COVIDJSON:"highlights",
COVIDJSON:"Delivery or takeout",
COVIDJSON:"Grubhub enabled",
COVIDJSON:"Call To Action enabled",
COVIDJSON:"Request a Quote Enabled",
COVIDJSON:"Covid Banner",
COVIDJSON:"Temporary Closed Until",
COVIDJSON:"Virtual Services Offered"
FROM "YELPRATINGPROJECT"."STAGING"."COVID_STAGING_TABLE";

2.3 Student provides screenshot of a table with three columns: raw files, staging, and ODS.
Each column should record the size of the data in the respective format. The table should have
eight rows, one for each file.

#Create table to store data size


CREATE TABLE "YELPRATINGPROJECT"."ODS"."METADATA_TABLE" (
data_file_name STRING,
raw_size INT,
staging_size INT,
ods_size INT)
COMMENT = "size in bytes of each file in 3 layers";

#Insert name and raw file size into the table


INSERT INTO "YELPRATINGPROJECT"."ODS"."METADATA_TABLE" (data_file_name, raw_size)
VALUES('Business',124380583),
('User',3684505303 ),
('Review',6936678061 ),
('Tip',230307244),
('Checkin',398272056),
('Covid',64835031),
('Temperature', 818712),
('Precipitation', 529121);

#Copy size of data in staging into the table


UPDATE "YELPRATINGPROJECT"."ODS"."METADATA_TABLE" as m
SET m.staging_size = t.BYTES
FROM
"YELPRATINGPROJECT"."INFORMATION_SCHEMA".tables as t
where CONCAT(UPPER(m.data_file_name),'_STAGING_TABLE') = t.table_name;

#Copy size of data in ods into the table

UPDATE "YELPRATINGPROJECT"."ODS"."METADATA_TABLE" as m
SET m.ods_size = t.BYTES
FROM
"YELPRATINGPROJECT"."INFORMATION_SCHEMA".tables as t
where CONCAT(UPPER(m.data_file_name),'_ODS_TABLE') = t.table_name;

#here is the table

2.4 The student provides an ER diagram that includes all appropriate model information.
2.5 Submission should include a SQL query that show how the datasets are integrated.

# Create integrated table (Review, Temperature, Precipitation)


CREATE TABLE "YELPRATINGPROJECT"."ODS"."INTEGRATED_ODS_TABLE" (
review_id STRING NOT NULL PRIMARY KEY,
user_id STRING FOREIGN KEY REFERENCES USER_ODS_TABLE(user_id),
business_id STRING FOREIGN KEY REFERENCES BUSINESS_ODS_TABLE(business_id),
stars INT,
date DATE,
text STRING,
useful INT,
funny INT,
cool INT,
min FLOAT,
max FLOAT,
normal_min FLOAT,
normal_max FLOAT,
precipitation FLOAT,
precipitation_normal FLOAT)
COMMENT = 'Integrated table in ODS layer';

#Insert data into INTEGRATED_ODS_TABLE


INSERT INTO INTEGRATED_ODS_TABLE
SELECT r.review_id, r.user_id, r.business_id, r.stars, r.date, r.text, r.useful, r.funny, r.cool,
t."min", t."max", t.normal_min, t.normal_max, p.precipitation, p.precipitation_normal
FROM REVIEW_ODS_TABLE as r, TEMPERATURE_ODS_TABLE as t, PRECIPITATION_ODS_TABLE as p
WHERE (r.date = t.date) AND (r.date = p.date);

3. Datawarehouse
3.1 Student provides a diagram of star schema with dimensions and fact tables.

3.2 Student provides the SQL queries necessary to move the data from ODS to DWH

#Create user dimension table USER_DIM_TABLE


CREATE TABLE "YELPRATINGPROJECT"."DATAWAREHOUSE"."USER_DIM_TABLE" (
user_id STRING NOT NULL PRIMARY KEY,
name STRING,
review_count INT,
yelping_since DATE,
friends ARRAY,
useful INT,
funny INT,
cool INT,
fans INT,
elite ARRAY,
average_stars FLOAT,
compliment_hot INT,
compliment_more INT,
compliment_profile INT,
compliment_cute INT,
compliment_list INT,
compliment_note INT,
compliment_plain INT,
compliment_cool INT,
compliment_funny INT,
compliment_writer INT,
compliment_photos INT
)
COMMENT = 'User dim table in DWH layer';

#Copy data from ODS to user dimension table USER_DIM_TABLE

INSERT INTO USER_DIM_TABLE(


user_id, name, review_count, yelping_since, friends, useful, funny, cool, fans, elite, average_stars,
compliment_hot, compliment_more, compliment_profile, compliment_cute, compliment_list,
compliment_note, compliment_plain, compliment_cool, compliment_funny, compliment_writer,
compliment_photos)
SELECT
user_id, name, review_count, yelping_since, friends, useful, funny, cool, fans, elite, average_stars,
compliment_hot, compliment_more, compliment_profile, compliment_cute, compliment_list,
compliment_note, compliment_plain, compliment_cool, compliment_funny, compliment_writer,
compliment_photos
FROM "YELPRATINGPROJECT"."ODS"."USER_ODS_TABLE";

#Create business dimension table BUSINESS_DIM_TABLE

CREATE TABLE "YELPRATINGPROJECT"."DATAWAREHOUSE"."BUSINESS_DIM_TABLE" (


business_id STRING NOT NULL PRIMARY KEY,
name STRING,
address STRING,
city STRING,
state STRING,
postal_code STRING,
latitude FLOAT,
longitude FLOAT,
stars FLOAT,
review_count INT,
is_open INT,
attributes OBJECT,
categories ARRAY,
hours ARRAY)
COMMENT = 'Business dim table in DWH layer';

#Copy data from ODS to business dimension table BUSINESS_DIM_TABLE

INSERT INTO BUSINESS_DIM_TABLE(


business_id, name, address, city, state, postal_code, latitude, longitude,
stars, review_count, is_open, attributes, categories, hours)
SELECT
business_id, name, address, city, state, postal_code, latitude, longitude,
stars, review_count, is_open, attributes, categories, hours
FROM "YELPRATINGPROJECT"."ODS"."BUSINESS_ODS_TABLE";

#Create temperature and precipitation dimension table TEMPERATURE_PRECIPITATION_DIM_TABLE

CREATE TABLE
"YELPRATINGPROJECT"."DATAWAREHOUSE"."TEMPERATURE_PRECIPITATION_DIM_TABLE" (
date DATE NOT NULL PRIMARY KEY,
"min" FLOAT,
"max" FLOAT,
normal_min FLOAT,
normal_max FLOAT,
precipitation FLOAT,
precipitation_normal FLOAT)
COMMENT = 'Temperature and Precipitationdim table in DWH layer';

#Copy data from ODS to temperature and precipitation dimension table


TEMPERATURE_DIM_TABLE

INSERT INTO TEMPERATURE_PRECIPITATION_DIM_TABLE


(date, "min", "max", normal_min, normal_max, precipitation, precipitation_normal)
SELECT
t.date, t."min", t."max", t.normal_min, t.normal_max,
p.precipitation, p.precipitation_normal
FROM "YELPRATINGPROJECT"."ODS"."TEMPERATURE_ODS_TABLE" as t,
"YELPRATINGPROJECT"."ODS"."PRECIPITATION_ODS_TABLE" as p
WHERE t.date = p.date;

#Create review fact table REVIEW_FACT_TABLE

CREATE TABLE "YELPRATINGPROJECT"."DATAWAREHOUSE"."REVIEW_FACT_TABLE" (


business_id STRING FOREIGN KEY REFERENCES BUSINESS_DIM_TABLE(business_id),
user_id STRING FOREIGN KEY REFERENCES USER_DIM_TABLE(user_id),
date DATE FOREIGN KEY REFERENCES TEMPERATURE_PRECIPITATION_DIM_TABLE(date),
stars INT)
COMMENT = 'Review fact table in DHW layer';

#Copy data from ODS to review fact table REVIEW_FACT_TABLE

INSERT INTO REVIEW_FACT_TABLE


(business_id, user_id, date, stars)
SELECT b.business_id, u.user_id, t.date, r.stars
FROM ODS.BUSINESS_ODS_TABLE as b, ODS.USER_ODS_TABLE as u,
ODS.TEMPERATURE_ODS_TABLE as t, ODS.REVIEW_ODS_TABLE as r
WHERE (b.business_id = r.business_id) AND (u.user_id = r.user_id) AND (t.date = r.date);

3.3 Student provides SQL queries that report business name, temperature, precipitation, and
ratings.

SELECT b.name, f.date, t."min", t."max", t.precipitation, avg(f.stars)


FROM REVIEW_FACT_TABLE as f, BUSINESS_DIM_TABLE as b,
TEMPERATURE_PRECIPITATION_DIM_TABLE as t
WHERE (f.business_id = b.business_id) AND (f.date = t.date)
GROUP BY b.name, f.date, t."min", t."max", t.precipitation
ORDER BY b.name, f.date;

Result query:

You might also like