YZAK8Kr2

· 6 years ago · Mar 19, 2019, 09:58 PM
1// ===========================================================================
2// Welcome! 
3// 
4// Log in to your Snowflake instance, create a new worksheet, then
5// copy and paste the contents of this file into your worksheet.
6//
7// Make sure you're using the ACCOUNTADMIN role before we get started 
8// (see the context in the upper right corner of your worksheet)
9//
10// Next, head over to the Partner Connect page (link in the top right of the snowflake toolbar).
11//
12// We're going to start by setting up a Talend Stitch account using the built in Snowflake connection. 
13// This will give us everything we need to start securely ingesting data.
14// ===========================================================================
15
16
17
18
19// ===========================================================================
20// Next, let's manually setup PowerBI similarly to how Partner Connect set
21// Talend Stitch up for us before.
22// ===========================================================================
23// Create the DB, user, role, and warehouse that powerbi will use to securely connect
24CREATE DATABASE IF NOT EXISTS POWERBI;
25CREATE USER IF NOT EXISTS POWERBI_USER_ACCOUNT PASSWORD="my super cool password." MUST_CHANGE_PASSWORD=false; // use your own password, dummy
26CREATE ROLE IF NOT EXISTS POWERBI_ROLE;
27CREATE WAREHOUSE IF NOT EXISTS POWERBI_WH WAREHOUSE_SIZE=XSMALL;
28
29// Attach permissions
30GRANT USAGE ON DATABASE POWERBI TO ROLE POWERBI_ROLE;
31GRANT USAGE ON WAREHOUSE POWERBI_WH TO ROLE POWERBI_ROLE;
32ALTER USER POWERBI_USER_ACCOUNT SET DEFAULT_ROLE=POWERBI_ROLE; // you must do this because the PowerBI connector doesn't let you specify a role.
33// ===========================================================================
34
35
36
37
38// ===========================================================================
39// Now, setup ingestion from the Stitch side of things and wait for data to flow.
40//
41// Let's explore the raw data.
42// ===========================================================================
43// Initial look at data
44SELECT * FROM PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_DATA LIMIT 10;
45
46// let's make the table easier to use in queries
47SET SPOTIFY = 'PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_DATA';
48SELECT * FROM TABLE($SPOTIFY) LIMIT 10;
49
50// we can also set context to do this
51USE DATABASE PC_STITCH_DB;
52USE SCHEMA AWS_S3_CSV_BUCKET;
53SELECT * FROM SPOTIFY_DATA LIMIT 10;
54
55// NOTE: I prefer the variable approach for worksheets
56// where you'll be in multiple databases. It prevents mistakes
57// when running different parts of your worksheet 
58
59// let's find out what the top songs are
60SELECT * FROM TABLE($SPOTIFY) ORDER BY STREAMS DESC LIMIT 10;
61
62// let's see what date ranges we're working with
63SELECT
64    MIN(DATE) AS startDate,
65    MAX(DATE) AS endDate
66FROM
67    TABLE($SPOTIFY);
68
69// looks like we're essentially playing with top daily streams for all of 2017.
70// Let's see which artist had the most streams over the whole data set
71SELECT 
72    ARTIST, SUM(STREAMS) as streamCount
73FROM 
74    TABLE($SPOTIFY)
75GROUP BY
76    ARTIST
77ORDER BY
78    streamCount DESC
79LIMIT 10;
80
81// hmm, these numbers seem high. Maybe we need to explore those region codes
82SELECT DISTINCT REGION FROM TABLE($SPOTIFY);
83
84// maybe we should just use global? I think aggregating with regions AND global
85// is resulting in double counting. Lets find out using Drake's streams.
86SET globalDrakeSteams =         (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Drake' AND REGION='global');
87SET sumOfRegionalDrakeStreams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Drake' AND REGION!='global');
88SELECT $globalDrakeSteams, $sumOfRegionalDrakeStreams;
89
90// what about Kendrick Lamar ("DAMN." came out in the spring of 2017 and it is a masterpiece)
91SET globalKendrickSteams =         (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Kendrick Lamar' AND REGION='global');
92SET sumOfRegionalKendrickStreams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Kendrick Lamar' AND REGION!='global');
93SELECT $globalKendrickSteams, $sumOfRegionalKendrickStreams;
94
95// it's not exact, but close enough to convince me to filter by global from now on. (check with other artists if you like)
96// The easiest way to do that is with a view. Let's make one now
97CREATE VIEW IF NOT EXISTS 
98  PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_GLOBAL
99AS
100  SELECT * FROM TABLE($SPOTIFY) WHERE REGION='global';
101  
102// Now grab the view name as a variable
103SET SPOTIFY_GLOBAL = 'PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_GLOBAL';
104
105// Cool, now we're working with some cleaner data.
106// Let's revisit our top artists.
107SELECT 
108    ARTIST, SUM(STREAMS) as streamCount
109FROM 
110    TABLE($SPOTIFY_GLOBAL)
111GROUP BY
112    ARTIST
113ORDER BY
114    streamCount DESC
115LIMIT 10;
116
117// now top songs
118SELECT 
119    ARTIST, TRACK_NAME, SUM(STREAMS) as streamCount
120FROM 
121    TABLE($SPOTIFY_GLOBAL)
122GROUP BY
123    ARTIST, TRACK_NAME
124ORDER BY
125    streamCount DESC
126LIMIT 10;
127
128// wow, Ed Sheeran was killing it in 2017. Good for him.
129//
130// I think we have enough context to maybe build a good BI table...
131// ===========================================================================
132
133
134
135
136// ===========================================================================
137// Now, let's send some clean, BI-ready data to our PowerBI DB.
138//
139// Our goal from here will be to transform the data in PC_STITCH_DB to a clean,
140// BI-ready form inside of the POWERBI database for analytics consumption
141// ===========================================================================
142// let's create our spotify schema in the powerbi db
143CREATE SCHEMA IF NOT EXISTS POWERBI.SPOTIFY;
144
145// I think it would be fun to see the top 5 songs every
146// month for US streaming.
147
148// start by creating a temp table with stream counts for each song by month.
149// We'll filter to only US data and we'll ignore the partial January 2018 data
150CREATE OR REPLACE TEMPORARY TABLE
151  POWERBI.SPOTIFY.US_STREAMCOUNTS_BY_MONTH
152AS (
153SELECT 
154  ARTIST, 
155  TRACK_NAME, 
156  SUM(STREAMS) AS STREAM_COUNT, 
157  DATE_TRUNC(MONTH, DATE) AS MONTH
158FROM
159  TABLE($SPOTIFY)
160WHERE
161  REGION='us' AND YEAR(DATE) = 2017
162GROUP BY 
163  MONTH, ARTIST, TRACK_NAME
164);
165
166// now we'll get song ranks within each month using a nested
167// query with a window function
168CREATE OR REPLACE TABLE
169    POWERBI.SPOTIFY.US_MONTHLY_TOP_5_SONGS
170AS (
171SELECT * FROM 
172(
173    SELECT
174      *,
175      RANK() OVER (PARTITION BY MONTH ORDER BY STREAM_COUNT DESC) AS MONTHLY_RANK
176    FROM
177      POWERBI.SPOTIFY.US_STREAMCOUNTS_BY_MONTH
178)
179WHERE
180  MONTHLY_RANK <= 5
181);
182
183// now that we're finished with all that messiness, let's grant read access
184// on our new schema and table to our powerbi user
185GRANT USAGE ON SCHEMA POWERBI.SPOTIFY TO ROLE POWERBI_ROLE;
186GRANT SELECT ON ALL TABLES IN SCHEMA POWERBI.SPOTIFY TO ROLE POWERBI_ROLE;
187
188
189// Great job! You should be able to make some cool visuazliations in PBI now.
190// ===========================================================================