· 7 years ago · Jan 14, 2019, 01:20 PM
1-- Creating a ETL script by passing run_date value
2
3-- HIVE CONF
4set hive.exec.dynamic.partition.mode=nonstrict;
5
6-- external parameters
7-- set RUN_DATE=2019-01-05;
8-- USAGE:
9-- hive -hiveconf RUN_DATE='2019-01-05' -f script.hql
10
11
12CREATE SCHEMA IF NOT EXISTS nik;
13CREATE SCHEMA IF NOT EXISTS prod;
14
15CREATE EXTERNAL TABLE IF NOT EXISTS prod.clean_access_log(
16 ip STRING,
17 request_type STRING,
18 uri STRING,
19 protocol STRING,
20 bytes_sent STRING,
21 referer STRING,
22 useragent STRING
23)
24PARTITIONED BY (dte STRING)
25STORED AS PARQUET
26LOCATION 's3://nikita-ds-playground/prod/clean_access_log/';
27
28DROP TABLE nik.access_log_temp;
29
30CREATE EXTERNAL TABLE IF NOT EXISTS nik.access_log_temp
31(log STRING)
32LOCATION 's3://nikita-ds-playground/raw/access-log/${hiveconf:RUN_DATE}/';
33
34
35INSERT OVERWRITE TABLE prod.clean_access_log PARTITION(dte)
36select
37split(log, " ")[0] as ip,
38substr(split(log, " ")[5], 2) as request_type,
39split(log, " ")[6] as uri,
40cast(split(log, " ")[8] as int) as protocol,
41cast(split(log, " ")[9] as bigint) as bytes_sent,
42substr( split(log, " ")[10],2, length( split(log, " ")[10])-2 ) as referrer,
43substr(split(log, "\\\" \\\"")[1], 1, length(split(log, "\\\" \\\"")[1]) - 1 ) as useragent,
44'${hiveconf:RUN_DATE}' as dte
45from nik.access_log_temp ;
46
47DROP TABLE nik.access_log_temp;
48
49-- verify data
50-- select dte, count(1) from prod.clean_access_log group by dte;