· 6 years ago · Oct 28, 2019, 11:52 AM
1--Hive queries template:
2--1) DDL
3--CSV
4
5CREATE TABLE IF NOT EXISTS <schemaname>.CSV_TABLE
6(
7 uuid VARCHAR(50),
8 flag CHAR(1),
9 bigid BIGINT,
10 record_dt DATE,
11 amount DECIMAL(20,2),
12 record_tmstp TIMESTAMP
13)
14ROW FORMAT DELIMITED FIELDS TERMINATED BY '~'
15LINES TERMINATED BY '\n'
16LOCATION 's3://<bucket>/<prefix>/';
17
18--ORC
19
20CREATE EXTERNAL TABLE `<schemaname>.ORC_TABLE`(
21 `uuid` VARCHAR(50),
22 `flag` CHAR(1),
23 `bigid` BIGINT,
24 `record_dt` DATE,
25 `amount` DECIMAL(20,2),
26 `record_tmstp` TIMESTAMP)
27ROW FORMAT SERDE
28 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
29STORED AS INPUTFORMAT
30 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
31OUTPUTFORMAT
32 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
33LOCATION
34 's3://bucket/prefix'
35TBLPROPERTIES (
36 'orc.compress'='SNAPPY')
37
38--AVRO
39--The Schema would be taken from Avro Schema file
40CREATE EXTERNAL TABLE `<schemaname>.AVRO_TABLE`(
41 `uuid` VARCHAR(50),
42 `flag` CHAR(1),
43 `bigid` BIGINT,
44 `record_dt` DATE,
45 `amount` DECIMAL(20,2),
46 `record_tmstp` TIMESTAMP)
47PARTITIONED BY (
48 `partition` int)
49ROW FORMAT SERDE
50 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
51STORED AS INPUTFORMAT
52 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
53OUTPUTFORMAT
54 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
55LOCATION
56 's3://bucket/prefix'
57TBLPROPERTIES (
58 'avro.schema.url'='s3://bucket/prefix2/file.avsc')
59
60--PARQUET
61
62--JSON
63CREATE EXTERNAL TABLE `<schemaname>.JSON_TABLE`(
64 `uuid` string COMMENT 'from deserializer',
65 `firstname` string COMMENT 'from deserializer',
66 `lastname` string COMMENT 'from deserializer')
67ROW FORMAT SERDE
68 'org.apache.hive.hcatalog.data.JsonSerDe'
69WITH SERDEPROPERTIES (
70 'paths'='uuid, firstname, lastname')
71STORED AS INPUTFORMAT
72 'org.apache.hadoop.mapred.TextInputFormat'
73OUTPUTFORMAT
74 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
75LOCATION
76 's3://bucket/prefix'
77
78--CSV TABLE WITH BUCKETS
79
80CREATE TABLE `<schemaname>.CSV_TABLE`(
81 `id` string,
82 `id2` string,
83 `primarykey` string,
84 `noid` string,
85 `type` string,
86 `source` string)
87COMMENT 'Data from source'
88CLUSTERED BY (
89 bucketid)
90INTO 100 BUCKETS
91ROW FORMAT SERDE
92 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
93WITH SERDEPROPERTIES (
94 'field.delim'=',',
95 'serialization.format'=',')
96STORED AS INPUTFORMAT
97 'org.apache.hadoop.mapred.TextInputFormat'
98OUTPUTFORMAT
99 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
100LOCATION
101 's3://bucket/prefix'
102
103--WITH PARTITION
104
105CREATE TABLE `<schemaname>.CSV_TABLE_WITH_PARTITION`(
106 `id` string,
107 `id2` string,
108 `primarykey` string,
109 `noid` string,
110 `type` string,
111 `source` string)
112COMMENT 'Data from source'
113CLUSTERED BY (
114 bucketid)
115INTO 100 BUCKETS
116PARTITIONED BY (
117 `partition` int)
118ROW FORMAT SERDE
119 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
120WITH SERDEPROPERTIES (
121 'field.delim'=',',
122 'serialization.format'=',')
123STORED AS INPUTFORMAT
124 'org.apache.hadoop.mapred.TextInputFormat'
125OUTPUTFORMAT
126 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
127LOCATION
128 's3://bucket/prefix'
129
130--COMPRESSION TYPES
131--CTAS
132--
133--2) DML
134--INSERT TO BUCKET TABLE
135--INSERT TO PARTITION TABLE - DYNAMIC PARTITION FLAGS
136--CSV GZIP - MENTION COMPRESSION FLAGS
137--INSERT OVERWRITE TO TABLE
138--
139--3) EXTERNAL TABLE
140--MSCK REPAIR TABLE
141--
142--4) STATS Collection for partitioned and non partitioned columns.