· 7 years ago · Nov 08, 2018, 01:26 AM
1#!/bin/bash
2
3##################################################
4# Constants #
5##################################################
6
7delimiter='~'
8soft_delimiter=","
9column_limit=9
10
11##################################################
12# Variables #
13##################################################
14
15#cluster specific:
16alias imp=''
17partitions_table='example.partition_analysis'
18partitions_table_location='/example/partition_analysis'
19partitions_table_view='example.partition_analysis_conformed'
20
21#instance specific:
22db='db_navbi_curated'
23output_file='./eric.csv'
24
25#safety values:
26table_limit=2500
27partition_limit=50000
28
29##################################################
30# Functions #
31##################################################
32
33#write to stdout with a timestamp
34log () {
35 echo "[$(date +%s)] $1"
36}
37
38#write to an output file
39write () {
40 echo "$1" >> ${output_file}
41}
42
43#split a string by its delimiter, reverse it, and recompose it with that delimiter
44reverse_by_delimiter () {
45 echo "$1" | perl -lne 'print join "'${delimiter}'", reverse split/\'${delimiter}'/;'
46}
47
48#replace all instances of a delmiter with a soft delmiter after a certain amount of delimiters
49trim_columns () {
50 echo "$1" | sed "s/${delimiter}/${soft_delimiter}/${column_limit}g"
51}
52
53##################################################
54# Main script #
55##################################################
56
57#set up:
58log "Setting up..."
59rm -f ${output_file} 2> /dev/null
60hdfs dfs -mkdir -p ${partitions_table_location} 2> /dev/null
61
62#loop through all impala tables in $db:
63log "Collecting information on up to ${table_limit} tables..."
64imp -Bq "use ${db}; show tables;" 2> /dev/null | head -n ${table_limit} | while read table; do
65 log "Fetching stats for ${db}.${table}..."
66 partitions="$(imp --output_delimiter='~' -Bq 'show partitions '${db}'.'${table}';' 2> /dev/null)"
67 echo "${partitions}" | grep -v "Total${delimiter}" | head -n ${partition_limit} | while read partition; do
68 write "${db}.${table}${delimiter}$(trim_columns "$(reverse_by_delimiter "${partition}")")"
69 done
70done
71
72#create the impala table if it doesn't yet exist:
73imp -Bq '
74 DROP TABLE IF EXISTS '${partitions_table}';
75 CREATE EXTERNAL TABLE IF NOT EXISTS '${partitions_table}' (
76 p_TABLE STRING,
77 P_LOCATION STRING,
78 P_INCREMENTAL BOOLEAN,
79 P_FORMAT STRING,
80 P_REPLICATION STRING,
81 P_CACHE STRING,
82 P_SIZE STRING,
83 P_FILES INT,
84 P_ROWS INT,
85 P_PARTITION STRING
86 )
87 ROW FORMAT DELIMITED
88 FIELDS TERMINATED BY '"'${delimiter}'"'
89 STORED AS TEXTFILE
90 LOCATION '"'${partitions_table_location}'"'
91 '
92
93#insert into the impala table:
94log 'Loading data into Impala...'
95hdfs dfs -put -f ${output_file} ${partitions_table_location}
96
97#refresh the impala table:
98log 'Refreshing Impala...'
99imp -Bq "REFRESH ${partitions_table}"
100
101log "Load complete."
102
103#create a view where the size data is meaningful:
104imp -Bq '
105 DROP VIEW IF EXISTS '${partitions_table_view}'
106 CREATE VIEW IF NOT EXISTS '${partitions_table_view}' AS SELECT
107 CASE p_size_unit
108 WHEN 'KB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 AS INT)
109 WHEN 'MB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 * 1024 AS INT)
110 WHEN 'GB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 * 1024 * 1024 AS INT)
111 WHEN 'TB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 * 1024 * 1024 * 1024 AS INT)
112 ELSE CAST(p_size AS INT) END p_size_bytes,
113 p_table,
114 p_location,
115 p_size,
116 p_files,
117 p_rows,
118 p_partition
119 FROM (
120 SELECT
121 substring(p_size, 1, length(p_size) - 2) p_size_raw,
122 substring(p_size, length(p_size) - 1, 2) p_size_unit,
123 p_table,
124 p_location,
125 p_size,
126 p_files,
127 p_rows,
128 p_partition
129 FROM
130 db_navbi.partition_analysis
131 WHERE p_size IS NOT NULL AND length(p_size) > 2
132 ) partition_analysis_split
133;
134'
135
136#example query:
137# show the tables with the lowest file size with a reasonable number of files:
138imp -q '
139select * from (select p_table, avg(avg_file_size) avg_file_size_table, sum(p_files) sum_files, count(*) count_partitions from (select p_table, p_files, (p_size_bytes / p_files) as avg_file_size from db_navbi.partition_analysis_conformed) avgs group by p_table) tbl_avgs where sum_files > 25 order by avg_file_size_table limit 25;
140'