iqyXqeRR

· 7 years ago · Nov 08, 2018, 01:26 AM
1#!/bin/bash
2
3##################################################
4#                   Constants                    #
5##################################################
6
7delimiter='~'
8soft_delimiter=","
9column_limit=9
10
11##################################################
12#                   Variables                    #
13##################################################
14
15#cluster specific:
16alias imp=''
17partitions_table='example.partition_analysis'
18partitions_table_location='/example/partition_analysis'
19partitions_table_view='example.partition_analysis_conformed'
20
21#instance specific:
22db='db_navbi_curated'
23output_file='./eric.csv'
24
25#safety values:
26table_limit=2500
27partition_limit=50000
28
29##################################################
30#                   Functions                    #
31##################################################
32
33#write to stdout with a timestamp
34log () {
35  echo "[$(date +%s)] $1"
36}
37
38#write to an output file
39write () {
40  echo "$1" >> ${output_file}
41}
42
43#split a string by its delimiter, reverse it, and recompose it with that delimiter
44reverse_by_delimiter () {
45  echo "$1" | perl -lne 'print join "'${delimiter}'", reverse split/\'${delimiter}'/;'
46}
47
48#replace all instances of a delmiter with a soft delmiter after a certain amount of delimiters
49trim_columns () {
50  echo "$1" | sed "s/${delimiter}/${soft_delimiter}/${column_limit}g"
51}
52
53##################################################
54#                 Main script                    #
55##################################################
56
57#set up:
58log "Setting up..."
59rm -f ${output_file} 2> /dev/null
60hdfs dfs -mkdir -p ${partitions_table_location} 2> /dev/null
61
62#loop through all impala tables in $db:
63log "Collecting information on up to ${table_limit} tables..."
64imp -Bq "use ${db}; show tables;" 2> /dev/null | head -n ${table_limit} | while read table; do
65  log "Fetching stats for ${db}.${table}..."
66  partitions="$(imp --output_delimiter='~' -Bq 'show partitions '${db}'.'${table}';' 2> /dev/null)"
67  echo "${partitions}" | grep -v "Total${delimiter}" | head -n ${partition_limit} | while read partition; do
68	write "${db}.${table}${delimiter}$(trim_columns "$(reverse_by_delimiter "${partition}")")"
69  done
70done
71
72#create the impala table if it doesn't yet exist:
73imp -Bq '
74  DROP TABLE IF EXISTS '${partitions_table}';
75  CREATE EXTERNAL TABLE IF NOT EXISTS '${partitions_table}' (
76    p_TABLE STRING,
77	P_LOCATION STRING,
78    P_INCREMENTAL BOOLEAN,
79    P_FORMAT STRING,
80    P_REPLICATION STRING,
81    P_CACHE STRING,
82    P_SIZE STRING,
83    P_FILES INT,
84    P_ROWS INT,
85    P_PARTITION STRING
86  ) 
87  ROW FORMAT DELIMITED 
88  FIELDS TERMINATED BY '"'${delimiter}'"'
89  STORED AS TEXTFILE
90  LOCATION '"'${partitions_table_location}'"'
91  '
92
93#insert into the impala table:
94log 'Loading data into Impala...'
95hdfs dfs -put -f ${output_file} ${partitions_table_location}
96
97#refresh the impala table:
98log 'Refreshing Impala...'
99imp -Bq "REFRESH ${partitions_table}"
100
101log "Load complete."
102  
103#create a view where the size data is meaningful:
104imp -Bq '
105 DROP VIEW IF EXISTS '${partitions_table_view}'
106 CREATE VIEW IF NOT EXISTS '${partitions_table_view}' AS SELECT 
107  CASE p_size_unit 
108   WHEN 'KB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 AS INT)
109   WHEN 'MB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 * 1024 AS INT)
110   WHEN 'GB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 * 1024 * 1024 AS INT)
111   WHEN 'TB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 * 1024 * 1024 * 1024 AS INT)
112   ELSE CAST(p_size AS INT) END p_size_bytes,
113   p_table,
114   p_location,
115   p_size,
116   p_files,
117   p_rows,
118   p_partition
119  FROM (
120   SELECT 
121     substring(p_size, 1, length(p_size) - 2) p_size_raw,
122     substring(p_size, length(p_size) - 1, 2) p_size_unit,
123 	p_table,
124     p_location,
125     p_size,
126     p_files,
127     p_rows,
128     p_partition
129   FROM 
130 	db_navbi.partition_analysis
131   WHERE p_size IS NOT NULL AND length(p_size) > 2
132 ) partition_analysis_split
133;
134'
135
136#example query:
137# show the tables with the lowest file size with a reasonable number of files:
138imp -q '
139select * from (select p_table, avg(avg_file_size) avg_file_size_table, sum(p_files) sum_files, count(*) count_partitions from (select p_table, p_files, (p_size_bytes / p_files) as avg_file_size from db_navbi.partition_analysis_conformed) avgs group by p_table) tbl_avgs where sum_files > 25 order by avg_file_size_table limit 25;
140'