· 4 years ago · Aug 18, 2021, 04:32 PM
1#!/bin/sh
2### Author: Arik Rozenman (arik.rozenman@nokia.com)
3### Current supported CBIS versions: 19A and 20
4### All Rights Reserved
5
6
7### colors
8NC='\033[0m'
9RED='\033[0;31m'
10GREEN='\033[32m'
11ORANGE='\033[33m'
12BLUE='\033[34m'
13BOLD='\e[1m'
14MAGENTA='\e[35m'
15CYAN='\e[36m'
16UL='\e[4m'
17LRB='\e[101m'
18BB='\e[44m'
19BLINK='\e[5m'
20
21
22### variables
23hv_cbis_admin_password=password
24ssh_params="-q -o LogLevel=error -o GlobalKnownHostsFile=/dev/null -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no"
25FAILURE_COUNTER=0
26SCRIPT_VERSION="8.9.5"
27
28
29logs_dir="/home/stack/system_health_validation_logs"
30if [ -d "$logs_dir/" ]
31then
32 logs_count=$(ls -l $logs_dir/ | grep -c system_health_report | xargs -i expr {} + 1)
33else
34 echo -e "${CYAN}creating the directory $logs_dir/${NC}"
35 mkdir -p $logs_dir/
36fi
37
38
39date=$(date +"%x %X %Z %Y")
40echo -e "${BLUE}\nWELCOME TO THE SYSTEM HEALTH VALIDATION VERSION $SCRIPT_VERSION SCRIPT ($date)${NC}"
41echo -e "${BLUE}THE SCRIPT OUTPUT WILL RESIDE AT: $logs_dir/ DIRECTORY${NC}\n\n"
42echo -e "${ORANGE}for script usage, run the script with -h (system_health_validation.sh -h)${NC}\n\n"
43echo -e "${MAGENTA}changes in version 8.9.5:\n1. added availability zones status validation${NC}\n"
44echo -e "${MAGENTA}changes in version 8.9.4:\n1. minor bug fixes\n2. revamp the docker CpusetCpus validation${NC}\n"
45echo -e "${MAGENTA}changes in version 8.9.3:\n1. minor bug fixes\n2. added zabbix global macros validation\n3. added computes total memory size validation${NC}\n\n"
46
47
48trap ctrl_c INT
49function ctrl_c() {
50 if [[ $FAILURE_COUNTER = "0" ]]
51 then
52 echo -e "${GREEN}failures found: $FAILURE_COUNTER${NC}"
53 exit
54 else
55 echo -e "${RED}failures found: $FAILURE_COUNTER${NC}"
56 exit
57 fi
58}
59
60
61OPTIND=1
62while getopts ":h" opt
63do
64 case ${opt}
65 in
66 h) echo -e "Usage:\t-h\t\t\t\t- SHOW SCRIPT USAGE\n\t-d <'description'>\t\t- FILE DESCRIPTION (default is no description)\n\t-u <username>\t\t\t- CBIS MANAGER USERNAME (default is 'cbis-admin')\n\t-p <password>\t\t\t- CBIS MANAGER PASSWORD (default is 'password')\n\t-e <yes/no>\t\t\t- ESSENTIAL CHECKS ONLY (default is no)\n\t-c <yes/no>\t\t\t- CONNECTIVITY CHECK ONLY (default is no)\n\t-x <stack user password>\t- SET USER stack PASSWORD"
67 exit
68 esac
69done
70
71
72# OPTIND=1 is executed because without it the second getopts function call is ignored. every-time getopts is called OPTIND increases by 1 while the getopts function searching for OPTIND=1 in order to work.
73# therefore, need to reset the OPTIND variable after each time we call the getopts function.
74OPTIND=1
75while getopts d:u:p:e:c:x: option
76do
77 case "${option}"
78 in
79 d) DESCRIPTION=${OPTARG};;
80 u) CBIS_MANAGER_USER=${OPTARG};;
81 p) CBIS_MANAGER_PASSWORD=${OPTARG};;
82 e) ESSENTIAL=${OPTARG};;
83 c) CONNECTIVITY=${OPTARG};;
84 x) STACK_PASSWORD=${OPTARG};;
85 esac
86done
87
88
89if [[ $DESCRIPTION == "" ]]
90then
91 echo -e "${RED}the script must run with -d <DESCRIPTION> (e.g, system_health_validation.sh -d 'after_clean_deploy_before_hardening'). exiting..${NC}"
92 exit
93else
94 if [[ "$DESCRIPTION" =~ \ |\' ]]
95 then
96 echo -e "${RED}the description can't have white-spaces in it. exiting..${NC}"
97 exit
98 fi
99fi
100
101
102if [[ $CBIS_MANAGER_USER == "" ]]
103then
104 CBIS_MANAGER_USER=cbis-admin
105 echo -e "${ORANGE}> no cbis manager username input (-u) was entered, using default username <cbis-admin>${NC}"
106fi
107if [[ $CBIS_MANAGER_PASSWORD == "" ]]
108then
109 CBIS_MANAGER_PASSWORD=password
110 echo -e "${ORANGE}> no cbis manager password input (-p) was entered, using default password <password>${NC}"
111fi
112cbis_manager_token=$(echo -n "$CBIS_MANAGER_USER:$CBIS_MANAGER_PASSWORD" | base64)
113
114
115
116if [[ $ESSENTIAL == "yes" ]]
117then
118 echo -e "${ORANGE}> running only the script essential checks - to run all the checks, execute the script with [-e no] or simply don't use [-e]${NC}"
119 MODE=ESSENTIAL
120fi
121if [[ $ESSENTIAL == "no" ]]
122then
123 echo -e "${ORANGE}> running all the script checks - to run just the essential checks, execute the script with [-e yes]${NC}"
124 ESSENTIAL=no
125 MODE=FULL
126fi
127if [[ $ESSENTIAL == "" ]]
128then
129 echo -e "${ORANGE}> running all the script checks - to run just the essential checks, execute the script with [-e yes]${NC}"
130 ESSENTIAL=no
131 MODE=FULL
132fi
133if [[ $ESSENTIAL != "yes" && $ESSENTIAL != "no" ]]
134then
135 echo -e "${RED}-e accepts only [yes/no] input. exiting..${NC}"
136 exit
137fi
138
139
140if [[ $CONNECTIVITY == "" ]]
141then
142 CONNECTIVITY=no
143fi
144if [[ $CONNECTIVITY == "yes" ]]
145then
146 echo -e "> running the script with -c yes (connectivity check only)"
147fi
148if [[ $CONNECTIVITY != "yes" && $CONNECTIVITY != "no" ]]
149then
150 echo -e "${RED}-c accepts only [yes/no] input. exiting..${NC}"
151 exit
152fi
153
154
155###########################################################################################################
156main_function() {
157global_start=$(date +%s)
158start=$(date +%s)
159###########################################################################################################
160
161
162echo -e "\n\n${BLUE}system health validation script version: $SCRIPT_VERSION${NC}"
163echo -e "\n\n${UL}${BOLD}Colors Legend${NC}"
164echo -e "${GREEN}Success"
165echo -e "${RED}Failure (ideally, each failure equals bug)"
166echo -e "${MAGENTA}Action item for the user"
167echo -e "${ORANGE}Warning that should be read and acknowledged"
168echo -e "${NC}Information (for future debug purposes)"
169echo -e "${BLUE}Presentation (to make the script more eye appealing and organized)"
170echo -e "${CYAN}Presentation (to make the script more eye appealing and organized)\n"
171
172UNDERCLOUD_LOCAL_IP_ADDRESS=$(ip address show | grep 172.31.0.1/21)
173if [[ -z $UNDERCLOUD_LOCAL_IP_ADDRESS ]]
174then
175 echo -e "\n${RED}the system_health_validation.sh script must be executed only from the undercloud virtual machine. exiting..${NC}"
176 exit
177fi
178
179
180if [[ $CONNECTIVITY == "yes" ]]
181then
182 echo -e "${BLUE}\nINSTANCES CONNECTIVITY VALIDATION (VIA NETWORK NAMESPACE)${NC}"
183 nuage=$(grep nuage: user_config.yaml | awk '{print $2}' | column -t)
184 if [[ $nuage != "true" ]]
185 then
186 source ~/overcloudrc
187 # the reason ebhind working with the last index controller is to find issues that related to replace controller. E.G an isse we faces that connectivity via the namespace to a VM doesn't work only from the replaced controller.
188 last_index_controller=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i controller | awk '{print $1}' | sort -r --uniq | awk NR==1)
189 cbis_version=$(openstack cbis version -f value | grep build | awk -F- '{print $2}')
190 instances=$(openstack server list --all -f value | wc -l)
191 if [[ $instances != "0" ]]
192 then
193 if [[ $cbis_version != "19.0.0.1" && $cbis_version != "18.0.0.1" ]]
194 then
195 networks=$(openstack server list --all --long -c Networks -f value | grep -E -i -v ':|avrs' | awk -F= '{print $1}' | sort -u)
196 if [[ $networks ]]
197 then
198 for network in $networks
199 do
200 addresses=$(openstack server list --all --long -c Networks -f value | awk '{OFS=RS;$1=$1}1' | grep -w $network | awk -F= '{print $2}' | tr -d ',;' | paste -sd " ")
201 if [[ -z $addresses ]]
202 then
203 echo -e "${RED}addresses variable didn't return any value${NC}"
204 fi
205 network_id=$(openstack network list -f value | grep -w $network | awk '{print $1}')
206 if [[ -z $network_id ]]
207 then
208 echo -e "${RED}network_id variable didn't return any value${NC}"
209 fi
210 echo -e "${CYAN}ansible $last_index_controller -b -m shell -a \"ip netns exec qdhcp-$network_id nping -4 --tcp-connect -p 22 -c 3 $addresses\"${NC}"
211 result=$(ansible $last_index_controller -b -m shell -a "ip netns exec qdhcp-$network_id nping -4 --tcp-connect -p 22 -c 3 $addresses" | grep 'Failed: [1-9]')
212 if [[ -z $result ]]
213 then
214 echo -e "${GREEN}network $network addresses replied successfully${NC}"
215 else
216 echo -e "${RED}$result${NC}"
217 fi
218 done
219 fi
220 networks=$(openstack server list --all --long -c Networks -f value | grep -i -v avrs | grep : | awk -F= '{print $1}' | sort -u)
221 if [[ $networks ]]
222 then
223 for network in $networks
224 do
225 addresses=$(openstack server list --all --long -c Networks -f value | awk '{OFS=RS;$1=$1}1' | grep -w $network | awk -F= '{print $2}' | tr -d ',;' | paste -sd " ")
226 if [[ -z $addresses ]]
227 then
228 echo -e "${RED}addresses variable didn't return any value${NC}"
229 fi
230 network_id=$(openstack network list -f value | grep -w $network | awk '{print $1}')
231 if [[ -z $network_id ]]
232 then
233 echo -e "${RED}network_id variable didn't return any value${NC}"
234 fi
235 echo -e "${CYAN}ansible $last_index_controller -b -m shell -a \"ip netns exec qdhcp-$network_id nping -6 --tcp-connect -p 22 -c 3 $addresses\"${NC}"
236 result=$(ansible $last_index_controller -b -m shell -a "ip netns exec qdhcp-$network_id nping -6 --tcp-connect -p 22 -c 3 $addresses" | grep 'Failed: [1-9]')
237 if [[ -z $result ]]
238 then
239 echo -e "${GREEN}network $network addresses replied successfully${NC}"
240 else
241 echo -e "${RED}$result${NC}"
242 fi
243 done
244 fi
245 else
246 networks=$(openstack server list --all --long -c Networks -f value | grep -v -i avrs)
247 ipv4_networks=$(echo "$networks" | grep -v : | awk -F= '{print $1}' | sort -u)
248 ipv6_networks=$(echo "$networks" | grep : | awk -F= '{print $1}' | sort -u)
249 if [[ $ipv4_networks ]]
250 then
251 for network in $ipv4_networks
252 do
253 addresses=$(openstack server list --all --long -c Networks -f value | awk '{OFS=RS;$1=$1}1' | grep -w $network | awk -F= '{print $2}' | tr -d ',;' | paste -sd " ")
254 if [[ -z $addresses ]]
255 then
256 echo -e "${RED}addresses variable didn't return any value${NC}"
257 fi
258 network_id=$(openstack network list -f value | grep -w $network | awk '{print $1}')
259 if [[ -z $network_id ]]
260 then
261 echo -e "${RED}network_id variable didn't return any value${NC}"
262 fi
263 echo -e "${CYAN}ansible $last_index_controller -b -m shell -a \"ip netns exec qdhcp-$network_id fping $addresses'\"${NC}"
264 result=$(ansible $last_index_controller -b -m shell -a "ip netns exec qdhcp-$network_id fping $addresses" | grep ^[0-9] | grep -v 'is alive')
265 if [[ $result ]]
266 then
267 echo -e "\n${RED}\n$result${NC}\n"
268 else
269 echo -e "${GREEN}all the addresses of network $network replied successfully${NC}"
270 fi
271 done
272 fi
273 if [[ $ipv6_networks ]]
274 then
275 for network in $ipv6_networks
276 do
277 addresses=$(openstack server list --all --long -c Networks -f value | awk '{OFS=RS;$1=$1}1' | grep -w $network | awk -F= '{print $2}' | tr -d ',;' | paste -sd " ")
278 if [[ -z $addresses ]]
279 then
280 echo -e "${RED}addresses variable didn't return any value${NC}"
281 fi
282 network_id=$(openstack network list -f value | grep -w $network | awk '{print $1}')
283 if [[ -z $network_id ]]
284 then
285 echo -e "${RED}network_id variable didn't return any value${NC}"
286 fi
287 echo -e "${CYAN}ansible $last_index_controller -b -m shell -a \"ip netns exec qdhcp-$network_id fping6 $addresses$\"${NC}"
288 result=$(ansible $last_index_controller -b -m shell -a "ip netns exec qdhcp-$network_id fping6 $addresses" | grep ^[0-9] | grep -v 'is alive')
289 if [[ $result ]]
290 then
291 echo -e "\n${RED}\n$result${NC}\n"
292 else
293 echo -e "${GREEN}all the addresses of network $network replied successfully${NC}"
294 fi
295 done
296 fi
297 fi
298 else
299 echo -e "${LRB}${BLINK}no instances are found on the system${NC}"
300 fi
301 elif [[ $nuage == "true" ]]
302 then
303 echo -e "${ORANGE}nuage/avrs instance aren't using the neutron dhcp namespace and therefore this check is irrelevant for nuage deployment${NC}"
304 fi
305 exit
306fi
307
308
309###########################################################################################################
310
311
312echo -e "\n${UL}${BLUE}GATHERING REQUIRED SYSTEM INFORMATION${NC}"
313
314echo -e "${CYAN}retrieve the cbis manager/hypervisor ip address${NC}"
315HypervisorURL=$(cat ~/user_config.yaml | grep -w hypervisor_cidr6)
316if [[ $HypervisorURL ]]
317then
318 HypervisorURL=$(cat ~/user_config.yaml | grep -w hypervisor_cidr | awk '{print $2}' | awk -F/ '{print $1}' | grep ^[0-9])
319 echo -e "${CYAN}retrieve the deployment ip stack${NC}"
320 ip_stack="IPv6/IPv4 Dual Stack"
321else
322 HypervisorURL=$(cat ~/user_config.yaml | grep -w hypervisor_cidr | awk '{print $2}' | awk -F/ '{print $1}' | grep ^[0-9])
323 echo -e "${CYAN}retrieve the deployment ip stack${NC}"
324 ip_stack="IPv4"
325fi
326
327echo -e "${CYAN}validate cbis manager credentials authenticity${NC}"
328cbis_manager_credentials_check=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X GET 'https://'$HypervisorURL'/api/pages' -H 'Authorization: Basic '$cbis_manager_token'' | grep RESP_CODE | awk -F: '{print $2}')
329if [[ $cbis_manager_credentials_check == "401" || $cbis_manager_credentials_check == "402" ]]
330then
331 echo -e "${RED}cbis manager authorization failure (http error $cbis_manager_credentials_check). make sure you entered the correct cbis manager user/password. exiting..${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
332 exit
333fi
334echo -e "${CYAN}retrieve cbis software version${NC}"
335cbis_version_full=$(openstack cbis version -f value)
336cbis_version_base=$(echo -e "$cbis_version_full" | grep build | awk '{print $NF}')
337cbis_version=$(echo -e "$cbis_version_full" | grep build | awk -F- '{print $2}')
338
339
340echo -e "${CYAN}retrieve cbis hotfix version and build${NC}"
341hotfix_install_success=$(/var/lib/cbis/cbis_hotfix list -f json 2>&1 | jq .[] 2>&1 | jq 'select(.state == "post-install-success")' 2>&1 | grep post-install-success)
342if [[ -z $hotfix_install_success ]]
343then
344 continue
345else
346 hotfix_json=$(/var/lib/cbis/cbis_hotfix list -f json | jq .[] | jq 'select(.state == "post-install-success")')
347 hotfix_name=$(echo $hotfix_json | jq .hotfix_name | head -n1 | tr -d \")
348 build_number=$(echo $hotfix_json | jq .build_number | head -n1 | tr -d \")
349 hotfix_name_build="${hotfix_name}-${build_number}"
350fi
351
352echo -e "${CYAN}retrieve the setup hardware model${NC}"
353hw_model=$(grep hw_model_type user_config.yaml | awk '{print $2}')
354
355echo -e "${CYAN}retrieve the cloud name${NC}"
356cloud_name=$(cat user_config.yaml | grep cloud_name | awk '{print $2}')
357if [[ $cloud_name == "''" ]]
358then
359 cloud_name=""
360fi
361
362echo -e "${CYAN}retrieve the nfs backup mountpoint${NC}"
363backup_nfs_mountpoint=$(grep backup_nfs_mountpoint: user_config.yaml | awk '{print $NF}')
364
365echo -e "${CYAN}retrieve the undercloud vm external ip address${NC}"
366undercloud_vm_ip=$(sudo grep undercloud_cidr: user_config.yaml | awk '{print $2}' | awk -F/ '{print $1}')
367
368echo -e "${CYAN}retrieve the existing host-groups${NC}"
369aggregate_hosts=$(source ~/overcloudrc && openstack aggregate list --long -f value -c Name | grep [a-zA-Z]*Compute)
370
371echo -e "${CYAN}retrieve the setup platform (model) type${NC}"
372platform=$(curl -g -s -L -k -X GET 'https://'$HypervisorURL'/api/installation/initial_page' -H 'Authorization: Basic '$cbis_manager_token'' | jq . | grep default | awk '{print $2}' | tr -d ,)
373
374echo -e "${CYAN}retrieve the entire cbis manager installation page${NC}"
375installation_page=$(curl -g -s -L -k -X POST 'https://'$HypervisorURL'/api/installation/status' -H 'Authorization: Basic '$cbis_manager_token'' -H 'content-type: application/json' --data '{"hardware":'$platform'}' | jq .)
376
377echo -e "${CYAN}retrieve openstack version${NC}"
378openstack_version_numerical=$(openstack --version 2>&1 | awk '{print $NF}')
379openstack_version_name=$(docker image list | grep '\-latest' | awk '{print $2}' | sort -u | awk -F- '{print $1}' | grep -v zabbix)
380
381echo -e "${CYAN}check if ceph backend is enabled or disabled${NC}"
382ceph_backend=$(grep ceph_backend_enabled: ~/user_config.yaml | awk '{print $2}')
383
384echo -e "${CYAN}check if the setup is deployed with nuage${NC}"
385nuage=$(grep nuage: user_config.yaml | awk '{print $2}' | column -t)
386
387echo -e "${CYAN}check if the setup is deployed external storage system${NC}"
388external_storage_system=$(cat user_config.yaml | grep external_storage_system | awk '{print $2}' | head -n1)
389
390echo -e "${CYAN}check if the setup is hci or non-hci${NC}"
391hci=$(grep ceph_hci: ~/user_config.yaml | awk '{print $2}')
392
393echo -e "${CYAN}check if the setup is deployed with/without ceph fast-pools${NC}"
394fast_pools=$(cat user_config.yaml | grep enable_fast_pool | awk '{print $2}')
395
396echo -e "${CYAN}retrieve the servers count (overcloud + undercloud vm)${NC}"
397ansible_all_hosts_count=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | awk '{print $1}' | sort --uniq | wc -l | xargs -i expr {} + 1)
398
399echo -e "${CYAN}retrieve controllers hostname${NC}"
400last_index_controller=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i controller | awk '{print $1}' | sort --uniq | sort -n | tail -n1)
401first_index_controller=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i controller | awk '{print $1}' | sort --uniq | awk NR==1)
402current_controllers=$(source ~/stackrc && openstack server list -c Name --no-name-lookup --name Controller -f value)
403current_controllers_piped=$(echo -e "$current_controllers" | paste -sd'|')
404
405echo -e "${CYAN}retrieve stackrc/overcloudrc openstack server list information${NC}"
406nova_overcloud_hosts_list=$(source ~/stackrc && openstack server list -f json | jq ".[]" | jq .Name | tr -d \" | sort | tr '[:upper:]' '[:lower:]')
407nova_overcloud_and_undercloud_hosts_count=$(echo -e "$nova_overcloud_hosts_list" | wc -l | xargs -i expr {} + 1)
408nova_instances=$(source ~/overcloudrc && openstack server list --long --all -f value -c ID -c Name -c Host -c Status -c "Power State" | column -t)
409nova_instances_count=$(echo -e "$nova_instances" | grep -c -E 'ACTIVE\s+Running')
410
411echo -e "${CYAN}retrieve overcloud images list${NC}"
412overcloud_images_list=$(source ~/overcloudrc && openstack image list -f value | column -t)
413
414echo -e "${CYAN}check if local storage is enabled/disabled per host-group${NC}"
415ovs_local_storage=$(awk '/OvsCompute:/,0' user_config.yaml | grep enable_local_storage: | head -n1 | awk '{print $2}')
416sriov_local_storage=$(awk '/SriovPerformanceCompute:/,0' user_config.yaml | grep enable_local_storage: | head -n1 | awk '{print $2}')
417dpdk_local_storage=$(awk '/DpdkPerformanceCompute:/,0' user_config.yaml | grep enable_local_storage: | head -n1 | awk '{print $2}')
418avrs_local_storage=$(awk '/AvrsCompute/,0' user_config.yaml | grep enable_local_storage: | head -n1 | awk '{print $2}')
419
420echo -e "${CYAN}retrieve the hostname of each host under each host-group from /etc/ansible/hosts${NC}"
421ansible_computes_hosts=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i compute | awk '{print $1}' | sort --uniq)
422ansible_controllers_hosts=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i controller | awk '{print $1}' | sort --uniq)
423ansible_monitoring_hosts=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i monitoring | awk '{print $1}' | sort --uniq)
424ansible_storage_hosts=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i storage | awk '{print $1}' | sort --uniq)
425ansible_ovs_hosts=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i ovs | awk '{print $1}' | sort --uniq)
426ansible_sriov_hosts=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i sriov | awk '{print $1}' | sort --uniq)
427ansible_dpdk_hosts=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i dpdk | awk '{print $1}' | sort --uniq)
428ansible_avrs_hosts=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i avrs | awk '{print $1}' | sort --uniq)
429ansible_overcloud_hosts=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | awk '{print $1}' | sort --uniq)
430random_storage_hostname=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i storage | awk '{print $1}' | sort --uniq | shuf -n 1)
431random_compute_hostname=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i compute | awk '{print $1}' | sort --uniq | shuf -n 1)
432
433echo -e "${CYAN}retrieve the hosts count of each host-group${NC}"
434ansible_storage_hosts_count=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i storage | awk '{print $1}' | sort --uniq | wc -l)
435ansible_monitoring_hosts_count=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i monitoring | awk '{print $1}' | sort --uniq | wc -l)
436ansible_dpdk_hosts_count=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i dpdk | awk '{print $1}' | sort --uniq | wc -l)
437ansible_ovs_hosts_count=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i ovs | awk '{print $1}' | sort --uniq | wc -l)
438ansible_sriov_hosts_count=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i sriov | awk '{print $1}' | sort --uniq | wc -l)
439ansible_avrs_hosts_count=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i avrs | awk '{print $1}' | sort --uniq | wc -l)
440ansible_controllers_hosts_count=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i controller | awk '{print $1}' | sort --uniq | wc -l)
441
442echo -e "${CYAN}retrieve $last_index_controller InternalApi, Tenant, Storage, StorageMgmt, ControlPlane (Provisioning) addresses${NC}"
443ip_addres_show=$(ansible $last_index_controller -b -m shell -a "ip addres show")
444internal_api_cidr=$(cat /home/stack/templates/network-environment.j2.yaml | grep InternalApiNetCidr | awk '{print $2}' | cut -d . -f 1-3)
445internal_api_controller_address=$(echo -e "$ip_addres_show" | grep $internal_api_cidr | grep inet | awk NR==1 | awk '{print $2}' | awk -F/ '{print $1}')
446tenant_cidr=$(cat /home/stack/templates/network-environment.j2.yaml | grep TenantNetCidr | awk '{print $2}' | cut -d . -f 1-3)
447tenant_controller_address=$(echo -e "$ip_addres_show" | grep $tenant_cidr | grep inet | awk NR==1 | awk '{print $2}' | awk -F/ '{print $1}')
448storage_cidr=$(cat /home/stack/templates/network-environment.j2.yaml | grep StorageNetCidr | awk '{print $2}' | cut -d . -f 1-3)
449storage_controller_address=$(echo -e "$ip_addres_show" | grep $storage_cidr | grep inet | awk NR==1 | awk '{print $2}' | awk -F/ '{print $1}')
450storage_mgmt_cidr=$(cat /home/stack/templates/network-environment.j2.yaml | grep StorageMgmtNetCidr | awk '{print $2}' | cut -d . -f 1-3)
451storage_mgmt_controller_address=$(echo -e "$ip_addres_show" | grep $storage_mgmt_cidr | grep inet | awk NR==1 | awk '{print $2}' | awk -F/ '{print $1}')
452provisioning_cidr=$(cat /home/stack/templates/network-environment.yaml | grep ControlPlaneDefaultRoute | awk '{print $2}' | cut -d . -f 1-3)
453provisioning_controller_address=$(echo -e "$ip_addres_show" | grep $provisioning_cidr | grep inet | awk NR==1 | awk '{print $2}' | awk -F/ '{print $1}')
454
455echo -e "${CYAN}retrieve the hypervisor_dedicated_cpus value from the computes${NC}"
456ovs_hypervisor_dedicated_cpus=$(grep -w OvsCompute: user_config.yaml -A19 | grep hypervisor_dedicated_cpus: | awk '{print $2}')
457sriov_hypervisor_dedicated_cpus=$(grep -w SriovPerformanceCompute: user_config.yaml -A19 | grep hypervisor_dedicated_cpus: | awk '{print $2}')
458dpdk_hypervisor_dedicated_cpus=$(grep -w DpdkPerformanceCompute: user_config.yaml -A19 | grep hypervisor_dedicated_cpus: | awk '{print $2}')
459avrs_hypervisor_dedicated_cpus=$(grep -w AvrsCompute: user_config.yaml -A19 | grep hypervisor_dedicated_cpus: | awk '{print $2}')
460
461echo -e "${CYAN}check if the setup is deployed with ELK and which deployment type (remote or local)${NC}"
462elk=$(cat user_config.yaml | grep deploy_elk | awk '{print $2}')
463elk_deployment_type=$(cat user_config.yaml | grep elk_deployment_type | awk '{print $2}')
464
465echo -e "${CYAN}retrieve OS_AUTH_URL (public virtual IP address)${NC}"
466PublicURL=$(cat user_config.yaml | grep ip_range_start: | awk '{print $2}' | grep -v :)
467if [[ -z $PublicURL ]]
468then
469 PublicURL=$(cat user_config.yaml | grep ip_range_start: | awk '{print $2}' | sed 's/$/]/g' | sed 's/^/[/g')
470else
471 PublicURL=$(cat user_config.yaml | grep ip_range_start: | awk '{print $2}')
472fi
473
474echo -e "${CYAN}retrieve the horizon admin password${NC}"
475ADMIN_PASSWORD=$(grep -w admin_password: user_config.yaml | awk '{print $NF}')
476
477echo -e "${CYAN}retrieve zabbix username and password${NC}"
478zabbix_username=$(cat user_config.yaml | grep zabbix_username: | awk '{print $2}')
479zabbix_password=$(cat user_config.yaml | grep zabbix_password: | awk '{print $2}')
480
481echo -e "${CYAN}retrieve kibana user, password and basic authentication token${NC}"
482kibana_user=$(cat user_config.yaml | grep kibana_username | awk '{print $2}')
483kibana_password=$(cat user_config.yaml | grep kibana_password | awk '{print $2}')
484kibana_basic_auth=$(echo -n "$kibana_user:$kibana_password" | base64)
485
486echo -e "${CYAN}retrieve zabbix authentication token${NC}"
487zabbix_auth=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php?result=$result' \
488-H 'Content-Type: application/json' \
489-H 'Cookie: SERVERID='$last_index_controller'' \
490--data '{
491 "jsonrpc": "2.0",
492 "method": "user.login",
493 "params": {
494 "user": "'$zabbix_username'",
495 "password": "'$zabbix_password'"
496 },
497 "id": 1,
498 "auth": null
499}' | jq .result)
500
501echo -e "${CYAN}retrieve zabbix overcloud hosts list${NC}"
502zabbix_hosts_raw=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
503-H 'Content-Type: application/json-rpc' \
504-H 'Cookie: SERVERID='$last_index_controller'' \
505--data '{
506 "jsonrpc": "2.0",
507 "method": "host.get",
508 "params": {
509 "output": [
510 "hostid",
511 "host"
512 ],
513 "selectInterfaces": [
514 "interfaceid",
515 "ip"
516 ]
517 },
518 "id": 2,
519 "auth": '$zabbix_auth'
520}')
521zabbix_hosts=$(echo -e "$zabbix_hosts_raw" | jq -r .result | jq ".[].host" | tr -d \" | grep -E -v -i 'undercloud|Zabbix server|active-controller|switch' | sort | tr '[:upper:]' '[:lower:]')
522zabbix_hosts_and_ids=$(echo -e "$zabbix_hosts_raw" | jq .result[] | jq -r "[.host,.hostid]")
523
524if [[ $cbis_version != "18.0.0.1" && $cbis_version != "19.0.0.1" ]]
525then
526 echo -e "${CYAN}check if the setup is upgraded${NC}"
527 upgraded=$(curl -g -s -k -L -X GET 'https://'$HypervisorURL'/api/cbis_upgrade/state' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' | jq .state | tr -d \")
528fi
529
530echo -e "${CYAN}retrieve the entire system installed rpms${NC}"
531sshpass -p $hv_cbis_admin_password ansible -k all -b -m shell -a "rpm -qa warn=False" > installed_rpms_$(date +%Y-%m-%d_%H-%M-%S)
532
533echo -e "${CYAN}retrieve the entire system bios version${NC}"
534bios=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "dmidecode -s bios-version" | tee BIOS_version)
535bios=$(echo -e "$bios" | grep -v SUCCESS | sort -u)
536
537echo -e "${CYAN}retrieve the entire system firmware version${NC}"
538firmware=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'" | tee firmware_version)
539firmware=$(echo -e "$firmware" | grep -v SUCCESS | sort -u)
540
541if [[ $nuage == "true" ]]
542then
543 echo -e "${CYAN}retrieve the vsd ip address${NC}"
544 vsd_ip=$(cat user_config.yaml | grep NeutronNuageVSDIp | awk '{print $NF}' | awk -F: '{print $1}')
545fi
546
547echo -e "${CYAN}retrieve the amount of time (in hours) from when the system was successfully deployed till now${NC}"
548deployment_ended_epoch_date=$(cat /var/log/cbis/overcloud_installation.log | grep ^20[2-9][1-9] | tail -n1 | awk -F, '{print $1}' | xargs -i date -d "{}" +%s)
549current_date=$(date +%s)
550uptime_hours=$(echo "scale=1;((($current_date-$deployment_ended_epoch_date) / 60 / 60))" | bc)
551uptime_days=$(echo -e "$uptime_hours" | awk -F. '{print $1}' | xargs -i echo "scale=1;(({} / 24))" | bc)
552
553echo -e "\n\n===================================================================================================="
554echo -e " SYSTEM SUMMARY"
555echo -e "===================================================================================================="
556fixed_platform=$(echo -e "$platform" | tr -d \")
557if [[ $fixed_platform ]]
558then
559 echo -e "PLATFORM \t\t\t\t\t\t= $fixed_platform"
560else
561 echo -e "${RED}WARNING!${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
562 echo -e "${RED}The platform value of cbis manager installation page returned empty value.\nIn most cases it happens after uninstalling CBIS manager.\nSome checks might be negativly affected by it. Review the results with discretion.${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
563 platform=$(cat user_config.yaml | grep hw_model_type: | awk '{print $2}' | sed 's/$/"/g' | sed 's/^/"/g')
564 fixed_platform=$(echo -e "$platform" | tr -d \")
565 echo -e "===================================================================================================="
566 echo -e "PLATFORM \t\t\t\t\t\t= $fixed_platform"
567fi
568echo -e "BASE CBIS VERSION \t\t\t\t\t= $cbis_version_base"
569if [[ $cbis_version != "18.0.0.1" && $cbis_version != "19.0.0.1" ]]
570then
571 if [[ $upgraded == "NEW" ]]
572 then
573 echo -e "CBIS UPGRADE STATE \t\t\t\t\t= $upgraded (UPGRADE NEVER STARTED)"
574 elif [[ $upgraded == "UNKNOWN" ]]
575 then
576 echo -e "CBIS UPGRADE STATE \t\t\t\t\t= $upgraded (UPGRADE NEVER STARTED OR UPGRADE FAILED)"
577 else
578 echo -e "CBIS UPGRADE STATE \t\t\t\t\t= $upgraded"
579 fi
580fi
581echo -e "OPENSTACK VERSION \t\t\t\t\t= $openstack_version_numerical ($openstack_version_name)"
582if [[ $fixed_platform != 'airframe' && $fixed_platform != 'dell-730' && $fixed_platform != 'hp-slg7_OVS' && $fixed_platform != 'hp-slg7_OVS_SSD_single_nic' && $fixed_platform != 'hp-c7kg8' && $fixed_platform != 'hp-c7kg9' ]]
583then
584 mlx_ofed_version=$(ansible compute -b -m shell -a "/usr/bin/ofed_info | head -n1" | awk '{print $1}' | grep -v overcloud | sort -u | grep -v /bin/sh)
585 if [[ $mlx_ofed_version == "/bin/sh:" ]]
586 then
587 echo -e "${RED}unable to find installed mellanox ofed on this $fixed_platform system (/usr/bin/ofed_info)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
588 nics_type="Intel"
589 else
590 nics_firmware=$(ansible compute -b -m shell -a "ovs-appctl bond/list | grep tenant-bond | awk '{print \$NF}' | sort -u | xargs -i ethtool -i {}" | grep firmware-version: | head -n1 | awk '{print $2,$3}')
591 if [[ -z $nics_firmware ]]
592 then
593 nics_firmware=$(ansible compute -b -m shell -a "cat /proc/net/bonding/tenant-bond | grep 'Slave Interface:' | awk '{print \$NF}' | sort -u | xargs -i ethtool -i {}" | grep firmware-version: | head -n1 | awk '{print $2,$3}')
594 fi
595 echo -e "NICS TYPE \t\t\t\t\t\t= $mlx_ofed_version"
596 echo -e "NICS FIRMWARE \t\t\t\t\t\t= $nics_firmware"
597 nics_firmware=$(echo -e "$nics_firmware" | awk '{print $1}')
598 nics_type="$mlx_ofed_version"
599 nics_type+="-$nics_firmware"
600 fi
601else
602 nics_type="Intel"
603 echo -e "NICS TYPE \t\t\t\t\t\t= $nics_type"
604fi
605echo -e "eSW \t\t\t\t\t\t\t= $bios (BIOS), $firmware (firmware)"
606if [[ $hotfix_install_success ]]
607then
608 echo -e "SUCCESSFULLY DEPLOYED PATCH(S) \t\t\t\t= $hotfix_name_build"
609 hotfix_deployment_type=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/deployment.log" | grep 'Hooks found')
610 if [[ $hotfix_deployment_type ]]
611 then
612 echo -e "PATCH DEPLOYMENT TYPE \t\t\t\t\t= Scratch Deployment"
613 else
614 echo -e "PATCH DEPLOYMENT TYPE \t\t\t\t\t= Patch Management"
615 fi
616else
617 echo -e "SUCCESSFULLY DEPLOYED PATCH(S) \t\t\t\t= None"
618fi
619echo -e "IP STACK \t\t\t\t\t\t= $ip_stack"
620if [[ $nuage == "true" ]]
621then
622 nuage_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "cat /usr/share/cbis/nuage-version" | grep ^[0-9] | sort --uniq)
623 echo -e "NUAGE \t\t\t\t\t\t\t= $nuage_version"
624
625else
626 echo -e "NUAGE \t\t\t\t\t\t\t= false"
627fi
628if [[ $external_storage_system != "null" ]]
629then
630 echo -e "EXTERNAL STORAGE SYSTEM \t\t\t\t= $external_storage_system"
631 storage_type="$external_storage_system"
632fi
633echo -e "ELK (ElasticSearch, Logstash & Kibana) \t\t\t= $elk ($elk_deployment_type)"
634if [[ $ceph_backend == "true" ]]
635then
636 echo -e "CEPH BACKEND \t\t\t\t\t\t= true"
637 if [[ $hci == "true" ]]
638 then
639 echo -e "HCI \t\t\t\t\t\t\t= true"
640 storage_type="HCI"
641 elif [[ $hci == "false" && $fast_pools == "false" ]]
642 then
643 echo -e "MULTI POOLS \t\t\t\t\t\t= true"
644 storage_type="Multi_Pools"
645 echo -e "STORAGE HOSTS \t\t\t\t\t\t= $ansible_storage_hosts_count"
646 else
647 echo -e "FAST POOLS \t\t\t\t\t\t= true"
648 storage_type="Fast_Pools"
649 echo -e "STORAGE HOSTS \t\t\t\t\t\t= $ansible_storage_hosts_count"
650 fi
651else
652 echo -e "CEPH BACKEND \t\t\t\t\t\t= false"
653fi
654echo -e "CONTROLLER HOSTS \t\t\t\t\t= $ansible_controllers_hosts_count"
655if [[ $ansible_monitoring_hosts_count != "0" ]]
656then
657 echo -e "MONITORING HOSTS \t\t\t\t\t= $ansible_monitoring_hosts_count"
658fi
659if [[ $ansible_dpdk_hosts_count != "0" ]]
660then
661 echo -e "DPDK HOSTS \t\t\t\t\t\t= $ansible_dpdk_hosts_count"
662fi
663if [[ $ansible_ovs_hosts_count != "0" ]]
664then
665 echo -e "OVS HOSTS \t\t\t\t\t\t= $ansible_ovs_hosts_count"
666fi
667if [[ $ansible_sriov_hosts_count != "0" ]]
668then
669 echo -e "SRIOV HOSTS \t\t\t\t\t\t= $ansible_sriov_hosts_count"
670fi
671if [[ $ansible_avrs_hosts_count != "0" ]]
672then
673 echo -e "AVRS HOSTS \t\t\t\t\t\t= $ansible_avrs_hosts_count"
674fi
675if [[ $ansible_dpdk_hosts_count != "0" && $dpdk_local_storage != "false" ]]
676then
677 echo -e "DPDK LOCAL STORAGE \t\t\t\t\t\t= $dpdk_local_storage"
678fi
679if [[ $ansible_ovs_hosts_count != "0" && $ovs_local_storage != "false" ]]
680then
681 echo -e "OVS LOCAL STORAGE \t\t\t\t\t= $ovs_local_storage"
682fi
683if [[ $ansible_sriov_hosts_count != "0" && $sriov_local_storage != "false" ]]
684then
685 echo -e "SRIOV LOCAL STORAGE \t\t\t\t\t\t= $sriov_local_storage"
686fi
687if [[ $ansible_avrs_hosts_count != "0" && $avrs_local_storage != "false" ]]
688then
689 echo -e "AVRS LOCAL STORAGE \t\t\t\t\t\t= $avrs_local_storage"
690fi
691echo -e "DEPLOYMENT UPTIME \t\t\t\t\t= $uptime_hours hours ($uptime_days days)"
692echo -e "===================================================================================================="
693elapsed_time_seconds=$(expr $(date +%s) - $global_start)
694
695
696####################################################################################################
697
698
699start=$(date +%s)
700STEPS_COUNTER=$((STEPS_COUNTER+1))
701echo -e "${BLUE}\n\n$STEPS_COUNTER) SET USER stack PASSWORD (`date '+%T'`)${NC}"
702if [[ $STACK_PASSWORD != "" ]]
703then
704 echo -e "$STACK_PASSWORD\n$STACK_PASSWORD" | sudo passwd stack
705 echo -e "${GREEN}updated user stack password to <$STACK_PASSWORD>${NC}"
706else
707 echo -e "${ORANGE}skipped user stack password modification${NC}"
708fi
709elapsed_time_seconds=$(expr $(date +%s) - $global_start)
710
711
712####################################################################################################
713
714
715if [[ $ESSENTIAL == "yes" || $ESSENTIAL == "no" ]]
716then
717 start=$(date +%s)
718 STEPS_COUNTER=$((STEPS_COUNTER+1))
719 echo -e "${BLUE}\n\n$STEPS_COUNTER) SHOW HOTFIXES HISTORY (+$elapsed_time_seconds `date '+%T'`)${NC}"
720 hotfix_existance_check=$(/var/lib/cbis/cbis_hotfix list-all -f json 2>/dev/null)
721 if [[ $hotfix_existance_check ]]
722 then
723 hotfix_history_state=$(/var/lib/cbis/cbis_hotfix list-all -f json 2>/dev/null | jq .[].state | grep -v post-install-success)
724 hotfix_history=$(/var/lib/cbis/cbis_hotfix list-all -f json 2>/dev/null | jq .[] | jq '{hotfix_name,state}')
725 if [[ $hotfix_history_state ]]
726 then
727 echo -e "${ORANGE}$hotfix_history${NC}"
728 else
729 echo -e "${GREEN}$hotfix_history${NC}"
730 fi
731 else
732 echo -e "${ORANGE}the system was never deployed with any hotfix${NC}"
733 fi
734 elapsed_time_seconds=$(expr $(date +%s) - $start)
735
736
737 ####################################################################################################
738
739
740 start=$(date +%s)
741 STEPS_COUNTER=$((STEPS_COUNTER+1))
742 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR ISSUES IN THE SRE ICE HEALTH CHECK REPORT (+$elapsed_time_seconds `date '+%T'`)${NC}"
743 if [ -d "/home/stack/HealthCheckFiles/" ]
744 then
745 ISSUES=$(ls -lrt /home/stack/HealthCheckFiles/ | grep \.log | awk '{print $NF}' | tail -n 1 | xargs -i cat /home/stack/HealthCheckFiles/{} | grep -E '\s+No\s+' | sort | uniq -c | tr -s '[:space:]' | awk -F\| '{print $1,$NF}')
746 if [[ $ISSUES ]]
747 then
748 echo -e "${RED}$ISSUES${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
749 else
750 echo -e "${GREEN}no issues found in the ice health check report (excluding warnings)${NC}"
751 fi
752 INSTALLED_ICE_VERSION=$(cat ~/ice/ice_version 2>&1)
753 LATEST_ISP=$(curl -s https://repo.lab.pl.alcatel-lucent.com/ice-generic-candidates/ | grep -E 'ice-support-package-[0-9]' | tail -n 1 | awk -F\" '{print $2}')
754 LATEST_ISP_VALIDATION=$(echo -e "$LATEST_ISP" | grep -w "$INSTALLED_ICE_VERSION")
755 if [[ -z $LATEST_ISP_VALIDATION ]]
756 then
757 echo -e "\n${MAGENTA}it was found that the latest ice service package build is not used. current version: $INSTALLED_ICE_VERSION, latest version: $LATEST_ISP\nto obtain and execute the latest build perform the following:${NC}\n"
758 echo -e "${MAGENTA}${UL}from the undercloud physical server:${NC}"
759 echo -e "${MAGENTA}cd /root/${NC}"
760 echo -e "${MAGENTA}rm -rf /root/sre/${NC}"
761 echo -e "${MAGENTA}mkdir /root/sre/${NC}"
762 echo -e "${MAGENTA}cd /root/sre/${NC}"
763 echo -e "${MAGENTA}ISP=\$(curl -s https://repo.lab.pl.alcatel-lucent.com/ice-generic-candidates/ | grep -E 'ice-support-package-[0-9]' | tail -n 1 | awk -F'\"' '{print \$2}')${NC}"
764 echo -e "${MAGENTA}curl -s https://repo.lab.pl.alcatel-lucent.com/ice-generic-candidates/\$ISP -o \$ISP${NC}"
765 echo -e "${MAGENTA}chmod +x \$ISP${NC}"
766 echo -e "${MAGENTA}./\$ISP${NC}"
767 echo -e "${MAGENTA}cd /root/sre/ice-support-package/Installer/${NC}"
768 echo -e "${MAGENTA}python IceInstaller.py${NC}"
769 echo -e "\n${MAGENTA}${UL}from the undercloud vm:${NC}"
770 echo -e "${MAGENTA}cd ~/${NC}"
771 echo -e "${MAGENTA}. icerc${NC}"
772 echo -e "${MAGENTA}ice healthcheck${NC}"
773 echo -e "\n${MAGENTA}re-run the system health validation script to see the results of the ice healthcheck execution${NC}"
774 fi
775 if [[ $ISSUES ]]
776 then
777 echo -e "\n\n${ORANGE}note: once you fixed any of the above failures, re-run the ice health check\nthe system validation script always reads from the last health check report and will keep presenting the previous failures until a new report is created${NC}"
778 fi
779 else
780 echo -e "${MAGENTA}/home/stack/HealthCheckFiles/ is not found which means that the ice health check was never executed${NC}"
781 echo -e "${MAGENTA}to obtain and execute the latest ice service package build perform the following:${NC}\n"
782 echo -e "${MAGENTA}${UL}from the undercloud physical server:${NC}"
783 echo -e "${MAGENTA}cd /root/${NC}"
784 echo -e "${MAGENTA}rm -rf /root/sre/${NC}"
785 echo -e "${MAGENTA}mkdir /root/sre/${NC}"
786 echo -e "${MAGENTA}cd /root/sre/${NC}"
787 echo -e "${MAGENTA}ISP=\$(curl -s https://repo.lab.pl.alcatel-lucent.com/ice-generic-candidates/ | grep -E 'ice-support-package-[0-9]' | tail -n 1 | awk -F'\"' '{print \$2}')${NC}"
788 echo -e "${MAGENTA}curl -s https://repo.lab.pl.alcatel-lucent.com/ice-generic-candidates/\$ISP -o \$ISP${NC}"
789 echo -e "${MAGENTA}chmod +x \$ISP${NC}"
790 echo -e "${MAGENTA}./\$ISP${NC}"
791 echo -e "${MAGENTA}cd /root/sre/ice-support-package/Installer/${NC}"
792 echo -e "${MAGENTA}python IceInstaller.py${NC}"
793 echo -e "\n${MAGENTA}${UL}from the undercloud vm:${NC}"
794 echo -e "${MAGENTA}cd ~/${NC}"
795 echo -e "${MAGENTA}. icerc${NC}"
796 echo -e "${MAGENTA}ice healthcheck${NC}"
797 echo -e "\n${MAGENTA}re-run the system health validation script to see the results of the ice healthcheck execution${NC}"
798 fi
799 elapsed_time_seconds=$(expr $(date +%s) - $start)
800
801
802 ####################################################################################################
803
804
805 start=$(date +%s)
806 STEPS_COUNTER=$((STEPS_COUNTER+1))
807 echo -e "${BLUE}\n\n$STEPS_COUNTER) SEARCH FOR IPMI HOSTS LEFTOVERS IN hosts.yaml and hosts_config.yaml (+$elapsed_time_seconds `date '+%T'`)${NC}"
808 cat hosts.yaml | grep pm_addr | awk '{print $2}' | sort -n > hosts_yaml_ipmi_addresses.txt
809 cat hosts_config.yaml | grep -E ^'\s+\-\s+[0-9]' | awk '{print $NF}' | sort -n > hosts_config_yaml_ipmi_addresses.txt
810 source ~/stackrc && openstack baremetal node list --long | grep ipmi_address | awk -F"ipmi_address" '{print $2}' | awk '{print $2}' | tr -d "'u," | sort -n > ironic_ipmi_addresses.txt
811 LEFTOVER_IPMI_HOSTS=$(cat hosts_yaml_ipmi_addresses.txt ironic_ipmi_addresses.txt hosts_config_yaml_ipmi_addresses.txt | sort -n | uniq -c | column -t | grep -v -E '^3\s+[0-9]' | awk '{print $NF}' | paste -sd'|')
812 if [[ $LEFTOVER_IPMI_HOSTS ]]
813 then
814 LEFTOVER_IPMI_HOSTS_DETAILED=$(strings -f hosts_yaml_ipmi_addresses.txt hosts_config_yaml_ipmi_addresses.txt | grep -E $LEFTOVER_IPMI_HOSTS)
815 echo -e "${RED}$LEFTOVER_IPMI_HOSTS_DETAILED${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
816 else
817 echo -e "${GREEN}no diff found between ironic, hosts.yaml and hosts_config.yaml${NC}"
818 fi
819 elapsed_time_seconds=$(expr $(date +%s) - $start)
820
821
822 ####################################################################################################
823
824
825 start=$(date +%s)
826 STEPS_COUNTER=$((STEPS_COUNTER+1))
827 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR HIGH DENTRY VALUES (sar) (+$elapsed_time_seconds `date '+%T'`)${NC}"
828 ### each check prints the comparison between the second last dentry and the last dentry entry (which runs 10 minutes after each-other)
829 for controller in $current_controllers
830 do
831 echo -e "${CYAN}$controller${NC}"
832 TAIL=10
833 FIRST=1
834 SECOND=2
835 EXIT_LOOP=0
836 FAILURE=0
837 SAR=$(ansible $controller -b -m shell -a "sar -v" | grep -v -E -i 'dentunusd|Average|Linux|overcloud-controller' | sed '/^$/d' | tail -n $TAIL)
838 while true
839 do
840 LAST_DENTRY=$(echo -e "$SAR" | tail -n $FIRST | awk NR==1 | awk '{print $3}')
841 SECOND_LAST_DENTRY=$(echo -e "$SAR" | tail -n $SECOND | awk NR==1 | awk '{print $3}')
842 RESULT=$(expr $LAST_DENTRY - $SECOND_LAST_DENTRY)
843 FIRST=$((FIRST+1))
844 SECOND=$((SECOND+1))
845 EXIT_LOOP=$((EXIT_LOOP+1))
846 if [ $RESULT -gt 100000 ]
847 then
848 echo -e "${RED}$RESULT${NC}"
849 FAILURE=$((FAILURE+1))
850 fi
851 if [ $EXIT_LOOP -gt $TAIL ]
852 then
853 break
854 fi
855 done
856 if [ $FAILURE -gt 0 ]
857 then
858 echo -e "\n\n${RED}$SAR${NC}\n\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
859 echo -e "${ORANGE}the above is fixed in CBIS 19A SP4-PP4, 20 PP4 and 22 - expect failures if you are using an older CBIS version (CBIS-16043, CBIS-16051)${NC}\n\n"
860 else
861 echo -e "${GREEN}no high dentry values are found${NC}\n"
862 fi
863 done
864 elapsed_time_seconds=$(expr $(date +%s) - $start)
865
866
867 ####################################################################################################
868
869
870 start=$(date +%s)
871 STEPS_COUNTER=$((STEPS_COUNTER+1))
872 CACHE_SIZE=100000
873 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE KERNEL SLAB TABLES WITH MORE THEN $CACHE_SIZE MB (+$elapsed_time_seconds `date '+%T'`)${NC}"
874 slabtop=$(ansible controller -b -m shell -a "slabtop -o -s c 2>&1 | grep ^[1-9] -B 1 | column -t | head -n 2")
875 high_memory_cache_table=$(ansible controller -b -m shell -a "slabtop -o -s c 2>&1 | grep ^[1-9] | tr -d K | awk NR==1 | awk 'NF{NF-=1};1' | awk '{print \$NF}' | xargs -i expr {} \/ 1024" | grep ^[0-9] | sort -n | tail -n 1)
876 if [ $high_memory_cache_table -gt $CACHE_SIZE ]
877 then
878 echo -e "${RED}$slabtop${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
879 else
880 echo -e "${GREEN}no kernel cache table that is using more then $CACHE_SIZE MB is found\n\n$slabtop${NC}"
881 fi
882 elapsed_time_seconds=$(expr $(date +%s) - $start)
883
884
885 ####################################################################################################
886
887
888 start=$(date +%s)
889 STEPS_COUNTER=$((STEPS_COUNTER+1))
890 echo -e "\n\n${BLUE}$STEPS_COUNTER) CHECK FOR CBIS MANAGER PAGES IN IN-PROGRESS STATE (+$elapsed_time_seconds `date '+%T'`)${NC}"
891 pages=$(curl -g -s -k -L -X GET 'https://'$HypervisorURL'/api/pages' -H 'Authorization: Basic '$cbis_manager_token'' | jq . | grep name | grep -v \, | awk '{print $NF}' | tr -d \")
892 COUNTER=0
893 for page in $pages
894 do
895 state=$(curl -g -s -k -L -X GET 'https://'$HypervisorURL'/api/'$page'/state' -H 'Authorization: Basic '$cbis_manager_token'' | jq .state | tr -d \")
896 if [[ $state == "IN_PROGRESS" ]]
897 then
898 echo -e "${RED}$page is in $state state${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
899 COUNTER=$((COUNTER+1))
900 fi
901 done
902 if [[ $COUNTER -eq 0 ]]
903 then
904 echo -e "${GREEN}no pages in CBIS manager with IN_PROGRESS state${NC}"
905 fi
906 elapsed_time_seconds=$(expr $(date +%s) - $global_start)
907
908
909 ####################################################################################################
910
911
912 start=$(date +%s)
913 STEPS_COUNTER=$((STEPS_COUNTER+1))
914 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR CBIS MANAGER PAGES IN PARTIAL STATE (+$elapsed_time_seconds `date '+%T'`)${NC}"
915 pages=$(curl -g -s -k -L -X GET 'https://'$HypervisorURL'/api/pages' -H 'Authorization: Basic '$cbis_manager_token'' | jq . | grep name | grep -v \, | awk '{print $NF}' | tr -d \")
916 COUNTER=0
917 for page in $pages
918 do
919 state=$(curl -g -s -k -L -X GET 'https://'$HypervisorURL'/api/'$page'/state' -H 'Authorization: Basic '$cbis_manager_token'' | jq .state | tr -d \")
920 if [[ $state == "PARTIAL" ]]
921 then
922 echo -e "${ORANGE}$page is in $state state${NC}"
923 echo -e "\n${ORANGE}note: currently, the only known PARTIAL state scenario is when the deployment smoke test failed or the deployment was executed with smoke test disabled${NC}"
924 COUNTER=$((COUNTER+1))
925 fi
926 done
927 if [[ $COUNTER -eq 0 ]]
928 then
929 echo -e "${GREEN}no pages in CBIS manager with PARTIAL state${NC}"
930 fi
931 elapsed_time_seconds=$(expr $(date +%s) - $global_start)
932
933
934 ####################################################################################################
935
936
937 start=$(date +%s)
938 STEPS_COUNTER=$((STEPS_COUNTER+1))
939 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE SYSTEMD SERVICES STATUS INCONCITIENCIES BETWEEN THE CONTROLLERS (+$elapsed_time_seconds `date '+%T'`)${NC}"
940 unique_enabled_services=$(ansible controller -b -m shell -a "systemctl list-unit-files --state=enabled" | grep enabled | grep -v cbis_update_ceph_pgs.service |sort | uniq -c | grep -v '3 ' | awk '{print $2}' | paste -sd"|")
941 if [[ $unique_enabled_services ]]
942 then
943 services_mismatch=$(ansible controller -b -m shell -a "systemctl list-unit-files | grep -E '$unique_enabled_services'")
944 echo -e "${RED}$services_mismatch${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
945 echo -e "\n\n${ORANGE}CBIS-16369 (19A) - mount_cephfs_share is not configured on replaced controllers${NC}"
946 else
947 echo -e "${GREEN}no inconsistencies found between the systemd services of the controllers${NC}"
948 fi
949 elapsed_time_seconds=$(expr $(date +%s) - $start)
950
951
952 ####################################################################################################
953
954
955 start=$(date +%s)
956 STEPS_COUNTER=$((STEPS_COUNTER+1))
957 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THE OVERCLOUD BACKUP db_backup.enc IS CREATED FROM ALL THE CONTROLLERS UNDER THE CONFIGURED BACKUP NFS DIRECTORY (+$elapsed_time_seconds `date '+%T'`)${NC}"
958 deployment_date=$(sudo cat /var/log/cbis/overcloud_installation.log | grep -E ^202[1-9]-[0-9] | tail -n1 | awk '{print $1}')
959 current_date_for_skip_check=$(date +%Y-%m-%d)
960 current_date_for_backup_directories=$(date +%Y.%m.%d)
961 db_backup_directory_check=$(sudo du -ha /mnt/backup/overcloud-controller-*/$current_date_for_backup_directories* 2>&1)
962 db_backup=$(echo -e "$db_backup_directory_check" | grep db_backup.enc -c)
963 no_file_or_directory_error=$(echo -e "$db_backup_directory_check" | grep 'No such file or directory')
964 # if the deployment date and current date are same the test expects for no overcloud backup. the overcloud backup by default is created each night at 02:00 AM. if the setup was deployed at the day of when this check is running there will not be any overcloud backup and thus this check will get skipped by design.
965 if [[ $no_file_or_directory_error && $deployment_date == $current_date_for_skip_check ]]
966 then
967 echo -e "${ORANGE}since only today the setup was deployed, no overcloud backups are expected under /mnt/backup/${NC}"
968 else
969 if [[ $db_backup != "3" ]]
970 then
971 echo -e "${RED}unable to find 3 db_backup.enc backup files (one per controller) under /mnt/backup/overcloud-controller-*/$current_date_for_backup_directories*${NC}\n"
972 echo -e "${RED}$db_backup_directory_check${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
973 else
974 echo -e "${GREEN}found 3 db_backup.enc files under the /mnt/backup/overcloud-controller-*/$current_date_for_backup_directories directories${NC}"
975 echo -e "\n${GREEN}$db_backup_directory_check${NC}"
976 fi
977 fi
978 elapsed_time_seconds=$(expr $(date +%s) - $start)
979
980
981 ####################################################################################################
982
983
984 start=$(date +%s)
985 STEPS_COUNTER=$((STEPS_COUNTER+1))
986 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE REMOVED HOSTS FINGERPRINTS LEFTOVERS IN /home/stack/.ssh/known_hosts (+$elapsed_time_seconds `date '+%T'`)${NC}"
987 LEFTOVER_KNOWN_HOSTS=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "ssh-keyscan -t ecdsa localhost" | grep ^localhost | awk '{print $3}' | sed 's/.$//' | awk -F'/' '{print $NF}' | awk -F'+' '{print $NF}' | paste -sd'|' | xargs -i sudo grep -E -v '{}' /home/stack/.ssh/known_hosts)
988 if [[ $LEFTOVER_KNOWN_HOSTS ]]
989 then
990 echo -e "${RED}$LEFTOVER_KNOWN_HOSTS${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
991 echo -e "\n\n${ORANGE}CBIS-15479 (20) / CBIS-15712 (19A) - hanging known_hosts entries${NC}"
992 else
993 echo -e "${GREEN}no leftover hosts fingerprints are found under /home/stack/.ssh/known_hosts${NC}"
994 fi
995 elapsed_time_seconds=$(expr $(date +%s) - $start)
996
997
998 ####################################################################################################
999
1000
1001 start=$(date +%s)
1002 STEPS_COUNTER=$((STEPS_COUNTER+1))
1003 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE REMOVED CONTROLLERS RESIDUE IN FILES UNDER /var/lib/config-data/puppet-generated/ AND /etc/ (+$elapsed_time_seconds `date '+%T'`)${NC}"
1004 source ~/stackrc
1005 cloud_name=$(cat user_config.yaml | grep cloud_name | awk '{print $2}' | tr -d \')
1006 controllers=$(openstack server list --flavor Controller -f value -c Name | sort)
1007 controllers_index=$(echo -e "$controllers" | awk -F- '{print $NF}' | sort)
1008 echo -e "${CYAN}${UL}controllers found:${NC}\n${CYAN}$controllers${NC}\n"
1009 missing_index=$(echo -e "$controllers_index" | awk '{ for (i = prev + 0; i < $1; i++) {print i} } { prev = $1 + 1 }')
1010 if [[ $missing_index ]]
1011 then
1012 controller_full_name=$(openstack server list --flavor Controller -f value -c Name | tr -d [0-9] | uniq)
1013 for index in $missing_index
1014 do
1015 removed_controller_leftovers=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "grep -R -w $controller_full_name$index /var/lib/config-data/puppet-generated/ /etc/ | grep -E -v '.conf.[0-9]|.conf.bck|.cfg.[0-9]'" | grep -v -E 'No such file or directory|rc=[1-9]' | grep ^/ -B 1)
1016 if [[ $removed_controller_leftovers ]]
1017 then
1018 echo -e "\n${RED}found $controller_full_name$index entries within one or more files under /var/lib/config-data/puppet-generated/ or /etc/${NC}"
1019 echo -e "\n${RED}$removed_controller_leftovers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1020 echo -e "\n\n${ORANGE}CBIS-16370 (19A) - the old replaced controllers are still presented in /etc/ssh/ssh_known_hosts on the existing controllers${NC}"
1021 else
1022 echo -e "${GREEN}$controller_full_name$index residue is not found under /var/lib/config-data/puppet-generated/*${NC}"
1023 fi
1024 done
1025 else
1026 echo -e "${GREEN}couldn't find removed controller in the system${NC}"
1027 fi
1028 elapsed_time_seconds=$(expr $(date +%s) - $start)
1029
1030
1031 ####################################################################################################
1032
1033
1034 start=$(date +%s)
1035 STEPS_COUNTER=$((STEPS_COUNTER+1))
1036 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE REMOVED CONTROLLER RESIDUE IN /var/log/* IN THE PAST 10 MINUTES (+$elapsed_time_seconds `date '+%T'`)${NC}"
1037 source ~/stackrc
1038 cloud_name=$(cat user_config.yaml | grep cloud_name | awk '{print $2}' | tr -d \')
1039 hour1=$(date +%Y"-"%m"-"%d" "%T | cut -d: -f1-2 | sed 's/.$//')
1040 hour2=$(date | awk '{print $2" "$3 ,$4}' | cut -d: -f1-2 | sed 's/.$//')
1041 controllers=$(openstack server list --flavor Controller -f value -c Name | sort)
1042 controllers_index=$(echo -e "$controllers" | awk -F- '{print $NF}' | sort)
1043 echo -e "${CYAN}${UL}controllers found:${NC}\n${CYAN}$controllers${NC}\n"
1044 missing_index=$(echo -e "$controllers_index" | awk '{ for (i = prev + 0; i < $1; i++) {print i} } { prev = $1 + 1 }')
1045 if [[ $missing_index ]]
1046 then
1047 controller_full_name=$(openstack server list --flavor Controller -f value -c Name | tr -d [0-9] | uniq)
1048 for index in $missing_index
1049 do
1050 removed_controller_leftovers=$(ansible all --limit '!hypervisor' -b -m shell -a "grep -E -R '$hour1|$hour2' /var/log/ | grep -E -v '/var/log/filebeat/filebeat|Invoked with warn=True|/var/log/cbis/ansible/ansible.log' | grep -w $controller_full_name$index | awk -F: '{print \$1}' | uniq -c | column -t" | grep ^[1-9] -B 1)
1051 if [[ $removed_controller_leftovers ]]
1052 then
1053 echo -e "${RED}found $controller_full_name$index entries within one or more log files in the past 10 minutes${NC}"
1054 echo -e "\n\n${RED}$removed_controller_leftovers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1055 else
1056 echo -e "${GREEN}$controller_full_name$index residue is not found under /var/log/*${NC}"
1057 fi
1058 done
1059 else
1060 echo -e "${GREEN}couldn't find removed controller in the system${NC}"
1061 fi
1062 elapsed_time_seconds=$(expr $(date +%s) - $start)
1063
1064
1065 ####################################################################################################
1066
1067
1068 start=$(date +%s)
1069 STEPS_COUNTER=$((STEPS_COUNTER+1))
1070 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK INSTANCES STATE (OPENSTACK) (+$elapsed_time_seconds `date '+%T'`)${NC}"
1071 vms=$(source ~/overcloudrc && openstack server list --all --long -f value | grep -w -v 'ACTIVE None Running')
1072 if [[ $vms ]]
1073 then
1074 echo -e "${RED}$vms${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1075 else
1076 echo -e "${GREEN}all instances are in active/running state${NC}"
1077 fi
1078 elapsed_time_seconds=$(expr $(date +%s) - $start)
1079
1080
1081 ####################################################################################################
1082
1083
1084 start=$(date +%s)
1085 STEPS_COUNTER=$((STEPS_COUNTER+1))
1086 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK INSTANCES STATE (VIRSH) (+$elapsed_time_seconds `date '+%T'`)${NC}"
1087 virsh_instances=$(ansible compute -b -m shell -a "virsh list --all")
1088 inactive_virsh_instances=$(echo -e "$virsh_instances" | grep -v running | grep instance- -B 3)
1089 running_virsh_instances=$(echo -e "$virsh_instances" | grep running)
1090 if [[ $inactive_virsh_instances ]]
1091 then
1092 echo -e "${RED}$inactive_virsh_instances${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1093 elif [[ -z $inactive_virsh_instances && -z $running_virsh_instances ]]
1094 then
1095 echo -e "${ORANGE}no instances are found on the system!${NC}"
1096 else
1097 echo -e "${GREEN}all instances are in running state${NC}"
1098 fi
1099 elapsed_time_seconds=$(expr $(date +%s) - $start)
1100
1101
1102 ####################################################################################################
1103
1104
1105 start=$(date +%s)
1106 STEPS_COUNTER=$((STEPS_COUNTER+1))
1107 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE RUNNING OPENSTACK INSTANCES AND RUNNING VIRSH INSTANCES COUNT IS IDENTICAL (+$elapsed_time_seconds `date '+%T'`)${NC}"
1108 running_virsh_instances=$(echo -e "$virsh_instances" | grep -c running)
1109 if [[ $nova_instances_count != $running_virsh_instances ]]
1110 then
1111 echo -e "${RED}$running openstack instances ($nova_instances_count), virsh running instances ($running_virsh_instances)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1112 else
1113 echo -e "${GREEN}running openstack instances ($nova_instances_count), virsh running instances ($running_virsh_instances)${NC}"
1114 fi
1115 elapsed_time_seconds=$(expr $(date +%s) - $start)
1116
1117
1118 ####################################################################################################
1119
1120
1121 start=$(date +%s)
1122 STEPS_COUNTER=$((STEPS_COUNTER+1))
1123 echo -e "${BLUE}\n\n$STEPS_COUNTER) INSTANCES CONNECTIVITY VALIDATION (VIA NETWORK NAMESPACE) (+$elapsed_time_seconds `date '+%T'`)${NC}"
1124 if [[ $nuage != "true" ]]
1125 then
1126 source ~/overcloudrc
1127 instances=$(openstack server list --all -f value | wc -l)
1128 if [[ $instances != "0" ]]
1129 then
1130 if [[ $cbis_version != "19.0.0.1" && $cbis_version != "18.0.0.1" ]]
1131 then
1132 networks=$(openstack server list --all --long -c Networks -f value | grep -E -i -v ':|avrs' | awk -F= '{print $1}' | sort -u)
1133 if [[ $networks ]]
1134 then
1135 for network in $networks
1136 do
1137 addresses=$(openstack server list --all --long -c Networks -f value | awk '{OFS=RS;$1=$1}1' | grep -w $network | awk -F= '{print $2}' | tr -d ',;' | paste -sd " ")
1138 if [[ -z $addresses ]]
1139 then
1140 echo -e "${RED}addresses variable didn't return any value${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1141 fi
1142 network_id=$(openstack network list -f value | grep -w $network | awk '{print $1}')
1143 if [[ -z $network_id ]]
1144 then
1145 echo -e "${RED}network_id variable didn't return any value${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1146 fi
1147 echo -e "${CYAN}ansible $last_index_controller -b -m shell -a \"ip netns exec qdhcp-$network_id nping -4 --tcp-connect -p 22 -c 3 $addresses\"${NC}"
1148 result=$(ansible $last_index_controller -b -m shell -a "ip netns exec qdhcp-$network_id nping -4 --tcp-connect -p 22 -c 3 $addresses" | grep 'Failed: [1-9]')
1149 if [[ -z $result ]]
1150 then
1151 echo -e "${GREEN}network $network addresses replied successfully${NC}"
1152 else
1153 echo -e "${RED}$result${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1154 fi
1155 done
1156 fi
1157 networks=$(openstack server list --all --long -c Networks -f value | grep -i -v avrs | grep : | awk -F= '{print $1}' | sort -u)
1158 if [[ $networks ]]
1159 then
1160 for network in $networks
1161 do
1162 addresses=$(openstack server list --all --long -c Networks -f value | awk '{OFS=RS;$1=$1}1' | grep -w $network | awk -F= '{print $2}' | tr -d ',;' | paste -sd " ")
1163 if [[ -z $addresses ]]
1164 then
1165 echo -e "${RED}addresses variable didn't return any value${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1166 fi
1167 network_id=$(openstack network list -f value | grep -w $network | awk '{print $1}')
1168 if [[ -z $network_id ]]
1169 then
1170 echo -e "${RED}network_id variable didn't return any value${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1171 fi
1172 echo -e "${CYAN}ansible $last_index_controller -b -m shell -a \"ip netns exec qdhcp-$network_id nping -6 --tcp-connect -p 22 -c 3 $addresses\"${NC}"
1173 result=$(ansible $last_index_controller -b -m shell -a "ip netns exec qdhcp-$network_id nping -6 --tcp-connect -p 22 -c 3 $addresses" | grep 'Failed: [1-9]')
1174 if [[ -z $result ]]
1175 then
1176 echo -e "${GREEN}network $network addresses replied successfully${NC}"
1177 else
1178 echo -e "${RED}$result${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1179 fi
1180 done
1181 fi
1182 else
1183 networks=$(openstack server list --all --long -c Networks -f value | grep -v -i avrs)
1184 ipv4_networks=$(echo "$networks" | grep -v : | awk -F= '{print $1}' | sort -u)
1185 ipv6_networks=$(echo "$networks" | grep : | awk -F= '{print $1}' | sort -u)
1186 if [[ $ipv4_networks ]]
1187 then
1188 for network in $ipv4_networks
1189 do
1190 addresses=$(openstack server list --all --long -c Networks -f value | awk '{OFS=RS;$1=$1}1' | grep -w $network | awk -F= '{print $2}' | tr -d ',;' | paste -sd " ")
1191 if [[ -z $addresses ]]
1192 then
1193 echo -e "${RED}addresses variable didn't return any value${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1194 fi
1195 network_id=$(openstack network list -f value | grep -w $network | awk '{print $1}')
1196 if [[ -z $network_id ]]
1197 then
1198 echo -e "${RED}network_id variable didn't return any value${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1199 fi
1200 echo -e "${CYAN}ansible $last_index_controller -b -m shell -a \"ip netns exec qdhcp-$network_id fping $addresses'\"${NC}"
1201 result=$(ansible $last_index_controller -b -m shell -a "ip netns exec qdhcp-$network_id fping $addresses" | grep ^[0-9] | grep -v 'is alive')
1202 if [[ $result ]]
1203 then
1204 echo -e "\n${RED}\n$result${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1205 else
1206 echo -e "${GREEN}all the addresses of network $network replied successfully${NC}"
1207 fi
1208 done
1209 fi
1210 if [[ $ipv6_networks ]]
1211 then
1212 for network in $ipv6_networks
1213 do
1214 addresses=$(openstack server list --all --long -c Networks -f value | awk '{OFS=RS;$1=$1}1' | grep -w $network | awk -F= '{print $2}' | tr -d ',;' | paste -sd " ")
1215 if [[ -z $addresses ]]
1216 then
1217 echo -e "${RED}addresses variable didn't return any value${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1218 fi
1219 network_id=$(openstack network list -f value | grep -w $network | awk '{print $1}')
1220 if [[ -z $network_id ]]
1221 then
1222 echo -e "${RED}network_id variable didn't return any value${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1223 fi
1224 echo -e "${CYAN}ansible $last_index_controller -b -m shell -a \"ip netns exec qdhcp-$network_id fping6 $addresses$\"${NC}"
1225 result=$(ansible $last_index_controller -b -m shell -a "ip netns exec qdhcp-$network_id fping6 $addresses" | grep ^[0-9] | grep -v 'is alive')
1226 if [[ $result ]]
1227 then
1228 echo -e "\n${RED}\n$result${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1229 else
1230 echo -e "${GREEN}all the addresses of network $network replied successfully${NC}"
1231 fi
1232 done
1233 fi
1234 fi
1235 else
1236 echo -e "${ORANGE}no instances are found on the system!${NC}"
1237 fi
1238 elif [[ $nuage == "true" ]]
1239 then
1240 echo -e "${ORANGE}nuage/avrs instance aren't using the neutron dhcp namespace and therefore this check is irrelevant for nuage deployment${NC}"
1241 fi
1242 elapsed_time_seconds=$(expr $(date +%s) - $start)
1243
1244
1245 ####################################################################################################
1246
1247 start=$(date +%s)
1248 STEPS_COUNTER=$((STEPS_COUNTER+1))
1249 echo -e "${BLUE}\n\n$STEPS_COUNTER) SEARCH FOR BIG GAPS BETWEEN THE SYSTEM CLOCK AND THE HARDWARE CLOCK (+$elapsed_time_seconds `date '+%T'`)${NC}"
1250 clock_gap=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "hwclock | awk '{print \$5}' | sed 's/...$//' && date +%I:%M:%S | sed 's/...$//'" | uniq -c | column -t | grep -E '^1\s+[0-9][0-9]:[0-9][0-9]:[0-9]' -B 1)
1251 if [[ $clock_gap ]]
1252 then
1253 clock_gap=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "hwclock | awk '{print \$5}' && date +%I:%M:%S" | uniq -c | column -t | grep -E '^1\s+[0-9][0-9]:[0-9][0-9]:[0-9]' -B 1)
1254 echo -e "${RED}$clock_gap${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1255 else
1256 echo -e "${GREEN}no substantial gap is found between the hardware clock (real time clock - A.K.A the RTC, CMOS clock) to the system clock (A.K.A the kernel clock or software clock)${NC}"
1257 fi
1258 elapsed_time_seconds=$(expr $(date +%s) - $start)
1259
1260
1261 ####################################################################################################
1262
1263
1264 start=$(date +%s)
1265 STEPS_COUNTER=$((STEPS_COUNTER+1))
1266 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT overcloud-full.qcow2 IS NOT CORRUPTED (+$elapsed_time_seconds `date '+%T'`)${NC}"
1267 ### based on ICE-2453
1268 overcloud_image_validation=$(sudo qemu-img check /home/stack/images/overcloud-full.qcow2)
1269 overcloud_image_validation_result=$(echo -e "$overcloud_image_validation" | grep 'No errors were found on the image')
1270 if [[ $overcloud_image_validation_result ]]
1271 then
1272 echo -e "${GREEN}$overcloud_image_validation_result${NC}"
1273 else
1274 echo -e "${RED}$overcloud_image_validation${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1275 fi
1276 elapsed_time_seconds=$(expr $(date +%s) - $start)
1277
1278
1279 ####################################################################################################
1280
1281
1282 start=$(date +%s)
1283 STEPS_COUNTER=$((STEPS_COUNTER+1))
1284 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE LEFTOVERS OF REMOVED COMPUTES IN THE UNDERCLOUD AND CONTROLLERS CONF FILES (+$elapsed_time_seconds `date '+%T'`)${NC}"
1285 scaled_in_compute=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/remove_node.log | grep 'node_names with value' | tr \" \" \"\n\"" | grep overcloud- | tr -d "u',[]" | head -n1)
1286 if [[ $scaled_in_compute ]]
1287 then
1288 compute_leftover=$(ansible localhost,controller -b -m shell -a "grep -R -i $scaled_in_compute /var/lib/config-data/puppet-generated/ 2>&1 | grep -v 'No such file or directory'" | grep -i $scaled_in_compute -B 1)
1289 if [[ $compute_leftover ]]
1290 then
1291 echo -e "${RED}$compute_leftover${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1292 else
1293 echo -e "${GREEN}couldn't find left-overs in /var/lib/config-data/puppet-generated for the scaled-in compute $scaled_in_compute${NC}"
1294 fi
1295 else
1296 echo -e "${GREEN}according to /var/log/cbis/remove_node.log on the undercloud physical server, scale-in was never performed on this system${NC}"
1297 fi
1298 elapsed_time_seconds=$(expr $(date +%s) - $start)
1299
1300
1301 ####################################################################################################
1302
1303
1304 start=$(date +%s)
1305 STEPS_COUNTER=$((STEPS_COUNTER+1))
1306 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE SEGMENTATIONS FAULT IN /var/log/messages (+$elapsed_time_seconds `date '+%T'`)${NC}"
1307 fault=$(sshpass -p $hv_cbis_admin_password ansible -k all -b -m shell -a "grep -R -i fault /var/log/message*" | grep -i signal -B 1)
1308 if [[ $fault ]]
1309 then
1310 echo -e "${RED}$fault${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1311 echo -e "\n\n${ORANGE}CBIS-16119 (20) - periodic ceph dashboard segmentation faults on the controllers ${NC}"
1312 else
1313 echo -e "${GREEN}no segmentation faults found in /var/log/messages${NC}"
1314 fi
1315 elapsed_time_seconds=$(expr $(date +%s) - $global_start)
1316
1317
1318 ####################################################################################################
1319
1320
1321 start=$(date +%s)
1322 STEPS_COUNTER=$((STEPS_COUNTER+1))
1323 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE SOFT/HARD LOCKUPS IN /var/log/messages (+$elapsed_time_seconds `date '+%T'`)${NC}"
1324 soft_lockup=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "grep -i -E 'soft lockup|hard lockup ' /var/log/messages | grep -v ansible-command" | grep 'soft lockup' -B 1)
1325 if [[ $soft_lockup ]]
1326 then
1327 echo -e "${RED}$soft_lockup${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1328 else
1329 echo -e "${GREEN}no soft/hard lockups found in /var/log/messages${NC}"
1330 fi
1331 elapsed_time_seconds=$(expr $(date +%s) - $global_start)
1332
1333
1334 ####################################################################################################
1335
1336
1337 start=$(date +%s)
1338 STEPS_COUNTER=$((STEPS_COUNTER+1))
1339 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE Traceback LOG LINES (case-insensitive) WITHIN /var/log/ UP TO 10 MINUTES EARLIER (+$elapsed_time_seconds `date '+%T'`)${NC}"
1340 hour1=$(date +%Y"-"%m"-"%d" "%T | cut -d: -f1-2 | sed 's/.$//')
1341 hour2=$(date | awk '{print $2" "$3 ,$4}' | cut -d: -f1-2 | sed 's/.$//')
1342 traceback=$(ansible all --limit '!hypervisor' -b -m shell -a "grep -E -R '$hour1|$hour2' /var/log/ | grep -i 'Traceback' | grep -E -v 'ansible-command: Invoked with warn|ansible.log|filebeat' | awk -F: '{print \$1}' | sort | uniq -c | column -t" | grep ^[1-9] -B 1)
1343 if [[ $traceback ]]
1344 then
1345 echo -e "${RED}$traceback${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1346 echo -e "\n\n${ORANGE}CBIS-13698 (19) / CBIS-16371 (19A) - every 10 minutes aodh exception in /var/log/vitrage/collector.log${NC}"
1347 else
1348 echo -e "${GREEN}no traceback logs lines found under /var/log/${NC}"
1349 fi
1350 elapsed_time_seconds=$(expr $(date +%s) - $start)
1351
1352
1353 ####################################################################################################
1354
1355
1356 start=$(date +%s)
1357 STEPS_COUNTER=$((STEPS_COUNTER+1))
1358 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE error, err_code, failure or fatal LOG LINES (case-insensitive) WITHIN /var/log/ UP TO 10 MINUTES EARLIER (+$elapsed_time_seconds `date '+%T'`)${NC}"
1359 hour1=$(date -d "-0 hour" +%Y"-"%m"-"%d" "%T | cut -d: -f1-2 | sed 's/.$//')
1360 hour2=$(date | awk '{print $2" "$3 ,$4}' | cut -d: -f1-2 | sed 's/.$//')
1361 errors=$(ansible all --limit '!hypervisor' -b -m shell -a "grep -E -R '$hour1|$hour2' /var/log/ | grep -E -i 'failure|error|err_code|fatal' | grep -E -v 'ansible-command: Invoked with warn|ansible.log|filebeat|, 0 errors,' | awk -F: '{print \$1}' | sort | uniq -c | column -t" | grep ^[1-9] -B 1)
1362 if [[ $errors ]]
1363 then
1364 echo -e "${RED}$errors${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1365 echo -e "\n\n${ORANGE}CBIS-16403 (19A) - barbican_wsgi_main_error.log continually reports 'access to /var/www/cgi-bin/barbican/main failed'${NC}"
1366 echo -e "${ORANGE}CBIS-16655 (20) - /var/log/rhsm/rhsm.log in the UC throws Certificate update using daemon failed error${NC}"
1367 else
1368 echo -e "${GREEN}no failure, error, err_code or fatal log lines were found under /var/log/${NC}"
1369 fi
1370 elapsed_time_seconds=$(expr $(date +%s) - $start)
1371
1372
1373 ####################################################################################################
1374
1375
1376 start=$(date +%s)
1377 STEPS_COUNTER=$((STEPS_COUNTER+1))
1378 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE Permission denied LOG LINES (case-insensitive) WITHIN /var/log/ UP TO 10 MINUTES EARLIER (+$elapsed_time_seconds `date '+%T'`)${NC}"
1379 hour1=$(date -d "-0 hour" +%Y"-"%m"-"%d" "%T | cut -d: -f1-2 | sed 's/.$//')
1380 hour2=$(date | awk '{print $2" "$3 ,$4}' | cut -d: -f1-2 | sed 's/.$//')
1381 errors=$(ansible all --limit '!hypervisor' -b -m shell -a "grep -E -R '$hour1|$hour2' /var/log/ | grep -E -i 'Permission denied' | grep -E -v 'ansible-command: Invoked with warn|ansible.log|filebeat' | awk -F: '{print \$1}' | sort | uniq -c | column -t" | grep ^[1-9] -B 1)
1382 if [[ $errors ]]
1383 then
1384 echo -e "${RED}$errors${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1385 else
1386 echo -e "${GREEN}no permission denied log lines were found under /var/log/${NC}"
1387 fi
1388 elapsed_time_seconds=$(expr $(date +%s) - $start)
1389
1390
1391 ####################################################################################################
1392
1393
1394 start=$(date +%s)
1395 STEPS_COUNTER=$((STEPS_COUNTER+1))
1396 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR ERRORS IN THE NOVA LOCAL STORAGE SERVICE cbis_local_storage_filesystem_remount${NC}"
1397 # local_enable_check=$(cat user_config.yaml | grep 'enable_local_storage: true')
1398 for host in $aggregate_hosts
1399 do
1400 user_config_json=$(cat /home/stack/templates/user_config.json | jq .CBIS[].$host)
1401 enable_local_storage=$(echo -e "$user_config_json" | grep '"enable_local_storage": true,')
1402 if [[ $enable_local_storage ]]
1403 then
1404 local_storage_devices=$(echo -e "$user_config_json" | grep local_storage_devices -A 1 | awk NR==2 | tr -d \" | column -t)
1405 echo -e "${CYAN}local storage is enabled on the $host host-group on disk $local_storage_devices${NC}"
1406 local_storage_enabled=true
1407 fi
1408 done
1409 if [[ $local_storage_enabled = "true" ]]
1410 then
1411 local_storage_nodes=$(ansible compute -b -m shell -a "systemctl --all | grep cbis_local_storage_filesystem_remount.service" | grep SUCCESS | awk '{print $1}' | paste -sd ,)
1412 if [[ $local_storage_nodes ]]
1413 then
1414 local_storage=$(ansible $local_storage_nodes -b -m shell -a "journalctl -axu cbis_local_storage_filesystem_remount.service | grep -E -i 'Error|Failed|cannot open|cannot stat'" | grep -E -v 'non-zero return code|FAILED')
1415 if [[ $local_storage ]]
1416 then
1417 echo -e "${RED}$local_storage${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1418 else
1419 echo -e "${GREEN}no errors found in journalctl -u cbis_local_storage_filesystem_remount.service${NC}"
1420 fi
1421 else
1422 echo -e "${RED}couldn't find the systemd service cbis_local_storage_filesystem_remount on the computes which has local storage enabled\nconnect them computes and check /var/log/cbis/cbis.pre_deploy for local storage configuration exception(s)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1423 fi
1424 else
1425 echo -e "${GREEN}nova local storage is not enabled for any host-group (according to the user_config.yaml)${NC}"
1426 fi
1427
1428
1429 ####################################################################################################
1430
1431
1432 start=$(date +%s)
1433 STEPS_COUNTER=$((STEPS_COUNTER+1))
1434 echo -e "${BLUE}\n\n$STEPS_COUNTER) SEARCH FOR DUPLICATED LINES IN haproxy.cfg WITHIN THE CONTROLLERS (+$elapsed_time_seconds `date '+%T'`)${NC}"
1435 DUPLICATED_HAPROXY_LINES=$(ansible controller -m shell -b -a "cat /var/lib/config-data/puppet-generated/haproxy/etc/haproxy/haproxy.cfg | grep -E '^\s+server' | sort | uniq -c" | grep -E '^\s+[2-9]' -B 1)
1436 if [[ $DUPLICATED_HAPROXY_LINES ]]
1437 then
1438 echo -e "${RED}$DUPLICATED_HAPROXY_LINES${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1439 else
1440 echo -e "${GREEN}no duplicated lines found in haproxy.cfg within the controllers${NC}"
1441 fi
1442 elapsed_time_seconds=$(expr $(date +%s) - $start)
1443
1444
1445 ####################################################################################################
1446
1447
1448 start=$(date +%s)
1449 STEPS_COUNTER=$((STEPS_COUNTER+1))
1450 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THE OVERCLOUD HEAT STACK(S) STATUS (+$elapsed_time_seconds `date '+%T'`)${NC}"
1451 stack_status=$(source ~/overcloudrc && openstack stack list)
1452 if [[ $stack_status ]]
1453 then
1454 stack_status=$(source ~/overcloudrc && openstack stack list -f value | grep -v COMPLETE)
1455 if [[ -z $stack_status ]]
1456 then
1457 echo -e "${GREEN}all the heat stacks are in COMPLETED status${NC}"
1458 else
1459 stack_status=$(source ~/overcloudrc && openstack stack list)
1460 echo -e "${RED}$stack_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1461 fi
1462 else
1463 echo -e "${ORANGE}no heat stacks found in the overcloud${NC}"
1464 fi
1465 elapsed_time_seconds=$(expr $(date +%s) - $start)
1466
1467
1468 ####################################################################################################
1469
1470
1471 start=$(date +%s)
1472 STEPS_COUNTER=$((STEPS_COUNTER+1))
1473 echo -e "${BLUE}\n\n$STEPS_COUNTER) PACEMAKER STATUS (+$elapsed_time_seconds `date '+%T'`)${NC}"
1474 pcs_status=$(ansible $last_index_controller -b -m shell -a "pcs resource" | grep -E -i 'DISABLED|Stopped|Stopping|unmanaged|FAILED|blocked|OFFLINE|promote|Recover|Starting|error|Monitoring' | grep -v $last_index_controller)
1475 if [[ -z $pcs_status ]]
1476 then
1477 echo -e "${GREEN}no DISABLED, Stopped, Stopping, unmanaged, FAILED, blocked, OFFLINE, promote, Recover, Starting, error or Monitoring keywords found in pcs status${NC}"
1478 else
1479 echo -e "${RED}$pcs_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1480 fi
1481 if [[ $cbis_version != "18.0.0.1" ]]
1482 then
1483 galera_masters_count=$(ansible $last_index_controller -b -m shell -a "pcs resource show galera-bundle | grep masters= | awk '{print \$3}' | awk -F= '{print \$2}'" | grep ^[0-9])
1484 if [[ $galera_masters_count == "3" ]]
1485 then
1486 echo -e "${GREEN}all the 3 galera members are showing master state as expected${NC}"
1487 else
1488 echo -e "${RED}one or more galera members are not in master state${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1489 fi
1490 redis_masters_count=$(ansible $last_index_controller -b -m shell -a "pcs resource show redis-bundle | grep masters= | awk '{print \$3}' | awk -F= '{print \$2}'" | grep ^[0-9])
1491 if [[ $redis_masters_count == "1" ]]
1492 then
1493 echo -e "${GREEN}1 redis member is showing master state as expected${NC}"
1494 else
1495 echo -e "${RED}expecting 1 redis master but got $redis_masters_count masters${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1496 fi
1497 fi
1498 if [[ $cbis_version == "18.0.0.1" ]]
1499 then
1500 galera_masters_count=$(ansible $last_index_controller -b -m shell -a "pcs resource | grep 'galera-master' -A1 | grep Masters: | awk '{OFS=RS;\$1=\$1}1'" | grep -c overcloud-controller)
1501 if [[ $galera_masters_count == "3" ]]
1502 then
1503 echo -e "${GREEN}all the 3 galera members are showing master state as expected${NC}"
1504 else
1505 echo -e "${RED}one or more galera members are not in master state${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1506 fi
1507 redis_masters_count=$(ansible $last_index_controller -b -m shell -a "pcs resource | grep 'redis-master' -A1 | grep Masters: | awk '{OFS=RS;\$1=\$1}1'" | grep -c overcloud-controller)
1508 if [[ $redis_masters_count == "1" ]]
1509 then
1510 echo -e "${GREEN}1 redis member is showing master state as expected${NC}"
1511 else
1512 echo -e "${RED}expecting 1 redis master but got $redis_masters_count masters${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1513 fi
1514 fi
1515 elapsed_time_seconds=$(expr $(date +%s) - $start)
1516
1517
1518 ####################################################################################################
1519
1520
1521 start=$(date +%s)
1522 STEPS_COUNTER=$((STEPS_COUNTER+1))
1523 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE DISABLED RESOURCES IN PACEMAKER CONSTRAINTS (+$elapsed_time_seconds `date '+%T'`)${NC}"
1524 pcs_disabled_constraints=$(ansible $last_index_controller -b -m shell -a "pcs constraint" | grep -i Disabled -B 1)
1525 if [[ $pcs_disabled_constraints ]]
1526 then
1527 echo -e "${RED}$pcs_disabled_constraints${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1528 else
1529 echo -e "${GREEN}no disabled pacemaker resources found${NC}"
1530 fi
1531
1532
1533 ####################################################################################################
1534
1535
1536 start=$(date +%s)
1537 STEPS_COUNTER=$((STEPS_COUNTER+1))
1538 echo -e "${BLUE}\n\n$STEPS_COUNTER) PACEMAKER RESOURCES FAILED ACTIONS HISTORY (+$elapsed_time_seconds `date '+%T'`)${NC}"
1539 pcs_failed_actions=$(ansible $last_index_controller -b -m shell -a "pcs status | awk '/Failed Actions:/,/Daemon Status:/' | grep -v 'Daemon Status:'" | grep -v -E 'FAILED|non-zero return code|SUCCESS')
1540 if [[ $pcs_failed_actions ]]
1541 then
1542 echo -e "${RED}$pcs_failed_actions${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1543 else
1544 echo -e "${GREEN}couldn't find failed actions in pcs status${NC}"
1545 fi
1546
1547
1548 ####################################################################################################
1549
1550
1551 start=$(date +%s)
1552 STEPS_COUNTER=$((STEPS_COUNTER+1))
1553 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT SELINUX IS ENABLED AND ENFORCING (ALSO CHECKS IF SECURITY HARDENING IS ENABLED) (+$elapsed_time_seconds `date '+%T'`)${NC}"
1554 selinux=$(ansible all --limit '!hypervisor' -b -m shell -a "sestatus | grep -E 'SELinux status:|Current mode:'" | grep -Ev 'SELinux status:\s+enabled|Current mode:\s+enforcing|SUCCESS')
1555 if [[ $selinux ]]
1556 then
1557 selinux=$(ansible all --limit '!hypervisor' -b -m shell -a "sestatus | grep -E 'SELinux status:\s+disabled|Current mode:\s+permissive'" | grep -E -v 'FAILED|non-zero return code')
1558 echo -e "${RED}selinux permissive mode is usually a case of not applying security hardening on the host(s)${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1559 echo -e "${RED}$selinux${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1560 else
1561 echo -e "${GREEN}selinux is enabled and enforcing on all the hosts${NC}"
1562 fi
1563 elapsed_time_seconds=$(expr $(date +%s) - $start)
1564
1565
1566 ####################################################################################################
1567
1568
1569 start=$(date +%s)
1570 STEPS_COUNTER=$((STEPS_COUNTER+1))
1571 date=$(date '+%x %T' | cut -d: -f1-2 | sed 's/.$//')
1572 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE SELINUX Permission denied ERRORS IN /var/log/audit/audit.log WITH THE TIMESTAMPS ["$date"] WITHIN THE UNDERCLOUD VM (+$elapsed_time_seconds `date '+%T'`)${NC}"
1573 ausearch=$(ausearch -m AVC,USER_AVC,SELINUX_ERR,USER_SELINUX_ERR -i 2>&1 | grep 'Permission denied' -B 1 | grep "$date" | awk -F'proctitle=' '{print $2}' | column | sort | uniq -c)
1574 if [[ $ausearch ]]
1575 then
1576 echo -e "${RED}$ausearch${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1577 else
1578 echo -e "${GREEN}no Permission denied errors found in the audit logs${NC}"
1579 fi
1580 elapsed_time_seconds=$(expr $(date +%s) - $start)
1581
1582
1583 ####################################################################################################
1584
1585
1586 start=$(date +%s)
1587 STEPS_COUNTER=$((STEPS_COUNTER+1))
1588 echo -e "${BLUE}\n\n$STEPS_COUNTER) DNS RESOLUTION VALIDATION (+$elapsed_time_seconds `date '+%T'`)${NC}"
1589 dns=$(python /usr/share/cbis/undercloud/tools/dns_validation.py)
1590 if [[ -z $dns ]]
1591 then
1592 echo -e "${GREEN}DNS resolution succeeded${NC}"
1593 else
1594 echo -e "${RED}DNS resolution failed${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1595 echo -e "${RED}$dns${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1596 fi
1597 elapsed_time_seconds=$(expr $(date +%s) - $start)
1598
1599
1600 ####################################################################################################
1601
1602
1603 start=$(date +%s)
1604 STEPS_COUNTER=$((STEPS_COUNTER+1))
1605 echo -e "${BLUE}\n\n$STEPS_COUNTER) NTP (timedatectl) SYNCHRONIZATION CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
1606 timedatectl=$(ansible all --limit '!hypervisor' -b -m shell -a "timedatectl" | grep 'NTP synchronized: no' -B 6 | grep SUCCESS | awk '{print $1}')
1607 if [[ $timedatectl ]]
1608 then
1609 echo -e "${RED}timedatectl returned \"NTP synchronized: no\" for the following hosts:\n$timedatectl${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1610 else
1611 echo -e "${GREEN}all hosts are synchronized (timedatectl)${NC}"
1612 fi
1613 elapsed_time_seconds=$(expr $(date +%s) - $start)
1614
1615
1616 ####################################################################################################
1617
1618
1619 start=$(date +%s)
1620 STEPS_COUNTER=$((STEPS_COUNTER+1))
1621 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK BOND INTERFACES STATE ON EACH OVERCLOUD HOST (+$elapsed_time_seconds `date '+%T'`)${NC}"
1622 echo -e "${CYAN}validating all the openvswitch bonds (ovs-appctl bond/list)${NC}"
1623 if [[ $nuage != "true" ]]
1624 then
1625 bond_status=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ovs-appctl bond/list | grep - | awk '{print \$1}' | xargs -i ovs-appctl bond/show {}" | grep ^slave | awk '{print $3}' | sort -u)
1626 if [[ $bond_status != "enabled" ]]
1627 then
1628 bond_status=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ovs-appctl bond/list | grep - | awk '{print \$1}' | xargs -i ovs-appctl bond/show {} | grep ^slave")
1629 echo -e "${RED}$bond_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1630 else
1631 echo -e "${GREEN}all openvswitch bond interfaces returned enabled${NC}"
1632 fi
1633 else
1634 echo -e "${ORANGE}openvswitch bond interfaces are invalid when the setup is deployed with nuage${NC}"
1635 fi
1636 echo -e "${CYAN}validating all the linux bonds (cat /proc/net/bonding/..)${NC}"
1637 bond_status=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ls /proc/net/bonding/ | grep -v bond0 | xargs -i cat /proc/net/bonding/{}" | grep 'MII Status:' | awk '{print $3}' | sort -u)
1638 if [[ $bond_status != "up" ]]
1639 then
1640 bond_status=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ovs-appctl bond/list | grep - | awk '{print \$1}' | xargs -i ovs-appctl bond/show {} | grep ^slave")
1641 echo -e "${RED}$bond_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1642 else
1643 echo -e "${GREEN}all openvswitch bond interfaces returned enabled${NC}"
1644 fi
1645 elapsed_time_seconds=$(expr $(date +%s) - $start)
1646
1647
1648 ####################################################################################################
1649
1650
1651 start=$(date +%s)
1652 STEPS_COUNTER=$((STEPS_COUNTER+1))
1653 echo -e "${BLUE}\n\n$STEPS_COUNTER) CEPH HEALTH CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
1654 if [[ $ceph_backend == "true" ]]
1655 then
1656 ceph_health_detail=$(ansible $last_index_controller -b -m shell -a "ceph health detail" | grep -v $last_index_controller)
1657 if [[ $ceph_health_detail == "HEALTH_OK" ]]
1658 then
1659 echo -e "${GREEN}ceph health is ok${NC}"
1660 else
1661 ceph_status=$(ansible $last_index_controller -b -m shell -a "ceph -s" | grep -v $last_index_controller)
1662 echo -e "${RED}$ceph_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1663 fi
1664 else
1665 echo -e "${ORANGE}the setup is deployed without ceph backend${NC}"
1666 fi
1667 elapsed_time_seconds=$(expr $(date +%s) - $start)
1668
1669
1670 ####################################################################################################
1671
1672
1673 start=$(date +%s)
1674 STEPS_COUNTER=$((STEPS_COUNTER+1))
1675 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK CONNECTVITY TO ALL THE INFRA ADDRESSES (+$elapsed_time_seconds `date '+%T'`)${NC}"
1676 if [[ $internal_api_controller_address ]]
1677 then
1678 ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $internal_api_controller_address" | grep 'ping statistics' -A1 | tail -n1 | grep -v '2 packets transmitted, 2 received')
1679 if [[ -z $ping ]]
1680 then
1681 echo -e "${GREEN}$internal_api_controller_address is reachable from all the overcloud servers${NC}"
1682 else
1683 ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $internal_api_controller_address | grep 'ping statistics' -A1 | tail -n1" | grep -v '2 packets transmitted, 2 received' | grep ^[0-9] -B 1)
1684 echo -e "${RED}$ping${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1685 fi
1686 else
1687 echo -e "${RED}the \"address\" variable returned empty output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1688 fi
1689 if [[ $tenant_controller_address ]]
1690 then
1691 if [[ $ceph_backend == "true" ]]
1692 then
1693 ping=$(ansible all --limit '!hypervisor,!localhost,!CephStorage' -b -m shell -a "ping -c 2 $tenant_controller_address" | grep 'ping statistics' -A1 | tail -n1 | grep -v '2 packets transmitted, 2 received')
1694 if [[ -z $ping ]]
1695 then
1696 echo -e "${GREEN}$tenant_controller_address is reachable from all the overcloud servers${NC}"
1697 else
1698 ping=$(ansible all --limit '!hypervisor,!localhost,!CephStorage' -b -m shell -a "ping -c 2 $tenant_controller_address | grep 'ping statistics' -A1 | tail -n1" | grep -v '2 packets transmitted, 2 received' | grep ^[0-9] -B 1)
1699 echo -e "${RED}$ping${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1700 fi
1701 else
1702 ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $tenant_controller_address" | grep 'ping statistics' -A1 | tail -n1 | grep -v '2 packets transmitted, 2 received')
1703 if [[ -z $ping ]]
1704 then
1705 echo -e "${GREEN}$tenant_controller_address is reachable from all the overcloud servers${NC}"
1706 else
1707 ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $tenant_controller_address | grep 'ping statistics' -A1 | tail -n1" | grep -v '2 packets transmitted, 2 received' | grep ^[0-9] -B 1)
1708 echo -e "${RED}$ping${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1709 fi
1710 fi
1711 else
1712 echo -e "${RED}the \"address\" variable returned empty output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1713 fi
1714 if [[ $storage_controller_address ]]
1715 then
1716 ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $storage_controller_address" | grep 'ping statistics' -A1 | tail -n1 | grep -v '2 packets transmitted, 2 received')
1717 if [[ -z $ping ]]
1718 then
1719 echo -e "${GREEN}$storage_controller_address is reachable from all the overcloud servers${NC}"
1720 else
1721 ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $storage_controller_address | grep 'ping statistics' -A1 | tail -n1" | grep -v '2 packets transmitted, 2 received' | grep ^[0-9] -B 1)
1722 echo -e "${RED}$ping${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1723 fi
1724 else
1725 echo -e "${RED}the \"address\" variable returned empty output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1726 fi
1727 if [[ $storage_mgmt_controller_address ]]
1728 then
1729 ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $storage_mgmt_controller_address" | grep 'ping statistics' -A1 | tail -n1 | grep -v '2 packets transmitted, 2 received')
1730 if [[ -z $ping ]]
1731 then
1732 echo -e "${GREEN}$storage_mgmt_controller_address is reachable from all the overcloud servers${NC}"
1733 else
1734 ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $storage_mgmt_controller_address | grep 'ping statistics' -A1 | tail -n1" | grep -v '2 packets transmitted, 2 received' | grep ^[0-9] -B 1)
1735 echo -e "${RED}$ping${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1736 fi
1737 else
1738 echo -e "${RED}the \"address\" variable returned empty output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1739 fi
1740 if [[ $provisioning_controller_address ]]
1741 then
1742 ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $provisioning_controller_address" | grep 'ping statistics' -A1 | tail -n1 | grep -v '2 packets transmitted, 2 received')
1743 if [[ -z $ping ]]
1744 then
1745 echo -e "${GREEN}$provisioning_controller_address is reachable from all the overcloud servers${NC}"
1746 else
1747 ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $provisioning_controller_address | grep 'ping statistics' -A1 | tail -n1" | grep -v '2 packets transmitted, 2 received' | grep ^[0-9] -B 1)
1748 echo -e "${RED}$ping${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1749 fi
1750 else
1751 echo -e "${RED}the \"address\" variable returned empty output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1752 fi
1753 elapsed_time_seconds=$(expr $(date +%s) - $start)
1754
1755
1756 ####################################################################################################
1757
1758
1759 start=$(date +%s)
1760 STEPS_COUNTER=$((STEPS_COUNTER+1))
1761 echo -e "${BLUE}\n\n$STEPS_COUNTER) KIBANA DASHBOARDS CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
1762 if [[ $elk == "true" && $elk_deployment_type == "local" ]]
1763 then
1764 if [[ $cbis_version != "18.0.0.1" && $cbis_version != "19.0.0.1" ]]
1765 then
1766 fixed_dashboards=(Cloud-ErrorsDashboard CephOverview Instance-Spawn-Fail-All-Clouds-Table-Cumulative-Sum [MetricbeatSystem]Hostoverview Openstack-Overview Requests-Dashboard Instance-Spawning-Failure Instance-Vtop Cloud-Usage ipmitoolforcloud [MetricbeatSystem]Overview Openstack-InstanceView [MetricbeatHAProxy]Backend [MetricbeatHAProxy]Frontend [MetricbeatHAProxy]HTTPbackend [MetricbeatHAProxy]HTTPfrontend [MetricbeatHAProxy]HTTPserver [MetricbeatHAProxy]Overview)
1767 dashboards=$(curl -g -s -L -X GET 'https://'$PublicURL'/api/saved_objects/?type=dashboard&' -H 'Content-Type: application/json, text/plain, */*' -H 'Authorization: Basic '$kibana_basic_auth'' -H 'Cookie: SERVERID='$last_index_controller'.internalapi.localdomain' --data '' | jq . | grep \"title\": | awk -F: '{print $2}' | tr -d '," ' | paste -sd " ")
1768 for dashboard in $dashboards
1769 do
1770 if [[ " ${fixed_dashboards[@]} " =~ "$dashboard" ]]
1771 then
1772 echo -e "${GREEN}$dashboard${NC}"
1773 else
1774 echo -e "${RED}$dashboard${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1775 fi
1776 done
1777 else
1778 dashboards=$(curl -g -s -L -X GET 'https://'$PublicURL'/api/saved_objects/?type=dashboard&' -H 'Content-Type: application/json, text/plain, */*' -H 'Authorization: Basic '$kibana_basic_auth'' -H 'Cookie: SERVERID='$last_index_controller'.internalapi.localdomain' --data '' | jq . | grep \"page\": | awk '{print $2}' | tr -d ,)
1779 if [[ $dashboards == "1" ]]
1780 then
1781 echo -e "${GREEN}kibana dashboard is accessible${NC}"
1782 else
1783 dashboards=$(curl -g -s -L -X GET 'https://'$PublicURL'/api/saved_objects/?type=dashboard&' -H 'Content-Type: application/json, text/plain, */*' -H 'Authorization: Basic '$kibana_basic_auth'' -H 'Cookie: SERVERID='$last_index_controller'.internalapi.localdomain' --data '' | jq .)
1784 echo -e "${RED}$dashboard${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1785 fi
1786 fi
1787 else
1788 echo -e "${ORANGE}CBIS is deployed without ELK or ELK type is remote${NC}"
1789 fi
1790 elapsed_time_seconds=$(expr $(date +%s) - $start)
1791
1792
1793 ####################################################################################################
1794
1795
1796 start=$(date +%s)
1797 STEPS_COUNTER=$((STEPS_COUNTER+1))
1798 echo -e "${BLUE}\n\n$STEPS_COUNTER) ZABBIX ALARMS (+$elapsed_time_seconds `date '+%T'`)${NC}"
1799 zabbix_problem_triggers=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
1800 -H 'Content-Type: application/json-rpc' \
1801 -H 'Cookie: SERVERID='$last_index_controller'' \
1802 --data '{
1803 "jsonrpc": "2.0",
1804 "method": "trigger.get",
1805 "params": {
1806 "output": [
1807 "description"
1808 ],
1809 "filter": {
1810 "value": 1
1811 },
1812 "sortfield": "hostname",
1813 "sortorder": "DESC"
1814 },
1815 "auth": '$zabbix_auth',
1816 "id": 1
1817 }' | jq .[] | grep -v '^[0-9]' | grep -v '^"'| jq .[] | jq 'select(.description != "/etc/passwd has been changed on {HOST.NAME}")' | jq 'select(.description != "Host information was changed on {HOST.NAME}")' | jq 'select(.description != "{HOST.NAME} has just been restarted")' | jq 'select(.hostname != "dummy")' | jq 'select(.hostname != "dummy_switch")')
1818 if [[ $zabbix_problem_triggers ]]
1819 then
1820 echo -e "${RED}$zabbix_problem_triggers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1821 echo -e "\n\n${ORANGE}CBIS-15886 - zabbix alarms still existing after scale-in/replace-controller operations (19.0)${NC}"
1822 else
1823 echo -e "${GREEN}no active alarms found in zabbix${NC}"
1824 fi
1825 zabbix_problem_triggers_count=$(echo -e "$zabbix_problem_triggers" | grep -c description)
1826 elapsed_time_seconds=$(expr $(date +%s) - $start)
1827
1828
1829 ####################################################################################################
1830
1831
1832 start=$(date +%s)
1833 STEPS_COUNTER=$((STEPS_COUNTER+1))
1834 hours="48"
1835 echo -e "${BLUE}\n\n$STEPS_COUNTER) PRINT ZABBIX EVENT PROBLEMS FROM THE PAST $hours HOURS (+$elapsed_time_seconds `date '+%T'`)${NC}"
1836 epoch_time_from=$(date +%s -d "-$hours hour")
1837 zabbix_history=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
1838 -H 'Content-Type: application/json-rpc' \
1839 -H 'Cookie: SERVERID="$last_index_controller".internalapi.localdomain' \
1840 --data '{
1841 "jsonrpc": "2.0",
1842 "method": "problem.get",
1843 "params": {
1844 "output": "extend",
1845 "selectAcknowledges": "extend",
1846 "selectTags": "extend",
1847 "time_from": "'$epoch_time_from'",
1848 "selectSuppressionData": "extend"
1849 },
1850 "auth": '$zabbix_auth',
1851 "id": 1
1852 }' | jq . | grep \"name\"\: | awk -F: '{print $2}' | tr -d '",' | sed 's/^[[:space:]]\+//' | sort -u | grep -v '/etc/passwd has been changed')
1853 if [[ $zabbix_history ]]
1854 then
1855 echo -e "${RED}$zabbix_history\n\n\n${ORANGE}Please log-in to the zabbix portal and acknowledge the problems history under Monitoring > Problems > History, set the filter timestamps as required and Apply" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1856 else
1857 echo -e "${GREEN}no problem events were found in the past $hours hours${NC}"
1858 fi
1859 elapsed_time_seconds=$(expr $(date +%s) - $start)
1860
1861
1862 ####################################################################################################
1863
1864
1865 start=$(date +%s)
1866 STEPS_COUNTER=$((STEPS_COUNTER+1))
1867 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT /var/log/zabbix/metrics/last_run.status AND /var/log/zabbix/metrics/last_KPIs_run.status ARE EMPTY (+$elapsed_time_seconds `date '+%T'`)${NC}"
1868 metrics_status=$(ansible controller -b -m shell -a "du -b /var/log/zabbix/metrics/last_KPIs_run.status /var/log/zabbix/metrics/last_run.status | awk '{ if ( \$1 != 0 ) print }'" | grep -E ^[1-9] -B 1)
1869 if [[ $metrics_status ]]
1870 then
1871 echo -e "${RED}$metrics_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1872 else
1873 echo -e "${GREEN}/var/log/zabbix/metrics/last_run.status and /var/log/zabbix/metrics/last_KPIs_run.status are empty (no erros) on all the controllers${NC}"
1874 fi
1875 elapsed_time_seconds=$(expr $(date +%s) - $start)
1876
1877
1878 ####################################################################################################
1879
1880
1881 start=$(date +%s)
1882 STEPS_COUNTER=$((STEPS_COUNTER+1))
1883 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT THE ZABBIX zbx_metrics AND THE ZABBIX KPI FILES ARE CREATED ON TIME AND DOESN'T CONTAIN ERRORS (+$elapsed_time_seconds `date '+%T'`)${NC}"
1884 zabbix_server_controller=$(ansible $last_index_controller -b -m shell -a "pcs resource" | grep zabbix-server | awk '{print $NF}')
1885 echo -e "${CYAN}checking the zbx_metrics files on $zabbix_server_controller${NC}"
1886 latest_zabbix_metrics_exporter_file=$(ansible $zabbix_server_controller -b -m shell -a "ls -lrt /var/log/zabbix/metrics/ | grep zbx_metrics | grep -v inprogress | tail -n1" | grep zbx_metrics)
1887 if [[ -z $latest_zabbix_metrics_exporter_file ]]
1888 then
1889 echo -e "${RED}can't find any zbx_metrics.xml files under /var/log/zabbix/metrics/ in $zabbix_server_controller${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1890 else
1891 latest_zabbix_metrics_exporter_file_name=$(echo -e "$latest_zabbix_metrics_exporter_file" | awk '{print $NF}')
1892 latest_zabbix_metrics_exporter_file_date=$(echo -e "$latest_zabbix_metrics_exporter_file" | awk '{print $6,$7,$8}' | xargs -i date -d '{}' +%s)
1893 current_epoch_date=$(date +%s)
1894 delta_minutes=$(expr $current_epoch_date - $latest_zabbix_metrics_exporter_file_date | xargs -i expr {} / 60)
1895 if [ $delta_minutes -gt 15 ]
1896 then
1897 echo -e "${RED}$latest_zabbix_metrics_exporter_file_name is from before $delta_minutes minutes${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1898 else
1899 errors=$(ansible $zabbix_server_controller -b -m shell -a "cat /var/log/zabbix/metrics/$latest_zabbix_metrics_exporter_file_name" | grep -E -iw 'unable|error|fatal|fail|exception|traceback|denied|warning')
1900 if [[ $errors ]]
1901 then
1902 echo -e "${RED}found errors in $latest_zabbix_metrics_exporter_file_name\n\n$errors${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1903 echo -e "\n\n${ORANGE}CBIS-15330 (19.0) - OSError: [Errno 2] No such file or directory: /var/lib/cbis/inventory${NC}"
1904 echo -e "${ORANGE}CBIS-16385 (19A) - zbx_metrics.csv error: /var/log/zabbix/services_names.txt: Permission denied${NC}"
1905
1906 else
1907 echo -e "${GREEN}$latest_zabbix_metrics_exporter_file_name was created in the expected time and doesn't contain errors${NC}"
1908 fi
1909 fi
1910 fi
1911 if [[ $cbis_version != "19.0.0.1" && $cbis_version != "18.0.0.1" ]]
1912 then
1913 echo -e "${CYAN}checking the KPIs files on $zabbix_server_controller${NC}"
1914 latest_zabbix_kpi_file=$(ansible $zabbix_server_controller -b -m shell -a "ls -lrt /var/log/zabbix/metrics/ | grep KPIs | grep -v inprogress | tail -n1" | grep KPIs)
1915 if [[ -z $latest_zabbix_metrics_exporter_file ]]
1916 then
1917 echo -e "${RED}can't find last_KPIs_run.status under /var/log/zabbix/metrics/ in $zabbix_server_controller${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1918 else
1919 latest_zabbix_kpi_file_name=$(echo -e "$latest_zabbix_kpi_file" | awk '{print $NF}')
1920 latest_zabbix_kpi_file_date=$(echo -e "$latest_zabbix_kpi_file" | awk '{print $6,$7,$8}' | xargs -i date -d '{}' +%s)
1921 current_epoch_date=$(date +%s)
1922 delta_hours=$(expr $current_epoch_date - $latest_zabbix_kpi_file_date | xargs -i expr {} / 60 / 60)
1923 if [ $delta_hours -gt 24 ]
1924 then
1925 echo -e "${RED}$latest_zabbix_kpi_file_name is from before $delta_hours hours${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1926 else
1927 errors=$(ansible $zabbix_server_controller -b -m shell -a "cat /var/log/zabbix/metrics/$latest_zabbix_kpi_file_name" | grep -E -iw 'unable|error|fatal|fail|exception|traceback|denied|warning')
1928 if [[ $errors ]]
1929 then
1930 echo -e "${RED}found errors in $latest_zabbix_kpi_file_name\n\n$errors${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1931 else
1932 echo -e "${GREEN}$latest_zabbix_kpi_file_name was created in the expected time and doesn't contain errors${NC}"
1933 fi
1934 fi
1935 fi
1936 fi
1937 elapsed_time_seconds=$(expr $(date +%s) - $start)
1938
1939
1940 ####################################################################################################
1941
1942
1943 start=$(date +%s)
1944 STEPS_COUNTER=$((STEPS_COUNTER+1))
1945 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE GLOBAL MACRO PRESENTED IN ZABBIX ARE AS CONFIGURED IN THE SYSTEM${NC}"
1946 ZABBIX_GLOBAL_MACROS_MYSQL=$(ansible $last_index_controller -b -m shell -a "mysql -e \"SELECT * FROM zabbixdb.globalmacro\"" | column -t)
1947 ALL_CONTROLLERS_IP_ADDRESSES=$(ansible controller -b -m shell -a "ip address show" | grep -o '[0-9]\{1,3\}\.[0-9]\{1,3\}\.[0-9]\{1,3\}\.[0-9]\{1,3\}' | sort | uniq)
1948 CONTROLLER_DNS_ADDRESSES=$(ansible $last_index_controller -b -m shell -a "cat /etc/resolv.conf" | grep -E '^nameserver\s+[0-2]' | awk '{print $NF}')
1949 ZABBIX_IP_MANAGEMENT_MACRO_ADDRESSES=$(echo -e "$ZABBIX_GLOBAL_MACROS_MYSQL" | grep IP_MANAGEMENT | awk '{print $NF}' | sort -n | uniq)
1950 ZABBIX_DNS_MACRO_ADDRESSES=$(echo -e "$ZABBIX_GLOBAL_MACROS_MYSQL" | grep DNS | awk '{print $NF}' | sort -n | uniq)
1951 for DNS in $ZABBIX_DNS_MACRO_ADDRESSES
1952 do
1953 IP_CHECK=$(echo -e "$CONTROLLER_DNS_ADDRESSES" | grep $DNS)
1954 if [[ -z $IP_CHECK ]]
1955 then
1956 echo -e "${RED}the global DNS macro $DNS is not found under /etc/resolv.conf of $last_index_controller${NC}"
1957 else
1958 echo -e "${GREEN}the global DNS macro $DNS is found under /etc/resolv.conf of $last_index_controller${NC}"
1959 fi
1960 done
1961 for IP_MANAGEMENT in $ZABBIX_IP_MANAGEMENT_MACRO_ADDRESSES
1962 do
1963 IP_CHECK=$(echo -e "$ALL_CONTROLLERS_IP_ADDRESSES" | grep $IP_MANAGEMENT)
1964 if [[ -z $IP_CHECK ]]
1965 then
1966 echo -e "${RED}the global IP_MANAGEMENT macro $IP_MANAGEMENT is not found in any of the controllers (ip a)${NC}"
1967 else
1968 echo -e "${GREEN}the global IP_MANAGEMENT macro $IP_MANAGEMENT is found in the controllers (ip a)${NC}"
1969 fi
1970 done
1971 elapsed_time_seconds=$(expr $(date +%s) - $start)
1972
1973
1974 ####################################################################################################
1975
1976
1977 start=$(date +%s)
1978 STEPS_COUNTER=$((STEPS_COUNTER+1))
1979 echo -e "${BLUE}\n\n$STEPS_COUNTER) SYSTEMD SERVICES CHECK ON THE OVERCLOUD EXCLUDING 'DHCP|NetworkManager|sysstat|epmd|kdump|ceph-disk@|sysroot.mount|driverctl|srpd|openstack-ironic|cloud-final' (+$elapsed_time_seconds `date '+%T'`)${NC}"
1980 if [[ ! -f "initial_overcloud_servers_systemctl_output.txt" ]]
1981 then
1982 ansible all --limit '!hypervisor,!localhost' -b -m shell -a "systemctl list-units --all --no-pager" > initial_overcloud_servers_systemctl_output.txt > /dev/null
1983 ansible all --limit '!hypervisor,!localhost' -b -m shell -a "systemctl list-units --all --no-pager > initial_systemctl_output.txt" > /dev/null
1984 fi
1985 systemd_overcloud=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "systemctl list-units --all --no-pager | grep failed | grep -E -v 'DHCP|NetworkManager|sysstat|epmd|kdump|ceph-disk@|sysroot.mount|driverctl|srpd|openstack-ironic|cloud-final'" | grep -E -v 'FAILED|non-zero return code|overcloud|localhost')
1986 if [[ -z $systemd_overcloud ]]
1987 then
1988 echo -e "${GREEN}no failed systemd services found${NC}"
1989 else
1990 systemd_overcloud=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "systemctl list-units --all --no-pager | grep failed | grep -E -v 'DHCP|NetworkManager|sysstat|epmd|kdump|ceph-disk@|sysroot.mount|driverctl|srpd|openstack-ironic|cloud-final'" | grep -E -v 'FAILED|non-zero return code')
1991 echo -e "${RED}$systemd_overcloud${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1992 echo -e "\n\n${ORANGE}CBIS-14217 - httpd service on the controllers after security hardening (19.0)${NC}"
1993 fi
1994 elapsed_time_seconds=$(expr $(date +%s) - $start)
1995
1996
1997 ####################################################################################################
1998
1999
2000 start=$(date +%s)
2001 STEPS_COUNTER=$((STEPS_COUNTER+1))
2002 echo -e "${BLUE}\n\n$STEPS_COUNTER) SYSTEMD SERVICES CHECK ON THE UNDERCLOUD EXCLUDING 'DHCP|NetworkManager|sysstat|epmd|kdump|ceph-disk@|sysroot.mount|driverctl|srpd' (+$elapsed_time_seconds `date '+%T'`)${NC}"
2003 if [[ ! -f "initial_undercloud_systemctl_output.txt" ]]
2004 then
2005 sudo systemctl list-units --all --no-pager > initial_undercloud_systemctl_output.txt
2006 fi
2007 systemd_undercloud=$(sudo systemctl list-units --all --no-pager | grep failed | grep -E -v 'DHCP|NetworkManager|sysstat|epmd|kdump|ceph-disk@|sysroot.mount|driverctl|srpd')
2008 if [[ -z $systemd_undercloud ]]
2009 then
2010 echo -e "${GREEN}no failed systemd services found${NC}"
2011 else
2012 systemd_undercloud=$(sudo systemctl list-units --all --no-pager | grep failed | grep -E -v 'DHCP|NetworkManager|sysstat|epmd|kdump|ceph-disk@|sysroot.mount|driverctl|srpd')
2013 echo -e "${RED}$systemd_undercloud${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2014 fi
2015 elapsed_time_seconds=$(expr $(date +%s) - $start)
2016
2017
2018 ####################################################################################################
2019
2020
2021 start=$(date +%s)
2022 STEPS_COUNTER=$((STEPS_COUNTER+1))
2023 echo -e "${BLUE}\n\n$STEPS_COUNTER) DOCKER CONTAINERS STATUS CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
2024 if [[ ! -f "initial_docker_containers_output.txt" ]]
2025 then
2026 ansible all --limit '!hypervisor' -b -m shell -a "docker ps" > initial_docker_containers_output.txt
2027 fi
2028 if [[ $ceph_backend == "false" ]]
2029 then
2030 failed_docker_containers=$(ansible all --limit '!hypervisor' -b -m shell -a "docker ps -a | grep -E -i 'Failed|unhealthy|restarting|starting|Exited' | grep -Fv -e 'ceilometer' -e 'gnocchi' -e 'aodh' -e 'Exited (0)' -e 'elk-curator' -e 'manila' -e 'Exited (255)'" | grep ^[0-9,a-f] -B 1)
2031 if [[ $failed_docker_containers ]]
2032 then
2033 echo -e "${RED}$failed_docker_containers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2034 else
2035 echo -e "${GREEN}no unhealthy, Exited, restarting, starting docker containers are found${NC}"
2036 fi
2037 elif [[ $ceph_backend == "true" ]]
2038 then
2039 failed_docker_containers=$(ansible all --limit '!hypervisor' -b -m shell -a "docker ps -a | grep -E -i 'Failed|unhealthy|restarting|starting|Exited' | grep -Fv -e 'Exited (0)' -e 'ceilometer' -e 'gnocchi' -e 'aodh' -e 'elk-curator' -e 'Exited (255)'" | grep ^[0-9,a-f] -B 1)
2040 if [[ $failed_docker_containers ]]
2041 then
2042 echo -e "${RED}$failed_docker_containers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2043 else
2044 echo -e "${GREEN}no unhealthy, Exited, restarting, starting docker containers are found${NC}"
2045 fi
2046 fi
2047 elapsed_time_seconds=$(expr $(date +%s) - $start)
2048
2049
2050 ####################################################################################################
2051
2052
2053 start=$(date +%s)
2054 STEPS_COUNTER=$((STEPS_COUNTER+1))
2055 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE FLAPPING (CONTAINERS THAT GOES UP AND DOWN RAPIDLY) DOCKER CONTAINERS (+$elapsed_time_seconds `date '+%T'`)${NC}"
2056 # note: this check was added after I seen that in one of the storage nodes 1 OSD container went up and down every 15 seconds without any in-between status such as restarting.
2057 containers_changes=$(ansible all --limit '!hypervisor' -b -m shell -a "for i in {0..300}; do sleep 0.1 ; docker ps | wc -l; done > docker_containers_real_time_changes.log && cat docker_containers_real_time_changes.log | sort | uniq | wc -l" | grep ^[2-9] -B 1)
2058 if [[ $containers_changes ]]
2059 then
2060 echo -e "${RED}$containers_changes\n\nSSH to the failed server(s) and run the following to try and spot the elusive failed containers:\nwatch -d -n 0.1 \"sudo docker ps | grep -v -E 'Up [1-9] days|[1-9] hours'\"${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2061 else
2062 echo -e "${GREEN}no elusive (flapping) docker containers were found${NC}"
2063 fi
2064 elapsed_time_seconds=$(expr $(date +%s) - $start)
2065
2066
2067 ####################################################################################################
2068
2069
2070 start=$(date +%s)
2071 STEPS_COUNTER=$((STEPS_COUNTER+1))
2072 HIGH_CPU_COUNTER=0
2073 PERCENTAGE=100.0
2074 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE CONSTANT $PERCENTAGE%+ CPU USAGE DOCKER CONTAINER (+$elapsed_time_seconds `date '+%T'`)${NC}"
2075 high_cpu_containers_ids=$(ansible all --limit '!hypervisor' -b -m shell -a "docker stats --no-stream | tr -d % | awk '{ if ( \$2 > $PERCENTAGE ) print \$0 }'" | grep ^[a-f,0-9] | awk '{print $1}' | paste -sd'|')
2076 if [[ $high_cpu_containers_ids ]]
2077 then
2078 while true
2079 do
2080 sleep 3
2081 high_cpu_containers_ids_recheck=$(ansible all --limit '!hypervisor' -b -m shell -a "docker stats --no-stream | tr -d % | awk '{ if ( \$2 > $PERCENTAGE ) print \$0 }' | grep -E '$high_cpu_containers_ids'" | grep ^[a-f,0-9] | awk '{print $1}' | paste -sd'|')
2082 if [[ $high_cpu_containers_ids_recheck ]]
2083 then
2084 HIGH_CPU_COUNTER=$((HIGH_CPU_COUNTER+1))
2085 high_cpu_containers_names=$(ansible all --limit '!hypervisor' -b -m shell -a "docker ps | grep -E '$high_cpu_containers_ids_recheck'" | grep ^[a-f,0-9] -B 1)
2086 echo -e "${ORANGE}$high_cpu_containers_names${NC}\n"
2087 else
2088 echo -e "${GREEN}couldn't find constant $PERCENTAGE%+ cpu usage docker containers${NC}"
2089 break
2090 fi
2091 if [ $HIGH_CPU_COUNTER -gt 5 ]
2092 then
2093 echo -e "${RED}$high_cpu_containers_names${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2094 break
2095 fi
2096 done
2097 else
2098 echo -e "${GREEN}couldn't find constant $PERCENTAGE%+ cpu usage docker containers${NC}"
2099 fi
2100 elapsed_time_seconds=$(expr $(date +%s) - $start)
2101
2102
2103 ####################################################################################################
2104
2105
2106 start=$(date +%s)
2107 STEPS_COUNTER=$((STEPS_COUNTER+1))
2108 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE DOCKER CONTAINER WITH 90.0%+ MEMORY USAGE (+$elapsed_time_seconds `date '+%T'`)${NC}"
2109 high_mem_containers_ids=$(ansible all --limit '!hypervisor' -b -m shell -a "docker stats --no-stream | tr -d % | awk '{ if ( \$8 > 90.0 ) print \$0 }'" | grep ^[a-f,0-9] | awk '{print $1}' | paste -sd'|')
2110 if [[ $high_mem_containers_ids ]]
2111 then
2112 high_mem_containers_names=$(ansible all --limit '!hypervisor' -b -m shell -a "docker ps | grep -E '$high_mem_containers_ids'" | grep ^[a-f,0-9] -B 1)
2113 echo -e "${RED}$high_mem_containers_names${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2114 else
2115 echo -e "${GREEN}didn't find docker containers with more then 90.0% memory usage${NC}"
2116 fi
2117 elapsed_time_seconds=$(expr $(date +%s) - $start)
2118
2119
2120 ####################################################################################################
2121
2122
2123 start=$(date +%s)
2124 STEPS_COUNTER=$((STEPS_COUNTER+1))
2125 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE CONTAINERS WITH NON-DEFAULT MEMORY HARD-LIMIT ON THE CONTROLLERS (+$elapsed_time_seconds `date '+%T'`)${NC}"
2126 TOTAL_CONTAINERS_MEMORY=$(ansible $last_index_controller -b -m shell -a "docker info" | grep 'Total Memory:' | awk '{print $3}')
2127 CEPH_CONTAINERS=$(ansible $last_index_controller -b -m shell -a "docker ps" | grep -E 'ceph-mon|ceph-rgw|ceph-mds|ceph-mgr' | awk '{print $1}' | paste -sd'|')
2128 PROBLEMATIC_CONTAINERS=$(ansible $last_index_controller -b -m shell -a "docker stats --no-stream" | grep -v $TOTAL_CONTAINERS_MEMORY | grep -v -E $CEPH_CONTAINERS | grep ^[0-9a-f])
2129 PROBLEMATIC_CONTAINERS_ID=$(echo -e "$PROBLEMATIC_CONTAINERS" | grep ^[0-9a-f] | awk '{print $1}' | paste -sd'|')
2130 PROBLEMATIC_CONTAINERS_NAME=$(ansible $last_index_controller -b -m shell -a "docker ps | grep -E \"$PROBLEMATIC_CONTAINERS_ID\"" | grep ^[0-9a-f])
2131 if [[ $PROBLEMATIC_CONTAINERS ]]
2132 then
2133 echo -e "${RED}$PROBLEMATIC_CONTAINERS\n\n$PROBLEMATIC_CONTAINERS_NAME${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2134 echo -e "\n\n${ORANGE}CBIS-16351 (19A) - horizon container is limited to 2 gigabyte until stack update is executed (fixed in SP4 PP4)${NC}"
2135 else
2136 echo -e "${GREEN}couldn't find containers with unexpected memory limit ("$TOTAL_CONTAINERS_MEMORY" GB)${NC}"
2137 fi
2138 elapsed_time_seconds=$(expr $(date +%s) - $start)
2139
2140
2141 ####################################################################################################
2142
2143
2144 start=$(date +%s)
2145 STEPS_COUNTER=$((STEPS_COUNTER+1))
2146 echo -e "${BLUE}\n\n$STEPS_COUNTER) OOM-KILLER CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
2147 oom_killer=$(ansible all --limit '!hypervisor' -b -m shell -a "grep -E -i 'Killed process|Out of memory' /var/log/dmesg /var/log/messages* | grep -v 'Invoked with warn'" | grep -E -v 'FAILED|non-zero return code')
2148 if [[ -z $oom_killer ]]
2149 then
2150 echo -e "${GREEN}no out of memory processes log lines found in /var/log/messages and /var/log/dmesg on all the hosts${NC}"
2151 else
2152 echo -e "${RED}$oom_killer${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2153 fi
2154 elapsed_time_seconds=$(expr $(date +%s) - $start)
2155
2156
2157 ####################################################################################################
2158
2159
2160 start=$(date +%s)
2161 STEPS_COUNTER=$((STEPS_COUNTER+1))
2162 echo -e "${BLUE}\n\n$STEPS_COUNTER) RABBITMQ CLUSTER HEALTH CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
2163 if [[ $cbis_version == "18.0.0.1" ]]
2164 then
2165 rabbitmqctl=$(ansible controller -b -m shell -a "rabbitmqctl node_health_check" | grep -c 'Health check passed')
2166 if [[ $rabbitmqctl == "3" ]]
2167 then
2168 echo -e "${GREEN}rabbitmq node health check passed${NC}"
2169 else
2170 rabbitmqctl=$(ansible controller -b -m shell -a "rabbitmqctl node_health_check")
2171 echo -e "${RED}$rabbitmqctl${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2172 fi
2173 elif [[ $cbis_version != "18.0.0.1" ]]
2174 then
2175 rabbitmqctl=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl node_health_check" | grep -c 'Health check passed')
2176 if [[ $rabbitmqctl == "3" ]]
2177 then
2178 echo -e "${GREEN}rabbitmq node health check passed${NC}"
2179 else
2180 rabbitmqctl=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl node_health_check")
2181 echo -e "${RED}$rabbitmqctl${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2182 fi
2183 fi
2184 elapsed_time_seconds=$(expr $(date +%s) - $start)
2185
2186
2187 ####################################################################################################
2188
2189
2190 start=$(date +%s)
2191 STEPS_COUNTER=$((STEPS_COUNTER+1))
2192 echo -e "${BLUE}\n\n$STEPS_COUNTER) RABBITMQ CLUSTER STATUS CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
2193 if [[ $cbis_version == "18.0.0.1" ]]
2194 then
2195 rabbitmqctl=$(ansible controller -b -m shell -a "rabbitmqctl cluster_status | grep running_nodes -A2 | wc -l" | grep ^[0-9] | sort -u)
2196 if [[ $rabbitmqctl == "3" ]]
2197 then
2198 echo -e "${GREEN}all 3 rabbitmq members are running${NC}"
2199 else
2200 rabbitmqctl=$(ansible controller -b -m shell -a "rabbitmqctl cluster_status")
2201 echo -e "${RED}$rabbitmqctl${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2202 fi
2203 elif [[ $cbis_version != "18.0.0.1" ]]
2204 then
2205 rabbitmqctl=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl cluster_status | grep running_nodes -A2 | wc -l" | grep ^[0-9] | sort | uniq -c | column -t)
2206 if [[ $rabbitmqctl == "3 3" || $rabbitmqctl = "1 3" ]]
2207 then
2208
2209 echo -e "${GREEN}all rabbitmq member(s) are running${NC}"
2210 else
2211 rabbitmqctl=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl node_health_check")
2212 echo -e "${RED}$rabbitmqctl${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2213 fi
2214 fi
2215 elapsed_time_seconds=$(expr $(date +%s) - $start)
2216
2217
2218 ####################################################################################################
2219
2220
2221 start=$(date +%s)
2222 STEPS_COUNTER=$((STEPS_COUNTER+1))
2223 echo -e "\n\n${BLUE}$STEPS_COUNTER) VALIDATE THAT ALL THE RABBITMQ CONFIGURED USERS HAS RUNNING CONNECTION AGAINST RABBITMQ (+$elapsed_time_seconds `date '+%T'`)${NC}"
2224bash <<"%EOF%"
2225 NC='\033[0m'
2226 RED='\033[0;31m'
2227 GREEN='\033[32m'
2228 rabbitmq_users_raw=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl list_users")
2229 rabbitmq_users_sorted=$(echo -e "$rabbitmq_users_raw" | grep -v -E 'keystone|glance' | awk '{print $1}' | grep -v -E 'Listing|overcloud-controller' | sort)
2230 rabbitmq_users=$(echo -e "$rabbitmq_users_sorted" | uniq)
2231 rabbitmq_users_per_controller=$(echo -e "$rabbitmq_users_sorted" | uniq -c | awk '{print $1}' | uniq)
2232 # keystone and glance are removed from the check as they always return false negative. keystone and glance are not using AMQP since they are 1 service projects with only API and therefore they don't need to interact with anything by themselves (approved by Smigielski, Radoslaw)
2233 rabbitmq_connections_by_user=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl list_connections" | awk '{print $1}' | grep -v -E 'Listing|overcloud-controller' | sort | uniq)
2234 if [[ $rabbitmq_users == $rabbitmq_connections_by_user ]]
2235 then
2236 echo -e "${GREEN}all the rabbitmq configured users has running connection in the rabbitmq connections list${NC}"
2237 else
2238 rabbitmq_users_to_connections_comparison=$(comm -23 <(echo $rabbitmq_users | tr ' ' '\n' | sort) <(echo $rabbitmq_connections_by_user | tr ' ' '\n' | sort))
2239 echo -e "${RED}the following are rabbitmq configured users that do not appear in the rabbitmq running connections list:\n${NC}"
2240 echo -e "${RED}$rabbitmq_users_to_connections_comparison${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2241 fi
2242 if [[ $rabbitmq_users_per_controller != "3" ]]
2243 then
2244 echo -e "\n\n${RED}one or more controllers missing one or more rabbitmq users:\n${NC}"
2245 echo -e "${RED}$rabbitmq_users_raw${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2246 fi
2247%EOF%
2248 elapsed_time_seconds=$(expr $(date +%s) - $global_start)
2249
2250
2251 ####################################################################################################
2252
2253
2254 start=$(date +%s)
2255 STEPS_COUNTER=$((STEPS_COUNTER+1))
2256 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK RABBIMQ USERS PREMISSION (+$elapsed_time_seconds `date '+%T'`)${NC}"
2257 rabbitmq_permissions_check=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl list_permissions | awk -F\. '{print \$2,\$3,\$4,\$5}' | column -t" | grep -v -w -E '* * *|rc=0')
2258 if [[ $rabbitmq_permissions_check ]]
2259 then
2260 rabbitmq_permissions_check=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl list_permissions")
2261 echo -e "${RED}$rabbitmq_permissions_check${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2262 else
2263 echo -e "${GREEN}all the rabbitmq users has full access (.* .* .*)${NC}"
2264 fi
2265 elapsed_time_seconds=$(expr $(date +%s) - $start)
2266
2267
2268 ####################################################################################################
2269
2270
2271 start=$(date +%s)
2272 STEPS_COUNTER=$((STEPS_COUNTER+1))
2273 echo -e "${BLUE}\n\n$STEPS_COUNTER) GALERA CLUSTER CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
2274 if [[ $cbis_version == "18.0.0.1" ]]
2275 then
2276 galera_cluster_check=$(ansible controller -b -m shell -a "clustercheck" | grep -c 'Galera cluster node is synced')
2277 if [[ $galera_cluster_check == "3" ]]
2278 then
2279 echo -e "${GREEN}galera cluster is synced${NC}"
2280 else
2281 galera_cluster_check=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=galera-bundle -q) clustercheck")
2282 echo -e "${RED}$galera_cluster_check${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2283 fi
2284 elif [[ $cbis_version != "18.0.0.1" ]]
2285 then
2286 galera_cluster_check=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=galera-bundle -q) clustercheck" | grep -c 'Galera cluster node is synced')
2287 if [[ $galera_cluster_check == "3" ]]
2288 then
2289 echo -e "${GREEN}galera cluster is synced${NC}"
2290 else
2291 galera_cluster_check=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=galera-bundle -q) clustercheck")
2292 echo -e "${RED}$galera_cluster_check${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2293 fi
2294 fi
2295 elapsed_time_seconds=$(expr $(date +%s) - $start)
2296
2297
2298 ####################################################################################################
2299
2300
2301 start=$(date +%s)
2302 STEPS_COUNTER=$((STEPS_COUNTER+1))
2303 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT resume_guests_state_on_host_boot IS SET TO false IN nova.conf (+$elapsed_time_seconds `date '+%T'`)${NC}"
2304 resume_vms_on_boot=$(ansible compute --limit '!localhost' -m shell -b -a "grep -E ^resume_guests_state_on_host_boot=[fF]alse /var/lib/config-data/puppet-generated/nova_libvirt/etc/nova/nova.conf" | grep -v -E 'non-zero return code|rc=[1-9]')
2305 if [[ $resume_vms_on_boot ]]
2306 then
2307 echo -e "${RED}$resume_vms_on_boot${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2308 else
2309 echo -e "${GREEN}resume_guests_state_on_host_boot is set to True on all the computes${NC}"
2310 fi
2311 elapsed_time_seconds=$(expr $(date +%s) - $start)
2312
2313
2314 ####################################################################################################
2315
2316
2317 start=$(date +%s)
2318 STEPS_COUNTER=$((STEPS_COUNTER+1))
2319 echo -e "${BLUE}\n\n$STEPS_COUNTER) NTP (ntpstat) SYNCHRONIZATION CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
2320 ntpstat=$(ansible all --limit '!hypervisor' -b -m shell -a "ntpstat" | grep unsynchronised -B 1 | grep overcloud- | awk '{print $1}')
2321 if [[ $ntpstat ]]
2322 then
2323 echo -e "${RED}ntpstat returned unsynchronised for the following hosts:\n$ntpstat${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2324 else
2325 echo -e "${GREEN}all hosts are synchronized (ntpstat)${NC}"
2326 fi
2327 elapsed_time_seconds=$(expr $(date +%s) - $start)
2328
2329
2330 ####################################################################################################
2331
2332
2333 start=$(date +%s)
2334 STEPS_COUNTER=$((STEPS_COUNTER+1))
2335 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT THE OVERCLOUD GLANCE IMAGES ARE IN active STATE (+$elapsed_time_seconds `date '+%T'`)${NC}"
2336 overcloud_images=$(source ~/overcloudrc && openstack image list -f value | grep -v active | column -t)
2337 if [[ $overcloud_images ]]
2338 then
2339 echo -e "${RED}$overcloud_images${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2340 else
2341 overcloud_images=$(source ~/overcloudrc && openstack image list -f value | wc -l)
2342 if [[ $overcloud_images == "0" ]]
2343 then
2344 echo -e "${GREEN}no images found${NC}"
2345 else
2346 echo -e "${GREEN}all images are in active state${NC}"
2347 fi
2348 fi
2349 elapsed_time_seconds=$(expr $(date +%s) - $start)
2350
2351
2352 ####################################################################################################
2353
2354
2355 start=$(date +%s)
2356 STEPS_COUNTER=$((STEPS_COUNTER+1))
2357 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT THE OVERCLOUD CINDER VOLUMES ARE IN available/in-use STATE (+$elapsed_time_seconds `date '+%T'`)${NC}"
2358 overcloud_volumes=$(source ~/overcloudrc && openstack volume list -f value | grep -E -v 'available|in-use' | column -t)
2359 if [[ $overcloud_volumes ]]
2360 then
2361 echo -e "${RED}$overcloud_volumes${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2362 else
2363 overcloud_volumes=$(source ~/overcloudrc && openstack volume list -f value | wc -l)
2364 if [[ $overcloud_volumes == "0" ]]
2365 then
2366 echo -e "${GREEN}no volumes found${NC}"
2367 else
2368 echo -e "${GREEN}all volumes are in available/in-use state${NC}"
2369 fi
2370 fi
2371 elapsed_time_seconds=$(expr $(date +%s) - $start)
2372
2373
2374 ####################################################################################################
2375
2376
2377 start=$(date +%s)
2378 STEPS_COUNTER=$((STEPS_COUNTER+1))
2379 echo -e "${BLUE}\n\n$STEPS_COUNTER) OPENSTACK VOLUME SERVICES CHECK ON THE OVERCLOUD (+$elapsed_time_seconds `date '+%T'`)${NC}"
2380 volume_services=$(source ~/overcloudrc && openstack volume service list --long -f value | grep -E -i ' XXX | DOWN ' | column -t)
2381 if [[ -z $volume_services ]]
2382 then
2383 echo -e "${GREEN}all volume services are enabled and up${NC}"
2384 else
2385 echo -e "${RED}$volume_services${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2386 echo -e "\n\n${ORANGE}CBIS-14283 (19A) - cinder services for host tripleo_ceph_volumes-fast is down (19.100.1)${NC}"
2387 echo -e "${ORANGE}CBIS-9424 (19A) - removed controllers still showing under openstack volume service list (19.100.1)${NC}"
2388 fi
2389 elapsed_time_seconds=$(expr $(date +%s) - $start)
2390
2391
2392 ####################################################################################################
2393
2394
2395 start=$(date +%s)
2396 STEPS_COUNTER=$((STEPS_COUNTER+1))
2397 echo -e "${BLUE}\n\n$STEPS_COUNTER) OPENSTACK NETWORK AGENTS LIST ON THE UNDERCLOUD (+$elapsed_time_seconds `date '+%T'`)${NC}"
2398 network_agents_undercloud=$(source ~/stackrc && openstack network agent list -f value | grep -E -i 'XXX|DOWN')
2399 if [[ -z $network_agents_undercloud ]]
2400 then
2401 echo -e "${GREEN}all network agents are alive and up${NC}"
2402 else
2403 echo -e "${RED}$network_agents_undercloud${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2404 echo -e "\n\n${ORANGE}CBIS-14294 - after upgrading from 19.x to 19A, there are old/unused/down undercloud services while the new services running on undercloud.localdomain${NC}"
2405 echo -e "${ORANGE}CBIS-16114,CBIS-15655,CBIS-13670 - after replace controller the network agent of the old controller still shows (19.0, 19A, 20)${NC}"
2406 fi
2407 elapsed_time_seconds=$(expr $(date +%s) - $start)
2408
2409
2410 ####################################################################################################
2411
2412
2413 start=$(date +%s)
2414 STEPS_COUNTER=$((STEPS_COUNTER+1))
2415 echo -e "${BLUE}\n\n$STEPS_COUNTER) OPENSTACK NETWORK AGENTS LIST ON THE OVERCLOUD (+$elapsed_time_seconds `date '+%T'`)${NC}"
2416 network_agents_overcloud=$(source ~/overcloudrc && openstack network agent list -f value | grep -E -i 'XXX|DOWN')
2417 if [[ -z $network_agents_overcloud ]]
2418 then
2419 echo -e "${GREEN}all network agents are alive and up${NC}"
2420 else
2421 echo -e "${RED}$network_agents_overcloud${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2422 fi
2423 elapsed_time_seconds=$(expr $(date +%s) - $start)
2424
2425
2426 ####################################################################################################
2427
2428
2429 start=$(date +%s)
2430 STEPS_COUNTER=$((STEPS_COUNTER+1))
2431 echo -e "${BLUE}\n\n$STEPS_COUNTER) OPENSTACK COMPUTE SERVICES ON THE UNDERCLOUD (+$elapsed_time_seconds `date '+%T'`)${NC}"
2432 compute_services_undercloud=$(source ~/stackrc && openstack compute service list -f value | grep -E -i 'XXX|DOWN|disabled')
2433 if [[ -z $compute_services_undercloud ]]
2434 then
2435 echo -e "${GREEN}all compute services are enabled and up${NC}"
2436 else
2437 echo -e "${RED}$compute_services_undercloud${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2438 fi
2439 elapsed_time_seconds=$(expr $(date +%s) - $start)
2440
2441
2442 ####################################################################################################
2443
2444
2445 start=$(date +%s)
2446 STEPS_COUNTER=$((STEPS_COUNTER+1))
2447 echo -e "${BLUE}\n\n$STEPS_COUNTER) OPENSTACK COMPUTE SERVICES ON THE OVERCLOUD (+$elapsed_time_seconds `date '+%T'`)${NC}"
2448 compute_services_overcloud=$(source ~/overcloudrc && openstack compute service list -f value | grep -E -i 'XXX|DOWN|disabled')
2449 if [[ -z $compute_services_overcloud ]]
2450 then
2451 echo -e "${GREEN}all compute services are enabled and up${NC}"
2452 else
2453 echo -e "${RED}$compute_services_overcloud${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2454 fi
2455 elapsed_time_seconds=$(expr $(date +%s) - $start)
2456
2457
2458 ####################################################################################################
2459
2460 start=$(date +%s)
2461 STEPS_COUNTER=$((STEPS_COUNTER+1))
2462 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK IF ANY AVAILABILITY-ZONE IS NOT IN available STATE (+$elapsed_time_seconds `date '+%T'`)${NC}"
2463 AVAILABILITY_ZONES=$(source ~/overcloudrc && openstack availability zone list --long -f value)
2464 AVAILABILITY_ZONES_FAILURES=$(echo -e "$AVAILABILITY_ZONES" | grep -v available)
2465
2466 if [[ $AVAILABILITY_ZONES_FAILURES ]]
2467 then
2468 echo -e "${RED}$AVAILABILITY_ZONES_FAILURES${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2469 else
2470 echo -e "${GREEN}all the availability zones are available${NC}"
2471 fi
2472 elapsed_time_seconds=$(expr $(date +%s) - $start)
2473
2474
2475 ####################################################################################################
2476
2477
2478 start=$(date +%s)
2479 STEPS_COUNTER=$((STEPS_COUNTER+1))
2480 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE LEFTOVER CONTROLLERS IN OPENSTACK ORCHESTRATION SERVICES (heat) ON THE OVERCLOUD (+$elapsed_time_seconds `date '+%T'`)${NC}"
2481 leftover_controllers=$(source ~/overcloudrc && openstack orchestration service list -c Hostname -c Binary -c Topic -c Status -f value | grep -w -E -v "$current_controllers_piped" | sort | uniq -c | column -t)
2482 if [[ $leftover_controllers ]]
2483 then
2484 echo -e "${RED}$leftover_controllers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2485 echo -e "\n\n${ORANGE}CBIS-16372 (19A) - leftover controllers in heat services list${NC}"
2486 else
2487 echo -e "${GREEN}no leftover controllers found${NC}"
2488 fi
2489 elapsed_time_seconds=$(expr $(date +%s) - $start)
2490
2491
2492 ####################################################################################################
2493
2494
2495 start=$(date +%s)
2496 STEPS_COUNTER=$((STEPS_COUNTER+1))
2497 echo -e "${BLUE}\n\n$STEPS_COUNTER) OPENSTACK ORCHESTRATION SERVICES (heat) ON THE OVERCLOUD (+$elapsed_time_seconds `date '+%T'`)${NC}"
2498 non_up_heat_engines=$(source ~/overcloudrc && openstack orchestration service list -c Hostname -c Binary -c Topic -c Status -f value | grep -w -E "$current_controllers_piped" | grep -v 'engine up' | sort | uniq -c)
2499 if [[ $non_up_heat_engines ]]
2500 then
2501 echo -e "${RED}$non_up_heat_engines${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2502 echo -e "\n\n${ORANGE}CBIS-16373 (19A) / CBIS-16374 (20) - several heat engine services shows down${NC}"
2503 else
2504 echo -e "${GREEN}no heat engines in down status found${NC}"
2505 fi
2506 elapsed_time_seconds=$(expr $(date +%s) - $start)
2507
2508
2509 ####################################################################################################
2510
2511
2512 start=$(date +%s)
2513 STEPS_COUNTER=$((STEPS_COUNTER+1))
2514 echo -e "${BLUE}\n\n$STEPS_COUNTER) OPENSTACK ORCHESTRATION SERVICES (heat) ON THE UNDERCLOUD (+$elapsed_time_seconds `date '+%T'`)${NC}"
2515 non_up_heat_engines=$(source ~/stackrc && openstack orchestration service list -c Hostname -c Binary -c Topic -c Status -f value | grep -v 'engine up' | sort | uniq -c)
2516 if [[ $non_up_heat_engines ]]
2517 then
2518 echo -e "${RED}$non_up_heat_engines${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2519 else
2520 echo -e "${GREEN}no heat engines in down status found${NC}"
2521 fi
2522 elapsed_time_seconds=$(expr $(date +%s) - $start)
2523
2524
2525 ####################################################################################################
2526
2527
2528 start=$(date +%s)
2529 STEPS_COUNTER=$((STEPS_COUNTER+1))
2530 echo -e "${BLUE}\n\n$STEPS_COUNTER) NEUTRON PORTS STATUS (+$elapsed_time_seconds `date '+%T'`)${NC}"
2531 ports_status=$(source ~/overcloudrc && openstack port list -c ID -c Status -f value | grep -v ACTIVE)
2532 if [[ $ports_status ]]
2533 then
2534 echo -e "${RED}$ports_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2535 else
2536 echo -e "${GREEN}the status of all the ports returned active${NC}"
2537 fi
2538 elapsed_time_seconds=$(expr $(date +%s) - $start)
2539
2540
2541
2542 ####################################################################################################
2543
2544
2545 start=$(date +%s)
2546 STEPS_COUNTER=$((STEPS_COUNTER+1))
2547 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE DUPLICATED NEUTRON PORTS IN MYSQL (+$elapsed_time_seconds `date '+%T'`)${NC}"
2548 duplicated_neutron_ports=$(ansible controller -b -m shell -a "mysql -e \"select port_id from ovs_neutron.ml2_port_bindings\"" | sort | uniq -c | column -t | grep ^[4-9] | awk '{print $NF}')
2549 if [[ $duplicated_neutron_ports ]]
2550 then
2551 echo -e "${RED}the following neutron ports have duplicated entries in the ml2_port_bindings table under the neutron_ovs database in mysql:\n\n$duplicated_neutron_ports${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2552 echo -e "\n\n${ORANGE}CBIS-15400 (19A) - duplicate inactive port is created in mysql after live-migrate failure${NC}"
2553 else
2554 echo -e "${GREEN}no duplicated ports found in the ml2_port_bindings table under the neutron_ovs database in mysql${NC}"
2555 fi
2556 elapsed_time_seconds=$(expr $(date +%s) - $start)
2557
2558
2559 ####################################################################################################
2560
2561
2562 start=$(date +%s)
2563 STEPS_COUNTER=$((STEPS_COUNTER+1))
2564 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK IF ANY COMPUTE IS MARKED AS FORCED DOWN (+$elapsed_time_seconds `date '+%T'`)${NC}"
2565 force_down=$(nova service-list | awk '{print $18}' | tr -d '\|' | column -t | sort | uniq)
2566 if [[ $force_down == "False" ]]
2567 then
2568 echo -e "${GREEN}none of the computes are marked as nova force down${NC}"
2569 else
2570 force_down=$(nova service-list)
2571 echo -e "${RED}$force_down${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2572 echo -e "\n\n${ORANGE}a computes marked as forced down = True is normally caused by the auto-evacuate process and requires human intervention to unset back to False${NC}"
2573 fi
2574 elapsed_time_seconds=$(expr $(date +%s) - $start)
2575
2576
2577 ####################################################################################################
2578
2579
2580 start=$(date +%s)
2581 STEPS_COUNTER=$((STEPS_COUNTER+1))
2582 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT /elk IS MOUNTED (+$elapsed_time_seconds `date '+%T'`)${NC}"
2583 if [[ $elk == "true" && $elk_deployment_type == "local" ]]
2584 then
2585 elk_disk=$(cat user_config.yaml | grep elk_disk | awk '{print $2}' | grep -v sda)
2586 if [[ $elk_disk ]]
2587 then
2588 elk_partition_validation=$(ansible controller -b -m shell -a "df -h | grep '/elk'")
2589 elk_results=$(echo -e "$elk_partition_validation" | grep $elk_disk | grep -c '/elk')
2590 if [[ $elk_results != "3" ]]
2591 then
2592 echo -e "${RED}the /elk partition is not found under any $elk_disk partition on one or more controllers:\n\n$elk_partition_validation${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2593 else
2594 echo -e "${GREEN}/elk partition is found on all the controllers on the expected disk $elk_disk${NC}"
2595 fi
2596 else
2597 echo -e "${GREEN}/elk is deployed on sda and therefore, using the root partition${NC}"
2598 fi
2599 else
2600 echo -e "${ORANGE}CBIS is deployed without ELK or ELK type is remote${NC}"
2601 fi
2602 elapsed_time_seconds=$(expr $(date +%s) - $start)
2603
2604
2605 ####################################################################################################
2606
2607
2608 start=$(date +%s)
2609 STEPS_COUNTER=$((STEPS_COUNTER+1))
2610 RETENTION=$(cat user_config.yaml | grep elk_keep_data | awk '{print $NF}')
2611 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THERE ARE NO ELK INDICES OLDER THEN $RETENTION DAYS (+$elapsed_time_seconds `date '+%T'`)${NC}"
2612 RESIDUE_INDICES=$(ansible controller -m shell -b -a "find /elk/esdata/nodes/0/indices/ -maxdepth 1 -type d -daystart -mtime $RETENTION" | grep ^/elk/esdata/nodes/0/indices/ -B 1)
2613 if [[ $RESIDUE_INDICES ]]
2614 then
2615 echo -e "${RED}$RESIDUE_INDICES${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2616 else
2617 echo -e "${GREEN}there are no ELK indices under /elk (of the controllers) older then $RETENTION days${NC}"
2618 fi
2619 elapsed_time_seconds=$(expr $(date +%s) - $start)
2620
2621
2622 ####################################################################################################
2623
2624
2625 start=$(date +%s)
2626 STEPS_COUNTER=$((STEPS_COUNTER+1))
2627 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE FAULTY DISKS ATTRIBUTES USING SMARTCTL (+$elapsed_time_seconds `date '+%T'`)${NC}"
2628 disks_attributes=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "lsblk -dSn | grep -E -v 'CDROM|usb' | awk '{print \$1}' | xargs -i smartctl -a /dev/{} | grep -E -w 'Erase_Fail_Count|CRC_Error_Count|Uncorrectable_Error_Cnt' | awk '{ if ( \$NF > 0 ) print \$0 }'" | grep ^[0-9] -B 1)
2629 if [[ $disks_attributes ]]
2630 then
2631 echo -e "${RED}$disks_attributes${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2632 echo -e "\n\n${ORANGE}Erase Fail Count = The number of times when erase operation on a flash memory failed\nUncorrectable_Error_Cnt = the number of times a data transfer within the drive had an error that was unable to be corrected by the ECC (Error Checking) algorithm\nCRC_Error_Count = A cyclic redundancy check (CRC) is an error-detecting code commonly used in digital networks and storage devices to detect accidental changes to raw data${NC}"
2633
2634
2635 else
2636 echo -e "${GREEN}all tested disks attributes returned 0${NC}"
2637 fi
2638 elapsed_time_seconds=$(expr $(date +%s) - $start)
2639
2640
2641 ####################################################################################################
2642
2643
2644 start=$(date +%s)
2645 STEPS_COUNTER=$((STEPS_COUNTER+1))
2646 echo -e "${BLUE}\n\n$STEPS_COUNTER) PERFORM SMARTCTL HEALTH CHECK ON THE DISKS (+$elapsed_time_seconds `date '+%T'`)${NC}"
2647 disk_health_test=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "lsblk -dSn | grep -E -v 'CDROM|usb' | awk '{print \$1}' | xargs -i smartctl -H /dev/{} | grep -E 'SMART overall-health self-assessment test result|SMART Health Status'")
2648 disk_health_test_filtered=$(echo -e "$disk_health_test" | grep -E -v 'OK|PASSED|rc=|non-zero return code')
2649 if [[ $disk_health_test_filtered ]]
2650 then
2651 echo -e "${RED}$disk_health_test${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2652 else
2653 echo -e "${GREEN}all tested disks health check passed${NC}"
2654 fi
2655 elapsed_time_seconds=$(expr $(date +%s) - $start)
2656
2657
2658 ####################################################################################################
2659
2660
2661 start=$(date +%s)
2662 STEPS_COUNTER=$((STEPS_COUNTER+1))
2663 echo -e "${BLUE}\n\n$STEPS_COUNTER) PERFORM SMARTCTL SHOT OFFLINE TEST ON ALL THE DISKS (+$elapsed_time_seconds `date '+%T'`)${NC}"
2664 UNTESTED_HOSTS=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "lsblk -dSn | grep -E -v 'CDROM|usb' | awk '{print \$1}' | xargs -i smartctl -a /dev/{} | grep 'Short offline'" | grep -E 'FAILED|rc=[1-9]' | awk '{print $1}' | paste -sd',')
2665 if [[ $UNTESTED_HOSTS ]]
2666 then
2667 sshpass -p $hv_cbis_admin_password ansible -k $UNTESTED_HOSTS -b -m shell -a "lsblk -dSn | grep -E -v 'CDROM|usb' | awk '{print \$1}' | xargs -i smartctl -t short /dev/{}" > /dev/null
2668 IN_PROGRESS_CHECK=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "lsblk -dSn | grep -E -v 'CDROM|usb' | awk '{print \$1}' | xargs -i smartctl -a /dev/{} | grep 'Short offline' | head -n 1" | grep 'in progress' -B 1)
2669 while [[ $IN_PROGRESS_CHECK ]]
2670 do
2671 sleep 10
2672 IN_PROGRESS_CHECK=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "lsblk -dSn | grep -E -v 'CDROM|usb' | awk '{print \$1}' | xargs -i smartctl -a /dev/{} | grep 'Short offline' | head -n 1" | grep 'in progress' -B 1)
2673 done
2674 FAILURE_CHECK=$(sshpass -p password ansible -k all --limit '!localhost' -b -m shell -a "lsblk -dSn | grep -E -v 'CDROM|usb' | awk '{print \$1}' | xargs -i smartctl -a /dev/{} | grep 'Short offline' | head -n 1 | grep -v -E 'Completed without error\s+00%'" | grep ^\# -B 1)
2675 if [[ $FAILURE_CHECK ]]
2676 then
2677 echo -e "${RED}$FAILURE_CHECK${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2678 else
2679 echo -e "${GREEN}the latest smartctl self test on all the physical disks returned: 'Completed without error'${NC}"
2680 fi
2681 else
2682 echo -e "${GREEN}the latest smartctl self test on all the physical disks returned: 'Completed without error'${NC}"
2683 fi
2684 elapsed_time_seconds=$(expr $(date +%s) - $start)
2685
2686
2687 ####################################################################################################
2688
2689
2690 start=$(date +%s)
2691 STEPS_COUNTER=$((STEPS_COUNTER+1))
2692 SIZE=25
2693 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE PROCESSES WITH $SIZE%+ MEMORY CONSUMPTION (ps) (+$elapsed_time_seconds `date '+%T'`)${NC}"
2694 processes_memory_consumption=$(ansible all --limit '!hypervisor' -b -m shell -a "ps -eo %mem,comm,%cpu,pid --sort=-%mem | head -n 2 | column -t | grep ^[0-9] | awk '{ if ( \$1 > $SIZE ) print \$0 }'" | grep ^[0-9] -B 1)
2695 if [[ -z $processes_memory_consumption ]]
2696 then
2697 echo -e "${GREEN}no process with $SIZE%+ memory consumption${NC}"
2698 else
2699 echo -e "${RED}$processes_memory_consumption${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2700 fi
2701 elapsed_time_seconds=$(expr $(date +%s) - $start)
2702
2703
2704 ####################################################################################################
2705
2706
2707 start=$(date +%s)
2708 STEPS_COUNTER=$((STEPS_COUNTER+1))
2709 SIZE=90
2710 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE PROCESSES WITH $SIZE%+ CPU CONSUMPTION (top) (+$elapsed_time_seconds `date '+%T'`)${NC}"
2711 processes_cpu_consumption=$(ansible all --limit '!hypervisor' -b -m shell -a "top -b -n 1 -o %CPU | head -n 10 | grep -E '^[0-9]|^\s+[0-9]' | awk '{ if ( \$9 > $SIZE ) print \$1,\$2,\$9,\$12 }' | grep -E -v 'top|fp-rte|ovs-vswi'" | grep ^[0-9] -B 1)
2712 if [[ -z $processes_cpu_consumption ]]
2713 then
2714 echo -e "${GREEN}no process with $SIZE%+ cpu consumption found${NC}"
2715 else
2716 echo -e "${RED}$processes_cpu_consumption${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2717 fi
2718 elapsed_time_seconds=$(expr $(date +%s) - $start)
2719
2720
2721 ####################################################################################################
2722
2723
2724 start=$(date +%s)
2725 STEPS_COUNTER=$((STEPS_COUNTER+1))
2726 load_average_treshold="30.0"
2727 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THE HOSTS LOAD AVERAGE AND REPORT FAILURE IF ABOVE $load_average_treshold (+$elapsed_time_seconds `date '+%T'`)${NC}"
2728 load_average=$(ansible all --limit '!hypervisor' -b -m shell -a "cat /proc/loadavg | awk '{ if ( \$1 > $load_average_treshold ) print \$1 }'" | grep ^[0-9] -B 1)
2729 if [[ $load_average ]]
2730 then
2731 echo -e "${RED}$load_average${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2732 else
2733 echo -e "${GREEN}no host has load average greater then $load_average_treshold${NC}"
2734 fi
2735 elapsed_time_seconds=$(expr $(date +%s) - $start)
2736
2737
2738 ####################################################################################################
2739
2740
2741 start=$(date +%s)
2742 STEPS_COUNTER=$((STEPS_COUNTER+1))
2743 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK EACH HOST SWAP MEMORY USAGE (+$elapsed_time_seconds `date '+%T'`)${NC}"
2744 swap_memory=$(ansible all --limit '!hypervisor' -b -m shell -a "free | grep Swap: | awk '{ if ( \$3 > 0 ) print \$3 }'" | grep ^[0-9])
2745 if [[ $swap_memory ]]
2746 then
2747 swap_memory=$(ansible all --limit '!hypervisor' -b -m shell -a "free | grep Swap: | awk '{ if ( \$3 > 0 ) print \$3 }'" | grep ^[0-9] -B 1)
2748 echo -e "${RED}$swap_memory${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2749 else
2750 echo -e "${GREEN}none of the servers are using swap memory${NC}"
2751 fi
2752 elapsed_time_seconds=$(expr $(date +%s) - $start)
2753
2754
2755 ####################################################################################################
2756
2757
2758 start=$(date +%s)
2759 STEPS_COUNTER=$((STEPS_COUNTER+1))
2760 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT EACH SRIOV HOST HAS THE EXPECTED NUMBER OF VIRTUAL FUNCTIONS (+$elapsed_time_seconds `date '+%T'`)${NC}"
2761 if [[ $ansible_sriov_hosts ]]
2762 then
2763 expected_vfs_num=$(grep -w SriovPerformanceComputeExtraConfig /home/stack/templates/sriov-info.yaml -A 6 | grep tripleo::host::sriov::number_of_vfs: | awk '{OFS=RS;$1=$1}1' | grep -E [0-9]+ |awk -F: '{print $2}' | tr -d "\'\]\," | paste -sd+ | bc)
2764 for host in $ansible_sriov_hosts
2765 do
2766 current_vfs_num=$(ansible $host -b -m shell -a "ip link show | grep -c 'vf '" | grep ^[0-9])
2767 if [[ $expected_vfs_num == $current_vfs_num ]]
2768 then
2769 echo -e "${GREEN}expected vfs number on $host ($expected_vfs_num), current vfs number on $host ($current_vfs_num)${NC}"
2770 else
2771 echo -e "${RED}expected vfs number on $host ($expected_vfs_num), current vfs number on $host ($current_vfs_num)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2772 fi
2773 done
2774 else
2775 echo -e "${ORANGE}no sriov computes are found${NC}"
2776 fi
2777 elapsed_time_seconds=$(expr $(date +%s) - $start)
2778
2779
2780 ####################################################################################################
2781
2782
2783 start=$(date +%s)
2784 STEPS_COUNTER=$((STEPS_COUNTER+1))
2785 echo -e "${BLUE}\n\n$STEPS_COUNTER) VERIFY THE NEGOTIATED DUPLEX VALUE OF THE PHYSICAL INTERFACES ARE AS EXPECTED (+$elapsed_time_seconds `date '+%T'`)${NC}"
2786 duplex=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "ls -l /sys/class/net/ | grep -E \"[0-9a-z][0-9a-z]:00\.0/net|[0-9a-z][0-9a-z]:00\.1/net\" | awk -F/ '{print \$NF}' | xargs -i ip link show {} | grep 'UP mode' | awk '{print \$2}' | tr -d : | xargs -i ethtool {}" | grep Duplex: | awk '{print $2}' | uniq)
2787 if [[ $duplex != "Full" ]]
2788 then
2789 duplex=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "ls -l /sys/class/net/ | grep -E \"[0-9a-z][0-9a-z]:00\.0/net|[0-9a-z][0-9a-z]:00\.1/net\" | awk -F/ '{print \$NF}' | xargs -i ip link show {} | grep 'UP mode' | awk '{print \$2}' | tr -d : | xargs -i ethtool {} | grep -E 'Settings for |Duplex:'")
2790 echo -e "${RED}$duplex${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2791 else
2792 echo -e "${GREEN}all physical ports negotiated duplex set to full${NC}"
2793 fi
2794 elapsed_time_seconds=$(expr $(date +%s) - $start)
2795
2796
2797 ####################################################################################################
2798
2799
2800 start=$(date +%s)
2801 STEPS_COUNTER=$((STEPS_COUNTER+1))
2802 echo -e "${BLUE}\n\n$STEPS_COUNTER) VERIFY THAT ALL THE PHYSICAL INTERFACES ARE UP (+$elapsed_time_seconds `date '+%T'`)${NC}"
2803 physical_interfaces=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "ls -l /sys/class/net/ | grep -E \"[0-9a-z][0-9a-z]:00\.0/net|[0-9a-z][0-9a-z]:00\.1/net\" | awk -F/ '{print \$NF}' | wc -l" | grep ^[0-9] | sort | md5sum | awk '{print $1}')
2804 up_physical_interfaces=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "ls -l /sys/class/net/ | grep -E \"[0-9a-z][0-9a-z]:00\.0/net|[0-9a-z][0-9a-z]:00\.1/net\" | awk -F/ '{print \$NF}' | xargs -i ip link show {} | grep 'state UP mode' -c" | grep ^[0-9] | sort | md5sum | awk '{print $1}')
2805 if [[ $physical_interfaces != $up_physical_interfaces ]]
2806 then
2807 down_interfaces=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "ls -l /sys/class/net/ | grep -E \"[0-9a-z][0-9a-z]:00\.0/net|[0-9a-z][0-9a-z]:00\.1/net\" | awk -F/ '{print \$NF}' | xargs -i ip link show {} | grep DOWN")
2808 echo -e "${RED}$down_interfaces${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2809 else
2810 echo -e "${GREEN}no physical interfaces are down${NC}"
2811 fi
2812 elapsed_time_seconds=$(expr $(date +%s) - $start)
2813
2814
2815 ####################################################################################################
2816
2817
2818 start=$(date +%s)
2819 STEPS_COUNTER=$((STEPS_COUNTER+1))
2820 echo -e "${BLUE}\n\n$STEPS_COUNTER) VERIFY THE NEGOTIATED LINK SPEED VALUE OF THE PHYSICAL INTERFACES ARE AS EXPECTED (+$elapsed_time_seconds `date '+%T'`)${NC}"
2821 link_speed=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "ls -l /sys/class/net/ | grep -E \"[0-9a-z][0-9a-z]:00\.0/net|[0-9a-z][0-9a-z]:00\.1/net\" | awk -F/ '{print \$NF}' | xargs -i ip link show {} | grep 'UP mode' | awk '{print \$2}' | tr -d : | xargs -i ethtool {}" | grep Speed: | awk '{print $2}' | uniq)
2822 if [[ $fixed_platform == "airframe" ]]
2823 then
2824 if [[ $link_speed != "10000Mb/s" ]]
2825 then
2826 link_speed=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "ls -l /sys/class/net/ | grep -E \"[0-9a-z][0-9a-z]:00\.0/net|[0-9a-z][0-9a-z]:00\.1/net\" | awk -F/ '{print \$NF}' | xargs -i ip link show {} | grep 'UP mode' | awk '{print \$2}' | tr -d : | xargs -i ethtool {} | grep -E 'Settings for |Speed:'")
2827 echo -e "${RED}$link_speed${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2828 else
2829 echo -e "${GREEN}all physical ports negotiated speed set to 10000Mb/s${NC}"
2830 fi
2831 else
2832 if [[ $link_speed != "25000Mb/s" ]]
2833 then
2834 link_speed=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "ls -l /sys/class/net/ | grep -E \"[0-9a-z][0-9a-z]:00\.0/net|[0-9a-z][0-9a-z]:00\.1/net\" | awk -F/ '{print \$NF}' | xargs -i ip link show {} | grep 'UP mode' | awk '{print \$2}' | tr -d : | xargs -i ethtool {} | grep -E 'Settings for |Speed:'")
2835 echo -e "${RED}$link_speed${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2836 else
2837 echo -e "${GREEN}all physical ports negotiated speed set to 25000Mb/s${NC}"
2838 fi
2839 fi
2840 elapsed_time_seconds=$(expr $(date +%s) - $start)
2841
2842
2843 ####################################################################################################
2844
2845
2846 start=$(date +%s)
2847 STEPS_COUNTER=$((STEPS_COUNTER+1))
2848 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE warning LOG LINES (case-insensitive) WITHIN /var/log/ UP TO 10 MINUTES EARLIER (+$elapsed_time_seconds `date '+%T'`)${NC}"
2849 hour1=$(date -d "-0 hour" +%Y"-"%m"-"%d" "%T | cut -d: -f1-2 | sed 's/.$//')
2850 hour2=$(date | awk '{print $2" "$3 ,$4}' | cut -d: -f1-2 | sed 's/.$//')
2851 warning=$(ansible all --limit '!hypervisor' -b -m shell -a "grep -E -R '$hour1|$hour2' /var/log/ | grep -i warning | grep -E -v 'ansible-command: Invoked with warn|ansible.log|filebeat|DeprecationWarning|level=warning|deprecated' | awk -F: '{print \$1}' | sort | uniq -c | column -t" | grep ^[1-9] -B 1)
2852 if [[ $warning ]]
2853 then
2854 echo -e "${ORANGE}$warning${NC}"
2855 else
2856 echo -e "${GREEN}no warning log lines were found under /var/log/${NC}"
2857 fi
2858 elapsed_time_seconds=$(expr $(date +%s) - $start)
2859
2860
2861 ####################################################################################################
2862
2863
2864 start=$(date +%s)
2865 STEPS_COUNTER=$((STEPS_COUNTER+1))
2866 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT ALL THE CBIS MANAGER PAGES RETURN HTTP 200 OK (+$elapsed_time_seconds `date '+%T'`)${NC}"
2867 echo -e "${CYAN}get zabbix, ceph, kibana and horizon components status${NC}"
2868 if [[ $ceph_backend == "false" && $elk == "false" ]]
2869 then
2870 installed_status=$(curl -g -s -L -k -X GET 'https://'$HypervisorURL'/api/components/getComponents' -H 'Authorization: Basic '$cbis_manager_token'' | jq 'select(.display != "Kibana")' | jq 'select(.status == "notInstalled")')
2871 elif [[ $ceph_backend == "false" && $elk == "true" ]]
2872 then
2873 installed_status=$(curl -g -s -L -k -X GET 'https://'$HypervisorURL'/api/components/getComponents' -H 'Authorization: Basic '$cbis_manager_token'' | jq .[][] | jq 'select(.display != "Ceph")' | jq 'select(.status == "notInstalled")')
2874 elif [[ $ceph_backend == "true" && $elk == "false" ]]
2875 then
2876 installed_status=$(curl -g -s -L -k -X GET 'https://'$HypervisorURL'/api/components/getComponents' -H 'Authorization: Basic '$cbis_manager_token'' | jq .[][] | jq 'select(.display != "Kibana")' | jq 'select(.status == "notInstalled")')
2877 fi
2878 if [[ $installed_status ]]
2879 then
2880 echo -e "${RED}$installed_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2881 else
2882 echo -e "${GREEN}all external components are installed${NC}"
2883 fi
2884 echo -e "${CYAN}access plugins page${NC}"
2885 http_status=$(curl -g -s -L -k -w 'RESP_CODE:%{response_code}' -X GET 'https://'$HypervisorURL'/api/plugins' -H 'Authorization: Basic '$cbis_manager_token'' | grep RESP_CODE | awk -F: '{print $2}')
2886 if [[ $http_status != "200" ]]
2887 then
2888 echo -e "${RED}expected http code 200 and got $http_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2889 else
2890 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2891 fi
2892 echo -e "${CYAN}access installation page${NC}"
2893 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/installation/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"hardware":'$platform'}' | grep RESP_CODE | awk -F: '{print $2}')
2894 if [[ $http_status != "200" ]]
2895 then
2896 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2897 else
2898 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2899 fi
2900 echo -e "${CYAN}access custom templates page${NC}"
2901 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/add_host_group/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"hardware":'$platform'}' | grep RESP_CODE | awk -F: '{print $2}')
2902 if [[ $http_status != "200" ]]
2903 then
2904 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2905 else
2906 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2907 fi
2908 if [[ $cbis_version != "19.0.0.1" ]]
2909 then
2910 echo -e "${CYAN}access upgrade (upgrade) page${NC}"
2911 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/cbis_upgrade/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"set_in_upgrade":"Upgrade"}' | grep RESP_CODE | awk -F: '{print $2}')
2912 if [[ $http_status != "200" ]]
2913 then
2914 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2915 else
2916 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2917 fi
2918 echo -e "${CYAN}access upgrade (resume) page${NC}"
2919 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/cbis_upgrade/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"set_in_upgrade":"Resume"}' | grep RESP_CODE | awk -F: '{print $2}')
2920 if [[ $http_status != "200" ]]
2921 then
2922 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2923 else
2924 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2925 fi
2926 echo -e "${CYAN}access upgrade (rollback) page${NC}"
2927 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/cbis_upgrade/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"set_in_upgrade":"Rollback"}' | grep RESP_CODE | awk -F: '{print $2}')
2928 if [[ $http_status != "200" ]]
2929 then
2930 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2931 else
2932 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2933 fi
2934 fi
2935 if [[ $cbis_version != "19.0.0.1" ]]
2936 then
2937 echo -e "${CYAN}access novl page${NC}"
2938 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/novl/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"test":"new test"}' | grep RESP_CODE | awk -F: '{print $2}')
2939 if [[ $http_status != "200" ]]
2940 then
2941 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2942 else
2943 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2944 fi
2945 fi
2946 echo -e "${CYAN}access scale-out page${NC}"
2947 if [[ $cbis_version == "18.0.0.1" || $cbis_version == "19.0.0.1" || $cbis_version == "19.100.1" ]]
2948 then
2949 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/add_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"current_hardware":'$platform',"hardware":'$platform'}' | grep RESP_CODE | awk -F: '{print $2}')
2950 else
2951 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/add_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"current_hardware":'$platform',"new_hardware":'$platform'}' | grep RESP_CODE | awk -F: '{print $2}')
2952 fi
2953 if [[ $http_status != "200" ]]
2954 then
2955 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2956 else
2957 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2958 fi
2959 echo -e "${CYAN}access scale-in page${NC}"
2960 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/remove_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
2961 if [[ $http_status != "200" ]]
2962 then
2963 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2964 else
2965 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2966 fi
2967 if [[ $cbis_version != "19.0.0.1" ]]
2968 then
2969 echo -e "${CYAN}access reboot servers page${NC}"
2970 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/reboot_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
2971 if [[ $http_status != "200" ]]
2972 then
2973 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2974 else
2975 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2976 fi
2977 fi
2978 if [[ $cbis_version != "19.0.0.1" ]]
2979 then
2980 echo -e "${CYAN}access maintenance mode (set) page${NC}"
2981 if [[ $cbis_version == "18.0.0.1" || $cbis_version == "19.0.0.1" || $cbis_version == "19.100.1" ]]
2982 then
2983 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/maintenance_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"set_in_maintenance":"Set in Maintenance"}' | grep RESP_CODE | awk -F: '{print $2}')
2984 else
2985 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/maintenance_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"set_to_maintenance":"Set to Maintenance"}' | grep RESP_CODE | awk -F: '{print $2}')
2986 fi
2987 if [[ $http_status != "200" ]]
2988 then
2989 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2990 else
2991 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2992 fi
2993 echo -e "${CYAN}access maintenance mode (unset) page${NC}"
2994 if [[ $cbis_version == "18.0.0.1" || $cbis_version == "19.0.0.1" || $cbis_version == "19.100.1" ]]
2995 then
2996 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/maintenance_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"set_in_maintenance":"Unset from Maintenance"}' | grep RESP_CODE | awk -F: '{print $2}')
2997 else
2998 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/maintenance_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"set_to_maintenance":"Unset from Maintenance"}' | grep RESP_CODE | awk -F: '{print $2}')
2999 fi
3000 if [[ $http_status != "200" ]]
3001 then
3002 if [[ $cbis_version == "18.0.0.1" || $cbis_version == "19.0.0.1" || $cbis_version == "19.100.1" ]]
3003 then
3004 error=$(curl -g -s -k -L -X POST 'https://'$HypervisorURL'/api/maintenance_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"set_in_maintenance":"Unset from Maintenance"}' | jq .Error | tr -d '\"\.')
3005 else
3006 error=$(curl -g -s -k -L -X POST 'https://'$HypervisorURL'/api/maintenance_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"set_to_maintenance":"Unset from Maintenance"}' | jq .error_message | tr -d '\"\.')
3007 fi
3008 if [[ $error == "There are currently no computes to unset from maintenance mode" ]]
3009 then
3010 echo -e "${GREEN}There are currently no computes to unset from maintenance mode${NC}"
3011 else
3012 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3013 fi
3014 elif [[ $http_status == "200" ]]
3015 then
3016 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3017 fi
3018 fi
3019 echo -e "${CYAN}access undercloud vm backup/restore (backup) page${NC}"
3020 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/undercloud_backup/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"operation":"Backup","backup_directory":"'$backup_nfs_mountpoint'"}' | grep RESP_CODE | awk -F: '{print $2}')
3021 if [[ $http_status != "200" ]]
3022 then
3023 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3024 else
3025 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3026 fi
3027 echo -e "${CYAN}access undercloud vm backup/restore (restore) page${NC}"
3028 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/undercloud_backup/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"operation":"Restore","backup_directory":"'$backup_nfs_mountpoint'"}' | grep RESP_CODE | awk -F: '{print $2}')
3029 if [[ $http_status != "200" ]]
3030 then
3031 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3032 else
3033 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3034 fi
3035 echo -e "${CYAN}access patch management page page${NC}"
3036 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/patch_management/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3037 if [[ $http_status != "200" ]]
3038 then
3039 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3040 else
3041 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3042 fi
3043 if [[ $cbis_version != "19.0.0.1" ]]
3044 then
3045 echo -e "${CYAN}access controller replacement page${NC}"
3046 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/replace_controller/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3047 if [[ $http_status != "200" ]]
3048 then
3049 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3050 else
3051 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3052 fi
3053 fi
3054 if [[ $cbis_version != "19.0.0.1" ]]
3055 then
3056 echo -e "${CYAN}access overcloud database restore page${NC}"
3057 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/overcloud_db_restore/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3058 if [[ $http_status != "200" ]]
3059 then
3060 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3061 else
3062 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3063 fi
3064 fi
3065 if [[ $cbis_version != "19.0.0.1" ]]
3066 then
3067 echo -e "${CYAN}access security hardening (install) page${NC}"
3068 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/security_hardening/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3069 if [[ $http_status != "200" ]]
3070 then
3071 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3072 else
3073 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3074 fi
3075 echo -e "${CYAN}access security hardening (rollback) page${NC}"
3076 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/security_hardening_rollback/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3077 if [[ $http_status != "200" ]]
3078 then
3079 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3080 else
3081 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3082 fi
3083 echo -e "${CYAN}access secured communication (ipsec) page${NC}"
3084 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/security_secured_communication/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3085 if [[ $http_status != "200" ]]
3086 then
3087 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3088 else
3089 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3090 fi
3091 echo -e "${CYAN}access key management page${NC}"
3092 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/security_key_management/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3093 if [[ $http_status != "200" ]]
3094 then
3095 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3096 else
3097 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3098 fi
3099 echo -e "${CYAN}access platform secret update page${NC}"
3100 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/security_platform_secrets_update/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3101 if [[ $http_status != "200" ]]
3102 then
3103 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3104 else
3105 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3106 fi
3107 echo -e "${CYAN}access ldap deployment parameters page${NC}"
3108 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/security_ldap_deployment/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3109 if [[ $http_status != "200" ]]
3110 then
3111 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3112 else
3113 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3114 fi
3115 fi
3116 echo -e "${CYAN}access multi cbis management page${NC}"
3117 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/manage_multi/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3118 if [[ $http_status != "200" ]]
3119 then
3120 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3121 else
3122 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3123 fi
3124 echo -e "${CYAN}access remote patch management page${NC}"
3125 http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/multi_vim/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3126 if [[ $http_status != "200" ]]
3127 then
3128 echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3129 else
3130 echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3131 fi
3132 echo -e "${CYAN}validate that the deployment log returns 'CBIS Installation Finished Successfully or 'Post install Sanity tests completed'${NC}"
3133 deployment_log_success_check=$(curl -g -s -k -L -X GET 'https://'$HypervisorURL'/log/deployment.log' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep -E 'CBIS Installation Finished Successfully|Post install Sanity tests completed')
3134 if [[ $deployment_log_success_check ]]
3135 then
3136 echo -e "${GREEN}returned response 'CBIS Installation Finished Successfully' or 'Post install Sanity tests completed'${NC}"
3137 else
3138 echo -e "${RED}didn't get 'CBIS Installation Finished Successfully' or 'Post install Sanity tests completed' as expected. log-in into cbis manager UI and verify that the CBIS installation log is showing${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3139 fi
3140 elapsed_time_seconds=$(expr $(date +%s) - $start)
3141
3142
3143 ####################################################################################################
3144
3145
3146 start=$(date +%s)
3147 STEPS_COUNTER=$((STEPS_COUNTER+1))
3148 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE HORIZON LOGO PAGE AUTHENTICITY (+$elapsed_time_seconds `date '+%T'`)${NC}"
3149 ### this check came after seeing permission issues (403 forbbiden errors) for most of the pages in horizon on newyork setup (19A SP4 PP3) which corrupted the view of horizon completely.
3150 horizon_logo_error_code=$(curl -g -s -k -L -w '\nRESP_CODE:%{response_code}\n' -X GET https://$PublicURL/dashboard/static/themes/nokia/img/logo.svg | grep RESP_CODE: | awk -F: '{print $2}')
3151 if [[ $horizon_logo_error_code != "200" ]]
3152 then
3153 echo -e "${RED}horizon logo page (logo.svg) returned error code $horizon_logo_error_code${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3154 else
3155 echo -e "${GREEN}horizon logo page (logo.svg) returned error code $horizon_logo_error_code${NC}"
3156 fi
3157 elapsed_time_seconds=$(expr $(date +%s) - $start)
3158
3159
3160 ####################################################################################################
3161
3162
3163 start=$(date +%s)
3164 STEPS_COUNTER=$((STEPS_COUNTER+1))
3165 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK ALL HORIZON PAGES VALIDITY (+$elapsed_time_seconds `date '+%T'`)${NC}"
3166 curl -g -k -s -L -c cookies.txt -b cookies.txt -o output.1.html 'https://'$PublicURL'/dashboard/auth/login/'
3167 CSRFTOKEN=$(cat cookies.txt | grep -w csrftoken | awk '{print $NF}')
3168 if [[ -z $CSRFTOKEN ]]
3169 then
3170 echo -e "${RED}cookies.txt doesn't contain the csrftoken output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3171 fi
3172 csrfmiddlewaretoken=$(curl -g -s -k -L 'https://'$PublicURL'/dashboard/auth/login/' | grep csrfmiddlewaretoken | awk -F'value=' '{print $2}' | awk -F\' '{print $2}')
3173 if [[ -z $csrfmiddlewaretoken ]]
3174 then
3175 echo -e "${RED}didn't receive the csrfmiddlewaretoken output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3176 fi
3177 HORIZON_REGION=$(curl -g -s -k -L 'https://'$PublicURL'/dashboard/auth/login/' | grep region | awk -F'value="' '{print $2}' | awk -F\" '{print $1}')
3178 if [[ -z $HORIZON_REGION ]]
3179 then
3180 echo -e "${RED}didn't receive the region output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3181 fi
3182 DATA="username=admin&password=$ADMIN_PASSWORD®ion=$HORIZON_REGION&csrfmiddlewaretoken=$CSRFTOKEN"
3183 if [[ -z $DATA ]]
3184 then
3185 echo -e "${RED}didn't receive the data output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3186 fi
3187 curl -g -k -s -L -c cookies.txt -b cookies.txt --output /dev/null -s -d "$DATA" --referer 'https://'$PublicURL'/dashboard/' 'https://'$PublicURL'/dashboard/auth/login/'
3188 SESSIONID=$(cat cookies.txt | grep sessionid | sed 's/^.*sessionid\s*//')
3189 if [[ -z $SESSIONID ]]
3190 then
3191 echo -e "${RED}didn't receive the sessionid output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3192 fi
3193 pages="identity identity/users identity/groups identity/groups identity/roles admin admin/hypervisors admin/aggregates admin/instances admin/flavors admin/images admin/volumes admin/snapshots admin/networks admin/routers admin/floating_ips admin/trunks admin/defaults admin/metadata_defs admin/info admin/vitrageadmindashboard admin/vitrageadminalarms admin/vitrageadminentities admin/vitrageadmintemplates admin/shares admin/share_snapshots admin/share_types admin/share_networks admin/security_services admin/share_servers admin/share_instances admin/share_groups admin/share_group_snapshots admin/share_group_types project/api_access project project/instances project/images project/key_pairs project/server_groups project/volumes project/snapshots project/volume_groups project/vg_snapshots project/network_topology project/networks project/routers project/security_groups project/floating_ips project/trunks project/stacks project/resource_types project/template_versions project/vitragedashboard project/vitragealarms project/vitrageentities project/vitragetemplates project/shares project/share_snapshots project/share_networks project/security_services project/share_groups project/share_group_snapshots settings"
3194 for page in $pages
3195 do
3196 echo -e "${CYAN}checking the $page page${NC}"
3197 OUTPUT=$(curl -g -k -L -s 'https://'$PublicURL'/dashboard/'$page'/' -H 'Cookie: csrftoken='$CSRFTOKEN'; sessionid='$SESSIONID'')
3198 if [[ $OUTPUT ]]
3199 then
3200 ERROR=$(echo -e "$OUTPUT" | grep -E 'Error:|unexpected error|Server error')
3201 if [[ $ERROR ]]
3202 then
3203 echo -e "${RED}$ERROR${NC}" | awk '{$1=$1};1' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3204 else
3205 echo -e "${GREEN}the $page page returned with no errors${NC}"
3206 fi
3207 else
3208 echo -e "${RED}OUTPUT variable returned no ouput${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3209 fi
3210 done
3211 rm -rf cookies.txt output.1.html
3212 elapsed_time_seconds=$(expr $(date +%s) - $start)
3213
3214
3215 ####################################################################################################
3216fi
3217 ####################################################################################################
3218
3219
3220if [[ $ESSENTIAL == "no" ]]
3221then
3222 start=$(date +%s)
3223 STEPS_COUNTER=$((STEPS_COUNTER+1))
3224 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT ALL THE COMPUTES HAS THE SAME TOTAL MEMORY SIZE (+$elapsed_time_seconds `date '+%T'`)${NC}"
3225 HYPERVISOR_LIST=$(source ~/overcloudrc && openstack hypervisor list --long -c "Hypervisor Hostname" -c "Memory MB")
3226 TOTAL_RAM=$(echo -e "$HYPERVISOR_LIST" | grep overcloud | awk '{print $4}' | sort | uniq | wc -l)
3227 if [ $TOTAL_RAM -gt 1 ]
3228 then
3229 echo -e "${RED}found one or more computes with different total memory size\n\n$HYPERVISOR_LIST${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3230 else
3231 echo -e "${GREEN}all the computes have the same total memory size${NC}"
3232 fi
3233 elapsed_time_seconds=$(expr $(date +%s) - $start)
3234
3235
3236 ####################################################################################################
3237
3238
3239 start=$(date +%s)
3240 STEPS_COUNTER=$((STEPS_COUNTER+1))
3241 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE CPU MODEL INCONSISTENCIES INSIDE EACH SERVER INDIVIDUALLY (+$elapsed_time_seconds `date '+%T'`)${NC}"
3242 inconsistence_servers=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "cat /proc/cpuinfo | grep 'model name' | awk -F: '{print \$2}' | sed 's/^ *//g' | sort | uniq | wc -l" | grep ^[2-9] -B 1 | grep overcloud- | awk '{print $1}' | paste -sd',')
3243 if [[ $inconsistence_servers ]]
3244 then
3245 inconsistence_servers=$(ssshpass -p $hv_cbis_admin_password ansible -k $inconsistence_servers --limit '!localhost' -b -m shell -a "cat /proc/cpuinfo | grep 'model name' | awk -F: '{print \$2}' | sed 's/^ *//g' | sort | uniq -c")
3246 echo -e "${RED}$inconsistence_servers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3247 else
3248 echo -e "${GREEN}no inconsistencies found between the many CPUs of each server${NC}"
3249 fi
3250 elapsed_time_seconds=$(expr $(date +%s) - $start)
3251
3252
3253 ####################################################################################################
3254
3255
3256 start=$(date +%s)
3257 STEPS_COUNTER=$((STEPS_COUNTER+1))
3258 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE CPU MODEL INCONSISTENCIES BETWEEN THE OVERCLOUD SERVERS + HYPERVISOR (+$elapsed_time_seconds `date '+%T'`)${NC}"
3259 echo -e "${CYAN}checking the CPU model of the computes and controllers${NC}"
3260 inconsistence_servers=$(sshpass -p $hv_cbis_admin_password ansible -k controller,compute,hypervisor -b -m shell -a "cat /proc/cpuinfo | grep 'model name' | awk -F: '{print \$2}' | sed 's/^ *//g' | sort | uniq" | grep CPU | sort | uniq | wc -l)
3261 if [[ $inconsistence_servers != "1" ]]
3262 then
3263 inconsistence_servers=$(sshpass -p $hv_cbis_admin_password ansible -k controller,compute,hypervisor -b -m shell -a "cat /proc/cpuinfo | grep 'model name' | awk -F: '{print \$2}' | sort | uniq")
3264 echo -e "${RED}$inconsistence_servers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3265 else
3266 echo -e "${GREEN}no inconsistencies found between the CPUs of the controllers, computes and hypervisor${NC}"
3267 fi
3268 if [[ $ansible_storage_hosts ]]
3269 then
3270 echo -e "\n${CYAN}checking the CPU model of the computes and controllers${NC}"
3271 inconsistence_servers=$(ansible CephStorage -b -m shell -a "cat /proc/cpuinfo | grep 'model name' | awk -F: '{print \$2}' | sed 's/^ *//g' | sort | uniq" | grep CPU | sort | uniq | wc -l)
3272 if [[ $inconsistence_servers != "1" ]]
3273 then
3274 inconsistence_servers=$(ansible CephStorage -b -m shell -a "cat /proc/cpuinfo | grep 'model name' | awk -F: '{print \$2}' | sort | uniq")
3275 echo -e "${RED}$inconsistence_servers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3276 else
3277 echo -e "${GREEN}no inconsistencies found between the CPUs of the storage nodes${NC}"
3278 fi
3279 fi
3280 elapsed_time_seconds=$(expr $(date +%s) - $start)
3281
3282
3283 ####################################################################################################
3284
3285
3286 start=$(date +%s)
3287 STEPS_COUNTER=$((STEPS_COUNTER+1))
3288 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE CPU FLAGS INCONCITIENCIES (+$elapsed_time_seconds `date '+%T'`)${NC}"
3289 lscpu=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "lscpu")
3290 uniqe_flags=$(echo -e "$lscpu" | grep Flags: | tr -s ' ' '\n' | grep -v Flags: | sort | uniq -c | column -t | awk '{print $1}' | sort -n | uniq | wc -l)
3291 if [ $uniqe_flags != "1" ]
3292 then
3293 uniqe_missing_flags=$(echo -e "$lscpu" | grep Flags: | tr -s ' ' '\n' | grep -v Flags: | sort | uniq -c | column -t | awk '{print $1}' | sort -n | uniq | head -n -1 | tail -n1)
3294 missing_flags_names=$(echo -e "$lscpu" | grep Flags: | tr -s ' ' '\n' | grep -v Flags: | sort | uniq -c | column -t | grep ^[1-$uniqe_missing_flags] | awk '{print $2}')
3295 missing_flags_paste=$(echo -e "$missing_flags_names" | paste -sd"|")
3296 flags_per_compute=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "lscpu | grep Flags: | tr -s ' ' '\n' | grep -E '$missing_flags_paste' | wc -l")
3297 echo -e "${RED}the following flags are missing from one or more computes:\n\n$missing_flags_names\n\n$flags_per_compute${NC}"
3298 else
3299 echo -e "${GREEN}the CPU flags are identical on all the compute hosts${NC}"
3300 fi
3301 elapsed_time_seconds=$(expr $(date +%s) - $start)
3302
3303
3304 ####################################################################################################
3305
3306
3307 start=$(date +%s)
3308 STEPS_COUNTER=$((STEPS_COUNTER+1))
3309 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT AUTO EVACUATE IS ENABLED (+$elapsed_time_seconds `date '+%T'`)${NC}"
3310 if [[ $cbis_version == "18.0.0.1" || $cbis_version == "19.0.0.1" || $cbis_version == "19.100.1" ]]
3311 then
3312 auto_evacuate=$(ansible $last_index_controller -b -m shell -a "cat /etc/vitrage/vitrage.conf" | grep enable_host_evacuate | awk '{print $3}')
3313 if [[ $auto_evacuate == "False" ]]
3314 then
3315 echo -e "${ORANGE}auto-evacuate is disabled${NC}"
3316 else
3317 echo -e "${GREEN}auto-evacuate is enabled${NC}"
3318 fi
3319 else
3320 auto_evacuate=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
3321 -H 'Content-Type: application/json-rpc' \
3322 -H 'Cookie: SERVERID='$last_index_controller'' \
3323 --data '{
3324 "jsonrpc": "2.0",
3325 "method": "action.get",
3326 "params": {
3327 "output": "extend",
3328 "selectOperations": "extend",
3329 "selectRecoveryOperations": "extend",
3330 "selectFilter": "extend",
3331 "filter": {
3332 "eventsource": 0
3333 }
3334 },
3335 "auth": '$zabbix_auth',
3336 "id": 2
3337 }' | jq . | grep auto-evacuate -A 5 | grep status | awk '{print $NF}' | tr -d '\"\,')
3338 if [[ $auto_evacuate == "1" ]]
3339 then
3340 echo -e "${ORANGE}auto-evacuate is disabled${NC}"
3341 elif [[ $auto_evacuate == "0" ]]
3342 then
3343 echo -e "${GREEN}auto-evacuate is enabled${NC}"
3344 fi
3345 fi
3346 elapsed_time_seconds=$(expr $(date +%s) - $global_start)
3347
3348
3349 ####################################################################################################
3350
3351
3352 start=$(date +%s)
3353 STEPS_COUNTER=$((STEPS_COUNTER+1))
3354 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE CHANGES IN CONF FILES ON THE CONTROLLERS (+$elapsed_time_seconds `date '+%T'`)${NC}"
3355 src_dir="/var/lib/config-data/puppet-generated"
3356 dst_dir="/home/cbis-admin/configuration_files_backup"
3357 dir_check=$(ansible $last_index_controller -b -m shell -a "ls $dst_dir" | grep -E -v -c 'overcloud|cannot access')
3358 if [ $dir_check != "9" ]
3359 then
3360 ansible controller -b -m shell -a "cp -u mkdir -p "$dst_dir" warn=False" > /dev/null
3361 ansible controller -b -m shell -a "cp -u "$src_dir"/cinder/etc/cinder/cinder.conf "$dst_dir"" > /dev/null
3362 ansible controller -b -m shell -a "cp -u "$src_dir"/nova_placement/etc/nova/nova.conf "$dst_dir"" > /dev/null
3363 ansible controller -b -m shell -a "cp -u "$src_dir"/glance_api/etc/glance/glance-api.conf "$dst_dir"" > /dev/null
3364 ansible controller -b -m shell -a "cp -u "$src_dir"/haproxy/etc/haproxy/haproxy.cfg "$dst_dir"" > /dev/null
3365 ansible controller -b -m shell -a "cp -u "$src_dir"/heat/etc/heat/heat.conf "$dst_dir"" > /dev/null
3366 ansible controller -b -m shell -a "cp -u "$src_dir"/keystone/etc/keystone/keystone.conf "$dst_dir"" > /dev/null
3367 ansible controller -b -m shell -a "cp -u "$src_dir"/neutron/etc/neutron/neutron.conf "$dst_dir"" > /dev/null
3368 ansible controller -b -m shell -a "cp -u "$src_dir"/rabbitmq/etc/rabbitmq/rabbitmq.config "$dst_dir"" > /dev/null
3369 ansible controller -b -m shell -a "cp -u "$src_dir"/ceph/etc/ceph/ceph.conf "$dst_dir"" > /dev/null
3370 fi
3371 cinder=$(ansible controller -b -m shell -a "diff "$src_dir"/cinder/etc/cinder/cinder.conf "$dst_dir"/cinder.conf | grep -v transport_url" | grep -E '^>|^<')
3372 nova=$(ansible controller -b -m shell -a "diff "$src_dir"/nova_placement/etc/nova/nova.conf "$dst_dir"/nova.conf | grep -v transport_url" | grep -E '^>|^<')
3373 glance_api=$(ansible controller -b -m shell -a "diff "$src_dir"/glance_api/etc/glance/glance-api.conf "$dst_dir"/glance-api.conf | grep -v transport_url" | grep -E '^>|^<')
3374 haproxy=$(ansible controller -b -m shell -a "diff "$src_dir"/haproxy/etc/haproxy/haproxy.cfg "$dst_dir"/haproxy.cfg | grep -v transport_url" | grep -E '^>|^<')
3375 heat=$(ansible controller -b -m shell -a "diff "$src_dir"/heat/etc/heat/heat.conf "$dst_dir"/heat.conf | grep -v transport_url" | grep -E '^>|^<')
3376 keystone=$(ansible controller -b -m shell -a "diff "$src_dir"/keystone/etc/keystone/keystone.conf "$dst_dir"/keystone.conf | grep -v transport_url" | grep -E '^>|^<')
3377 neutron=$(ansible controller -b -m shell -a "diff "$src_dir"/neutron/etc/neutron/neutron.conf "$dst_dir"/neutron.conf | grep -v transport_url" | grep -E '^>|^<')
3378 rabbitmq=$(ansible controller -b -m shell -a "diff "$src_dir"/rabbitmq/etc/rabbitmq/rabbitmq.config "$dst_dir"/rabbitmq.config | grep -v transport_url" | grep -E '^>|^<')
3379 ceph=$(ansible controller -b -m shell -a "diff "$src_dir"/ceph/etc/ceph/ceph.conf "$dst_dir"/ceph.conf | grep -v transport_url" | grep -E '^>|^<')
3380 if [[ $cinder ]]
3381 then
3382 echo -e "${LRB}cinder${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3383 echo -e "${RED}$cinder${NC}" | sed 's/^>/ORIGINAL:/g' | sed 's/^</CURRENT:/g' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3384 fi
3385 if [[ $nova ]]
3386 then
3387 echo -e "${LRB}nova${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3388 echo -e "${RED}$nova${NC}" | sed 's/^>/ORIGINAL:/g' | sed 's/^</CURRENT:/g' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3389 else
3390 echo -e "${GREEN}nova original and current conf files are identical${NC}"
3391 fi
3392 if [[ $glance_api ]]
3393 then
3394 echo -e "${LRB}glance_api${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3395 echo -e "${RED}$glance_api${NC}" | sed 's/^>/ORIGINAL:/g' | sed 's/^</CURRENT:/g' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3396 else
3397 echo -e "${GREEN}glance_api original and current conf files are identical${NC}"
3398 fi
3399 if [[ $haproxy ]]
3400 then
3401 echo -e "${LRB}haproxy${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3402 echo -e "${RED}$haproxy${NC}" | sed 's/^>/ORIGINAL:/g' | sed 's/^</CURRENT:/g' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3403 else
3404 echo -e "${GREEN}haproxy original and current conf files are identical${NC}"
3405 fi
3406 if [[ $heat ]]
3407 then
3408 echo -e "${LRB}heat${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3409 echo -e "${RED}$heat${NC}" | sed 's/^>/ORIGINAL:/g' | sed 's/^</CURRENT:/g' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3410 else
3411 echo -e "${GREEN}heat original and current conf files are identical${NC}"
3412 fi
3413 if [[ $keystone ]]
3414 then
3415 echo -e "${LRB}keystone${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3416 echo -e "${RED}$keystone${NC}" | sed 's/^>/ORIGINAL:/g' | sed 's/^</CURRENT:/g' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3417 else
3418 echo -e "${GREEN}keystone original and current conf files are identical${NC}"
3419 fi
3420 if [[ $neutron ]]
3421 then
3422 echo -e "${LRB}neutron${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3423 echo -e "${RED}$neutron${NC}" | sed 's/^>/ORIGINAL:/g' | sed 's/^</CURRENT:/g' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3424 else
3425 echo -e "${GREEN}neutron original and current conf files are identical${NC}"
3426 fi
3427 if [[ $rabbitmq ]]
3428 then
3429 echo -e "${LRB}rabbitmq${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3430 echo -e "${RED}$rabbitmq${NC}" | sed 's/^>/ORIGINAL:/g' | sed 's/^</CURRENT:/g' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3431 else
3432 echo -e "${GREEN}rabbitmq original and current conf files are identical${NC}"
3433 fi
3434 if [[ $ceph ]]
3435 then
3436 echo -e "${LRB}ceph${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3437 echo -e "${RED}$ceph${NC}" | sed 's/^>/ORIGINAL:/g' | sed 's/^</CURRENT:/g' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3438 else
3439 echo -e "${GREEN}ceph original and current conf files are identical${NC}"
3440 fi
3441 elapsed_time_seconds=$(expr $(date +%s) - $start)
3442
3443
3444 ####################################################################################################
3445
3446
3447 start=$(date +%s)
3448 STEPS_COUNTER=$((STEPS_COUNTER+1))
3449 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT OPENSTACK CAN ISSUE TOKEN FROM BOTH stackrc AND overcloudrc (+$elapsed_time_seconds `date '+%T'`)${NC}"
3450 stackrc_token=$(source ~/stackrc && openstack token issue > /dev/null)
3451 if [[ $stackrc_token ]]
3452 then
3453 echo -e "${RED}$stackrc_token${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3454 else
3455 echo -e "${GREEN}token was created sucessfully for user admin (stackrc)${NC}"
3456 fi
3457 overcloudrc_token=$(source ~/overcloudrc && openstack token issue > /dev/null)
3458 if [[ $overcloudrc_token ]]
3459 then
3460 echo -e "\n${RED}$overcloudrc_token${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3461 else
3462 echo -e "${GREEN}token was created sucessfully for user admin (overcloudrc)${NC}"
3463 fi
3464 elapsed_time_seconds=$(expr $(date +%s) - $start)
3465
3466
3467 ####################################################################################################
3468
3469
3470 start=$(date +%s)
3471 if [[ $nuage == "true" ]]
3472 then
3473 STEPS_COUNTER=$((STEPS_COUNTER+1))
3474 echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE BETWEEN THE NUAGE VERSION OF ALL THE OVERCLOUD HOSTS (+$elapsed_time_seconds `date '+%T'`)${NC}"
3475 nuage_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "cat /usr/share/cbis/nuage-version" | grep ^[0-9] | sort --uniq | wc -l)
3476 if [[ $nuage_version != "1" ]]
3477 then
3478 nuage_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "cat /usr/share/cbis/nuage-version")
3479 echo -e "${RED}nuage version mismatch found between one or more hosts${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3480 echo -e "${RED}$nuage_version${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3481 else
3482 echo -e "${GREEN}nuage version is: $nuage_version${NC} "
3483 fi
3484 fi
3485 elapsed_time_seconds=$(expr $(date +%s) - $start)
3486
3487
3488 ####################################################################################################
3489
3490
3491 start=$(date +%s)
3492 if [[ $nuage == "true" ]]
3493 then
3494 STEPS_COUNTER=$((STEPS_COUNTER+1))
3495 echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE BETWEEN THE Open vSwitch NUAGE VERSION OF ALL THE OVERCLOUD HOSTS (+$elapsed_time_seconds `date '+%T'`)${NC}"
3496 nuage_ovs_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ovs-appctl --version" | grep nuage | awk '{print $ NF}' | cut -d - -f 1-2 | sort --uniq | wc -l)
3497 if [[ $nuage_ovs_version != "1" ]]
3498 then
3499 nuage_ovs_version$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ovs-appctl --version | grep nuage")
3500 echo -e "${RED}nuage Open vSwitch version mismatch found between one or more hosts${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3501 echo -e "${RED}$nuage_ovs_version${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3502 else
3503 echo -e "${GREEN}nuage Open vSwitch version is: $nuage_ovs_version${NC} "
3504 fi
3505 fi
3506 elapsed_time_seconds=$(expr $(date +%s) - $start)
3507
3508
3509 ####################################################################################################
3510
3511
3512 start=$(date +%s)
3513 STEPS_COUNTER=$((STEPS_COUNTER+1))
3514 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT ALL THE HOSTS HAS IDENTICAL MELLANOX OFED AND DRIVER VERSION (+$elapsed_time_seconds `date '+%T'`)${NC}"
3515 if [[ $fixed_platform != 'airframe' && $fixed_platform != 'dell-730' && $fixed_platform != 'hp-slg7_OVS' && $fixed_platform != 'hp-slg7_OVS_SSD_single_nic' && $fixed_platform != 'hp-c7kg8' && $fixed_platform != 'hp-c7kg9' ]]
3516 then
3517 if [[ $mlx_ofed_version != "/bin/sh:" ]]
3518 then
3519 mlx_version_comparison=$(echo -e "$mlx_ofed_version" | wc -l)
3520 if [[ $mlx_version_comparison != "1" ]]
3521 then
3522 echo -e "${RED}found multiple mellanox ofed versions:\n\n$mlx_ofed_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3523 else
3524 echo -e "${GREEN}all the hosts has the same mellanox ofed version ($mlx_ofed_version)${NC}"
3525 echo -e "MELLANOX OFED VERSION = $mlx_ofed_version" > $logs_dir/mellanox_details
3526 mlxfwmanager=$(ansible $last_index_controller -b -m shell -a "mlxfwmanager")
3527 mlx_firmware=$(echo -e "mlxfwmanager" | grep FW | awk '{print $2}' | sort -u)
3528 echo -e "\n\n$MELLANOX INTERFACE FIRMWARE = $mlx_firmware" >> $logs_dir/mellanox_details
3529 mlx_device_type=$(echo -e "mlxfwmanager" | grep 'Device Type:' | awk '{print $2}' | sort -u)
3530 echo -e "\n\n$MELLANOX DEVICE TYPE = $mlx_device_type" >> $logs_dir/mellanox_details
3531 echo -e "\n\n$MELLANOX RPMS INFO: = $mlx_firmware" >> $logs_dir/mellanox_details
3532 ansible $last_index_controller -b -m shell -a "/usr/bin/ofed_rpm_info" >> $logs_dir/mellanox_details
3533 fi
3534 else
3535 echo -e "${RED}$fixed_platform platform should use mellanox interfaces but no mellanox interface are to be found (/usr/bin/ofed_info)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3536 fi
3537 else
3538 echo -e "${ORANGE}$fixed_platform platform blueprint is without mellanox interfaces${NC}"
3539 fi
3540 elapsed_time_seconds=$(expr $(date +%s) - $start)
3541
3542
3543 ####################################################################################################
3544
3545
3546 start=$(date +%s)
3547 STEPS_COUNTER=$((STEPS_COUNTER+1))
3548 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE /etc/ansible/hosts.hfx Compute GROUP CONTAINS ONLY COMPUTES (ICE-3141) (+$elapsed_time_seconds `date '+%T'`)${NC}"
3549 NON_COMPUTE_HOSTS=$(cat /etc/ansible/hosts.hfx | awk '/\[Compute\]/,/\[compute\]/' | grep overcloud- | grep -v compute)
3550 if [[ $NON_COMPUTE_HOSTS ]]
3551 then
3552 echo -e "${RED}the following hosts appear under the [Compute] group in /etc/ansible/hosts.hfx :${NC}\n\n"
3553 echo -e "${RED}$NON_COMPUTE_HOSTS${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3554 else
3555 echo -e "${GREEN}no non-computes are found under the [Compute] group in /etc/ansible/hosts.hfx${NC}"
3556 fi
3557 elapsed_time_seconds=$(expr $(date +%s) - $start)
3558
3559
3560 ####################################################################################################
3561
3562
3563 start=$(date +%s)
3564 STEPS_COUNTER=$((STEPS_COUNTER+1))
3565 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE DOCKER CONTAINERS ACTIVLY USING OLD/REMOVED IMAGES (+$elapsed_time_seconds `date '+%T'`)${NC}"
3566 REMOVE_STATE_DOCKER_IMAGES=$(ansible all --limit '!hypervisor' -b -m shell -a "docker image list | grep -w -E 'remove\s+' | awk '{print \$3}'" | grep ^[0-9a-f] | sort | uniq | paste -sd'|')
3567 if [[ $REMOVE_STATE_DOCKER_IMAGES ]]
3568 then
3569 CONTAINERS_USING_REMOVED_IMAGES=$(ansible all --limit '!hypervisor' -b -m shell -a "docker ps -a | grep -E \"$REMOVE_STATE_DOCKER_IMAGES\"" | grep ^[0-9a-f] -B 1)
3570 if [[ $CONTAINERS_USING_REMOVED_IMAGES ]]
3571 then
3572 echo -e "${RED}$CONTAINERS_USING_REMOVED_IMAGES${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3573 echo -e "\n\n${ORANGE}CBIS-16623/CBIS-16630 (19A/20) - post deploying the priority pack the zabbix docker container still using the old zabbix image that should have been removed and replaced by a new image${NC}"
3574 else
3575 echo -e "${GREEN}no docker containers that are using images that in remove state are found${NC}"
3576 fi
3577 else
3578 echo -e "${GREEN}no docker containers that are using images that in remove state are found${NC}"
3579 fi
3580 elapsed_time_seconds=$(expr $(date +%s) - $start)
3581
3582
3583 ####################################################################################################
3584
3585
3586 start=$(date +%s)
3587 STEPS_COUNTER=$((STEPS_COUNTER+1))
3588 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE DOCKER IMAGES WITH REMOVE STATE (+$elapsed_time_seconds `date '+%T'`)${NC}"
3589 REMOVE_STATE_DOCKER_IMAGES=$(ansible all --limit '!hypervisor' -b -m shell -a "docker image list | grep -w -E 'remove\s+'" | grep -w -E 'remove\s+' -B 1)
3590 if [[ $REMOVE_STATE_DOCKER_IMAGES ]]
3591 then
3592 echo -e "${RED}$REMOVE_STATE_DOCKER_IMAGES${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3593 echo -e "\n\n${ORANGE}CBIS-16623/CBIS-16630 (19A/20) - post deploying the priority pack the zabbix docker container still using the old zabbix image that should have been removed and replaced by a new image${NC}"
3594 else
3595 echo -e "${GREEN}no docker images with remove state are found${NC}"
3596 fi
3597 elapsed_time_seconds=$(expr $(date +%s) - $start)
3598
3599
3600 ####################################################################################################
3601
3602
3603 start=$(date +%s)
3604 STEPS_COUNTER=$((STEPS_COUNTER+1))
3605 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE UNUSED DOCKER IMAGES (+$elapsed_time_seconds `date '+%T'`)${NC}"
3606 if [[ $cbis_version != "18.0.0.1" ]]
3607 then
3608 unused_docker_images=$(ansible all --limit '!hypervisor' -b -m shell -a "docker image ls --all | grep \<none\> | awk '{print \$1}' | sort | uniq -c | grep -E -v '^[[:space:]]+[0-1]' | grep -v \<none\>" | grep -E -v 'FAILED|non-zero return code')
3609 if [[ $unused_docker_images ]]
3610 then
3611 echo -e "${RED}$unused_docker_images${NC}"
3612 else
3613 echo -e "${GREEN}no unused docker images were found${NC}"
3614 fi
3615 else
3616 echo -e "${ORANGE}no docker containers in CBIS 18.0.0.1${NC}"
3617 fi
3618 elapsed_time_seconds=$(expr $(date +%s) - $start)
3619
3620
3621 ####################################################################################################
3622
3623
3624 start=$(date +%s)
3625 STEPS_COUNTER=$((STEPS_COUNTER+1))
3626 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE UNUSED OVERCLOUD GLANCE IMAGES (+$elapsed_time_seconds `date '+%T'`)${NC}"
3627 used_overcloud_image=$(source ~/stackrc && openstack server list --long -c "Image ID" -f value | sort --uniq | paste -sd '|')
3628 unused_overcloud_images=$(source ~/stackrc && openstack image list -f value | grep overcloud-full_[0-9] | grep -E -v "$used_overcloud_image")
3629 unused_overcloud_images_parsed=$(echo -e "$unused_overcloud_images" | awk '{print $2}' | paste -sd' ')
3630 if [[ $unused_overcloud_images ]]
3631 then
3632 echo -e "${RED}$unused_overcloud_images\n\n${MAGENTA}it is safe to delete the above image(s) in order to free required disk space from the / partition in the UC\nfrom the undercloud vm: source ~/stackrc && openstack image delete $unused_overcloud_images_parsed${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3633 else
3634 echo -e "${GREEN}no unused overcloud images${NC}"
3635 fi
3636 elapsed_time_seconds=$(expr $(date +%s) - $start)
3637
3638
3639 ####################################################################################################
3640
3641
3642 start=$(date +%s)
3643 STEPS_COUNTER=$((STEPS_COUNTER+1))
3644 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE NO DOCKER CONTAINER IS USING ALL THE COMPUTE CPUS (SHOULD ONLY USE THE ISOLATED CPUS) (+$elapsed_time_seconds `date '+%T'`)${NC}"
3645 UNRESTRAINED_CONTAINERS=$(ansible compute -b -m shell -a "docker ps | awk '{print \$NF}' | xargs -i docker inspect {} | grep -E '\"Name\": \"/|\"CpusetCpus\": \"\",' | grep '\"CpusetCpus\": \"\",' -B 1" | grep -v 'Error: No such object: NAMES')
3646 if [[ $UNRESTRAINED_CONTAINERS ]]
3647 then
3648 echo -e "${RED}one or more containers are using all the compute cpus while it should only use the isolated cpus${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3649 else
3650 echo -e "${GREEN}couldn't find containers that using all the compute cpus${NC}"
3651 fi
3652 elapsed_time_seconds=$(expr $(date +%s) - $start)
3653
3654
3655 ####################################################################################################
3656
3657
3658 start=$(date +%s)
3659 STEPS_COUNTER=$((STEPS_COUNTER+1))
3660 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE AMOUNT OF HOSTS PRESENTED BY ANSIBLE EQUALS TO THE AMOUNT OF HOSTS PRESENTED BY NOVA (+$elapsed_time_seconds `date '+%T'`)${NC}"
3661 if [[ $ansible_all_hosts_count != $nova_overcloud_and_undercloud_hosts_count ]]
3662 then
3663 echo -e "${RED}openstack server list overcloud + undercloud vm hosts count is ($nova_overcloud_and_undercloud_hosts_count) while there are ($ansible_all_hosts_count) hosts under /etc/ansible/hosts${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3664 else
3665 echo -e "${GREEN}/etc/ansible/hosts and openstack server list has the same hosts count${NC}"
3666 fi
3667 elapsed_time_seconds=$(expr $(date +%s) - $start)
3668
3669
3670 ####################################################################################################
3671
3672
3673 if [[ $nuage == "true" ]]
3674 then
3675 start=$(date +%s)
3676 STEPS_COUNTER=$((STEPS_COUNTER+1))
3677 echo -e "${BLUE}\n\n$STEPS_COUNTER) MONITOR THE NUAGE SERVICE ON THE VSD $vsd_ip (+$elapsed_time_seconds `date '+%T'`)${NC}"
3678 monit_summary=$(sshpass -p 'Alcateldc' ssh root@$vsd_ip monit summary | grep -E -v 'Running|Accessible|Status ok')
3679 if [[ $monit_summary ]]
3680 then
3681 echo -e "${RED}$monit_summary${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3682 else
3683 echo -e "${GREEN}monit summary returned Running, Accessible and Status ok as expected${NC}"
3684 fi
3685 fi
3686 elapsed_time_seconds=$(expr $(date +%s) - $start)
3687
3688
3689
3690 ####################################################################################################
3691
3692
3693 if [[ $nuage == "true" ]]
3694 then
3695 start=$(date +%s)
3696 STEPS_COUNTER=$((STEPS_COUNTER+1))
3697 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THE EJABBERED LICENSE ON THE VSD $vsd_ip (+$elapsed_time_seconds `date '+%T'`)${NC}"
3698 ejabbered_license=$(sshpass -p 'Alcateldc' ssh root@$vsd_ip /opt/ejabberd/bin/ejabberdctl license_info | grep expired)
3699 if [[ $ejabbered_license ]]
3700 then
3701 ejabbered_license=$(sshpass -p 'Alcateldc' ssh root@$vsd_ip /opt/ejabberd/bin/ejabberdctl license_info)
3702 echo -e "${RED}$ejabbered_license${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3703 else
3704 echo -e "${GREEN}the ejabbered license is still valid${NC}"
3705 fi
3706 elapsed_time_seconds=$(expr $(date +%s) - $start)
3707 fi
3708
3709
3710 ####################################################################################################
3711
3712
3713 start=$(date +%s)
3714 STEPS_COUNTER=$((STEPS_COUNTER+1))
3715 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THE OVERCLOUD STACK STATUS (+$elapsed_time_seconds `date '+%T'`)${NC}"
3716 stack_status=$(source ~/stackrc && openstack stack list -c 'Stack Status' | grep COMPLETE | awk '{print $2}')
3717 if [[ $stack_status ]]
3718 then
3719 echo -e "${GREEN}overcloud stack status is $stack_status${NC}"
3720 else
3721 stack_status=$(source ~/stackrc && openstack stack list)
3722 echo -e "${RED}$stack_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3723 fi
3724 elapsed_time_seconds=$(expr $(date +%s) - $start)
3725
3726
3727 ####################################################################################################
3728
3729
3730 start=$(date +%s)
3731 STEPS_COUNTER=$((STEPS_COUNTER+1))
3732 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE SYSTEM IS CONFIGURED WITH THE EXPECTED NUMBER OF FAST OSDS (+$elapsed_time_seconds `date '+%T'`)${NC}"
3733 if [[ $hci == "false" && $fast_pools == "true" ]]
3734 then
3735 fast_disks=$(cat user_config.yaml | grep fast_pool_device: -A6 | grep -c /dev/)
3736 expected_fast_osds=$(expr $ansible_storage_hosts_count \* $fast_disks)
3737 current_fast_osds=$(ansible $last_index_controller -b -m shell -a "ceph osd tree | sed -n -e '/root fast/,/root common/ p' | grep -c osd\." | grep ^[0-9])
3738 if [[ $expected_fast_osds == $current_fast_osds ]]
3739 then
3740 echo -e "${GREEN}found $expected_fast_osds fast osds as expected${NC}"
3741 else
3742 echo -e "${RED}expected $expected_fast_osds fast osds but received $current_fast_osds osds - execute \"ceph osd tree\" in one of the storage-nodes to check for inconsistencies" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3743 echo -e "\n\n${ORANGE}CBIS-16402 (19A) - OSDs are not configured for sdaa+ disks{NC}"
3744 fi
3745 elif [[ $ceph_backend == "true" && $hci == "false" && $fast_pools == "false" ]]
3746 then
3747 echo -e "${ORANGE}the setup is configured with multi-pools${NC}"
3748 elif [[ $ceph_backend == "true" && $hci == "true" ]]
3749 then
3750 echo -e "${ORANGE}the setup is configured with hci${NC}"
3751 elif [[ $ceph_backend == "false" ]]
3752 then
3753 echo -e "${ORANGE}the setup is deployed without ceph backend${NC}"
3754 fi
3755 elapsed_time_seconds=$(expr $(date +%s) - $start)
3756
3757
3758 ####################################################################################################
3759
3760
3761 start=$(date +%s)
3762 STEPS_COUNTER=$((STEPS_COUNTER+1))
3763 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THERE IS NO MIX OF BLOCK STORAGE TYPES (SSD, HDD and ETC..) BETWEEN THE CEPH FAST/COMMON POOLS (+$elapsed_time_seconds `date '+%T'`)${NC}"
3764 if [[ $ceph_backend == "true" && $fast_pools == "true" ]]
3765 then
3766 echo -e "${CYAN}fast-pools osds check${NC}"
3767 osds=$(ansible $last_index_controller -b -m shell -a "ceph osd tree -f json | jq .nodes[].name" | awk '/fast/,/common/' | grep osd\. | tr -d '"' | paste -sd "|")
3768 osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree | grep -E -w '$osds' | awk '{print \$2}' | sort --uniq | wc -l" | grep ^[0-9])
3769 if [[ $osds_class != "1" ]]
3770 then
3771 osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree" | awk '/fast/,/common/')
3772 echo -e "${RED}$osds_class${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3773 else
3774 osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree" | awk '/fast/,/common/' | grep osd\. | awk '{print $2}' | sort --uniq)
3775 echo -e "${GREEN}all fast osds using the same block storage device ($osds_class)${NC}"
3776 fi
3777 echo -e "${CYAN}common-pools osds check${NC}"
3778 osds=$(ansible $last_index_controller -b -m shell -a "ceph osd tree -f json | jq .nodes[].name" | awk '/common/,0' | grep osd\. | tr -d '"' | paste -sd "|")
3779 osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree | grep -E -w '$osds' | awk '{print \$2}' | sort --uniq | wc -l" | grep ^[0-9])
3780 if [[ $osds_class != "1" ]]
3781 then
3782 osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree" | awk '/common/,0')
3783 echo -e "${RED}$osds_class${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3784 else
3785 osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree" | awk '/common/,0' | grep osd\. | awk '{print $2}' | sort --uniq)
3786 echo -e "${GREEN}all common osds using the same block storage device ($osds_class)${NC}"
3787 fi
3788 echo -e "${CYAN}common-pools osds check${NC}"
3789 osds=$(ansible $last_index_controller -b -m shell -a "ceph osd tree -f json | jq .nodes[].name" | awk '/common/,0' | grep osd\. | tr -d '"' | paste -sd "|")
3790 osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree | grep -E -w '$osds' | awk '{print \$2}' | sort --uniq | wc -l" | grep ^[0-9])
3791 if [[ $osds_class != "1" ]]
3792 then
3793 osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree" | awk '/common/,0')
3794 echo -e "${RED}$osds_class${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3795 else
3796 osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree" | awk '/common/,0' | grep osd\. | awk '{print $2}' | sort --uniq)
3797 echo -e "${GREEN}all common osds using the same block storage device ($osds_class)${NC}"
3798 fi
3799 elif [[ $ceph_backend == "true" && $hci == "false" && $fast_pools == "false" ]]
3800 then
3801 echo -e "${ORANGE}the setup is configured with multi-pools${NC}"
3802 elif [[ $ceph_backend == "true" && $hci == "true" ]]
3803 then
3804 echo -e "${ORANGE}the setup is configured with hci${NC}"
3805 elif [[ $ceph_backend == "false" ]]
3806 then
3807 echo -e "${ORANGE}the setup is deployed without ceph backend${NC}"
3808 fi
3809 elapsed_time_seconds=$(expr $(date +%s) - $start)
3810
3811
3812 ####################################################################################################
3813
3814
3815 start=$(date +%s)
3816 STEPS_COUNTER=$((STEPS_COUNTER+1))
3817 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE FAST POOL OSD'S BLOCK STORAGE TYPE ARE SSD/NVME (+$elapsed_time_seconds `date '+%T'`)${NC}"
3818 if [[ $ceph_backend == "true" && $fast_pools == "true" ]]
3819 then
3820 fast_osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree" | awk '/fast/,/common/' | grep osd\. | awk '{print $2}' | grep -v ssd | grep -v nvme | sort --uniq)
3821 if [[ -z $fast_osds_class ]]
3822 then
3823 echo -e "${GREEN}all fast osds using the same block storage device ($osds_class)${NC}"
3824 else
3825 echo -e "${RED}one or more osds under the fast-pool are from $fast_osds_class type while expecting the fast-pool osds to be from type nvme/ssd${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3826 fi
3827 elif [[ $ceph_backend == "true" && $hci == "false" && $fast_pools == "false" ]]
3828 then
3829 echo -e "${ORANGE}the setup is configured with multi-pools${NC}"
3830 elif [[ $ceph_backend == "true" && $hci == "true" ]]
3831 then
3832 echo -e "${ORANGE}the setup is configured with hci${NC}"
3833 elif [[ $ceph_backend == "false" ]]
3834 then
3835 echo -e "${ORANGE}the setup is deployed without ceph backend${NC}"
3836 fi
3837 elapsed_time_seconds=$(expr $(date +%s) - $start)
3838
3839
3840 ####################################################################################################
3841
3842
3843 start=$(date +%s)
3844 STEPS_COUNTER=$((STEPS_COUNTER+1))
3845 echo -e "${BLUE}\n\n$STEPS_COUNTER) CEPH OSDS IN/UP STATUS CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
3846 if [[ $cbis_version != "18.0.0.1" ]]
3847 then
3848 if [[ $ceph_backend == "true" ]]
3849 then
3850 osds_total=$(ansible $last_index_controller -b -m shell -a "ceph -s" | grep osd: | awk '{print $2}')
3851 osds_up=$(ansible $last_index_controller -b -m shell -a "ceph -s" | grep osd: | awk '{print $4}')
3852 osds_in=$(ansible $last_index_controller -b -m shell -a "ceph -s" | grep osd: | awk '{print $6}')
3853 if [[ $osds_total == $osds_up && $osds_total == $osds_in ]]
3854 then
3855 echo -e "${GREEN}all $osds_total osds are in and up${NC}"
3856 else
3857 ceph_status=$(ansible $last_index_controller -b -m shell -a "ceph -s")
3858 echo -e "${RED}found osds inconsistencies:${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3859 echo -e "${RED}total osds: $osds_total | osds up: $osds_up | osds in: $osds_in${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3860
3861 echo -e "\n${RED}$ceph_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3862 fi
3863 else
3864 echo -e "${ORANGE}the setup is deployed without ceph backend${NC}"
3865 fi
3866 elif [[ $cbis_version == "18.0.0.1" ]]
3867 then
3868 if [[ $ceph_backend == "true" ]]
3869 then
3870 osds_total=$(ansible $last_index_controller -b -m shell -a "ceph -s | grep osdmap" | grep -v $last_index_controller | awk '{print $3}')
3871 osds_up=$(ansible $last_index_controller -b -m shell -a "ceph -s | grep osdmap" | grep -v $last_index_controller | awk '{print $5}')
3872 osds_in=$(ansible $last_index_controller -b -m shell -a "ceph -s | grep osdmap" | grep -v $last_index_controller | awk '{print $7}')
3873 if [[ $osds_total == $osds_up && $osds_total == $osds_in ]]
3874 then
3875 echo -e "${GREEN}all $osds_total osds are in and up${NC}"
3876 else
3877 echo -e "${RED}found osds inconsistencies${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3878 echo -e "${RED}$osds_total total osds, $osds_up up osds, $osds_in in osds${NC}\n\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3879 ceph_status=$(ansible $last_index_controller -b -m shell -a "ceph -s")
3880 echo -e "${RED}$ceph_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3881 fi
3882 else
3883 echo -e "${ORANGE}the setup is deployed without ceph backend${NC}"
3884 fi
3885 fi
3886 elapsed_time_seconds=$(expr $(date +%s) - $start)
3887
3888
3889 ####################################################################################################
3890
3891
3892 start=$(date +%s)
3893 STEPS_COUNTER=$((STEPS_COUNTER+1))
3894 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK EACH HCI COMPUTE/STORAGE NODE HAS THE SAME NUMBER OF OSDS (+$elapsed_time_seconds `date '+%T'`)${NC}"
3895 if [[ $hci == "true" ]]
3896 then
3897 ceph_osds_count_per_server=$(ansible compute -b -m shell -a "docker ps | grep ceph-osd | wc -l" | grep ^[0-9] | sort -u | wc -l)
3898 if [[ $ceph_osds_count_per_server == "1" ]]
3899 then
3900 echo -e "${GREEN}all the servers has the same number of osds docker containers${NC}"
3901 else
3902 ceph_osds_count_per_server=$(ansible compute -b -m shell -a "docker ps | grep ceph-osd | wc -l")
3903 echo -e "${RED}$ceph_osds_count_per_server${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3904 fi
3905 elif [[ $hci == "false" && $ceph_backend == "true" ]]
3906 then
3907 ceph_osds_count_per_server=$(ansible cephstorage -b -m shell -a "docker ps | grep ceph-osd | wc -l" | grep ^[0-9] | sort -u | wc -l)
3908 if [[ $ceph_osds_count_per_server == "1" ]]
3909 then
3910 echo -e "${GREEN}all the servers has the same number of osds docker containers${NC}"
3911 else
3912 ceph_osds_count_per_server=$(ansible cephstorage -b -m shell -a "docker ps | grep ceph-osd | wc -l")
3913 echo -e "${RED}$ceph_osds_count_per_server${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3914 fi
3915 elif [[ $ceph_backend == "false" ]]
3916 then
3917 echo -e "${ORANGE}the setup is deployed without ceph backend${NC}"
3918 fi
3919 elapsed_time_seconds=$(expr $(date +%s) - $start)
3920
3921
3922 ####################################################################################################
3923
3924
3925 start=$(date +%s)
3926 STEPS_COUNTER=$((STEPS_COUNTER+1))
3927 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE CEPH SLOW REQUESTS IN /var/log/messages /var/log/ceph/(+$elapsed_time_seconds `date '+%T'`)"
3928 if [[ $ceph_backend == "true" ]]
3929 then
3930 ceph_slow_requests=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "grep -i -R -E 'REQUEST_SLOW|slow requests' /var/log/messages /var/log/ceph/ | grep -v ansible-command" | grep -i -E 'REQUEST_SLOW|slow requests' -B 1)
3931 if [[ $ceph_slow_requests ]]
3932 then
3933 echo -e "${RED}$ceph_slow_requests${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3934 else
3935
3936 echo -e "${GREEN}no ceph slow requests in /var/log/messages and /var/log/ceph/ are found${NC}"
3937 fi
3938 else
3939 echo -e "${ORANGE}the setup is deployed without ceph backend${NC}"
3940 fi
3941 elapsed_time_seconds=$(expr $(date +%s) - $start)
3942
3943
3944 ####################################################################################################
3945
3946
3947 start=$(date +%s)
3948 STEPS_COUNTER=$((STEPS_COUNTER+1))
3949 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT CEPH FSID IS IDENTICAL BETWEEN ALL THE HOSTS (+$elapsed_time_seconds `date '+%T'`)"
3950 if [[ $ceph_backend == "true" ]]
3951 then
3952 oc_fsid=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ceph fsid" | grep ^[0-9,a-f] | sort -u)
3953 uc_fsid=$(grep CephClusterFSID templates/storage-environment.yaml | awk '{print $2}' | tr -d \')
3954 if [[ $oc_fsid == $uc_fsid ]]
3955 then
3956 echo -e "${GREEN}all the hosts has the same ceph fsid ($uc_fsid)${NC}"
3957 else
3958 echo -e "${RED}ceph fsid presented in templates/storage-environment.yaml (undercloud vm):\n($uc_fsid)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3959 echo -e "\n${RED}ceph fsid result of the ceph fsid command (overcloud):\n($oc_fsid)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3960 echo -e "\n\n${ORANGE}CBIS-15830 (19A) - Ceph returns failed to bind the UNIX domain socket warning ${NC}"
3961 fi
3962 else
3963 echo -e "${ORANGE}the setup is deployed without ceph backend${NC}"
3964 fi
3965 elapsed_time_seconds=$(expr $(date +%s) - $start)
3966
3967
3968 ####################################################################################################
3969
3970
3971 start=$(date +%s)
3972 STEPS_COUNTER=$((STEPS_COUNTER+1))
3973 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE NOVA LOCAL STORAGE PARTITION IS CONFIGURED (df) (+$elapsed_time_seconds `date '+%T'`)"
3974 if [[ $ovs_local_storage == "true" || $avrs_local_storage == "true" || $sriov_local_storage == "true" || $dpdk_local_storage == "true" ]]
3975 then
3976 if [[ $ovs_local_storage == "true" ]]
3977 then
3978 echo -e "${CYAN}checking OvsCompute partitions${NC}"
3979 nova_local_storage=$(ansible *overcloud-[oO]vs* -b -m shell -a "df | grep /var/lib/nova/instances")
3980 if [[ $nova_local_storage ]]
3981 then
3982 echo -e "${GREEN}$nova_local_storage${NC}"
3983 else
3984 echo -e "${GREEN}while the compute host-group is configured with local storage, the partition /var/lib/nova/instances is missing${NC}"
3985 fi
3986 fi
3987 if [[ $avrs_local_storage == "true" ]]
3988 then
3989 echo -e "${CYAN}checking AvrsCompute partitions${NC}"
3990 nova_local_storage=$(ansible *overcloud-[aA]vrs* -b -m shell -a "df | grep /var/lib/nova/instances")
3991 if [[ $nova_local_storage ]]
3992 then
3993 echo -e "${GREEN}$nova_local_storage${NC}"
3994 else
3995 echo -e "${GREEN}while the compute host-group is configured with local storage, the partition /var/lib/nova/instances is missing${NC}"
3996 fi
3997 fi
3998 if [[ $sriov_local_storage == "true" ]]
3999 then
4000 echo -e "${CYAN}checking SriovPerformanceCompute partitions${NC}"
4001 nova_local_storage=$(ansible *overcloud-[sS]riov* -b -m shell -a "df | grep /var/lib/nova/instances")
4002 if [[ $nova_local_storage ]]
4003 then
4004 echo -e "${GREEN}$nova_local_storage${NC}"
4005 else
4006 echo -e "${GREEN}while the compute host-group is configured with local storage, the partition /var/lib/nova/instances is missing${NC}"
4007 fi
4008 fi
4009 if [[ $dpdk_local_storage == "true" ]]
4010 then
4011 echo -e "${CYAN}checking SriovPerformanceCompute partitions${NC}"
4012 nova_local_storage=$(ansible *overcloud-[sS]riov* -b -m shell -a "df | grep /var/lib/nova/instances")
4013 if [[ $nova_local_storage ]]
4014 then
4015 echo -e "${GREEN}$nova_local_storage${NC}"
4016 else
4017 echo -e "${GREEN}while the compute host-group is configured with local storage, the partition /var/lib/nova/instances is missing${NC}"
4018 fi
4019 fi
4020 else
4021 echo -e "${ORANGE}no host with nova local storage enabled is found${NC}"
4022 fi
4023 elapsed_time_seconds=$(expr $(date +%s) - $start)
4024
4025
4026 ####################################################################################################
4027
4028
4029 start=$(date +%s)
4030 STEPS_COUNTER=$((STEPS_COUNTER+1))
4031 EXCEPTION=0
4032 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE INSTANCES LEFTOVERS IN /var/lib/nova/instances/ OF THE COMPUTES (+$elapsed_time_seconds `date '+%T'`)${NC}"
4033 for host in $ansible_computes_hosts
4034 do
4035 instances_id_dir=$(ansible $host -b -m shell -a "ls /var/lib/nova/instances/ | awk '{print \$NF}'" | grep '^[0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f]-')
4036 for instance_id_dir in $instances_id_dir
4037 do
4038 check_if_id_in_nova=$(echo -e "$nova_instances" | grep $instance_id_dir)
4039 if [[ -z $check_if_id_in_nova ]]
4040 then
4041 echo -e "${RED}/var/lib/nova/instances/$instance_id_dir is not found in openstack server list${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4042 EXCEPTION=$((EXCEPTION+1))
4043 else
4044 echo -e "${GREEN}/var/lib/nova/instances/$instance_id_dir is found in openstack server list${NC}"
4045 fi
4046 done
4047 done
4048 if [ $EXCEPTION -gt 0 ]
4049 then
4050 echo -e "\n\n${ORANGE}CBIS-16393 (19A) - /var/lib/nova/instances/<instance> is not deleted after the instance was migrated${NC}"
4051 fi
4052 elapsed_time_seconds=$(expr $(date +%s) - $global_start)
4053
4054
4055 ####################################################################################################
4056
4057 start=$(date +%s)
4058 STEPS_COUNTER=$((STEPS_COUNTER+1))
4059 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT EACH HOST-GROUP HAS THE SAME PARTITIONS (df) (+$elapsed_time_seconds `date '+%T'`)${NC}"
4060 echo -e "${CYAN}check the controllers${NC}"
4061 partitions=$(ansible controller -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|md[1][2][6-7]' | sha1sum" | grep ^[0-9a-f] | sort -u | wc -l)
4062 if [[ $partitions == "1" ]]
4063 then
4064 echo -e "${GREEN}all the controllers has the same partitions${NC}"
4065 else
4066 partitions=$(ansible controller -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|md[1][2][6-7]'")
4067 echo -e "${RED}$partitions${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4068 echo -e "\n\n${ORANGE}CBIS-16369 (19A) - mount_cephfs_share is not configured on replaced controllers${NC}"
4069 fi
4070 if [[ $ansible_sriov_hosts_count != "0" ]]
4071 then
4072 echo -e "${CYAN}check sriov computes${NC}"
4073 partitions=$(ansible *overcloud-[sS]riov* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|/var/lib/nova/instances|md[1][2][6-7]' | sha1sum" | grep ^[0-9a-f] | sort -u | wc -l)
4074 if [[ $partitions == "1" ]]
4075 then
4076 echo -e "${GREEN}all the sriov computes has the same partitions${NC}"
4077 else
4078 partitions=$(ansible *overcloud-[sS]riov* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|md[1][2][6-7]'")
4079 echo -e "${RED}$partitions${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4080 fi
4081 fi
4082 if [[ $ansible_ovs_hosts_count != "0" ]]
4083 then
4084 echo -e "${CYAN}check ovs computes${NC}"
4085 partitions=$(ansible *overcloud-[oO]vs* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|/var/lib/nova/instances|md[1][2][6-7]' | sha1sum" | grep ^[0-9a-f] | sort -u | wc -l)
4086 if [[ $partitions == "1" ]]
4087 then
4088 echo -e "${GREEN}all the ovs computes has the same partitions${NC}"
4089 else
4090 partitions=$(ansible *overcloud-[oO]vs* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|md[1][2][6-7]'")
4091 echo -e "${RED}$partitions${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4092 fi
4093 fi
4094 if [[ $ansible_dpdk_hosts_count != "0" ]]
4095 then
4096 echo -e "${CYAN}check dpdk computes${NC}"
4097 partitions=$(ansible *overcloud-[dD]pdk* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|/var/lib/nova/instances|md[1][2][6-7]' | sha1sum" | grep ^[0-9a-f] | sort -u | wc -l)
4098 if [[ $partitions == "1" ]]
4099 then
4100 echo -e "${GREEN}all the dpdk computes has the same partitions${NC}"
4101 else
4102 partitions=$(ansible *overcloud-[dD]pdk* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|md[1][2][6-7]'")
4103 echo -e "${RED}$partitions${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4104 fi
4105 fi
4106 if [[ $ansible_avrs_hosts_count != "0" ]]
4107 then
4108 echo -e "${CYAN}check avrs computes${NC}"
4109 partitions=$(ansible *overcloud-[aA]vrs* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|/var/lib/nova/instances|md[1][2][6-7]' | sha1sum" | grep ^[0-9a-f] | sort -u | wc -l)
4110 if [[ $partitions == "1" ]]
4111 then
4112 echo -e "${GREEN}all the avrs computes has the same partitions${NC}"
4113 else
4114 partitions=$(ansible *overcloud-[aA]vrs* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|md[1][2][6-7]'")
4115 echo -e "${RED}$partitions${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4116 fi
4117 fi
4118 if [[ $ansible_storage_hosts_count != "0" ]]
4119 then
4120 echo -e "${CYAN}check the storage nodes${NC}"
4121 partitions=$(ansible *overcloud-[Ss]torage* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|md[1][2][6-7]' | sha1sum" | grep ^[0-9a-f] | sort -u | wc -l)
4122 if [[ $partitions == "1" ]]
4123 then
4124 echo -e "${GREEN}all the storage nodes has the same partitions${NC}"
4125 else
4126 partitions=$(ansible *overcloud-[Ss]torage* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|md[1][2][6-7]'")
4127 echo -e "${RED}$partitions${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4128 fi
4129 fi
4130 if [[ $ansible_monitoring_hosts_count != "0" ]]
4131 then
4132 echo -e "${CYAN}check the monitoring nodes${NC}"
4133 partitions=$(ansible *overcloud-[Mm]onitoring* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|md[1][2][6-7]' | sha1sum" | grep ^[0-9a-f] | sort -u | wc -l)
4134 if [[ $partitions == "1" ]]
4135 then
4136 echo -e "${GREEN}all the monitoring nodes has the same partitions${NC}"
4137 else
4138 partitions=$(ansible *overcloud-[Mm]onitoring* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|md[1][2][6-7]'")
4139 echo -e "${RED}$partitions${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4140 fi
4141 fi
4142 elapsed_time_seconds=$(expr $(date +%s) - $start)
4143
4144
4145 ####################################################################################################
4146
4147
4148 start=$(date +%s)
4149 STEPS_COUNTER=$((STEPS_COUNTER+1))
4150 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT EACH HOST-GROUP HAS THE SAME AMOUNT OF PHYSICAL DISKS (lsblk) (+$elapsed_time_seconds `date '+%T'`)${NC}"
4151 echo -e "${CYAN}check the controllers${NC}"
4152 disks_count=$(ansible controller -b -m shell -a "lsblk | grep disk | wc -l" | grep ^[0-9] | sort -u | wc -l)
4153 if [[ $disks_count == "1" ]]
4154 then
4155 echo -e "${GREEN}same number of disks for all the controllers${NC}"
4156 else
4157 disks=$(ansible controller -b -m shell -a "lsblk | grep disk" | grep ^s | awk '{print $1}' | sort | uniq -c | column -t | sort -k1 | grep -v ^$ansible_storage_hosts_count | awk '{print $2}' | tr -d \n | paste -sd ' ' | tr -s ' ' '|')
4158 missing_disks=$(ansible controller -b -m shell -a "lsblk -d | grep -E '$disks'")
4159 echo -e "${RED}$missing_disks${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4160 fi
4161 echo -e "${CYAN}check the computes${NC}"
4162 disks_count=$(ansible compute -b -m shell -a "lsblk | grep disk | wc -l" | grep ^[0-9] | sort -u | wc -l)
4163 if [[ $disks_count == "1" ]]
4164 then
4165 echo -e "${GREEN}same number of disks for all the computes${NC}"
4166 else
4167 disks=$(ansible compute -b -m shell -a "lsblk | grep disk" | grep ^s | awk '{print $1}' | sort | uniq -c | column -t | sort -k1 | grep -v ^$ansible_storage_hosts_count | awk '{print $2}' | tr -d \n | paste -sd ' ' | tr -s ' ' '|')
4168 missing_disks=$(ansible compute -b -m shell -a "lsblk -d | grep -E '$disks'")
4169 echo -e "${RED}$missing_disks${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4170 fi
4171 if [[ $ansible_storage_hosts_count != "0" ]]
4172 then
4173 echo -e "${CYAN}check the storage nodes${NC}"
4174 disks_count=$(ansible *overcloud-[Ss]torage* -b -m shell -a "lsblk | grep disk | wc -l" | grep ^[0-9] | sort -u | wc -l)
4175 if [[ $disks_count == "1" ]]
4176 then
4177 echo -e "${GREEN}same number of disks for all the storage nodes${NC}"
4178 else
4179 disks=$(ansible *overcloud-[Ss]torage* -b -m shell -a "lsblk | grep disk" | grep ^s | awk '{print $1}' | sort | uniq -c | column -t | sort -k1 | grep -v ^$ansible_storage_hosts_count | awk '{print $2}' | tr -d \n | paste -sd ' ' | tr -s ' ' '|')
4180 missing_disks=$(ansible *overcloud-[Ss]torage* -b -m shell -a "lsblk -d | grep -E '$disks'")
4181 echo -e "${RED}$missing_disks${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4182 fi
4183 fi
4184 if [[ $ansible_monitoring_hosts_count != "0" ]]
4185 then
4186 echo -e "${CYAN}check the monitoring nodes${NC}"
4187 disks_count=$(ansible *overcloud-[Mm]onitoring* -b -m shell -a "lsblk | grep disk | wc -l" | grep ^[0-9] | sort -u | wc -l)
4188 if [[ $disks_count == "1" ]]
4189 then
4190 echo -e "${GREEN}same number of disks for all the monitoring nodes${NC}"
4191 else
4192 disks=$(ansible *overcloud-[Mm]onitoring* -b -m shell -a "lsblk | grep disk" | grep ^s | awk '{print $1}' | sort | uniq -c | column -t | sort -k1 | grep -v ^$ansible_storage_hosts_count | awk '{print $2}' | tr -d \n | paste -sd ' ' | tr -s ' ' '|')
4193 missing_disks=$(ansible *overcloud-[Mm]onitoring* -b -m shell -a "lsblk -d | grep -E '$disks'")
4194 echo -e "${RED}$missing_disks${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4195 fi
4196 fi
4197 elapsed_time_seconds=$(expr $(date +%s) - $start)
4198
4199
4200 ####################################################################################################
4201
4202
4203 start=$(date +%s)
4204 STEPS_COUNTER=$((STEPS_COUNTER+1))
4205 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT ALL THE DISKS OF THE CONTROLLERS AND COMPUTES ARE ARRANGED/ORDERED THE SAME WAY (+$elapsed_time_seconds `date '+%T'`)${NC}"
4206 disk_order=$(ansible controller,compute -b -m shell -a "lsblk -dn | awk '{print \$1}'| md5sum" | grep ^[0-9a-f] | awk '{print $1}' | uniq | wc -l)
4207 if [[ $disk_order != "1" ]]
4208 then
4209 disk_order=$(ansible controller,compute -b -m shell -a "lsblk -dn")
4210 echo -e "${RED}$disk_order${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4211 else
4212 echo -e "${GREEN}all the disks are ordered the same way${NC}"
4213 fi
4214 elapsed_time_seconds=$(expr $(date +%s) - $start)
4215
4216
4217 ####################################################################################################
4218
4219
4220 start=$(date +%s)
4221 STEPS_COUNTER=$((STEPS_COUNTER+1))
4222 echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE MEGARAID CONFIGURATION BETWEEN THE STORAGE NODES (+$elapsed_time_seconds `date '+%T'`)${NC}"
4223 if [[ $ansible_storage_hosts_count != "0" ]]
4224 then
4225 raid_mismatch=$(ansible CephStorage -b -m shell -a "/opt/MegaRAID/storcli/storcli64 /c0/vall show" | grep RAID | awk '{print $1,$2}' | sort | uniq -c | column -t | grep -v ^$ansible_storage_hosts_count)
4226 if [[ $raid_mismatch ]]
4227 then
4228 raid_mismatch=$(ansible CephStorage -b -m shell -a "/opt/MegaRAID/storcli/storcli64 /c0/vall show | grep RAID")
4229 echo -e "${RED}$raid_mismatch${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4230 else
4231 echo -e "${GREEN}the storage nodes megaraid configuration is aligned${NC}"
4232 fi
4233 else
4234 echo -e "${ORANGE}no storage nodes found${NC}"
4235 fi
4236 elapsed_time_seconds=$(expr $(date +%s) - $start)
4237
4238
4239 ####################################################################################################
4240
4241
4242 start=$(date +%s)
4243 STEPS_COUNTER=$((STEPS_COUNTER+1))
4244 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THE SOFTWARE RAID STATUS OF ANY SERVER THAT IS CONFIGURED WITH SOFTWARE RAID (+$elapsed_time_seconds `date '+%T'`)${NC}"
4245 swraid1_servers=$(ansible all --limit '!localhost,!hypervisor' -m shell -b -a "test -f /var/log/cbis/raid_setup_inital_setup.log; echo \$?" | grep ^0 -B 1 | grep ^overcloud- | awk '{print $1}' | paste -sd',')
4246 if [[ $swraid1_servers ]]
4247 then
4248 servers=$(echo -e "$swraid1_servers" | tr -s , '\n')
4249 echo -e "${CYAN}servers configured with software raid 1:\n$servers${NC}\n\n"
4250 swraid1_failed_servers=$(ansible $swraid1_servers -m shell -b -a "cat /proc/mdstat | grep -c '\[UU\]'" | grep '^[0-1]\|^[3-9]' -B 1 | grep ^overcloud- | awk '{print $1}' | paste -sd',')
4251 if [[ $swraid1_failed_servers ]]
4252 then
4253 swraid1_failure_output=$(ansible $swraid1_failed_servers -m shell -b -a "cat /proc/mdstat")
4254 echo -e "${RED}$swraid1_failure_output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4255 else
4256 echo -e "${GREEN}the software raid status is as expected on all the servres that are configured with software raid${NC}"
4257 fi
4258 else
4259 echo -e "${ORANGE}no server is configured with software raid - please investigate if that shouldn't be the case${NC}"
4260 fi
4261 elapsed_time_seconds=$(expr $(date +%s) - $start)
4262
4263
4264 ####################################################################################################
4265
4266
4267 start=$(date +%s)
4268 STEPS_COUNTER=$((STEPS_COUNTER+1))
4269 EXCEPTION=0
4270 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT EACH HOSTS HAS THE EXPECTED HYPERVISOR ISOLATED CPUS (+$elapsed_time_seconds `date '+%T'`)${NC}"
4271 if [[ $ansible_avrs_hosts_count != "0" && $nuage == "true" ]]
4272 then
4273 echo -e "${CYAN}now checking AvrsCompute${NC}"
4274 physcpubind=$(ansible *overcloud-[aA]vrs* -b -m shell -a "numactl --show" | grep physcpubind | tr -s ' ' '\n' | grep ^[0-9] | sort --uniq | wc -l)
4275 if [[ $physcpubind == $avrs_hypervisor_dedicated_cpus ]]
4276 then
4277 echo -e "${GREEN}found the expected number of isolated cpus${NC}"
4278 else
4279 echo -e "${RED}according to the user_config.yaml $avrs_hypervisor_dedicated_cpus isolated cpus are expected while numactl --show returned $physcpubind isolated cpus${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4280 EXCEPTION=$((EXCEPTION+1))
4281 fi
4282 fi
4283 if [[ $ansible_ovs_hosts_count != "0" ]]
4284 then
4285 echo -e "${CYAN}now checking OvsCompute${NC}"
4286 physcpubind=$(ansible *overcloud-[oO]vs* -b -m shell -a "numactl --show" | grep physcpubind | tr -s ' ' '\n' | grep ^[0-9] | sort --uniq | wc -l)
4287 if [[ $physcpubind == $ovs_hypervisor_dedicated_cpus ]]
4288 then
4289 echo -e "${GREEN}found the expected number of isolated cpus${NC}"
4290 else
4291 echo -e "${RED}according to the user_config.yaml $ovs_hypervisor_dedicated_cpus isolated cpus are expected while numactl --show returned $physcpubind isolated cpus${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4292 EXCEPTION=$((EXCEPTION+1))
4293 fi
4294 fi
4295 if [[ $ansible_sriov_hosts_count != "0" ]]
4296 then
4297 echo -e "${CYAN}now checking SriovPerformanceCompute${NC}"
4298 physcpubind=$(ansible *overcloud-[sS]riov* -b -m shell -a "numactl --show" | grep physcpubind | tr -s ' ' '\n' | grep ^[0-9] | sort --uniq | wc -l)
4299 if [[ $physcpubind == $sriov_hypervisor_dedicated_cpus ]]
4300 then
4301 echo -e "${GREEN}found the expected number of isolated cpus${NC}"
4302 else
4303 echo -e "${RED}according to the user_config.yaml $sriov_hypervisor_dedicated_cpus isolated cpus are expected while numactl --show returned $physcpubind isolated cpus${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4304 EXCEPTION=$((EXCEPTION+1))
4305 fi
4306 fi
4307 if [[ $ansible_dpdk_hosts_count != "0" ]]
4308 then
4309 echo -e "${CYAN}now checking DpdkPerformanceCompute${NC}"
4310 physcpubind=$(ansible *overcloud-[dD]pdk* -b -m shell -a "numactl --show" | grep physcpubind | tr -s ' ' '\n' | grep ^[0-9] | sort --uniq | wc -l)
4311 if [[ $physcpubind == $dpdk_hypervisor_dedicated_cpus ]]
4312 then
4313 echo -e "${GREEN}found the expected number of isolated cpus${NC}"
4314 else
4315 echo -e "${RED}according to the user_config.yaml $dpdk_hypervisor_dedicated_cpus isolated cpus are expected while numactl --show returned $physcpubind isolated cpus${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4316 EXCEPTION=$((EXCEPTION+1))
4317 fi
4318 fi
4319 if [ $EXCEPTION -gt 0 ]
4320 then
4321 echo -e "\n\n${ORANGE}validate that the setup is not deployed with odd number in the hypervisor_dedicated_cpus parameter${NC}"
4322 fi
4323 elapsed_time_seconds=$(expr $(date +%s) - $start)
4324
4325
4326 ####################################################################################################
4327
4328
4329 if [[ $cbis_version != "18.0.0.1" ]]
4330 then
4331 start=$(date +%s)
4332 STEPS_COUNTER=$((STEPS_COUNTER+1))
4333 echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE BETWEEN current_node_count AND intended_node_count IN /etc/ansible/hosts (+$elapsed_time_seconds `date '+%T'`)${NC}"
4334 current_node_count=$(cat /etc/ansible/hosts | grep current_node_count | awk -F= '{print $2}')
4335 intended_node_count=$(cat /etc/ansible/hosts | grep intended_node_count | awk -F= '{print $2}')
4336 if [[ $current_node_count == $intended_node_count ]]
4337 then
4338 echo -e "${GREEN}current_node_count ("$current_node_count") and intended_node_count ("$intended_node_count") are identical${NC}"
4339 else
4340 echo -e "${RED}current_node_count ("$current_node_count") and intended_node_count ("$intended_node_count") not identical${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4341 fi
4342 fi
4343 elapsed_time_seconds=$(expr $(date +%s) - $start)
4344
4345
4346 ####################################################################################################
4347
4348
4349 start=$(date +%s)
4350 STEPS_COUNTER=$((STEPS_COUNTER+1))
4351 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THE BASIC HEALTH OF THE BMC (+$elapsed_time_seconds `date '+%T'`)${NC}"
4352 selftest=$(ansible all --limit '!localhost' -b -m shell -a "ipmitool mc selftest -v | grep -v -w 'Selftest: passed'" | grep Selftest -B 1)
4353 if [[ $selftest ]]
4354 then
4355 echo -e "${RED}$selftest${NC}"
4356 else
4357 echo -e "${GREEN}BMC selftest passed on all the servers${NC}"
4358 fi
4359 elapsed_time_seconds=$(expr $(date +%s) - $start)
4360
4361
4362 ####################################################################################################
4363
4364
4365 if [[ $cbis_version != "19.0.0.1" ]]
4366 then
4367 start=$(date +%s)
4368 STEPS_COUNTER=$((STEPS_COUNTER+1))
4369 retransmits_count=100
4370 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR HIGH PACKETS RETRANSMISIONS FROM RANDOM STORAGE/COMPUTE TO $storage_controller_address ON $last_index_controller (+$elapsed_time_seconds `date '+%T'`)${NC}"
4371 if [[ $cbis_version != "18.0.0.1" ]]
4372 then
4373 ansible $last_index_controller -b -m shell -a "killall iperf3" | grep -v SUCCESS | tr -d '\n' > /dev/null
4374 a=$(ansible $last_index_controller -b -m shell -a "iptables -S" | grep ACCEPT | awk -F'--dports' '{print $2}' | awk '{print $1}' | tr -s ',' '\n' | tr -s ':' ' ' | sort -n | uniq | grep -v ' ')
4375 b=$(ansible $last_index_controller -b -m shell -a "iptables -S" | grep ACCEPT | awk -F'--dports' '{print $2}' | awk '{print $1}' | tr -s ',' '\n' | tr -s ':' ' ' | sort -n | uniq | grep ' ')
4376 if [[ $b ]]
4377 then
4378 b=$(ansible $last_index_controller -b -m shell -a "iptables -S" | grep ACCEPT | awk -F'--dports' '{print $2}' | awk '{print $1}' | tr -s ',' '\n' | tr -s ':' ' ' | sort -n | uniq | grep ' ' | xargs -L 1 seq)
4379 fi
4380 c="${a} ${b}"
4381 printf "$c" | tr -s ' ' '\n' | sort -n | uniq | grep ^[0-9] > allowed_ports_on_'$last_index_controller'.txt
4382 e=$(ansible $last_index_controller -b -m shell -a "nmap -p 1-65535 127.0.0.1" | grep open | awk -F/ '{print $1}')
4383 f=$(ansible $last_index_controller -b -m shell -a "netstat -tuplen" | awk '{print $4}' | awk -F: '{print $2}' | sort -n | uniq | grep ^[0-9])
4384 g="${e} ${f}"
4385 printf "$g" | tr -s ' ' '\n' | sort -n | uniq | grep ^[0-9] > open_ports_on_'$last_index_controller'.txt
4386 random_unused_whitelist_port=$(diff allowed_ports_on_'$last_index_controller'.txt open_ports_on_'$last_index_controller'.txt | grep \< | awk '{print $2}' | shuf -n 1)
4387 ansible $last_index_controller -b -m shell -a "iperf3 -s -p $random_unused_whitelist_port -D" | grep -v SUCCESS | tr -d '\n'
4388 if [[ $hci == "false" && $ansible_storage_hosts ]]
4389 then
4390 retransmits=$(ansible $random_storage_hostname -b -m shell -a "iperf3 -c $storage_controller_address -p $random_unused_whitelist_port" | grep \/sec | grep -E -v 'sender|receiver' | tail -n+2 | awk '{ if ( $9 > $retransmits_count ) print $0 }')
4391 elif [[ $hci == "true" ]]
4392 then
4393 retransmits=$(ansible $random_compute_hostname -b -m shell -a "iperf3 -c $storage_controller_address -p $random_unused_whitelist_port" | grep \/sec | grep -E -v 'sender|receiver' | tail -n+2 | awk '{ if ( $9 > $retransmits_count ) print $0 }')
4394 fi
4395 if [[ $retransmits ]]
4396 then
4397 echo -e "${RED}$retransmits${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4398 else
4399 echo -e "${GREEN}the number of retransmits per connection is less then 100${NC}"
4400 fi
4401 ansible $last_index_controller -b -m shell -a "killall iperf3" | grep -v SUCCESS | tr -d '\n'
4402 else
4403 echo -e "${ORANGE}iperf3 is not installed on cbis 18.0.0.1${NC}"
4404 fi
4405 fi
4406 elapsed_time_seconds=$(expr $(date +%s) - $start)
4407
4408
4409 ####################################################################################################
4410
4411
4412 start=$(date +%s)
4413 STEPS_COUNTER=$((STEPS_COUNTER+1))
4414 echo -e "${BLUE}\n\n$STEPS_COUNTER) KIBANA LOCALHOST LOGIN CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
4415 if [[ $elk == "true" && $elk_deployment_type == "local" ]]
4416 then
4417 kibana=$(ansible controller -b -m shell -a "curl -g -s localhost:5601/api/status | jq '.status' warn=False" | grep 'state' | sort | uniq -c | grep ^[[:blank:]]*3 | awk '{print $1}')
4418 if [[ $kibana == "3" ]]
4419 then
4420 echo -e "${GREEN}kibana (localhost:5601/api/status) sucessfully replied from all the controllers${NC}"
4421 else
4422 kibana=$(ansible controller -b -m shell -a "curl -g localhost:5601/api/status warn=False")
4423 echo -e "${RED}$kibana${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4424 fi
4425 else
4426 echo -e "${ORANGE}CBIS is deployed without ELK or ELK type is remote${NC}"
4427 fi
4428 elapsed_time_seconds=$(expr $(date +%s) - $start)
4429
4430
4431 ####################################################################################################
4432
4433
4434 start=$(date +%s)
4435 STEPS_COUNTER=$((STEPS_COUNTER+1))
4436 echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE BETWEEN THE HOSTS PRESENTED IN ZABBIX AND THE HOSTS PRESENTED BY OPENSTACK (+$elapsed_time_seconds `date '+%T'`)${NC}"
4437 if [[ $zabbix_hosts != $nova_overcloud_hosts_list ]]
4438 then
4439 echo -e "${RED}openstack overcloud hosts:\n$nova_overcloud_hosts_list\n\nzabbix overcloud hosts:\n$zabbix_hosts${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4440 else
4441 echo -e "${GREEN}the configured overcloud hosts in zabbix are identical to the nova overcloud hosts${NC}"
4442 fi
4443
4444
4445 ####################################################################################################
4446
4447
4448 start=$(date +%s)
4449 STEPS_COUNTER=$((STEPS_COUNTER+1))
4450 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE ZABBIX DISCOVERY RULES WITH UNEXPECTED STATE OR STATUS VALUE(+$elapsed_time_seconds `date '+%T'`)${NC}"
4451 discovery_rules=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
4452 -H 'Content-Type: application/json-rpc' \
4453 -H 'Cookie: SERVERID='$last_index_controller'' \
4454 --data '{
4455 "jsonrpc": "2.0",
4456 "method": "discoveryrule.get",
4457 "params": {
4458 "output": "extend",
4459 "sortfield": "name"
4460 },
4461 "auth": '$zabbix_auth',
4462 "id": 1
4463 }')
4464 wrong_status=$(echo -e "$discovery_rules" | tr '\r\n' ' ' | jq .result[] | jq 'select(.status != "0")' | jq -r "[.name,.hostid]")
4465 wrong_state=$(echo -e "$discovery_rules" | tr '\r\n' ' ' | jq .result[] | jq 'select(.state != "0")' | jq -r "[.name,.hostid]")
4466 if [[ $wrong_status || $wrong_state ]]
4467 then
4468 echo -e "${RED}$zabbix_hosts_and_ids\n\n\n$wrong_status\n\n\n$wrong_state${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4469 echo -e "\n${ORANGE}CBIS-16053 - Block devices discovery, KVM Network discovery, KVM Pool discovery, MD devices discovery, Network interface discovery, Hotfixes and Services Discovery triggers are disabled (CBIS 20)${NC}"
4470
4471 else
4472 echo -e "${GREEN}no zabbix discovery rules with unexpected status or state values are found${NC}\n"
4473 fi
4474 elapsed_time_seconds=$(expr $(date +%s) - $start)
4475
4476
4477 ####################################################################################################
4478
4479
4480 start=$(date +%s)
4481 STEPS_COUNTER=$((STEPS_COUNTER+1))
4482 echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE ZABBIX TEMPLATES BETWEEN EACH HOST WITHIN EACH HOST-GROUP (+$elapsed_time_seconds `date '+%T'`)${NC}"
4483 if [[ $ansible_ovs_hosts ]]
4484 then
4485 echo -e "${CYAN}checking ovs computes${NC}"
4486 templates_total=""
4487 for host in $ansible_ovs_hosts
4488 do
4489 templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
4490 -H 'Content-Type: application/json-rpc' \
4491 -H 'Cookie: SERVERID='$last_index_controller'' \
4492 --data '{
4493 "jsonrpc": "2.0",
4494 "method": "host.get",
4495 "params": {
4496 "output": ["host"],
4497 "selectParentTemplates": [
4498 "templateid",
4499 "name"
4500 ],
4501 "filter": {
4502 "host": "'$host'"
4503 }
4504 },
4505 "id": 1,
4506 "auth": '$zabbix_auth'
4507 }' | jq . | grep \"name\" | awk -F: '{print $2}' | tr -d '," ' | column -t)
4508 templates_total+="$templates\n"
4509 done
4510 templates_result=$(printf "$templates_total" | sort | uniq -c | awk '{print $1}' | sort -u | wc -l)
4511 if [[ $templates_result != "1" ]]
4512 then
4513 echo -e "${RED}found different number of templates between the ovs computes. log-in to zabbix, configuration > hosts and investigate the anomaly${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4514 else
4515 echo -e "${GREEN}all the ovs computes has identical number of templates${NC}"
4516 fi
4517 fi
4518 if [[ $ansible_sriov_hosts ]]
4519 then
4520 echo -e "${CYAN}checking sriov computes${NC}"
4521 templates_total=""
4522 for host in $ansible_sriov_hosts
4523 do
4524 templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
4525 -H 'Content-Type: application/json-rpc' \
4526 -H 'Cookie: SERVERID='$last_index_controller'' \
4527 --data '{
4528 "jsonrpc": "2.0",
4529 "method": "host.get",
4530 "params": {
4531 "output": ["host"],
4532 "selectParentTemplates": [
4533 "templateid",
4534 "name"
4535 ],
4536 "filter": {
4537 "host": "'$host'"
4538 }
4539 },
4540 "id": 1,
4541 "auth": '$zabbix_auth'
4542 }' | jq . | grep \"name\" | awk -F: '{print $2}' | tr -d '," ' | column -t)
4543 templates_total+="$templates\n"
4544 done
4545 templates_result=$(printf "$templates_total" | sort | uniq -c | awk '{print $1}' | sort -u | wc -l)
4546 if [[ $templates_result != "1" ]]
4547 then
4548 echo -e "${RED}found different number of templates between the sriov computes. log-in to zabbix, configuration > hosts and investigate the anomaly${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4549 else
4550 echo -e "${GREEN}all the sriov computes has identical number of templates${NC}"
4551 fi
4552 fi
4553 if [[ $ansible_dpdk_hosts ]]
4554 then
4555 echo -e "${CYAN}checking dpdk computes${NC}"
4556 templates_total=""
4557 for host in $ansible_dpdk_hosts
4558 do
4559 templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
4560 -H 'Content-Type: application/json-rpc' \
4561 -H 'Cookie: SERVERID='$last_index_controller'' \
4562 --data '{
4563 "jsonrpc": "2.0",
4564 "method": "host.get",
4565 "params": {
4566 "output": ["host"],
4567 "selectParentTemplates": [
4568 "templateid",
4569 "name"
4570 ],
4571 "filter": {
4572 "host": "'$host'"
4573 }
4574 },
4575 "id": 1,
4576 "auth": '$zabbix_auth'
4577 }' | jq . | grep \"name\" | awk -F: '{print $2}' | tr -d '," ' | column -t)
4578 templates_total+="$templates\n"
4579 done
4580 templates_result=$(printf "$templates_total" | sort | uniq -c | awk '{print $1}' | sort -u | wc -l)
4581 if [[ $templates_result != "1" ]]
4582 then
4583 echo -e "${RED}found different number of templates between the dpdk computes. log-in to zabbix, configuration > hosts and investigate the anomaly${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4584 else
4585 echo -e "${GREEN}all the dpdk computes has identical number of templates${NC}"
4586 fi
4587 fi
4588 if [[ $ansible_avrs_hosts ]]
4589 then
4590 echo -e "${CYAN}checking avrs computes${NC}"
4591 templates_total=""
4592 for host in $ansible_avrs_hosts
4593 do
4594 templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
4595 -H 'Content-Type: application/json-rpc' \
4596 -H 'Cookie: SERVERID='$last_index_controller'' \
4597 --data '{
4598 "jsonrpc": "2.0",
4599 "method": "host.get",
4600 "params": {
4601 "output": ["host"],
4602 "selectParentTemplates": [
4603 "templateid",
4604 "name"
4605 ],
4606 "filter": {
4607 "host": "'$host'"
4608 }
4609 },
4610 "id": 1,
4611 "auth": '$zabbix_auth'
4612 }' | jq . | grep \"name\" | awk -F: '{print $2}' | tr -d '," ' | column -t)
4613 templates_total+="$templates\n"
4614 done
4615 templates_result=$(printf "$templates_total" | sort | uniq -c | awk '{print $1}' | sort -u | wc -l)
4616 if [[ $templates_result != "1" ]]
4617 then
4618 echo -e "${RED}found different number of templates between the avrs computes. log-in to zabbix, configuration > hosts and investigate the anomaly${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4619 else
4620 echo -e "${GREEN}all the avrs computes has identical number of templates${NC}"
4621 fi
4622 fi
4623 if [[ $ansible_storage_hosts ]]
4624 then
4625 echo -e "${CYAN}checking storage nodes${NC}"
4626 templates_total=""
4627 for host in $ansible_storage_hosts
4628 do
4629 templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
4630 -H 'Content-Type: application/json-rpc' \
4631 -H 'Cookie: SERVERID='$last_index_controller'' \
4632 --data '{
4633 "jsonrpc": "2.0",
4634 "method": "host.get",
4635 "params": {
4636 "output": ["host"],
4637 "selectParentTemplates": [
4638 "templateid",
4639 "name"
4640 ],
4641 "filter": {
4642 "host": "'$host'"
4643 }
4644 },
4645 "id": 1,
4646 "auth": '$zabbix_auth'
4647 }' | jq . | grep \"name\" | awk -F: '{print $2}' | tr -d '," ' | column -t)
4648 templates_total+="$templates\n"
4649 done
4650 templates_result=$(printf "$templates_total" | sort | uniq -c | awk '{print $1}' | sort -u | wc -l)
4651 if [[ $templates_result != "1" ]]
4652 then
4653 echo -e "${RED}found different number of templates between the storage nodes. log-in to zabbix, configuration > hosts and investigate the anomaly${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4654 else
4655 echo -e "${GREEN}all the storage nodes has identical number of templates${NC}"
4656 fi
4657 fi
4658 if [[ $ansible_monitoring_hosts ]]
4659 then
4660 echo -e "${CYAN}checking monitoring hosts${NC}"
4661 templates_total=""
4662 for host in $ansible_monitoring_hosts
4663 do
4664 templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
4665 -H 'Content-Type: application/json-rpc' \
4666 -H 'Cookie: SERVERID='$last_index_controller'' \
4667 --data '{
4668 "jsonrpc": "2.0",
4669 "method": "host.get",
4670 "params": {
4671 "output": ["host"],
4672 "selectParentTemplates": [
4673 "templateid",
4674 "name"
4675 ],
4676 "filter": {
4677 "host": "'$host'"
4678 }
4679 },
4680 "id": 1,
4681 "auth": '$zabbix_auth'
4682 }' | jq . | grep \"name\" | awk -F: '{print $2}' | tr -d '," ' | column -t)
4683 templates_total+="$templates\n"
4684 done
4685 templates_result=$(printf "$templates_total" | sort | uniq -c | awk '{print $1}' | sort -u | wc -l)
4686 if [[ $templates_result != "1" ]]
4687 then
4688 echo -e "${RED}found different number of templates between the monitoring hosts. log-in to zabbix, configuration > hosts and investigate the anomaly${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4689 else
4690 echo -e "${GREEN}all the monitoring hosts has identical number of templates${NC}"
4691 fi
4692 fi
4693 if [[ $ansible_controllers_hosts ]]
4694 then
4695 echo -e "${CYAN}checking controllers${NC}"
4696 templates_total=""
4697 for host in $ansible_controllers_hosts
4698 do
4699 templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
4700 -H 'Content-Type: application/json-rpc' \
4701 -H 'Cookie: SERVERID='$last_index_controller'' \
4702 --data '{
4703 "jsonrpc": "2.0",
4704 "method": "host.get",
4705 "params": {
4706 "output": ["host"],
4707 "selectParentTemplates": [
4708 "templateid",
4709 "name"
4710 ],
4711 "filter": {
4712 "host": "'$host'"
4713 }
4714 },
4715 "id": 1,
4716 "auth": '$zabbix_auth'
4717 }' | jq . | grep \"name\" | awk -F: '{print $2}' | tr -d '," ' | column -t)
4718 templates_total+="$templates\n"
4719 done
4720 templates_result=$(printf "$templates_total" | sort | uniq -c | awk '{print $1}' | sort -u | wc -l)
4721 if [[ $templates_result != "1" ]]
4722 then
4723 echo -e "${RED}found different number of templates between the controllers. log-in to zabbix, configuration > hosts and investigate the anomaly${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4724 else
4725 echo -e "${GREEN}all the controllers has identical number of templates${NC}"
4726 fi
4727 fi
4728 elapsed_time_seconds=$(expr $(date +%s) - $start)
4729
4730
4731 ####################################################################################################
4732
4733
4734 start=$(date +%s)
4735 STEPS_COUNTER=$((STEPS_COUNTER+1))
4736 echo -e "${BLUE}\n\n$STEPS_COUNTER) VITRAGE ALARMS (+$elapsed_time_seconds `date '+%T'`)${NC}"
4737 if [[ $cbis_version == *"2"* ]]
4738 then
4739 echo -e "${ORANGE}vitrage is deprecated from cbis-20.100.1 and onwards${NC}"
4740 elif [[ $cbis_version == "18.0.0.1" || $cbis_version == "19.0.0.1" || $cbis_version == "19.100.1" ]]
4741 then
4742 vitrage=$(ansible $last_index_controller -m shell -a "source ~/overcloudrc && vitrage alarm list -f value" | grep -E -v 'SUCCESS|/etc/passwd')
4743 if [[ -z $vitrage ]]
4744 then
4745 echo -e "${GREEN}no alarms found${NC}"
4746 else
4747 echo -e "${RED}$vitrage${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4748 fi
4749 fi
4750 elapsed_time_seconds=$(expr $(date +%s) - $start)
4751
4752
4753 ####################################################################################################
4754
4755
4756 start=$(date +%s)
4757 STEPS_COUNTER=$((STEPS_COUNTER+1))
4758 echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE THE ALARMS COUNT BETWEEN ZABBIX AND VITRAGE (+$elapsed_time_seconds `date '+%T'`)${NC}"
4759 if [[ $cbis_version != *"2"* ]]
4760 then
4761 vitrage_alarms_count=$(ansible $last_index_controller -m shell -a "source ~/overcloudrc && vitrage alarm list -f value | grep -v '/etc/passwd has been changed' | wc -l" | grep ^[0-9])
4762 if [[ $zabbix_problem_triggers_count != $vitrage_alarms_count ]]
4763 then
4764 echo -e "${RED}zabbix alarms: "$zabbix_problem_triggers_count", vitrage alarms: "$vitrage_alarms_count" ${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4765 else
4766 echo -e "${GREEN}same number of zabbix and vitrage alarms${NC}"
4767 fi
4768 else
4769 echo -e "${ORANGE}vitrage is deprecated from cbis-20.100.1 and onwards${NC}"
4770 fi
4771 elapsed_time_seconds=$(expr $(date +%s) - $start)
4772
4773
4774 ####################################################################################################
4775
4776
4777 start=$(date +%s)
4778 STEPS_COUNTER=$((STEPS_COUNTER+1))
4779 echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE THE ALARMS BETWEEN ZABBIX AND ALARM MANAGER (mysql) (+$elapsed_time_seconds `date '+%T'`)${NC}"
4780 almadb_alarms=$(ansible $last_index_controller -b -m shell -a "mysql -s -N -e \"select text from alma_db.ALMAALARM\"" | grep -v rc=0 | awk '{$NF=""; print $0}' | sort | uniq)
4781 zabbixdb_alarms=$(ansible $last_index_controller -b -m shell -a "mysql -s -N -e \"SELECT description FROM zabbixdb.triggers WHERE value = 1\"" | grep -v rc=0 | awk '{$NF=""; print $0}' | sort | uniq)
4782 if [[ $almadb_alarms != $zabbixdb_alarms ]]
4783 then
4784 echo -e "${RED}zabbix alarms:\n\n"$zabbixdb_alarms"\n\nalarm manager alarms:\n\n"$almadb_alarms"${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4785 else
4786 echo -e "${GREEN}zabbix alarms and alarm manager alarms are identical${NC}"
4787 fi
4788 elapsed_time_seconds=$(expr $(date +%s) - $start)
4789
4790
4791 ####################################################################################################
4792
4793
4794 start=$(date +%s)
4795 STEPS_COUNTER=$((STEPS_COUNTER+1))
4796 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOG ALL haproxy DOWN LOGS AND COMPARE WITH PREVIOUS CHECK AND CURRENT CHECK TO FIGURE OUT PROBLEMATIC TRENDS (+$elapsed_time_seconds `date '+%T'`)${NC}"
4797 if [[ ! -f "previous_haproxy_down_logs.txt" ]]
4798 then
4799 ansible $first_index_controller -b -m shell -a "cat /var/log/containers/haproxy/haproxy.log" | grep -w DOWN | awk -F, '{print $1}' > previous_haproxy_down_logs.txt
4800 fi
4801
4802 if [[ ! -f "current_haproxy_down_logs.txt" ]]
4803 then
4804 ansible $first_index_controller -b -m shell -a "cat /var/log/containers/haproxy/haproxy.log" | grep -w DOWN | awk -F, '{print $1}' > current_haproxy_down_logs.txt
4805 fi
4806
4807 previous_current_diff=$(diff -s previous_haproxy_down_logs.txt current_haproxy_down_logs.txt | awk '{print $NF}')
4808 if [[ $previous_current_diff == "identical" ]]
4809 then
4810 echo -e "${GREEN}couldn't find new DOWN lines in haproxy.log${NC}"
4811 sudo rm -f current_httpd_down_logs
4812 else
4813 echo -e "${GREEN}$previous_current_diff${NC}"
4814 sudo cp current_httpd_down_logs.txt previous_httpd_down_logs.txt
4815 sudo rm -f current_httpd_down_logs
4816 fi
4817 elapsed_time_seconds=$(expr $(date +%s) - $start)
4818
4819
4820
4821 ####################################################################################################
4822
4823
4824 start=$(date +%s)
4825 STEPS_COUNTER=$((STEPS_COUNTER+1))
4826 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE DOWN AND NOLB (no load-balancer) SERVICES WITHIN THE HAPROXY SOCKET STATS (+$elapsed_time_seconds `date '+%T'`)${NC}"
4827 haproxy_down_services=$(ansible $last_index_controller -m shell -b -a "echo 'show stat' | nc -U /var/lib/haproxy/stats | grep -E -w 'DOWN|NOLB' | awk -F, '{print \$1,\$2}'" | grep -v -E rc=[0-9])
4828 if [[ $haproxy_down_services ]]
4829 then
4830 echo -e "${RED}$haproxy_down_services${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4831 echo -e "\n\n${ORANGE}CBIS-16375 (19A) - aodh, gnocchi and panko services leftovers in haproxy and redis and ceph_dashboard not showing in full 3/3 HA${NC}"
4832 else
4833 echo -e "${GREEN}no services are reported as DOWN or NOLB (no load-balancer)${NC}"
4834 fi
4835 elapsed_time_seconds=$(expr $(date +%s) - $start)
4836
4837
4838 ####################################################################################################
4839
4840
4841 if [[ -f "$logs_dir/initial_undercloud_hostname" ]]
4842 then
4843 start=$(date +%s)
4844 STEPS_COUNTER=$((STEPS_COUNTER+1))
4845 echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE THE UNDERCLOUD INITIAL AND CURRENT HOSTNAME (+$elapsed_time_seconds `date '+%T'`)${NC}"
4846 hostname > current_undercloud_hostname
4847 diff=$(diff $logs_dir/initial_undercloud_hostname /home/stack/current_undercloud_hostname)
4848 if [[ $diff ]]
4849 then
4850 echo -e "${RED}$diff${NC}" | sed 's/^>/INITIAL:/g' | sed 's/^</CURRENT:/g'
4851 else
4852 echo -e "${GREEN}no differences between the initial and current hostname are found${NC}"
4853 fi
4854 else
4855 start=$(date +%s)
4856 STEPS_COUNTER=$((STEPS_COUNTER+1))
4857 echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE INITIAL AND CURRENT HOSTNAMES (+$elapsed_time_seconds `date '+%T'`)${NC}"
4858 hostname > $logs_dir/initial_undercloud_hostname
4859 echo -e "${GREEN}this is the initial undercloud hostname audit - the comparison will begin from the next script iteration${NC}"
4860 fi
4861
4862
4863 ####################################################################################################
4864
4865
4866 start=$(date +%s)
4867 STEPS_COUNTER=$((STEPS_COUNTER+1))
4868 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK IF USERS root/stack ARE FAIL-LOCKED (+$elapsed_time_seconds `date '+%T'`)${NC}"
4869 # usually a user will become faillocked after several failed login attempts
4870 faillock=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor,localhost -b -m shell -a "faillock" | grep -w V -B 3)
4871 if [[ $faillock ]]
4872 then
4873 echo -e "${RED}$faillock${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4874 else
4875 echo -e "${GREEN}no fail-locked user found${NC}"
4876 fi
4877 elapsed_time_seconds=$(expr $(date +%s) - $start)
4878
4879
4880 ####################################################################################################
4881
4882
4883 # The Automatic Bug Reporting Tool, commonly abbreviated as ABRT, is a set of tools that is designed to help users detect and report application crashes.
4884 start=$(date +%s)
4885 STEPS_COUNTER=$((STEPS_COUNTER+1))
4886 echo -e "${BLUE}\n\n$STEPS_COUNTER) DETECT APPLICATION CRASHES USING RED-HAT ABRT (Automatic Bug Reporting Tool) ON THE HYPERVISOR (+$elapsed_time_seconds `date '+%T'`)${NC}"
4887 abrt=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "abrt-cli status" | grep -v SUCCESS)
4888 if [[ $abrt ]]
4889 then
4890 abrt=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "abrt-cli list 2> /dev/null | grep ^reason: | sort | uniq -c")
4891 echo -e "${ORANGE}$abrt${NC}"
4892 else
4893 echo -e "${GREEN}no application crashes detected${NC}"
4894 fi
4895 elapsed_time_seconds=$(expr $(date +%s) - $start)
4896
4897
4898 ####################################################################################################
4899
4900
4901 # libguestfs-test-tool is a test program shipped with libguestfs to allow you to check basic libguestfs functionality is working. This is needed because libguestfs occasionally breaks for reasons beyond our control: usually because of changes in the underlying qemu or kernel packages, or the host environment.
4902 # libguestfs is a set of tools for accessing and modifying virtual machine (VM) disk images. You can use this for viewing and editing files inside guests, scripting changes to VMs, monitoring disk used/free statistics, creating guests, P2V, V2V, performing backups, cloning VMs, building VMs, formatting disks, resizing disks, and much more.
4903 # Context: CBIS 20 PP3 deployment failed due to https://access.redhat.com/solutions/3416791 which I used libguestfs-test-tool to debug.
4904 start=$(date +%s)
4905 STEPS_COUNTER=$((STEPS_COUNTER+1))
4906 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT BASIC libguestfs FUNCTIONALITY IS WORKING (+$elapsed_time_seconds `date '+%T'`)${NC}"
4907 libguestfs_test_tool=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "libguestfs-test-tool 2>&1")
4908 libguestfs_test_tool_verdict=$(echo -e "$libguestfs_test_tool" | grep 'TEST FINISHED OK')
4909 if [[ $libguestfs_test_tool_verdict ]]
4910 then
4911 echo -e "${GREEN}libguestfs-test-tool returned: TEST FINISHED OK${NC}"
4912 else
4913 echo -e "${RED}$libguestfs_test_tool${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4914 fi
4915 elapsed_time_seconds=$(expr $(date +%s) - $start)
4916
4917
4918 ####################################################################################################
4919
4920
4921 start=$(date +%s)
4922 STEPS_COUNTER=$((STEPS_COUNTER+1))
4923 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT ALL THE IRONIC HOSTS POWER STATE IS power on, PROVISIONING STATE IS active AND MAINTENANCE IS False (+$elapsed_time_seconds `date '+%T'`)${NC}"
4924 maintenance=$(source ~/stackrc && openstack baremetal node list -f value -c UUID -c Name -c 'Power State' -c 'Provisioning State' -c Maintenance | column -t | grep -E -v 'power\s+on\s+active\s+False')
4925 if [[ $maintenance ]]
4926 then
4927 echo -e "${RED}$maintenance${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4928 else
4929 echo -e "${GREEN}no baremetal host(s) with unexpected status is found${NC}"
4930 fi
4931 elapsed_time_seconds=$(expr $(date +%s) - $start)
4932
4933
4934 ####################################################################################################
4935
4936
4937 start=$(date +%s)
4938 if [[ $cbis_version != "19.0.0.1" ]]
4939 then
4940 STEPS_COUNTER=$((STEPS_COUNTER+1))
4941 echo -e "${BLUE}\n\n$STEPS_COUNTER) RABBITMQ QUEUES CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
4942 if [[ $cbis_version == "18.0.0.1" ]]
4943 then
4944 rabbitmqctl_list_queues=$(ansible $last_index_controller -b -m shell -a "rabbitmqctl list_queues" | grep -E "[[:space:]]+[0-9]" | awk '($2!=0) {print $0}')
4945 sleep 60
4946 rabbitmqctl_list_queues_no_2=$(ansible $last_index_controller -b -m shell -a "rabbitmqctl list_queues" | grep -E "[[:space:]]+[0-9]" | awk '($2!=0) {print $0}')
4947 if [[ $rabbitmqctl_list_queues == $rabbitmqctl_list_queues_no_2 ]]
4948 then
4949 echo -e "${GREEN}no change in the queues were found in 30 seconds duration${NC}"
4950 else
4951 echo -e "${RED}found difference in the queues between the first rabbitmqctl list_queues check and the second check which is taken 30 seconds after the first one${NC}\n\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4952 echo -e "${RED}first check:\n"$rabbitmqctl_list_queues"${NC}\n\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4953 echo -e "${RED}second check:\n"$rabbitmqctl_list_queues_no_2"${NC}\n\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4954 fi
4955 elif [[ $cbis_version != "18.0.0.1" ]]
4956 then
4957 rabbitmqctl_list_queues=$(ansible $last_index_controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl list_queues" | grep -E "[[:space:]]+[0-9]" | awk '($2!=0) {print $0}')
4958 sleep 30
4959 rabbitmqctl_list_queues_no_2=$(ansible $last_index_controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl list_queues" | grep -E "[[:space:]]+[0-9]" | awk '($2!=0) {print $0}')
4960 if [[ $rabbitmqctl_list_queues == $rabbitmqctl_list_queues_no_2 ]]
4961 then
4962 echo -e "${GREEN}no change in the queues were found in 30 seconds duration${NC}"
4963 else
4964 echo -e "${RED}found difference in the queues between the first rabbitmqctl list_queues check and the second check which is taken 30 seconds after the first one${NC}\n\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4965 echo -e "${RED}first check:\n"$rabbitmqctl_list_queues"${NC}\n\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4966 echo -e "${RED}second check:\n"$rabbitmqctl_list_queues_no_2"${ORANGE}\n\n\nnote: a difference between the first and second checks isn't necessary a bug. please examine the output carefully"${NC} ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4967 fi
4968 fi
4969 elapsed_time_seconds=$(expr $(date +%s) - $start)
4970 fi
4971
4972
4973 ####################################################################################################
4974
4975
4976 start=$(date +%s)
4977 STEPS_COUNTER=$((STEPS_COUNTER+1))
4978 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE ROOT PARTITIONS WITH 90%+ DISK USAGE (df) (+$elapsed_time_seconds `date '+%T'`)${NC}"
4979 partitions_size=$(ansible all -m shell -a "df -h | grep -w / | awk '{print \$5,\$6}' | sed 's/\%//g' | grep -E ^[0-9] | awk '{ if ( \$1 > 90 ) print \$1,\$2 }'" | grep ^[9] -B 1)
4980 if [[ -z $partitions_size ]]
4981 then
4982 echo -e "${GREEN}no partition with 90%+ usage found on the undercloud and on the overcloud servers${NC}"
4983 else
4984 echo -e "${RED}$partitions_size${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4985 fi
4986 elapsed_time_seconds=$(expr $(date +%s) - $start)
4987
4988
4989 ####################################################################################################
4990
4991
4992 start=$(date +%s)
4993 STEPS_COUNTER=$((STEPS_COUNTER+1))
4994 file_size="1"
4995 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE BIG LOG FILES (OVER "$file_size"G) (+$elapsed_time_seconds `date '+%T'`)${NC}"
4996 big_files=$(ansible all --limit '!hypervisor' -b -m shell -a "find /var/log/ -size +"$file_size"G -exec ls -lh {} \+" | grep /var/log/ -B 1)
4997 if [[ -z $big_files ]]
4998 then
4999 echo -e "${GREEN}couldn't find files under /var/log/ which weights more then "$file_size"G${NC}"
5000 else
5001 echo -e "${RED}$big_files${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5002 fi
5003 elapsed_time_seconds=$(expr $(date +%s) - $start)
5004
5005
5006 ####################################################################################################
5007
5008
5009 start=$(date +%s)
5010 STEPS_COUNTER=$((STEPS_COUNTER+1))
5011 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE UNSUCCESSFUL LOG LINES IN /var/log/cbis/patches-applied.log ON ALL THE SERVERS (+$elapsed_time_seconds `date '+%T'`)${NC}"
5012 applied_patch_bad_logs=$(ansible all --limit '!hypervisor' -b -m shell -a "cat /var/log/cbis/patches-applied.log | grep -v -E 'PATCH-SUCCESS|PATCH-START'" | grep -v -E 'rc=[1-9]|non-zero return code')
5013 if [[ $applied_patch_bad_logs ]]
5014 then
5015 echo -e "${RED}$applied_patch_bad_logs${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5016 echo -e "\n\n${ORANGE}CBIS-16038 scale out fails - the scaled out server doesn't have SSH iptables ACCEPT rule and thus ansible fails to access it\nthe sypthom was that on the scaled out compute the patch was not deployerd correctly and it this was seen under /var/log/cbis/patches-applied.log of the scaled out server(CBIS 20 PP3)${NC}"
5017 else
5018 echo -e "${GREEN}no unsuccessful log lines found under /var/log/cbis/patches-applied.log${NC}"
5019 fi
5020 elapsed_time_seconds=$(expr $(date +%s) - $start)
5021
5022
5023 ####################################################################################################
5024
5025
5026 start=$(date +%s)
5027 STEPS_COUNTER=$((STEPS_COUNTER+1))
5028 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT 'max memory size|open files|cpu time|virtual memory|file locks' CONFIURATIOSN ARE SAME FOR ALL HOSTS (ulimit -a) (+$elapsed_time_seconds `date '+%T'`)${NC}"
5029 ulimit=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ulimit -a" | grep -E -w 'max memory size|open files|cpu time|virtual memory|file locks' | sort --uniq | awk '{print $NF}' | sort --uniq | paste -sd " ")
5030 if [[ $ulimit != "1024 unlimited" ]]
5031 then
5032 ulimit=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ulimit -a | grep -E -w 'max memory size|open files|cpu time|virtual memory|file locks'")
5033 echo -e "${RED}$ulimit${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5034 else
5035 echo -e "${GREEN}all the hosts returned the expected 'max memory size|open files|cpu time|virtual memory|file locks' values${NC}"
5036 fi
5037 elapsed_time_seconds=$(expr $(date +%s) - $start)
5038
5039
5040 ####################################################################################################
5041
5042
5043 start=$(date +%s)
5044 STEPS_COUNTER=$((STEPS_COUNTER+1))
5045 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE VLOCK PROCESSES (+$elapsed_time_seconds `date '+%T'`)${NC}"
5046 vlock=$(ansible all --limit '!hypervisor' -b -m shell -a "top -b -n 1 | grep vlock" | grep -E -v 'FAILED|non-zero return code')
5047 if [[ -z $vlock ]]
5048 then
5049 echo -e "${GREEN}no vlock processes found${NC}"
5050 else
5051 echo -e "${MAGENTA}$vlock${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5052 echo -e "\n\n${ORANGE}CBIS-7324 (CBIS 19.0)\nCBIS-7324 (CBIS 19A)${NC}"
5053 fi
5054 elapsed_time_seconds=$(expr $(date +%s) - $start)
5055
5056
5057 ####################################################################################################
5058
5059
5060 start=$(date +%s)
5061 STEPS_COUNTER=$((STEPS_COUNTER+1))
5062 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE ZOMBIE (DEFUNCT) PROCESSES (+$elapsed_time_seconds `date '+%T'`)${NC}"
5063 zombie=$(ansible all --limit '!hypervisor' -b -m shell -a "ps aux | grep [d]efunct | grep -v swift-object-au" | grep -E -v 'FAILED|non-zero return code')
5064 servers_with_zombie=$(echo -e "$zombie" | grep SUCCESS | awk '{print $1}' | paste -sd",")
5065 if [[ -z $zombie ]]
5066 then
5067 echo -e "${GREEN}no zombie processes found${NC}"
5068 else
5069 echo -e "${RED}the following PIDs are marked as defunct (zombie process):\n\n$zombie${NC}\n\n"
5070 zombie=$(ansible $servers_with_zombie -b -m shell -a "ps aux | grep [d]efunct | grep -v swift-object-au | awk '{print \$2}' | xargs -i pstree -aps {}")
5071 ### to avoid trancuated lines use pstree -laps
5072 echo -e "${RED}$zombie${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5073 echo -e "\n\n${ORANGE}CBIS-11144 (19A) / CBIS-11245 (20) - neutron router causing zombie processes${NC}"
5074 echo -e "${ORANGE}CBIS-16391 (19A) - neutron-dhcp-agent zabbix alarm after deploying running default security hardening${NC}"
5075 fi
5076 elapsed_time_seconds=$(expr $(date +%s) - $start)
5077
5078
5079 ####################################################################################################
5080
5081
5082 start=$(date +%s)
5083 STEPS_COUNTER=$((STEPS_COUNTER+1))
5084 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR CRITICAL ERRORS IN DMESG (KERNEL MESSAGES) (+$elapsed_time_seconds `date '+%T'`)${NC}"
5085 dmesg_alarms=$(ansible all --limit '!hypervisor' -b -m shell -a "dmesg -x --ctime --level crit --level alert --level emerg --nopager --decode --kernel --userspace" | grep : -B 1)
5086 if [[ -z $dmesg_alarms ]]
5087 then
5088 echo -e "${GREEN}no critical alarms found in dmesg${NC}"
5089 else
5090 echo -e "${RED}$dmesg_alarms${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5091 fi
5092 elapsed_time_seconds=$(expr $(date +%s) - $start)
5093
5094
5095 ####################################################################################################
5096
5097
5098 start=$(date +%s)
5099 STEPS_COUNTER=$((STEPS_COUNTER+1))
5100 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT ALL THE OVERCLOUD HOSTS HAS THE SAME FIRMWARE VERSION (+$elapsed_time_seconds `date '+%T'`)${NC}"
5101 echo -e "${CYAN}checking the computes${NC}"
5102 firmware_version=$(ansible compute -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'" | grep -E ^[0-9] | sort -u | wc -l)
5103 if [[ $firmware_version == "1" ]]
5104 then
5105 firmware_version=$(ansible compute -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'" | grep -E ^[0-9] | sort -u)
5106 echo -e "${GREEN}all servers using firmware revision $firmware_version${NC}"
5107 touch $logs_dir/firmware_version
5108 echo "$firmware_version" > $logs_dir/firmware_version
5109 else
5110 firmware_version=$(ansible compute -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'")
5111 echo -e "${RED}$firmware_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5112 fi
5113 echo -e "${CYAN}checking the controllers${NC}"
5114 firmware_version=$(ansible controller -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'" | grep -E ^[0-9] | sort -u | wc -l)
5115 if [[ $firmware_version == "1" ]]
5116 then
5117 firmware_version=$(ansible controller -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'" | grep -E ^[0-9] | sort -u)
5118 echo -e "${GREEN}all servers using firmware revision $firmware_version${NC}"
5119 else
5120 firmware_version=$(ansible controller -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'")
5121 echo -e "${RED}$firmware_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5122 fi
5123 if [[ $ansible_storage_hosts_count != "0" ]]
5124 then
5125 echo -e "${CYAN}checking the storage nodes${NC}"
5126 firmware_version=$(ansible cephstorage -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'" | grep -E ^[0-9] | sort -u | wc -l)
5127 if [[ $firmware_version == "1" ]]
5128 then
5129 firmware_version=$(ansible cephstorage -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'" | grep -E ^[0-9] | sort -u)
5130 echo -e "${GREEN}all servers using firmware revision $firmware_version${NC}"
5131 else
5132 firmware_version=$(ansible cephstorage -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'")
5133 echo -e "${RED}$firmware_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5134 fi
5135 fi
5136 echo -e "${CYAN}checking all the overcloud servers at once${NC}"
5137 firmware_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'" | grep -E ^[0-9] | sort -u | wc -l)
5138 if [[ $firmware_version == "1" ]]
5139 then
5140 firmware_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'" | grep -E ^[0-9] | sort -u)
5141 echo -e "${GREEN}all servers using firmware revision $firmware_version${NC}"
5142 else
5143 firmware_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'")
5144 echo -e "${RED}$firmware_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5145 echo "$firmware_version" > $logs_dir/firmware_version
5146 fi
5147 elapsed_time_seconds=$(expr $(date +%s) - $start)
5148
5149
5150 ####################################################################################################
5151
5152
5153 start=$(date +%s)
5154 STEPS_COUNTER=$((STEPS_COUNTER+1))
5155 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT ALL THE HOSTS HAS THE SAME BIOS VERSION (+$elapsed_time_seconds `date '+%T'`)${NC}"
5156 echo -e "${CYAN}checking the computes${NC}"
5157 bios_version=$(ansible compute -b -m shell -a "dmidecode -s bios-version" | grep -v SUCCESS | sort -u | wc -l)
5158 if [[ $bios_version == "1" ]]
5159 then
5160 bios_version=$(ansible compute -b -m shell -a "dmidecode -s bios-version" | grep -v SUCCESS | sort -u)
5161 echo -e "${GREEN}all servers using bios version $bios_version${NC}"
5162 touch $logs_dir/bios_version
5163 echo "$bios_version" > $logs_dir/bios_version
5164 else
5165 bios_version=$(ansible compute -b -m shell -a "dmidecode -s bios-version")
5166 echo -e "${RED}$bios_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5167 fi
5168 echo -e "${CYAN}checking the controllers${NC}"
5169 bios_version=$(ansible controller -b -m shell -a "dmidecode -s bios-version" | grep -v SUCCESS | sort -u | wc -l)
5170 if [[ $bios_version == "1" ]]
5171 then
5172 bios_version=$(ansible controller -b -m shell -a "dmidecode -s bios-version" | grep -v SUCCESS | sort -u)
5173 echo -e "${GREEN}all servers using bios version $bios_version${NC}"
5174 else
5175 bios_version=$(ansible controller -b -m shell -a "dmidecode -s bios-version")
5176 echo -e "${RED}$bios_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5177 fi
5178 if [[ $ansible_storage_hosts_count != "0" ]]
5179 then
5180 echo -e "${CYAN}checking the storage nodes${NC}"
5181 bios_version=$(ansible cephstorage -b -m shell -a "dmidecode -s bios-version" | grep -v SUCCESS | sort -u | wc -l)
5182 if [[ $bios_version == "1" ]]
5183 then
5184 bios_version=$(ansible cephstorage -b -m shell -a "dmidecode -s bios-version" | grep -v SUCCESS | sort -u)
5185 echo -e "${GREEN}all servers using bios version $bios_version${NC}"
5186 else
5187 bios_version=$(ansible cephstorage -b -m shell -a "dmidecode -s bios-version")
5188 echo -e "${RED}$bios_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5189 fi
5190 else
5191 echo -e "${CYAN}checking the storage nodes${NC}"
5192 echo -e "${ORANGE}couldn't find storage nodes in the system${NC}"
5193 fi
5194 echo -e "${CYAN}checking all the overcloud servers at once${NC}"
5195 bios_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "dmidecode -s bios-version" | grep -v SUCCESS | sort -u | wc -l)
5196 if [[ $bios_version == "1" ]]
5197 then
5198 bios_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "dmidecode -s bios-version" | grep -v SUCCESS | sort -u)
5199 echo -e "${GREEN}all servers using bios version $bios_version${NC}"
5200 else
5201 bios_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "dmidecode -s bios-version")
5202 echo -e "${RED}$bios_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5203 fi
5204 elapsed_time_seconds=$(expr $(date +%s) - $start)
5205
5206
5207 ####################################################################################################
5208
5209
5210 start=$(date +%s)
5211 STEPS_COUNTER=$((STEPS_COUNTER+1))
5212 echo -e "${BLUE}\n\n$STEPS_COUNTER) READ THE BMCS SENSORS AND LOCATE FAULTS (+$elapsed_time_seconds `date '+%T'`)${NC}"
5213 sensors=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "ipmitool sdr elist full | awk '{ if ( \$5 != ok ) print }' | grep -v -E '\| ok \||\| ns \|'" | grep ^[A-Z] -B 1)
5214 if [[ $sensors ]]
5215 then
5216 echo -e "${RED}$sensors${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5217 echo -e "\n\n${ORANGE}lnr - Lower Non-Recoverable${NC}"
5218 echo -e "${ORANGE}lcr - Lower Critical${NC}"
5219 echo -e "${ORANGE}lnc - Lower Non-Critical${NC}"
5220 echo -e "${ORANGE}unc - Upper Non-Critical${NC}"
5221 echo -e "${ORANGE}ucr - Upper Critical${NC}"
5222 echo -e "${ORANGE}unr - Upper Non-Recoverable${NC}"
5223 echo -e "${ORANGE}nr - Non Recoverable${NC}"
5224 echo -e "${ORANGE}cr - Critical${NC}"
5225 echo -e "${ORANGE}nc - Non Critical${NC}"
5226 echo -e "${ORANGE}ns - Not Specified${NC}"
5227 echo -e "${ORANGE}na - Not Available${NC}"
5228 else
5229 echo -e "${GREEN}all sensors returned ok${NC}"
5230 fi
5231 elapsed_time_seconds=$(expr $(date +%s) - $start)
5232
5233
5234 ####################################################################################################
5235
5236
5237 start=$(date +%s)
5238 STEPS_COUNTER=$((STEPS_COUNTER+1))
5239 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE CRITICAL/NON-RECOVERABLE EVENT LOGS IN SYSTEM BMCS FROM CURRENT MONTH (+$elapsed_time_seconds `date '+%T'`)${NC}"
5240 this_month_dates=$(date +%m/[0-9][0-9]/%Y)
5241 events=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -m shell -b -a "ipmitool sel list | grep -E -i 'Critical|Non-Recoverable' | grep -E $this_month_dates" | grep -E -v 'FAILED \| rc=[1-9]|non-zero return code')
5242 if [[ $events ]]
5243 then
5244 echo -e "${RED}$events${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5245 else
5246 echo -e "${GREEN}none of the events of the BMCs returned critical or non-recoverable${NC}"
5247 fi
5248 elapsed_time_seconds=$(expr $(date +%s) - $start)
5249
5250
5251 ####################################################################################################
5252
5253
5254 if [[ -f "$logs_dir/kernel" ]]
5255 then
5256 continue
5257 else
5258 start=$(date +%s)
5259 STEPS_COUNTER=$((STEPS_COUNTER+1))
5260 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT ALL THE OVERCLOUD HOSTS HAS THE SAME KERNEL VERSION (+$elapsed_time_seconds `date '+%T'`)${NC}"
5261 kernel_version=$(ansible all --limit '!hypervisor' -b -m shell -a "uname -r" | grep -v SUCCESS | sort -u | wc -l)
5262 if [[ $kernel_version == "1" ]]
5263 then
5264 kernel_version=$(ansible all --limit '!hypervisor' -b -m shell -a "uname -r" | grep -v SUCCESS | sort -u)
5265 echo -e "${GREEN}all servers using kernel version $kernel_version${NC}"
5266 touch $logs_dir/kernel
5267 echo "$kernel_version" > $logs_dir/kernel
5268 else
5269 kernel_version=$(ansible all --limit '!hypervisor' -b -m shell -a "uname -r")
5270 echo -e "${RED}$kernel_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5271 fi
5272 fi
5273 elapsed_time_seconds=$(expr $(date +%s) - $start)
5274
5275
5276 ####################################################################################################
5277
5278
5279 if [[ -f "$logs_dir/cpu_model" ]]
5280 then
5281 continue
5282 else
5283 start=$(date +%s)
5284 STEPS_COUNTER=$((STEPS_COUNTER+1))
5285 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT ALL THE OVERCLOUD HOSTS HAS THE SAME CPU MODEL (+$elapsed_time_seconds `date '+%T'`)${NC}"
5286 echo -e "${CYAN}checking the computes${NC}"
5287 cpu_model=$(ansible compute -b -m shell -a "dmidecode -t processor | grep Version: | sort -u | awk -F: '{print \$2}'" | grep -v SUCCESS | sort -u | wc -l)
5288 if [[ $cpu_model == "1" ]]
5289 then
5290 cpu_model=$(ansible compute -b -m shell -a "dmidecode -t processor | grep Version: | sort -u | awk -F: '{print \$2}'" | grep -v SUCCESS | sort -u)
5291 echo -e "${GREEN}all servers using cpu model $cpu_model${NC}"
5292 touch $logs_dir/cpu_model
5293 echo "$cpu_model" > $logs_dir/cpu_model
5294 else
5295 cpu_model=$(ansible compute -b -m shell -a "dmidecode -t processor | grep Version: | sort -u | awk -F: '{print \$2}'")
5296 echo -e "${RED}$cpu_model${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5297 fi
5298 echo -e "${CYAN}checking the controllers${NC}"
5299 cpu_model=$(ansible controller -b -m shell -a "dmidecode -t processor | grep Version: | sort -u | awk -F: '{print \$2}'" | grep -v SUCCESS | sort -u | wc -l)
5300 if [[ $cpu_model == "1" ]]
5301 then
5302 cpu_model=$(ansible controller -b -m shell -a "dmidecode -t processor | grep Version: | sort -u | awk -F: '{print \$2}'" | grep -v SUCCESS | sort -u)
5303 echo -e "${GREEN}all servers using cpu model $cpu_model${NC}"
5304 else
5305 cpu_model=$(ansible controller -b -m shell -a "dmidecode -t processor | grep Version: | sort -u | awk -F: '{print \$2}'")
5306 echo -e "${RED}$cpu_model${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5307 fi
5308 if [[ $ansible_storage_hosts_count != "0" ]]
5309 then
5310 echo -e "${CYAN}checking the storage nodes${NC}"
5311 cpu_model=$(ansible cephstorage -b -m shell -a "dmidecode -t processor | grep Version: | sort -u | awk -F: '{print \$2}'" | grep -v SUCCESS | sort -u | wc -l)
5312 if [[ $cpu_model == "1" ]]
5313 then
5314 cpu_model=$(ansible cephstorage -b -m shell -a "dmidecode -t processor | grep Version: | sort -u | awk -F: '{print \$2}'" | grep -v SUCCESS | sort -u)
5315 echo -e "${GREEN}all servers using cpu model $cpu_model${NC}"
5316 else
5317 cpu_model=$(ansible cephstorage -b -m shell -a "dmidecode -t processor | grep Version: | sort -u | awk -F: '{print \$2}'")
5318 echo -e "${RED}$cpu_model${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5319 fi
5320 else
5321 echo -e "${CYAN}\nchecking the storage nodes${NC}"
5322 echo -e "${ORANGE}couldn't find storage nodes in the system${NC}"
5323 fi
5324 fi
5325 elapsed_time_seconds=$(expr $(date +%s) - $start)
5326
5327
5328 ####################################################################################################
5329
5330
5331 if [[ -f "$logs_dir/baseboard_product_name" ]]
5332 then
5333 continue
5334 else
5335 start=$(date +%s)
5336 STEPS_COUNTER=$((STEPS_COUNTER+1))
5337 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT ALL THE OVERCLOUD HOSTS HAS THE SAME BASEBOARD PRODUCT NAME (+$elapsed_time_seconds `date '+%T'`)${NC}"
5338 echo -e "${CYAN}checking the computes${NC}"
5339 baseboard_product_name=$(ansible compute -b -m shell -a "dmidecode -s baseboard-product-name" | grep -v SUCCESS | sort -u | wc -l)
5340 if [[ $baseboard_product_name == "1" ]]
5341 then
5342 baseboard_product_name=$(ansible compute -b -m shell -a "dmidecode -s baseboard-product-name" | grep -v SUCCESS | sort -u)
5343 echo -e "${GREEN}all servers using baseboard product name $baseboard_product_name${NC}"
5344 touch $logs_dir/baseboard_product_name
5345 echo "$baseboard_product_name" > $logs_dir/baseboard_product_name
5346 else
5347 baseboard_product_name=$(ansible compute -b -m shell -a "dmidecode -s baseboard-product-name")
5348 echo -e "${RED}$baseboard_product_name${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5349 fi
5350 echo -e "${CYAN}checking the controllers${NC}"
5351 baseboard_product_name=$(ansible controller -b -m shell -a "dmidecode -s baseboard-product-name" | grep -v SUCCESS | sort -u | wc -l)
5352 if [[ $baseboard_product_name == "1" ]]
5353 then
5354 baseboard_product_name=$(ansible controller -b -m shell -a "dmidecode -s baseboard-product-name" | grep -v SUCCESS | sort -u)
5355 echo -e "${GREEN}all servers using baseboard product name $baseboard_product_name${NC}"
5356 else
5357 baseboard_product_name=$(ansible controller -b -m shell -a "dmidecode -s baseboard-product-name")
5358 echo -e "${RED}$baseboard_product_name${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5359 fi
5360 if [[ $ansible_storage_hosts_count != "0" ]]
5361 then
5362 echo -e "${CYAN}checking the storage nodes${NC}"
5363 baseboard_product_name=$(ansible cephstorage -b -m shell -a "dmidecode -s baseboard-product-name" | grep -v SUCCESS | sort -u | wc -l)
5364 if [[ $baseboard_product_name == "1" ]]
5365 then
5366 baseboard_product_name=$(ansible cephstorage -b -m shell -a "dmidecode -s baseboard-product-name" | grep -v SUCCESS | sort -u)
5367 echo -e "${GREEN}all servers using baseboard product name $baseboard_product_name${NC}"
5368 else
5369 baseboard_product_name=$(ansible cephstorage -b -m shell -a "dmidecode -s baseboard-product-name")
5370 echo -e "${RED}$baseboard_product_name${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5371 fi
5372 else
5373 echo -e "${CYAN}\nchecking the storage nodes${NC}"
5374 echo -e "${ORANGE}couldn't find storage nodes in the system${NC}"
5375 fi
5376 fi
5377 elapsed_time_seconds=$(expr $(date +%s) - $start)
5378
5379
5380 ####################################################################################################
5381
5382
5383 if [[ $nuage != "true" ]]
5384 then
5385 start=$(date +%s)
5386 STEPS_COUNTER=$((STEPS_COUNTER+1))
5387 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT ALL CONTROLLERS HAS SAME NUMBER OF NAMESPACES (+$elapsed_time_seconds `date '+%T'`)${NC}"
5388 missing_namespaces=$(ansible controller -b -m shell -a "ip netns | wc -l" | grep ^[0-9] | sort --uniq | wc -l)
5389 if [[ $missing_namespaces != "1" ]]
5390 then
5391 missing_namespaces=$(ansible controller -b -m shell -a "ip netns | wc -l")
5392 echo -e "${RED}$missing_namespaces${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5393 else
5394 echo -e "${GREEN}all controllers has the identical number of namespaces${NC}"
5395 fi
5396 fi
5397 elapsed_time_seconds=$(expr $(date +%s) - $start)
5398
5399
5400 ####################################################################################################
5401
5402
5403 if [[ $nuage != "true" ]]
5404 then
5405 STEPS_COUNTER=$((STEPS_COUNTER+1))
5406 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE CONTROLLERS HAS THE EXPECTED NUMBER OF NAMESPACES (+$elapsed_time_seconds `date '+%T'`)${NC}"
5407 dhcp_true_networks=$(source ~/overcloudrc && openstack subnet list --long -f value | grep True | awk '{print $3}' | sort -u | wc -l)
5408 routers=$(source ~/overcloudrc && openstack router list -f value | wc -l)
5409 expected_namespaces=$(expr $dhcp_true_networks + $routers)
5410 namespaces=$(ansible controller -b -m shell -a "ip netns | wc -l" | grep ^[0-9] | sort -u)
5411 if [[ $namespaces != $expected_namespaces ]]
5412 then
5413 namespaces_diff=$(ansible controller -b -m shell -a "ip netns | wc -l")
5414 echo -e "${RED}all the controller are expected to have $expected_namespaces namespaces\nexecute: ${MAGENTA}ansible controller -b -m shell -a \"ip netns\"${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5415 echo -e "${RED}$namespaces_diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5416 else
5417 echo -e "${GREEN}all the controllers has the expected number of namespaces${NC}"
5418 fi
5419 fi
5420 elapsed_time_seconds=$(expr $(date +%s) - $start)
5421
5422
5423 ####################################################################################################
5424
5425
5426 start=$(date +%s)
5427 STEPS_COUNTER=$((STEPS_COUNTER+1))
5428 echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE THE NUMBER OF CRONTAB JOBS BETWEEN THE CONTROLLERS (+$elapsed_time_seconds `date '+%T'`)${NC}"
5429 cronjobs_comparison=$(ansible controller -b -m shell -a "crontab -l" | grep -v -E '\#|rabbit-drain-queues.py|AIDE integrity check run|SUCCESS' | sort | uniq -c | column -t | grep -v ^3 | awk '{print $NF}' | paste -sd'|' | xargs -i ansible controller -b -m shell -a "crontab -l | grep -E '{}'")
5430 if [[ $cronjobs_comparison ]]
5431 then
5432 echo -e "${RED}$cronjobs_comparison${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5433 else
5434 echo -e "${GREEN}the crontab jobs count is identical between the controllers${NC}"
5435 fi
5436 elapsed_time_seconds=$(expr $(date +%s) - $start)
5437
5438
5439 ####################################################################################################
5440
5441
5442 # 0 3 * * 6 bash /usr/share/cbis/utils/check_restart_horizon.sh
5443 # */15 * * * * source /home/cbis-admin/overcloudrc && python /usr/lib/zabbix/alertscripts/zbx_metrics_exporter.py > /var/log/zabbix/metrics/last_run.status
5444 # 0 0 * * * /usr/bin/find /var/log/zabbix/metrics/*.xml* -mtime +1 -exec rm -rf {} \; > /dev/null 2>&1
5445 # 0 0 * * * source /home/cbis-admin/overcloudrc && python /usr/lib/zabbix/alertscripts/zbx_KPIs_exporter.py &> /var/log/zabbix/metrics/last_KPIs_run.status
5446 # 1 * * * * sudo python /usr/bin/zabbix_db_partitions_manager.py --history=3 --trend=30
5447 # 0 * * * * sudo python /usr/share/cbis/overcloud/postdeploy/templates/zabbix/tools/clear_outdated_alma_alarms.py
5448 # 0 3 * * * /bin/docker start elk-curator
5449 # 0 1 * * * /usr/bin/find /backup/* -mtime +3 -exec rm -rf {} \; > /dev/null 2>&1
5450 # @daily sh /usr/local/bin/check_passwd_expiry.sh
5451 # @daily /usr/sbin/aide --check | /bin/mail -s "overcloud-controller-dublin-1 - AIDE integrity check run" root
5452 # @daily sh /usr/local/bin/create_cert_exp_alarm_oc.sh
5453 # 0 0 * * 0 python /usr/share/cbis/overcloud/postdeploy/scripts/rabbit-drain-queues.py --user guest --password '95eodeHJBpfbdXYAiaXc635bK' --host 172.17.1.11
5454 # 0 2 * * * python /root/backup/CbisOvercloudDatabaseBackup.py
5455
5456 # note: ignoring rabbit-drain-queues.py and AIDE as they are uniq.
5457
5458 start=$(date +%s)
5459 STEPS_COUNTER=$((STEPS_COUNTER+1))
5460 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE EXPECTED CRONTAB JOBS EXISTS ON THE CONTROLLERS (BASED ON CBIS 20 PP3) (+$elapsed_time_seconds `date '+%T'`)${NC}"
5461 crontab_list=$(ansible controller -b -m shell -a "crontab -l" | grep -v ^# | tr -d [0-9] | sort | uniq -c | column -t)
5462 declare -a cronjobs=(check_restart_horizon zbx_metrics_exporter /var/log/zabbix/metrics/*.xml zbx_KPIs_exporter zabbix_db_partitions_manager clear_outdated_alma_alarms elk-curator /backup/* check_passwd_expiry CbisOvercloudDatabaseBackup AIDE rabbit-drain-queues)
5463 for cronjob in "${cronjobs[@]}"
5464 do
5465 missing_cronjob=$(echo -e "$crontab_list" | grep $cronjob | grep -v ^3)
5466 if [[ $missing_cronjob ]]
5467 then
5468 controllers_count=$(echo -e "$missing_cronjob" | awk '{print $1}')
5469 echo -e "${RED}$cronjob crontab job is found only on $controllers_count controllers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5470 else
5471 echo -e "${GREEN}$cronjob crontab job is found on all 3 controllers${NC}"
5472 fi
5473 done
5474 elapsed_time_seconds=$(expr $(date +%s) - $start)
5475
5476
5477 ####################################################################################################
5478
5479
5480 start=$(date +%s)
5481 STEPS_COUNTER=$((STEPS_COUNTER+1))
5482 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THE PRESENCE OF CRITICAL CRONTAB JOBS ON THE UNDERCLOUD VM (BASED ON CBIS 20 PP3) (+$elapsed_time_seconds `date '+%T'`)${NC}"
5483 crontab_list=$(sudo crontab -l | grep -v ^# && crontab -l | grep -v ^#)
5484 if [[ $cbis_version == "19.100.1" || $cbis_version == "19.0.0.1" ]]
5485 then
5486 declare -a cronjobs=(backup_fetcher check_fsid_mismatch heat-manage purge_deleted AIDE)
5487 else
5488 declare -a cronjobs=(backup_fetcher check_fsid_mismatch heat-manage purge_deleted AIDE create_cert_exp_alarm_uc)
5489 fi
5490 for cronjob in "${cronjobs[@]}"
5491 do
5492 missing_cronjob=$(echo -e "$crontab_list" | grep $cronjob)
5493 if [[ -z $missing_cronjob ]]
5494 then
5495 echo -e "${RED}$cronjob crontab job is missing${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5496 else
5497 echo -e "${GREEN}$cronjob crontab job is found${NC}"
5498 fi
5499 done
5500 elapsed_time_seconds=$(expr $(date +%s) - $start)
5501
5502
5503 ####################################################################################################
5504
5505
5506 start=$(date +%s)
5507 STEPS_COUNTER=$((STEPS_COUNTER+1))
5508 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE DUPLICATE CRONJOBS IN CRONTAB (+$elapsed_time_seconds `date '+%T'`)${NC}"
5509 duplicate_cronjob=$(ansible controller -b -m shell -a "crontab -l | grep -E -v '\#|AIDE' | sort | uniq -c | grep -E '^\s+[2-9] '" | grep -E -v 'non-zero return code|FAILED')
5510 if [[ $duplicate_cronjob ]]
5511 then
5512 echo -e "${RED}$duplicate_cronjob${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5513 echo -e "\n${ORANGE}CBIS-13672 - Duplicated zabbix metrics cronjob in controller (Detected in 19.0 MP4 PP1)${NC}"
5514 else
5515 echo -e "${GREEN}no duplicate cronjobs are found${NC}"
5516 fi
5517 elapsed_time_seconds=$(expr $(date +%s) - $start)
5518
5519
5520 ####################################################################################################
5521
5522
5523 start=$(date +%s)
5524 STEPS_COUNTER=$((STEPS_COUNTER+1))
5525 echo -e "${BLUE}\n\n$STEPS_COUNTER) PERFORMS LOGROTATE DRY RUN TO LOCATE ERRORS (+$elapsed_time_seconds `date '+%T'`)${NC}"
5526 logrotate_linux=$(ansible all --limit '!hypervisor' -b -m shell -a "ls /etc/logrotate.d/ | tr -s ' ' /n | xargs -i logrotate -vdf /etc/logrotate.d/{} 2>&1 | grep -v 'No such file or directory' | grep -i error:" | grep -v -E 'rc=[1-9]|non-zero return code')
5527 logrotate_containers=$(ansible all --limit '!hypervisor' -b -m shell -a "logrotate -vdf /var/lib/config-data/puppet-generated/crond/etc/logrotate-crond.conf 2>&1 | grep -v 'No such file or directory' | grep -i error:" | grep -v -E 'rc=[1-9]|non-zero return code')
5528 if [[ $logrotate_linux || $logrotate_containers ]]
5529 then
5530 echo -e "${RED}$logrotate_linux\n$logrotate_containers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5531 echo -e "\n${ORANGE}CBIS-16153 - zabbix_ceph_racks, zabbix_cbis and zabbix-agent logrotate conf files have permission issues (20)${NC}"
5532 else
5533 echo -e "${GREEN}no errors were found in the logrotate dry run${NC}"
5534 fi
5535 elapsed_time_seconds=$(expr $(date +%s) - $start)
5536
5537
5538 ####################################################################################################
5539
5540
5541 start=$(date +%s)
5542 STEPS_COUNTER=$((STEPS_COUNTER+1))
5543 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE LOGROTATE CRONJOB IS CONFIGURED WITHIN THE logrotate_crond DOCKER CONTAINER OF EACH SERVER (+$elapsed_time_seconds `date '+%T'`)${NC}"
5544 logrotate_crond_crontab=$(ansible all --limit '!hypervisor' -b -m shell -a "docker exec -i logrotate_crond crontab -l | grep '/usr/sbin/logrotate -s' -c" | grep ^[0-9] | sort | uniq)
5545 if [[ $logrotate_crond_crontab != "1" ]]
5546 then
5547 logrotate_crond_crontab=$(ansible all --limit '!hypervisor' -b -m shell -a "docker exec -i logrotate_crond crontab -l | grep '/usr/sbin/logrotate -s'")
5548 echo -e "${RED}$logrotate_crond_crontab${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5549 else
5550 echo -e "${GREEN}the logrotate cronjob is configured within the logrotate_crond docker container of each server${NC}"
5551 fi
5552 elapsed_time_seconds=$(expr $(date +%s) - $start)
5553 elapsed_time_seconds=$(expr $(date +%s) - $start)
5554
5555
5556 ####################################################################################################
5557
5558
5559 start=$(date +%s)
5560 STEPS_COUNTER=$((STEPS_COUNTER+1))
5561 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT LOG FILES UNDER /var/log/containers/ ARE LOG-ROTATED (+$elapsed_time_seconds `date '+%T'`)${NC}"
5562 servers_without_logrotation=$(ansible all --limit '!hypervisor' -b -m shell -a "find /var/log/containers/ -iname '*.log*' | grep -v -E log$ | xargs -i ls -l {}" | grep -e '^$' -B 1 | awk '{print $1}' | column -t | sort | grep overcloud-)
5563 if [[ $servers_without_logrotation ]]
5564 then
5565 echo -e "${RED}unable to find single log-rotated log file under /var/log/containers/* for the following hosts:\n\n$servers_without_logrotation${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5566 echo -e "\n\n${ORANGE}CBIS-16404 (19A) - logrotate isn't working on all the overcloud servers - daily cronjob doesn't work${NC}"
5567 else
5568 echo -e "${GREEN}at least one log file is found log-rotated on each hosts${NC}"
5569 fi
5570 elapsed_time_seconds=$(expr $(date +%s) - $start)
5571
5572
5573 ####################################################################################################
5574
5575
5576 start=$(date +%s)
5577 STEPS_COUNTER=$((STEPS_COUNTER+1))
5578 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THERE ARE NO DIFFERENCES IN THE LOGROTATE CONFIG FILES BETWEEN THE HOST-GROUPS (+$elapsed_time_seconds `date '+%T'`)${NC}"
5579 echo -e "${CYAN}checking controllers${NC}"
5580 logrotate_sha1=$(ansible controller -b -m shell -a "ls /etc/logrotate.d/ | xargs -i sha1sum /etc/logrotate.d/{}" | grep ^[0-9,a-f] | sort | uniq -c | grep -E '^\s+[1-2]')
5581 if [[ $logrotate_sha1 ]]
5582 then
5583 echo -e "${RED}$logrotate_sha1${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5584 echo -e "\n\n${ORANGE}CBIS-16138 - zabbix-sender and zabbix-server missing from the replaced controllers/scaled-out compute/storage nodes (20)${NC}\n"
5585 else
5586 echo -e "${GREEN}the logrotate configuration files are identical on all the controller${NC}"
5587 fi
5588 if [ $ansible_avrs_hosts_count -gt 1 ]
5589 then
5590 echo -e "${CYAN}checking avrs computes${NC}"
5591 logrotate_sha1=$(ansible *overcloud-[aA]vrs* -b -m shell -a "ls /etc/logrotate.d/ | xargs -i sha1sum /etc/logrotate.d/{}" | grep ^[0-9,a-f] | sort | uniq -c | grep -vE '^\s+'$ansible_avrs_hosts_count'')
5592 if [[ $logrotate_sha1 ]]
5593 then
5594 echo -e "${RED}$logrotate_sha1${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5595 echo -e "\n\n${ORANGE}CBIS-16138 - zabbix-sender and zabbix-server missing from the replaced controllers/scaled-out compute/storage nodes (20)${NC}\n"
5596 else
5597 echo -e "${GREEN}the logrotate configuration files are identical on all the avrs computes${NC}"
5598 fi
5599 fi
5600 if [ $ansible_dpdk_hosts_count -gt 1 ]
5601 then
5602 echo -e "${CYAN}checking dpdk computes${NC}"
5603 logrotate_sha1=$(ansible *overcloud-[dD]pdk* -b -m shell -a "ls /etc/logrotate.d/ | xargs -i sha1sum /etc/logrotate.d/{}" | grep ^[0-9,a-f] | sort | uniq -c | grep -vE '^\s+'$ansible_dpdk_hosts_count'')
5604 if [[ $logrotate_sha1 ]]
5605 then
5606 echo -e "${RED}$logrotate_sha1${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5607 echo -e "\n\n${ORANGE}CBIS-16138 - zabbix-sender and zabbix-server missing from the replaced controllers/scaled-out compute/storage nodes (20)${NC}\n"
5608 else
5609 echo -e "${GREEN}the logrotate configuration files are identical on all the dpdk computes${NC}"
5610 fi
5611 fi
5612 if [ $ansible_ovs_hosts_count -gt 1 ]
5613 then
5614 echo -e "${CYAN}checking ovs computes${NC}"
5615 logrotate_sha1=$(ansible *overcloud-[oO]vs* -b -m shell -a "ls /etc/logrotate.d/ | xargs -i sha1sum /etc/logrotate.d/{}" | grep ^[0-9,a-f] | sort | uniq -c | grep -vE '^\s+'$ansible_ovs_hosts_count'')
5616 if [[ $logrotate_sha1 ]]
5617 then
5618 echo -e "${RED}$logrotate_sha1${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5619 echo -e "\n\n${ORANGE}CBIS-16138 - zabbix-sender and zabbix-server missing from the replaced controllers/scaled-out compute/storage nodes (20)${NC}\n"
5620 else
5621 echo -e "${GREEN}the logrotate configuration files are identical on all the ovs computes${NC}"
5622 fi
5623 fi
5624 if [ $ansible_sriov_hosts_count -gt 1 ]
5625 then
5626 echo -e "${CYAN}checking sriov computes${NC}"
5627 logrotate_sha1=$(ansible *overcloud-[sS]riov* -b -m shell -a "ls /etc/logrotate.d/ | xargs -i sha1sum /etc/logrotate.d/{}" | grep ^[0-9,a-f] | sort | uniq -c | grep -vE '^\s+'$ansible_sriov_hosts_count'')
5628 if [[ $logrotate_sha1 ]]
5629 then
5630 echo -e "${RED}$logrotate_sha1${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5631 echo -e "\n\n${ORANGE}CBIS-16138 - zabbix-sender and zabbix-server missing from the replaced controllers/scaled-out compute/storage nodes (20)${NC}\n"
5632 else
5633 echo -e "${GREEN}the logrotate configuration files are identical on all the sriov computes${NC}"
5634 fi
5635 fi
5636 if [ $ansible_storage_hosts_count -gt 1 ]
5637 then
5638 echo -e "${CYAN}checking storage nodes${NC}"
5639 logrotate_sha1=$(ansible *overcloud-[sS]torage* -b -m shell -a "ls /etc/logrotate.d/ | xargs -i sha1sum /etc/logrotate.d/{}" | grep ^[0-9,a-f] | sort | uniq -c | grep -vE '^\s+'$ansible_storage_hosts_count'')
5640 if [[ $logrotate_sha1 ]]
5641 then
5642 echo -e "${RED}$logrotate_sha1${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5643 echo -e "\n\n${ORANGE}CBIS-16138 - zabbix-sender and zabbix-server missing from the replaced controllers/scaled-out compute/storage nodes (20)${NC}\n"
5644 else
5645 echo -e "${GREEN}the logrotate configuration files are identical on all the storage nodes${NC}"
5646 fi
5647 fi
5648 elapsed_time_seconds=$(expr $(date +%s) - $start)
5649
5650
5651 ####################################################################################################
5652
5653
5654 if [[ -f "$logs_dir/sriov_vf_interfaces" ]]
5655 then
5656 continue
5657 else
5658 start=$(date +%s)
5659 STEPS_COUNTER=$((STEPS_COUNTER+1))
5660 echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE 2 SRIOV NICS ARE MAPPED TO THE 2 NUMAS (+$elapsed_time_seconds `date '+%T'`)${NC}"
5661 touch $logs_dir/sriov_vf_interfaces
5662 if [[ $hw_model == "airframe_or" || $hw_model == "hp-c7kg9" ]]
5663 then
5664 echo -e "${ORANGE}not applicable for airframe_or (OR17) and hp-c7kg9.\nwhen performing/designing tests around this hardware(s), understand that the sriov network interfaces are all mapped to 1 numa (numa 0)${NC}"
5665 else
5666 if [[ $ansible_sriov_hosts_count == "0" ]]
5667 then
5668 echo -e "${ORANGE}couldn't find sriov computes${NC}"
5669 else
5670 sriov_numa_mapping=$(ansible Sriov -b -m shell -a "ip link show | grep 'vf 0' -B2 | grep ^[0-9] | awk -F: '{print \$2}' | xargs -i cat /sys/class/net/{}/device/numa_node | sort -u | wc -l" | grep ^[0-9] | sort -u)
5671 if [[ $sriov_numa_mapping == "2" ]]
5672 then
5673 echo -e "${GREEN}the sriov computes network interfaces are mapped to each individual NUMA as expected${NC}"
5674 echo -e "$sriov_numa_mapping" > $logs_dir/sriov_vf_interfaces
5675 else
5676 sriov_vf_interfaces=$(ansible Sriov -b -m shell -a "ip link show | grep 'vf 0' -B2 | grep ^[0-9] | awk -F: '{print \$2}' | xargs -i echo /sys/class/net/{}/device/numa_node")
5677 echo -e "${RED}$sriov_vf_interfaces${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5678
5679 fi
5680 fi
5681 fi
5682 fi
5683 elapsed_time_seconds=$(expr $(date +%s) - $start)
5684
5685
5686 ####################################################################################################
5687
5688
5689 start=$(date +%s)
5690 STEPS_COUNTER=$((STEPS_COUNTER+1))
5691 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR DISK ERRORS USING SMARTCTL (+$elapsed_time_seconds `date '+%T'`)${NC}"
5692 disk_errors=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "smartctl --scan | cut -d' ' -f1-3 | while read line; do smartctl -x \${line} | awk '{ if ( \$4 > 10 ) print \$1,\$2,\$3,\$4 }'; done | grep 'Device Error Count'" | grep -E -v 'FAILED|non-zero return code')
5693 if [[ -z $disk_errors ]]
5694 then
5695 echo -e "${GREEN}no device errors found${NC}"
5696 else
5697 echo -e "${RED}$disk_errors${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5698 fi
5699 elapsed_time_seconds=$(expr $(date +%s) - $start)
5700
5701
5702 ####################################################################################################
5703
5704
5705 start=$(date +%s)
5706 STEPS_COUNTER=$((STEPS_COUNTER+1))
5707 echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE 'Too many open files' IN /var/log/messages WITHIN ALL THE HOST (+$elapsed_time_seconds `date '+%T'`)${NC}"
5708 files=$(ansible all --limit '!hypervisor' -b -m shell -a "grep -i -R 'Too many open files' /var/log/messages" | grep -E -v 'Invoked with warn=True|SUCCESS')
5709 if [[ $files ]]
5710 then
5711 files=$(ansible all --limit '!hypervisor' -b -m shell -a "grep -i -R 'Too many open files' /var/log/messages | grep -v 'Invoked with warn=True'"| grep -E -v 'FAILED|non-zero return code')
5712 echo -e "${RED}$files${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5713 else
5714 echo -e "${GREEN}no 'Too many open files' lines found under /var/log/messages${NC}"
5715 fi
5716 elapsed_time_seconds=$(expr $(date +%s) - $start)
5717
5718
5719 ####################################################################################################
5720
5721
5722 start=$(date +%s)
5723 STEPS_COUNTER=$((STEPS_COUNTER+1))
5724 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK RPMS DIFFERENCES BETWEEN THE HOSTS OF EACH SAME HOST-GROUP (+$elapsed_time_seconds `date '+%T'`)${NC}"
5725 echo -e "${CYAN}checking for rpms delta between all the > controllers < (to cover replace controller rpms possible mismatch)${NC}"
5726 md5sum=$(ansible controller -b -m shell -a "md5sum latest_installed_rpms.txt" | awk '{print $1}' | grep -vi overcloud | sort -u | wc -l)
5727 if [[ $md5sum == "1" ]]
5728 then
5729 echo -e "${GREEN}no differences are found between the current installed rpms of the controllers${NC}"
5730 else
5731 echo -e "${RED}one or more controllers has different md5 checksum. compare the latest_installed_rpms.txt file of each controller${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5732 md5sum=$(ansible controller -b -m shell -a "md5sum latest_installed_rpms.txt")
5733 echo -e "${RED}$md5sum${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5734 fi
5735 if [[ $ansible_dpdk_hosts_count != "0" && $ansible_dpdk_hosts_count != "1" ]]
5736 then
5737 echo -e "${CYAN}checking for rpms delta between all the dpdk computes${NC}"
5738 md5sum=$(ansible Dpdk* -b -m shell -a "md5sum latest_installed_rpms.txt" | awk '{print $1}' | grep -vi overcloud | sort -u | wc -l)
5739 if [[ $md5sum == "1" ]]
5740 then
5741 echo -e "${GREEN}no differences are found between the current installed rpms of the dpdk computes${NC}"
5742 else
5743 echo -e "${RED}one or more computes has different md5 checksum. compare the latest_installed_rpms.txt file of each compute${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5744 md5sum=$(ansible Dpdk* -b -m shell -a "md5sum latest_installed_rpms.txt")
5745 echo -e "${RED}$md5sum${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5746 fi
5747 fi
5748 if [[ $ansible_sriov_hosts_count != "0" && $ansible_sriov_hosts_count != "1" ]]
5749 then
5750 echo -e "${CYAN}checking for rpms delta between all the sriov computes${NC}"
5751 md5sum=$(ansible Sriov -b -m shell -a "md5sum latest_installed_rpms.txt" | awk '{print $1}' | grep -vi overcloud | sort -u | wc -l)
5752 if [[ $md5sum == "1" ]]
5753 then
5754 echo -e "${GREEN}no differences are found between the current installed rpms of the sriov computes${NC}"
5755 else
5756 echo -e "${RED}one or more computes has different md5 checksum. compare the latest_installed_rpms.txt file of each compute${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5757 md5sum=$(ansible Sriov -b -m shell -a "md5sum latest_installed_rpms.txt")
5758 echo -e "${RED}$md5sum${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5759 fi
5760 fi
5761 if [[ $ansible_ovs_hosts_count != "0" && $ansible_ovs_hosts_count != "1" ]]
5762 then
5763 echo -e "${CYAN}checking for rpms delta between all the ovs computes${NC}"
5764 md5sum=$(ansible *overcloud-[Oo]vs* -b -m shell -a "md5sum latest_installed_rpms.txt" | awk '{print $1}' | grep -vi overcloud | sort -u | wc -l)
5765 if [[ $md5sum == "1" ]]
5766 then
5767 echo -e "${GREEN}no differences are found between the current installed rpms of the ovs computes${NC}"
5768 else
5769 echo -e "${RED}one or more computes has different md5 checksum. compare the latest_installed_rpms.txt file of each compute${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5770 md5sum=$(ansible *overcloud-[Oo]vs* -b -m shell -a "md5sum latest_installed_rpms.txt")
5771 echo -e "${RED}$md5sum${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5772 fi
5773 fi
5774 if [[ $ansible_avrs_hosts_count != "0" && $ansible_avrs_hosts_count != "1" && $nuage == "true" ]]
5775 then
5776 echo -e "${CYAN}checking for rpms delta between all the avrs computes${NC}"
5777 md5sum=$(ansible *overcloud-[Aa]vrs* -b -m shell -a "md5sum latest_installed_rpms.txt" | awk '{print $1}' | grep -vi overcloud | sort -u | wc -l)
5778 if [[ $md5sum == "1" ]]
5779 then
5780 echo -e "${GREEN}no differences are found between the current installed rpms of the avrs computes${NC}"
5781 else
5782 echo -e "${RED}one or more computes has different md5 checksum. compare the latest_installed_rpms.txt file of each compute${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5783 md5sum=$(ansible *overcloud-[Aa]vrs* -b -m shell -a "md5sum latest_installed_rpms.txt")
5784 echo -e "${RED}$md5sum${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5785 fi
5786 fi
5787 if [[ $ansible_storage_hosts_count != "0" ]]
5788 then
5789 echo -e "${CYAN}checking for rpms delta between all the > storage nodes < (to cover scale-out rpms possible mismatch)${NC}"
5790 md5sum=$(ansible *overcloud-[Ss]torage* -b -m shell -a "md5sum latest_installed_rpms.txt" | awk '{print $1}' | grep -vi overcloud | sort -u | wc -l)
5791 if [[ $md5sum == "1" ]]
5792 then
5793 echo -e "${GREEN}no differences are found between the current installed rpms of the storage nodes${NC}"
5794 else
5795 echo -e "${RED}one or more storage nodes has different md5 checksum. compare the latest_installed_rpms.txt file of each storage node${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5796 md5sum=$(ansible *overcloud-[Ss]torage* -b -m shell -a "md5sum latest_installed_rpms.txt")
5797 echo -e "${RED}$md5sum${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5798 fi
5799 fi
5800 elapsed_time_seconds=$(expr $(date +%s) - $start)
5801
5802
5803 ####################################################################################################
5804
5805
5806 start=$(date +%s)
5807 STEPS_COUNTER=$((STEPS_COUNTER+1))
5808 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT /root/cbis-installer/user_config.yaml EXISTS (+$elapsed_time_seconds `date '+%T'`)${NC}"
5809 user_config_hv=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "test -f /root/cbis-installer/user_config.yaml && echo '/root/cbis-installer/user_config.yaml exists in the hypervisor'" | grep exists)
5810 if [[ $user_config_hv ]]
5811 then
5812 echo -e "${GREEN}$user_config_hv${NC}"
5813 else
5814 echo -e "${RED}/root/cbis-installer/user_config.yaml can't be found in the hypervisor${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5815 fi
5816 elapsed_time_seconds=$(expr $(date +%s) - $start)
5817
5818
5819 ####################################################################################################
5820
5821
5822 start=$(date +%s)
5823 STEPS_COUNTER=$((STEPS_COUNTER+1))
5824 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THE TIME IT TAKES FOR nova list TO FINISH PROCESSING FROM BOTH stackrc AND OVERCLOUDRC AND REPORT FAILURE IF IT IS GREATER THEN 10 SECONDS (+$elapsed_time_seconds `date '+%T'`)${NC}"
5825 # when using self signed certificate the following error may show: Certificate for 10.55.220.115 has no `subjectAltName`, falling back to check for a `commonName` for now. This feature is being removed by major browsers and deprecated by RFC 2818.
5826 # this certificate warning is expected and not a bug according to Yves Brissette.
5827 source ~/stackrc
5828 for i in {1..3}
5829 do
5830 start_time=$(date +%s)
5831 nova list > /dev/null
5832 end_time=$(date +%s)
5833 result_in_seconds=$(expr $end_time - $start_time)
5834 if [ $result_in_seconds -gt 10 ]
5835 then
5836 echo -e "${RED}try $i/3 - nova list (stackrc) finished processing after $result_in_seconds seconds (fail criteria is > 10 seconds)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5837 else
5838 echo -e "${GREEN}try $i/3 - nova list (stackrc) finished processing after $result_in_seconds seconds${NC}"
5839 fi
5840 done
5841 source ~/overcloudrc
5842 for i in {1..3}
5843 do
5844 start_time=$(date +%s)
5845 nova list > /dev/null
5846 end_time=$(date +%s)
5847 result_in_seconds=$(expr $end_time - $start_time)
5848 if [ $result_in_seconds -gt 10 ]
5849 then
5850 echo -e "${RED}try $i/3 - nova list (overcloudrc) finished processing after $result_in_seconds seconds (fail criteria is > 10 seconds)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5851 else
5852 echo -e "${GREEN}try $i/3 - nova list (overcloudrc) finished processing after $result_in_seconds seconds${NC}"
5853 fi
5854 done
5855 elapsed_time_seconds=$(expr $(date +%s) - $start)
5856
5857
5858 ####################################################################################################
5859
5860
5861 start=$(date +%s)
5862 STEPS_COUNTER=$((STEPS_COUNTER+1))
5863 echo -e "${BLUE}\n\n$STEPS_COUNTER) VERIFY admin CORE RESOURCES QUOTA IS -1 (UNLIMITED) (+$elapsed_time_seconds `date '+%T'`)${NC}"
5864 quota=$(source ~/overcloudrc && openstack quota show admin | grep -E -w 'backups|cores|instances|networks|ram|volumes' | awk '{print $4}' | sort -u)
5865 if [[ $quota != "-1" ]]
5866 then
5867 quota=$(source ~/overcloudrc && openstack quota show admin)
5868 echo -e "${RED}$quota${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5869 else
5870 echo -e "${GREEN}core resources (backups, cores, instances, networks, ram and volumes) are al set to -1 (unlimited) quota${NC}"
5871 fi
5872 elapsed_time_seconds=$(expr $(date +%s) - $start)
5873
5874
5875 ####################################################################################################
5876
5877
5878 start=$(date +%s)
5879 STEPS_COUNTER=$((STEPS_COUNTER+1))
5880 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT CORE OPENSTACK CLIENTS USES THE EXPECTED VERSION (+$elapsed_time_seconds `date '+%T'`)${NC}"
5881 current=( $(ansible all --limit '!hypervisor' -b -m shell -a "openstack module list | tr -d '\|\+\-' | grep -E -w 'aodhclient|barbicanclient|cinderclient|glanceclient|heatclient|ironicclient|keystoneclient|mistralclient|novaclient|openstack|openstackclient|swiftclient|vitrageclient'" | grep -v SUCCESS | awk '{print $1,$2}' | sort -u | tr -s ' ' '-' | paste -sd' ') )
5882 if [[ $cbis_version == "19.0.0.1" ]]
5883 then
5884 expected=(aodhclient-1.0.0 barbicanclient-4.6.0 cinderclient-3.5.0 glanceclient-2.10.0 heatclient-1.14.0 ironicclient-2.2.1 keystoneclient-3.15.0 mistralclient-3.3.0 novaclient-10.1.0 openstack-0.11.3 openstackclient-3.14.2 swiftclient-3.5.0 vitrageclient-0.0.1)
5885 expected_current_diff=(`echo ${current[@]} ${expected[@]} | tr ' ' '\n' | sort | uniq -u | paste -sd'|'`)
5886 if [[ $expected_current_diff ]]
5887 then
5888 echo -e "${RED}$expected_current_diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5889 else
5890 echo -e "${GREEN}all diagnosed openstack clients are with the expected version${NC}"
5891 fi
5892 fi
5893 if [[ $cbis_version == "19.100.1" ]]
5894 then
5895 expected=(aodhclient-1.1.0 barbicanclient-4.7.2 cinderclient-4.0.2 glanceclient-2.13.1 heatclient-1.16.2 ironicclient-2.5.1 keystoneclient-3.17.0 mistralclient-3.7.0 novaclient-11.0.0 openstack-0.17.2 openstackclient-3.16.2 swiftclient-3.6.0 vitrageclient-2.3.0)
5896 expected_current_diff=(`echo ${current[@]} ${expected[@]} | tr ' ' '\n' | sort | uniq -u | paste -sd'|'`)
5897 if [[ $expected_current_diff ]]
5898 then
5899 echo -e "${RED}$expected_current_diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5900 else
5901 echo -e "${GREEN}all diagnosed openstack clients are with the expected version${NC}"
5902 fi
5903 fi
5904 if [[ $cbis_version == "20.100.1" ]]
5905 then
5906 expected=(aodhclient-1.1.0 barbicanclient-4.7.2 cinderclient-4.0.2 glanceclient-2.13.1 heatclient-1.16.2 ironicclient-2.5.1 keystoneclient-3.17.0 mistralclient-3.7.0 novaclient-11.0.0 openstack-0.17.2 openstackclient-3.16.2 swiftclient-3.6.0)
5907 expected_current_diff=(`echo ${current[@]} ${expected[@]} | tr ' ' '\n' | sort | uniq -u | paste -sd'|'`)
5908 if [[ $expected_current_diff ]]
5909 then
5910 echo -e "${RED}$expected_current_diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5911 else
5912 echo -e "${GREEN}all diagnosed openstack clients are with the expected version${NC}"
5913 fi
5914 fi
5915 elapsed_time_seconds=$(expr $(date +%s) - $start)
5916
5917
5918 ####################################################################################################
5919
5920
5921 start=$(date +%s)
5922 STEPS_COUNTER=$((STEPS_COUNTER+1))
5923 echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT CBIS REPO ENABLED ON THE OVERCLOUD HOSTS (ICE-575) (+$elapsed_time_seconds `date '+%T'`)${NC}"
5924 cbis_repo_enabled=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "yum repolist all warn=False" | grep -E '^CBIS |^!CBIS ' | grep -v enabled)
5925 if [[ -z $cbis_repo_enabled ]]
5926 then
5927 echo -e "${GREEN}CBIS repo is enabled on all the overcloud hosts${NC}"
5928 else
5929 cbis_repo_enabled=$(ansible all --limit '!hypervisor' -b -m shell -a "yum repolist all | grep -E '^CBIS |^!CBIS ' warn=False")
5930 echo -e "${RED}$cbis_repo_enabled${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5931 fi
5932 elapsed_time_seconds=$(expr $(date +%s) - $start)
5933
5934
5935 ####################################################################################################
5936
5937
5938 start=$(date +%s)
5939 STEPS_COUNTER=$((STEPS_COUNTER+1))
5940 echo -e "${BLUE}\n\n$STEPS_COUNTER) SHOW THE LATEST REBOOT + salt CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
5941 LAST_REBOOT=$(/home/stack/venv/salt-ssh/bin/python /home/stack/venv/salt-ssh/bin/salt-ssh -c /home/stack/salt/etc/salt/ --log-file /home/stack/salt/var/log/salt/ssh --no-host-keys "*" cmd.run "last reboot | grep reboot | head -n 1")
5942 echo -e "${GREEN}$LAST_REBOOT${NC}"
5943 elapsed_time_seconds=$(expr $(date +%s) - $start)
5944
5945
5946 ####################################################################################################
5947
5948 # start=$(date +%s)
5949 # STEPS_COUNTER=$((STEPS_COUNTER+1))
5950 # echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE ATTACHED USB DEVICES ON ALL HOSTS(+$elapsed_time_seconds `date '+%T'`)${NC}"
5951 # usb=$(ansible all -b -m shell -a "lsblk --all -S | grep usb" | grep usb -B 1)
5952 # if [[ $usb ]]
5953 # then
5954 # echo -e "${RED}$usb${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5955 # else
5956 # echo -e "${GREEN}couldn't find attached usb devices${NC}"
5957 # fi
5958 # elapsed_time_seconds=$(expr $(date +%s) - $start)
5959
5960
5961 ####################################################################################################
5962
5963
5964 # start=$(date +%s)
5965 # STEPS_COUNTER=$((STEPS_COUNTER+1))
5966 # echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT PER-USER OPEN FILES LIMIT IS IDENTICAL FOR ALL THE HOSTS (+$elapsed_time_seconds `date '+%T'`)${NC}"
5967 # open_files_limit_per_user=$(ansible all --limit '!hypervisor' -b -m shell -a "cat /etc/security/limits.conf" | grep -E 'soft memlock|hard memlock' | awk '{print $NF}' | sort --uniq)
5968 # if [[ $open_files_limit_per_user != "unlimited" ]]
5969 # then
5970 # open_files_limit_per_user=$(ansible all --limit '!hypervisor' -b -m shell -a "cat /etc/security/limits.conf | grep -E 'soft memlock|hard memlock' | awk '{print $NF}' | sort --uniq")
5971 # echo -e "${RED}$open_files_limit_per_user${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5972 # else
5973 # echo -e "${GREEN}all the hosts returned unlimited for both soft memlock and hard memlock under /etc/security/limits.conf${NC}"
5974 # fi
5975 # elapsed_time_seconds=$(expr $(date +%s) - $start)
5976
5977
5978 ####################################################################################################
5979
5980
5981 # start=$(date +%s)
5982 # STEPS_COUNTER=$((STEPS_COUNTER+1))
5983 # echo -e "${BLUE}\n\n$STEPS_COUNTER) ZABBIX LOGIN CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
5984 # http_availability=$(nc -v -i1 -w1 135.248.16.107 80 2>&1 | grep 'Connected to')
5985 # if [[ $http_availability ]]
5986 # then
5987 # ansible controller -b -m shell -a "wget -N http://135.248.16.107/testmanager/cbis/scripts/python_scripts/zabbix_connectivity.py warn=False" > /dev/null
5988 # zabbix=$(ansible controller -b -m shell -a "python zabbix_connectivity.py" | grep -c Running)
5989 # if [[ $zabbix == "3" || $zabbix == "1" ]]
5990 # then
5991 # echo -e "${GREEN}zabbix login page is accessible (via API) from all the controllers${NC}"
5992 # else
5993 # zabbix=$(ansible controller -b -m shell -a "python zabbix_connectivity.py")
5994 # echo -e "${RED}$zabbix${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5995 # fi
5996 # else
5997 # echo -e "${RED}can't download the zabbix script from 135.248.16.107 - check for zabbix alarms manually${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5998 # fi
5999 # elapsed_time_seconds=$(expr $(date +%s) - $start)
6000
6001
6002 ####################################################################################################
6003
6004
6005 # start=$(date +%s)
6006 # STEPS_COUNTER=$((STEPS_COUNTER+1))
6007 # echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT sshd_migration_fix AND sshd_migration_fix_2 RESIDE WITHIN THE SELINUX POLICY MODULES (+$elapsed_time_seconds `date '+%T'`)${NC}"
6008 # selinux_migrate_fix=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "semodule -l" | grep -E 'sshd_migration_fix_2|sshd_migration_fix' | sort -u | wc -l)
6009 # if [[ $selinux_migrate_fix != "2" ]]
6010 # then
6011 # selinux_migrate_fix=$(ansible compute -b -m shell -a "semodule -l | grep -E 'sshd_migration_fix_2|sshd_migration_fix'")
6012 # echo -e "${RED}$selinux_migrate_fix${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6013 # else
6014 # echo -e "${GREEN}sshd_migration_fix and sshd_migration_fix_2 reside within the SELinux policy modules of the computes${NC}"
6015 # fi
6016 # elapsed_time_seconds=$(expr $(date +%s) - $start)
6017
6018
6019 ####################################################################################################
6020
6021
6022 # start=$(date +%s)
6023 # STEPS_COUNTER=$((STEPS_COUNTER+1))
6024 # echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK EACH HOST FREE MEMORY AND REPORT FAILURE IF A HOST HAD LESS THEN 2G FREE MEMORY (free) (+$elapsed_time_seconds `date '+%T'`)${NC}"
6025 # free_memory=$(ansible all --limit '!hypervisor' -b -m shell -a "free -g | grep Mem: | awk '{ if ( \$4 < 1 ) print \$4 }'" | grep ^[0-9] -B1)
6026 # if [[ $free_memory ]]
6027 # then
6028 # echo -e "${RED}$free_memory${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6029 # else
6030 # echo -e "${GREEN}all the servers has equal or more then 2G free memory${NC}"
6031 # fi
6032 # elapsed_time_seconds=$(expr $(date +%s) - $start)
6033
6034
6035 ####################################################################################################
6036
6037
6038 # start=$(date +%s)
6039 # STEPS_COUNTER=$((STEPS_COUNTER+1))
6040 # echo -e "${BLUE}\n\n$STEPS_COUNTER) PRINT SYSTEM FILE DESCRIPTOR LIMIT AND USAGE (+$elapsed_time_seconds `date '+%T'`)${NC}"
6041 # used_file_descriptor=$(ansible all --limit '!hypervisor' -b -m shell -a "cat /proc/sys/fs/file-nr | awk '{print \$1}'")
6042 # total_file_descriptor=$(ansible all --limit '!hypervisor' -b -m shell -a "cat /proc/sys/fs/file-nr | awk '{print \$3}'")
6043 # if [[ $used_file_descriptor -ge $total_file_descriptor ]]
6044 # then
6045 # file_descriptor=$(ansible all --limit '!hypervisor' -b -m shell -a "cat /proc/sys/fs/file-nr")
6046 # echo -e "${RED}$file_descriptor${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6047 # else
6048 # echo -e "${GREEN}the file descriptor limit is not reached${NC}"
6049 # fi
6050 # elapsed_time_seconds=$(expr $(date +%s) - $start)
6051
6052
6053 ####################################################################################################
6054
6055
6056 # start=$(date +%s)
6057 # STEPS_COUNTER=$((STEPS_COUNTER+1))
6058 # echo -e "${BLUE}\n\n$STEPS_COUNTER) PRINT RABBITMQ FILE DESCRIPTOR/SOCKETS/PROCESSES LIMIT AND USAGE (+$elapsed_time_seconds `date '+%T'`)${NC}"
6059 # ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl report | grep file_descriptors -A5"
6060 # elapsed_time_seconds=$(expr $(date +%s) - $start)
6061
6062
6063 ####################################################################################################
6064
6065
6066 # start=$(date +%s)
6067 # STEPS_COUNTER=$((STEPS_COUNTER+1))
6068 # echo -e "${BLUE}\n\n$STEPS_COUNTER) PRINT MYSQL DATABASES SIZE (+$elapsed_time_seconds `date '+%T'`)${NC}"
6069 # ansible $last_index_controller -b -m shell -a "mysql -e \"SELECT table_schema 'DATABASE', sum(data_length + index_length)/1024/1024 'SIZE_IN_MB' FROM information_schema.TABLES GROUP BY table_schema;\"" | column -t
6070 # elapsed_time_seconds=$(expr $(date +%s) - $start)
6071
6072
6073 ####################################################################################################
6074
6075
6076 # start=$(date +%s)
6077 # STEPS_COUNTER=$((STEPS_COUNTER+1))
6078 # if [[ $ansible_storage_hosts_count != "0" ]]
6079 # then
6080 # echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK IF THE CephStorage GROUP WITHIN THE /etc/ansible/hosts FILE IS WITHOUT COMPUTE HOSTS (BASED ON CBIS-13164) (+$elapsed_time_seconds `date '+%T'`)${NC}"
6081 # $ansible_hosts=$(sed -n -e '/[[Cc]eph[Ss]torage\]/,/\[/ p' /etc/ansible/hosts | grep overcloud | grep -v [Ss]torage | sort -u)
6082 # if [[ $ansible_hosts ]]
6083 # then
6084 # echo -e "${RED}the following servers should not appear under the CephStorage group in /etc/ansible/hosts${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6085 # echo -e "${RED}$ansible_hosts${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6086 # else
6087 # echo -e "${GREEN}no servers other then storage-nodes found under the cephstorage group within /etc/ansible/hosts ${NC}"
6088 # fi
6089 # fi
6090 # elapsed_time_seconds=$(expr $(date +%s) - $start)
6091
6092
6093 ####################################################################################################
6094
6095
6096 # start=$(date +%s)
6097 # STEPS_COUNTER=$((STEPS_COUNTER+1))
6098 # echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECKING THE OVERCLOUDRC FILE VALIDITY (+$elapsed_time_seconds `date '+%T'`)${NC}"
6099 # expected_lines=$(cat ~/overcloudrc | grep -c -E 'OS_USERNAME=admin|OS_CLOUDNAME=overcloud|OS_PROJECT_NAME=admin|OS_TENANT_NAME=admin|OS_CACERT=/home/stack/ca.crt.pem')
6100 # if [[ $expected_lines == "5" ]]
6101 # then
6102 # echo -e "${GREEN}the expected configuration in the overcloudrc file was found${NC}"
6103 # else
6104 # echo -e "${RED}one or more from the following lines missing from the overcloudrc file\nOS_USERNAME=admin\nOS_CLOUDNAME=overcloud\nOS_PROJECT_NAME=admin\nOS_TENANT_NAME=admin\nOS_CACERT=/home/stack/ca.crt.pem${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6105 # cat ~/overcloudrc
6106 # fi
6107 # elapsed_time_seconds=$(expr $(date +%s) - $start)
6108
6109
6110 ####################################################################################################
6111
6112
6113 # start=$(date +%s)
6114 # STEPS_COUNTER=$((STEPS_COUNTER+1))
6115 # echo -e "${BLUE}\n\n$STEPS_COUNTER) PRINT VITRAGE ALARMS HISTORY FROM THE DB FROM THE PAST 1 HOUR (+$elapsed_time_seconds `date '+%T'`)${NC}"
6116 # from_date=$(date -d "-1 hour" +%Y"-"%m"-"%d" "%T)
6117 # to_date=$(date -d "-0 hour" +%Y"-"%m"-"%d" "%T)
6118 # vitrage_history=$(ansible $last_index_controller -b -m shell -a "mysql -e \"SELECT created_at,name FROM alarms WHERE start_timestamp BETWEEN '$from_date' AND '$to_date'\G;\" vitrage | grep -v '\*' | grep -v 'rc=0'" | grep created_at -A1)
6119 # if [[ -z $vitrage_history ]]
6120 # then
6121 # echo -e "${GREEN}no alarms were intiated in the past 1 hour${NC}"
6122 # else
6123 # echo -e "${RED}$vitrage_history${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6124 # fi
6125 # elapsed_time_seconds=$(expr $(date +%s) - $start)
6126
6127
6128 ####################################################################################################
6129
6130
6131 # start=$(date +%s)
6132 # STEPS_COUNTER=$((STEPS_COUNTER+1))
6133 # echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT /etc/sysconfig/iptables IS IDENTICAL BETWEEN ALL THE HOSTS UNDER EACH HOST-GROUP (+$elapsed_time_seconds `date '+%T'`)${NC}"
6134 # echo -e "\n${CYAN}now checking DpdkPerformanceCompute${NC}"
6135 # if [[ $ansible_dpdk_hosts_count != "0" ]]
6136 # then
6137 # iptables=$(ansible *overcloud-[Dd]pdk* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u | wc -l)
6138 # if [[ $iptables != "1" ]]
6139 # then
6140 # echo -e "${RED}one or more dpdk computes has different content inside /etc/sysconfig/iptables${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6141 # iptables=$(ansible *overcloud-[Dd]pdk* -b -m shell -a "md5sum /etc/sysconfig/iptables")
6142 # echo -e "${RED}$iptables${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6143 # if [[ ! -f "dpdk_initial_iptables_config" ]]
6144 # then
6145 # ansible *overcloud-[Dd]pdk* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > dpdk_initial_iptables_config
6146 # else
6147 # ansible *overcloud-[Dd]pdk* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > dpdk_latest_iptables_config
6148 # diff=$(sudo diff -s dpdk_initial_iptables_config dpdk_latest_iptables_config | grep -c 'Files dpdk_initial_iptables_config and dpdk_latest_iptables_config are identical')
6149 # if [[ $diff != "1" ]]
6150 # then
6151 # echo -e "\n${RED}differences were found between dpdk_initial_iptables_config and dpdk_latest_iptables_config${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6152 # diff=$(diff dpdk_initial_iptables_config dpdk_latest_iptables_config)
6153 # echo -e "${RED}$diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6154 # else
6155 # echo -e "\n${GREEN}no differences were found between dpdk_initial_iptables_config and dpdk_latest_iptables_config${NC}"
6156 # fi
6157 # fi
6158 # else
6159 # echo -e "${GREEN}/etc/sysconfig/iptable is idetical on all the dpdk servers${NC}"
6160 # if [[ ! -f "dpdk_initial_iptables_config" ]]
6161 # then
6162 # ansible *overcloud-[Dd]pdk* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > dpdk_initial_iptables_config
6163 # else
6164 # ansible *overcloud-[Dd]pdk* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > dpdk_latest_iptables_config
6165 # diff=$(sudo diff -s dpdk_initial_iptables_config dpdk_latest_iptables_config | grep -c 'Files dpdk_initial_iptables_config and dpdk_latest_iptables_config are identical')
6166 # if [[ $diff != "1" ]]
6167 # then
6168 # echo -e "\n${RED}differences were found between dpdk_initial_iptables_config and dpdk_latest_iptables_config${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6169 # diff=$(diff dpdk_initial_iptables_config dpdk_latest_iptables_config)
6170 # echo -e "${RED}$diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6171 # else
6172 # echo -e "${GREEN}no differences were found between dpdk_initial_iptables_config and dpdk_latest_iptables_config${NC}"
6173 # fi
6174 # fi
6175 # fi
6176 # else
6177 # echo -e "${ORANGE}no dpdk computes found${NC}"
6178 # fi
6179 # echo -e "\n${CYAN}now checking OvsCompute${NC}"
6180 # if [[ $ansible_ovs_hosts_count != "0" ]]
6181 # then
6182 # iptables=$(ansible *overcloud-[Oo]vs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u | wc -l)
6183 # if [[ $iptables != "1" ]]
6184 # then
6185 # echo -e "${RED}one or more OVS computes has different content inside /etc/sysconfig/iptables${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6186 # iptables=$(ansible *overcloud-[Oo]vs* -b -m shell -a "md5sum /etc/sysconfig/iptables")
6187 # echo -e "${RED}$iptables${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6188 # if [[ ! -f "ovs_initial_iptables_config" ]]
6189 # then
6190 # ansible *overcloud-[Oo]vs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > ovs_initial_iptables_config
6191 # else
6192 # ansible *overcloud-[Oo]vs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > ovs_latest_iptables_config
6193 # diff=$(sudo diff -s ovs_initial_iptables_config ovs_latest_iptables_config | grep -c 'Files ovs_initial_iptables_config and ovs_latest_iptables_config are identical')
6194 # if [[ $diff != "1" ]]
6195 # then
6196 # echo -e "\n${RED}differences were found between ovs_initial_iptables_config and ovs_latest_iptables_config${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6197 # diff=$(diff ovs_initial_iptables_config ovs_latest_iptables_config)
6198 # echo -e "${RED}$diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6199 # else
6200 # echo -e "\n${GREEN}no differences were found between ovs_initial_iptables_config and ovs_latest_iptables_config${NC}"
6201 # fi
6202 # fi
6203 # else
6204 # echo -e "${GREEN}/etc/sysconfig/iptable is idetical on all the OVS servers${NC}"
6205 # if [[ ! -f "ovs_initial_iptables_config" ]]
6206 # then
6207 # ansible *overcloud-[Oo]vs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > ovs_initial_iptables_config
6208 # else
6209 # ansible *overcloud-[Oo]vs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > ovs_latest_iptables_config
6210 # diff=$(sudo diff -s ovs_initial_iptables_config ovs_latest_iptables_config | grep -c 'Files ovs_initial_iptables_config and ovs_latest_iptables_config are identical')
6211 # if [[ $diff != "1" ]]
6212 # then
6213 # echo -e "\n${RED}differences were found between ovs_initial_iptables_config and ovs_latest_iptables_config${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6214 # diff=$(diff ovs_initial_iptables_config ovs_latest_iptables_config)
6215 # echo -e "${RED}$diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6216 # else
6217 # echo -e "${GREEN}no differences were found between ovs_initial_iptables_config and ovs_latest_iptables_config${NC}"
6218 # fi
6219 # fi
6220 # fi
6221 # else
6222 # echo -e "${ORANGE}no OVS computes found${NC}"
6223 # fi
6224 # echo -e "\n${CYAN}now checking SriovPerformanceCompute${NC}"
6225 # if [[ $ansible_sriov_hosts_count != "0" ]]
6226 # then
6227 # iptables=$(ansible *overcloud-[Ss]riov* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u | wc -l)
6228 # if [[ $iptables != "1" ]]
6229 # then
6230 # echo -e "${RED}one or more sriov computes has different content inside /etc/sysconfig/iptables${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6231 # iptables=$(ansible *overcloud-[Ss]riov* -b -m shell -a "md5sum /etc/sysconfig/iptables")
6232 # echo -e "${RED}$iptables${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6233 # if [[ ! -f "sriov_initial_iptables_config" ]]
6234 # then
6235 # ansible *overcloud-[Ss]riov* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > sriov_initial_iptables_config
6236 # else
6237 # ansible *overcloud-[Ss]riov* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > sriov_latest_iptables_config
6238 # diff=$(sudo diff -s sriov_initial_iptables_config sriov_latest_iptables_config | grep -c 'Files sriov_initial_iptables_config and sriov_latest_iptables_config are identical')
6239 # if [[ $diff != "1" ]]
6240 # then
6241 # echo -e "\n${RED}differences were found between sriov_initial_iptables_config and sriov_latest_iptables_config${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6242 # diff=$(diff sriov_initial_iptables_config sriov_latest_iptables_config)
6243 # echo -e "${RED}$diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6244 # else
6245 # echo -e "\n${GREEN}no differences were found between sriov_initial_iptables_config and sriov_latest_iptables_config${NC}"
6246 # fi
6247 # fi
6248 # else
6249 # echo -e "${GREEN}/etc/sysconfig/iptable is idetical on all the sriov servers${NC}"
6250 # if [[ ! -f "sriov_initial_iptables_config" ]]
6251 # then
6252 # ansible *overcloud-[Ss]riov* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > sriov_initial_iptables_config
6253 # else
6254 # ansible *overcloud-[Ss]riov* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > sriov_latest_iptables_config
6255 # diff=$(sudo diff -s sriov_initial_iptables_config sriov_latest_iptables_config | grep -c 'Files sriov_initial_iptables_config and sriov_latest_iptables_config are identical')
6256 # if [[ $diff != "1" ]]
6257 # then
6258 # echo -e "\n${RED}differences were found between sriov_initial_iptables_config and sriov_latest_iptables_config${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6259 # diff=$(diff sriov_initial_iptables_config sriov_latest_iptables_config)
6260 # echo -e "${RED}$diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6261 # else
6262 # echo -e "${GREEN}no differences were found between sriov_initial_iptables_config and sriov_latest_iptables_config${NC}"
6263 # fi
6264 # fi
6265 # fi
6266 # else
6267 # echo -e "${ORANGE}no sriov computes found${NC}"
6268 # fi
6269 # echo -e "\n${CYAN}now checking AvrsCompute${NC}"
6270 # if [[ $ansible_avrs_hosts_count != "0" && $nuage == "true" ]]
6271 # then
6272 # iptables=$(ansible *overcloud-[Aa]vrs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u | wc -l)
6273 # if [[ $iptables != "1" ]]
6274 # then
6275 # echo -e "${RED}one or more avrs computes has different content inside /etc/sysconfig/iptables${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6276 # iptables=$(ansible *overcloud-[Aa]vrs* -b -m shell -a "md5sum /etc/sysconfig/iptables")
6277 # echo -e "${RED}$iptables${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6278 # if [[ ! -f "avrs_initial_iptables_config" ]]
6279 # then
6280 # ansible *overcloud-[Aa]vrs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > avrs_initial_iptables_config
6281 # else
6282 # ansible *overcloud-[Aa]vrs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > avrs_latest_iptables_config
6283 # diff=$(sudo diff -s avrs_initial_iptables_config avrs_latest_iptables_config | grep -c 'Files avrs_initial_iptables_config and avrs_latest_iptables_config are identical')
6284 # if [[ $diff != "1" ]]
6285 # then
6286 # echo -e "\n${RED}differences were found between avrs_initial_iptables_config and avrs_latest_iptables_config${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6287 # diff=$(diff avrs_initial_iptables_config avrs_latest_iptables_config)
6288 # echo -e "${RED}$diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6289 # else
6290 # echo -e "\n${GREEN}no differences were found between avrs_initial_iptables_config and avrs_latest_iptables_config${NC}"
6291 # fi
6292 # fi
6293 # else
6294 # echo -e "${GREEN}/etc/sysconfig/iptable is idetical on all the avrs servers${NC}"
6295 # if [[ ! -f "avrs_initial_iptables_config" ]]
6296 # then
6297 # ansible *overcloud-[Aa]vrs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > avrs_initial_iptables_config
6298 # else
6299 # ansible *overcloud-[Aa]vrs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > avrs_latest_iptables_config
6300 # diff=$(sudo diff -s avrs_initial_iptables_config avrs_latest_iptables_config | grep -c 'Files avrs_initial_iptables_config and avrs_latest_iptables_config are identical')
6301 # if [[ $diff != "1" ]]
6302 # then
6303 # echo -e "\n${RED}differences were found between avrs_initial_iptables_config and avrs_latest_iptables_config${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6304 # diff=$(diff avrs_initial_iptables_config avrs_latest_iptables_config)
6305 # echo -e "${RED}$diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6306 # else
6307 # echo -e "${GREEN}no differences were found between avrs_initial_iptables_config and avrs_latest_iptables_config${NC}"
6308 # fi
6309 # fi
6310 # fi
6311 # else
6312 # echo -e "${ORANGE}no avrs computes found${NC}"
6313 # fi
6314 # elapsed_time_seconds=$(expr $(date +%s) - $start)
6315
6316
6317 ####################################################################################################
6318
6319
6320 # start=$(date +%s)
6321 # STEPS_COUNTER=$((STEPS_COUNTER+1))
6322 # echo -e "${BLUE}\n\n$STEPS_COUNTER) PRINT ZABBIX EVENTS FROM THE DB FROM THE PAST 1 HOUR (+$elapsed_time_seconds `date '+%T'`)${NC}"
6323 # if [[ $cbis_version != "19.0.0.1" && $cbis_version != "18.0.0.1" ]]
6324 # then
6325 # epoch_time_now=$(date +%s)
6326 # epoch_time_before=$(date +%s -d "-1 hour")
6327 # zabbix_events=$(ansible $last_index_controller -b -m shell -a "mysql -e \"SELECT name FROM events WHERE clock BETWEEN $epoch_time_before AND $epoch_time_now;\G;\" zabbixdb" | grep -E -v '^name|SUCCESS' | sort -u )
6328 # if [[ -z $zabbix_events ]]
6329 # then
6330 # echo -e "${GREEN}no events were intiated in the past 1 hour${NC}"
6331 # else
6332 # echo -e "${RED}$zabbix_events\n\n${ORANGE}Please log-in to the zabbix portal and acknowledge the problems history under Monitoring > Problems > History, set the filter timestamps as required and Apply" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6333 # fi
6334 # else
6335 # echo -e "${ORANGE}this zabbix events check is only valid from CBIS 19.100.1${NC}"
6336 # fi
6337 # elapsed_time_seconds=$(expr $(date +%s) - $start)
6338
6339
6340 ####################################################################################################
6341
6342
6343 # start=$(date +%s)
6344 # STEPS_COUNTER=$((STEPS_COUNTER+1))
6345 # echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR CPU VULNERABILITES (MELTDOWN/SPECTRE) (+$elapsed_time_seconds `date '+%T'`)${NC}"
6346 # cpus_vulnerablities=$(ansible all -b -m shell -a "awk '{print FILENAME\":\"\$0}' /sys/devices/system/cpu/vulnerabilities/* | grep Vulnerable" | grep Vulnerable -B 1)
6347 # if [[ $cpus_vulnerablities ]]
6348 # then
6349 # echo -e "${RED}$cpus_vulnerablities${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6350 # fi
6351 # if [[ -z $kernel_vulnerablities && -z $cpus_vulnerablities ]]
6352 # then
6353 # echo -e "${GREEN}all hosts are MELTDOWN/SPECTRE hardened${NC}"
6354 # fi
6355 # elapsed_time_seconds=$(expr $(date +%s) - $start)
6356
6357
6358 ####################################################################################################
6359
6360
6361 # start=$(date +%s)
6362 # STEPS_COUNTER=$((STEPS_COUNTER+1))
6363 # echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT USER stack CAN MODIFY FILES UNDER /mnt/backup/ TO UNDERSTAND IF THE /usr/bin/find /mnt/backup/... CRONJOB WORKS (CBIS-13758) (+$elapsed_time_seconds `date '+%T'`)${NC}"
6364 # sudo_check_in_backup_cronjon=$(crontab -l | grep -c 'sudo /usr/bin/find /mnt/backup/')
6365 # if [[ $sudo_check_in_backup_cronjon == "0" ]]
6366 # then
6367 # check_if_backup_files_exist=$(sudo /usr/bin/find /mnt/backup/* 2> /dev/null | grep db_backup.enc)
6368 # if [[ $check_if_backup_files_exist ]]
6369 # then
6370 # permissions_check=$(/usr/bin/find /mnt/backup/* | grep db_backup.enc | grep -v orig | awk NR==1 | xargs -i cp {} {}.orig 2>&1 | grep -c 'Permission denied')
6371 # if [[ $permissions_check == "1" ]]
6372 # then
6373 # cronjobs=$(crontab -l)
6374 # echo -e "${RED}unable to delete content under /mnt/backup/ as user stack. permission denied" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6375 # echo -e "${RED}sudo is required for the /usr/bin/find /mnt/backup/*... cronjob\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6376 # echo -e "${RED}$cronjobs${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6377 # else
6378 # echo -e "${GREEN}user stack can modify files under /mnt/backup/${NC}"
6379 # fi
6380 # else
6381 # echo -e "${ORANGE}couldn't find any db_backup.enc file under /mnt/backup/${NC}"
6382 # fi
6383 # else
6384 # echo -e "${GREEN}the \'sudo /usr/bin/find /mnt/backup/...\' cronjob contains sudo${NC}"
6385 # fi
6386 # elapsed_time_seconds=$(expr $(date +%s) - $start)
6387
6388
6389 ####################################################################################################
6390
6391
6392 # start=$(date +%s)
6393 # STEPS_COUNTER=$((STEPS_COUNTER+1))
6394 # echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE UNEXPECTED CONFIGURATION IN /etc/fstab ON THE OVERCLOUD HOSTS (+$elapsed_time_seconds `date '+%T'`)${NC}"
6395 # fstab=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "cat /etc/fstab" | grep -E -v 'SUCCESS|LABEL=img-rootfs / xfs defaults 0 1|tmpfs /dev/shm tmpfs defaults,nodev,nosuid,noexec 0 0|elk|^#')
6396 # if [[ $fstab ]]
6397 # then
6398 # fstab=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "cat /etc/fstab | grep -v ^#")
6399 # echo -e "${RED}$fstab${NC} " ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6400 # else
6401 # echo -e "${GREEN}/etc/fstab is configured as expected on all the overcloud hosts${NC} "
6402 # fi
6403 # elapsed_time_seconds=$(expr $(date +%s) - $start)
6404
6405
6406 ####################################################################################################
6407
6408
6409 # start=$(date +%s)
6410 # STEPS_COUNTER=$((STEPS_COUNTER+1))
6411 # echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT SYSTEM-WIDE OPEN FILES LIMIT IS IDENTICAL FOR ALL THE HOSTS (+$elapsed_time_seconds `date '+%T'`)${NC}"
6412 ## I currently don't know why, but seems like the controllers always have slighly different limit value
6413 # open_files_limit_system_wide=$(ansible all --limit '!hypervisor,!localhost,!controller' -b -m shell -a "cat /proc/sys/fs/file-max" | grep ^[0-9] | sort --uniq | sed 's/..$//' | wc -l)
6414 # if [[ $open_files_limit_system_wide != "1" ]]
6415 # then
6416 # open_files_limit_system_wide=$(ansible all --limit '!hypervisor,!localhost,!controller' -b -m shell -a "cat /proc/sys/fs/file-max")
6417 # echo -e "${RED}$open_files_limit_system_wide${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6418 # else
6419 # echo -e "${GREEN}all the hosts returned the same system-wide open files limit value${NC}"
6420 # fi
6421 # elapsed_time_seconds=$(expr $(date +%s) - $start)
6422
6423
6424 ####################################################################################################
6425
6426
6427 # start=$(date +%s)
6428 # STEPS_COUNTER=$((STEPS_COUNTER+1))
6429 # echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR INTERFACES DRIVER/VERSION/FIRMWARE INCONSISTENCIES BETWEEN THE OVERCLOUD HOSTS (+$elapsed_time_seconds `date '+%T'`)${NC}"
6430 # if [[ $nuage != "true" ]]
6431 # then
6432 # checks="firmware-version driver version"
6433 # for check in $checks
6434 # do
6435 # echo -e "${CYAN}now checking the interfaces $check${NC}"
6436 # item=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ovs-appctl bond/list | grep tenant-bond | awk '{print \$NF}' | sort -u | xargs -i ethtool -i {}" | grep ^$check | sort -u | wc -l)
6437 # if [[ $item != "1" ]]
6438 # then
6439 # item=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ovs-appctl bond/list | grep tenant-bond | awk '{print \$NF}' | sort -u | xargs -i ethtool -i {} | grep ^$check")
6440 # echo -e "${RED}$item${NC} " ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6441 # else
6442 # echo -e "${GREEN}the $check is the same on all the overcloud hosts${NC} "
6443 # fi
6444 # done
6445 # else
6446 # echo -e "${ORANGE}this check is not applicable for Nuage${NC} "
6447 # fi
6448 # elapsed_time_seconds=$(expr $(date +%s) - $start)
6449
6450
6451 ####################################################################################################
6452
6453
6454 # start=$(date +%s)
6455 # STEPS_COUNTER=$((STEPS_COUNTER+1))
6456 # echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT IPSEC IS CONFIGURED AND WORKING (+$elapsed_time_seconds `date '+%T'`)${NC}"
6457 # if [[ $cbis_version == "19.0.0.1" ]]
6458 # then
6459 # ipsec_execution_time=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "grep -R 'Finished security hardening: Secured Communication Deployment' /var/log/cbis/ | grep -v api.log | awk -F: '{print \$1}' | head -n1 | xargs -i ls -l {} | awk '{print \$6,\$7,\$8}' | xargs -i date -d "{}" +%s" | grep -v rc=0 | grep ^[0-9])
6460 # if [[ -z $ipsec_execution_time ]]
6461 # then
6462 # echo -e "${MAGENTA}IPsec is not enabled - enable IPsec (secured communications) from the security section in CBIS manager${NC}"
6463 # else
6464 # deployment_execution_time=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "ls -l /var/log/cbis/deployment.log | awk '{print \$6,\$7,\$8}' | xargs -i date -d "{}" +%s" | grep -v rc=0 | grep ^[0-9])
6465 # if [ $deployment_execution_time -gt $ipsec_execution_time ]
6466 # then
6467 # echo -e "${ORANGE}IPsec is not enabled - enable IPsec (secured communications) from the security section in CBIS manager${NC}"
6468 # else
6469 # echo -e "${CYAN}tripleo-ipsec pacemaker resources check${NC}"
6470 # ipsec=$(ansible $last_index_controller -b -m shell -a "pcs resource | grep tripleo-ipsec | grep -E -w -c 'internalapi|redis|storage|storagemgmt|ctlplane'" | grep ^[0-9])
6471 # if [[ $ipsec != "5" ]]
6472 # then
6473 # echo -e "${RED}expected 5 tripleo-ipsec pacemaker resources (internalapi, redis, storage, storagemgmt, ctlplane) and got ($ipsec)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6474 # else
6475 # echo -e "${GREEN}found all 5 tripleo-ipsec pacemaker resources (internalapi, redis, storage, storagemgmt, ctlplane)${NC}"
6476 # fi
6477 # echo -e "${CYAN}check for leaked ipsec packets${NC}"
6478 # ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipsec enumcheck 2>&1 | grep enumcheck:" | grep -v rc=0 | sort -u)
6479 # if [[ $ipsec != "enumcheck: leak detective found no leaks" ]]
6480 # then
6481 # ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipsec enumcheck 2>&1 | grep enumcheck:")
6482 # echo -e "${RED}$ipsec${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6483 # else
6484 # echo -e "${GREEN}no leaks were detected${NC}"
6485 # fi
6486 # echo -e "${CYAN}capture ESP packets using tcpdump${NC}"
6487 # ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "timeout 10 tcpdump -nnepi any proto ESP -c 100 2>&1 | grep 'packets captured' | awk '{ if ( \$2 < 100 ) print \$1 }'" | grep ^[0-9] -B 1)
6488 # if [[ $ipsec ]]
6489 # then
6490 # echo -e "${RED}$ipsec${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6491 # else
6492 # echo -e "${GREEN}sucessfully captured 100 ESP packets from all the hosts${NC}"
6493 # fi
6494 # echo -e "${CYAN}check iptables IPSEC chain rules${NC}"
6495 # ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "iptables -L IPSEC -n | grep -E -vc 'Vitrage|elasticsearch'" | grep ^[0-9] | sort -u)
6496 # if [[ $ipsec != "17" ]]
6497 # then
6498 # echo -e "${RED}expected (17) iptable rules in the IPSEC chain but got ($ipsec)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6499 # else
6500 # echo -e "${GREEN}all the iptables IPSEC chain rules are found${NC}"
6501 # fi
6502 ## Chain IPSEC (4 references)
6503 ## target prot opt source destination
6504 ## ACCEPT all -- 172.31.0.1 0.0.0.0/0 /* Allow ctlplane traffic from UC */
6505 ## ACCEPT all -- 169.254.169.254 0.0.0.0/0 /* Allow ironic metadata traffic from UC */
6506 ## ACCEPT all -- 0.0.0.0/0 0.0.0.0/0
6507 ## ACCEPT all -- 0.0.0.0/0 0.0.0.0/0
6508 ## ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 multiport sports 16514,49152:49215,5900:5999 /* open ports for nova */
6509 ## ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 multiport dports 16514,49152:49215,5900:5999 /* open ports for nova */
6510 ## ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 tcp spt:22
6511 ## ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 tcp dpt:22
6512 ## ACCEPT ah -- 0.0.0.0/0 0.0.0.0/0
6513 ## ACCEPT esp -- 0.0.0.0/0 0.0.0.0/0
6514 ## ACCEPT udp -- 0.0.0.0/0 0.0.0.0/0 udp spt:4500 dpt:4500
6515 ## ACCEPT udp -- 0.0.0.0/0 0.0.0.0/0 udp spt:500 dpt:500
6516 ## ACCEPT all -- 0.0.0.0/0 0.0.0.0/0 policy match dir in pol ipsec
6517 ## LOG all -- 0.0.0.0/0 0.0.0.0/0 limit: avg 2/min burst 5 LOG flags 0 level 4 prefix "IPTables-Dropped:"
6518 ## DROP all -- 0.0.0.0/0 0.0.0.0/0
6519 # echo -e "${CYAN}check that ESP tunnels were created${NC}"
6520 # ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipsec whack --trafficstatus | wc -l | awk '{ if ( \$1 < 10 ) print \$1 }'" | grep ^[0-9] -B 1)
6521 # if [[ $ipsec ]]
6522 # then
6523 # echo -e "${RED}$ipsec${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6524 # else
6525 # echo -e "${GREEN}each host has at least 10 active ESP tunnels${NC}"
6526 # fi
6527 # fi
6528 # fi
6529 # fi
6530 # if [[ $cbis_version == "19.100.1" ]]
6531 # then
6532 # ipsec=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /usr/share/cbis/seccom_state" | grep -v rc=0)
6533 # if [[ $ipsec != "0" ]]
6534 # then
6535 # echo -e "${MAGENTA}IPsec is not enabled - enable IPsec (secured communications) from the security section in CBIS manager${NC}"
6536 # else
6537 # echo -e "${CYAN}tripleo-ipsec pacemaker resources check${NC}"
6538 # ipsec=$(ansible $last_index_controller -b -m shell -a "pcs resource | grep tripleo-ipsec | grep -E -w -c 'internalapi|redis|storage|storagemgmt|ctlplane'" | grep ^[0-9])
6539 # if [[ $ipsec != "5" ]]
6540 # then
6541 # echo -e "${RED}expected 5 tripleo-ipsec pacemaker resources (internalapi, redis, storage, storagemgmt, ctlplane) and got ($ipsec)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6542 # else
6543 # echo -e "${GREEN}found all 5 tripleo-ipsec pacemaker resources (internalapi, redis, storage, storagemgmt, ctlplane)${NC}"
6544 # fi
6545 # echo -e "${CYAN}check for leaked ipsec packets${NC}"
6546 # ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipsec enumcheck 2>&1 | grep enumcheck:" | grep -v rc=0 | sort -u)
6547 # if [[ $ipsec != "enumcheck: leak detective found no leaks" ]]
6548 # then
6549 # ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipsec enumcheck 2>&1 | grep enumcheck:")
6550 # echo -e "${RED}$ipsec${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6551 # else
6552 # echo -e "${GREEN}no leaks were detected${NC}"
6553 # fi
6554 # echo -e "${CYAN}capture ESP packets using tcpdump${NC}"
6555 # ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "timeout 10 tcpdump -nnepi any proto ESP -c 100 2>&1 | grep 'packets captured' | awk '{ if ( \$2 < 100 ) print \$1 }'" | grep ^[0-9] -B 1)
6556 # if [[ $ipsec ]]
6557 # then
6558 # echo -e "${RED}$ipsec${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6559 # else
6560 # echo -e "${GREEN}sucessfully captured 100 ESP packets from all the hosts${NC}"
6561 # fi
6562 # echo -e "${CYAN}check iptables IPSEC chain rules${NC}"
6563 # ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "iptables -L IPSEC -n | grep -E -vc 'Vitrage|elasticsearch'" | grep ^[0-9] | sort -u)
6564 # if [[ $ipsec != "17" ]]
6565 # then
6566 # echo -e "${RED}expected (17) iptable rules in the IPSEC chain but got ($ipsec)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6567 # else
6568 # echo -e "${GREEN}all the iptables IPSEC chain rules are found${NC}"
6569 # fi
6570 ## Chain IPSEC (4 references)
6571 ## target prot opt source destination
6572 ## ACCEPT all -- 172.31.0.1 0.0.0.0/0 /* Allow ctlplane traffic from UC */
6573 ## ACCEPT all -- 169.254.169.254 0.0.0.0/0 /* Allow ironic metadata traffic from UC */
6574 ## ACCEPT all -- 0.0.0.0/0 0.0.0.0/0
6575 ## ACCEPT all -- 0.0.0.0/0 0.0.0.0/0
6576 ## ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 multiport sports 16514,49152:49215,5900:5999 /* open ports for nova */
6577 ## ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 multiport dports 16514,49152:49215,5900:5999 /* open ports for nova */
6578 ## ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 tcp spt:22
6579 ## ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 tcp dpt:22
6580 ## ACCEPT ah -- 0.0.0.0/0 0.0.0.0/0
6581 ## ACCEPT esp -- 0.0.0.0/0 0.0.0.0/0
6582 ## ACCEPT udp -- 0.0.0.0/0 0.0.0.0/0 udp spt:4500 dpt:4500
6583 ## ACCEPT udp -- 0.0.0.0/0 0.0.0.0/0 udp spt:500 dpt:500
6584 ## ACCEPT all -- 0.0.0.0/0 0.0.0.0/0 policy match dir in pol ipsec
6585 ## LOG all -- 0.0.0.0/0 0.0.0.0/0 limit: avg 2/min burst 5 LOG flags 0 level 4 prefix "IPTables-Dropped:"
6586 ## DROP all -- 0.0.0.0/0 0.0.0.0/0
6587 # echo -e "${CYAN}check that ESP tunnels were created${NC}"
6588 # ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipsec whack --trafficstatus | wc -l | awk '{ if ( \$1 < 10 ) print \$1 }'" | grep ^[0-9] -B 1)
6589 # if [[ $ipsec ]]
6590 # then
6591 # echo -e "${RED}$ipsec${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6592 # else
6593 # echo -e "${GREEN}each host has at least 10 active ESP tunnels${NC}"
6594 # fi
6595 # fi
6596 # fi
6597 # if [[ $cbis_version == "20.100.1" ]]
6598 # then
6599 # ipsec=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /opt/install/data/states/seccom_state" | grep -v rc=0 | xargs)
6600 # if [[ $ipsec != "0" ]]
6601 # then
6602 # echo -e "${MAGENTA}IPsec is not enabled - enable IPsec (secured communications) from the security section in CBIS manager${NC}"
6603 # else
6604 # echo -e "${CYAN}tripleo-ipsec pacemaker resources check${NC}"
6605 # ipsec=$(ansible $last_index_controller -b -m shell -a "pcs resource | grep tripleo-ipsec | grep -E -w -c 'internalapi|redis|storage|storagemgmt|ctlplane'" | grep ^[0-9])
6606 # if [[ $ipsec != "5" ]]
6607 # then
6608 # echo -e "${RED}expected 5 tripleo-ipsec pacemaker resources (internalapi, redis, storage, storagemgmt, ctlplane) and got ($ipsec)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6609 # else
6610 # echo -e "${GREEN}found all 5 tripleo-ipsec pacemaker resources (internalapi, redis, storage, storagemgmt, ctlplane)${NC}"
6611 # fi
6612 # echo -e "${CYAN}check for leaked ipsec packets${NC}"
6613 # ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipsec enumcheck 2>&1 | grep enumcheck:" | grep -v rc=0 | sort -u)
6614 # if [[ $ipsec != "enumcheck: leak detective found no leaks" ]]
6615 # then
6616 # ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipsec enumcheck 2>&1 | grep enumcheck:")
6617 # echo -e "${RED}$ipsec${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6618 # else
6619 # echo -e "${GREEN}no leaks were detected${NC}"
6620 # fi
6621 # echo -e "${CYAN}capture ESP packets using tcpdump${NC}"
6622 # ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "timeout 10 tcpdump -nnepi any proto ESP -c 100 2>&1 | grep 'packets captured' | awk '{ if ( \$2 < 100 ) print \$1 }'" | grep ^[0-9] -B 1)
6623 # if [[ $ipsec ]]
6624 # then
6625 # echo -e "${RED}$ipsec${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6626 # else
6627 # echo -e "${GREEN}sucessfully captured 100 ESP packets from all the hosts${NC}"
6628 # fi
6629 # echo -e "${CYAN}check iptables IPSEC chain rules${NC}"
6630 # ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "iptables -L IPSEC -n | grep -E -vc 'Vitrage|elasticsearch'" | grep ^[0-9] | sort -u)
6631 # if [[ $ipsec != "17" ]]
6632 # then
6633 # echo -e "${RED}expected (17) iptable rules in the IPSEC chain but got ($ipsec)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6634 # else
6635 # echo -e "${GREEN}all the iptables IPSEC chain rules are found${NC}"
6636 # fi
6637 ## Chain IPSEC (4 references)
6638 ## target prot opt source destination
6639 ## ACCEPT all -- 172.31.0.1 0.0.0.0/0 /* Allow ctlplane traffic from UC */
6640 ## ACCEPT all -- 169.254.169.254 0.0.0.0/0 /* Allow ironic metadata traffic from UC */
6641 ## ACCEPT all -- 0.0.0.0/0 0.0.0.0/0
6642 ## ACCEPT all -- 0.0.0.0/0 0.0.0.0/0
6643 ## ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 multiport sports 16514,49152:49215,5900:5999 /* open ports for nova */
6644 ## ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 multiport dports 16514,49152:49215,5900:5999 /* open ports for nova */
6645 ## ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 tcp spt:22
6646 ## ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 tcp dpt:22
6647 ## ACCEPT ah -- 0.0.0.0/0 0.0.0.0/0
6648 ## ACCEPT esp -- 0.0.0.0/0 0.0.0.0/0
6649 ## ACCEPT udp -- 0.0.0.0/0 0.0.0.0/0 udp spt:4500 dpt:4500
6650 ## ACCEPT udp -- 0.0.0.0/0 0.0.0.0/0 udp spt:500 dpt:500
6651 ## ACCEPT all -- 0.0.0.0/0 0.0.0.0/0 policy match dir in pol ipsec
6652 ## LOG all -- 0.0.0.0/0 0.0.0.0/0 limit: avg 2/min burst 5 LOG flags 0 level 4 prefix "IPTables-Dropped:"
6653 ## DROP all -- 0.0.0.0/0 0.0.0.0/0
6654 # echo -e "${CYAN}check that ESP tunnels were created${NC}"
6655 # ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipsec whack --trafficstatus | wc -l | awk '{ if ( \$1 < 10 ) print \$1 }'" | grep ^[0-9] -B 1)
6656 # if [[ $ipsec ]]
6657 # then
6658 # echo -e "${RED}$ipsec${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6659 # else
6660 # echo -e "${GREEN}each host has at least 10 active ESP tunnels${NC}"
6661 # fi
6662 # fi
6663 # fi
6664 # elapsed_time_seconds=$(expr $(date +%s) - $start)
6665
6666
6667 ####################################################################################################
6668
6669
6670 # start=$(date +%s)
6671 # STEPS_COUNTER=$((STEPS_COUNTER+1))
6672 # echo -e "${BLUE}\n\n$STEPS_COUNTER) OBTAIN THE ZABBIX TEMPLATES OF EACH HOST AND COMPARE IT WITH THE EXPECTED TEMPLATES TAKEN FROM configure_zabbix_server_monitoring.py (+$elapsed_time_seconds `date '+%T'`)${NC}"
6673 # all_hosts_zabbix_templates=$(cat /usr/share/cbis/overcloud/postdeploy/scripts/configure_zabbix_server_monitoring.py | awk '/Templates for all hosts/,/]/' | grep -v \# | grep -E '^\s+' | tr -d \'\,\[\] | tr -d " \t" | sort)
6674 # all_hosts_zabbix_templates_fixed=$(echo -e "$all_hosts_zabbix_templates" | paste -sd'|' | sed '$ s/.$//')
6675
6676 # active_controller_zabbix_templates=$(cat /usr/share/cbis/overcloud/postdeploy/scripts/configure_zabbix_server_monitoring.py | awk '/ACTIVE_CONTROLLER_TEMPLATE=/,/]/' | grep -v \# | grep -E '^\s+' | tr -d \'\,\[\] | tr -d " \t" | sort)
6677 # active_controller_zabbix_templates_fixed=$(echo -e "$active_controller_zabbix_templates" | paste -sd'|' | sed '$ s/.$//')
6678
6679 # computes_hci_zabbix_templates=$(cat /usr/share/cbis/overcloud/postdeploy/scripts/configure_zabbix_server_monitoring.py | awk '/Templates for compute hosts/,/]/' | grep -v \# | grep -E '^\s+' | tr -d \'\,\[\] | tr -d " \t" | sort)
6680 # computes_hci_zabbix_templates_fixed=$(echo -e "$computes_hci_zabbix_templates" | paste -sd'|' | sed '$ s/.$//')
6681
6682 # computes_non_hci_zabbix_templates=$(cat /usr/share/cbis/overcloud/postdeploy/scripts/configure_zabbix_server_monitoring.py | awk '/Templates for compute hosts/,/]/' | grep -v \# | grep -v TemplateAppOpenStackCeph | grep -E '^\s+' | tr -d \'\,\[\] | tr -d " \t" | sort)
6683 # computes_non_hci_zabbix_templates_fixed=$(echo -e "$computes_non_hci_zabbix_templates" | paste -sd'|' | sed '$ s/.$//')
6684
6685 # controllers_zabbix_templates=$(cat /usr/share/cbis/overcloud/postdeploy/scripts/configure_zabbix_server_monitoring.py | awk '/Templates for controller hosts/,/]/' | grep -v \# | grep -E '^\s+' | tr -d \'\,\[\] | tr -d " \t" | sort)
6686 # controllers_zabbix_templates_fixed=$(echo -e "$controllers_zabbix_templates" | paste -sd'|' | sed '$ s/.$//')
6687
6688 # undercloud_zabbix_templates=$(cat /usr/share/cbis/overcloud/postdeploy/scripts/configure_zabbix_server_monitoring.py | awk '/UNDERCLOUD_TEMPLATE_NAMES = BASIC_TEMPLATE_NAMES/,/]/' | grep -v \# | grep -E '^\s+' | tr -d \'\,\[\] | tr -d " \t" | sort)
6689 # undercloud_zabbix_templates_fixed=$(echo -e "$undercloud_zabbix_templates" | paste -sd'|' | sed '$ s/.$//')
6690
6691 # echo -e "${CYAN}checking the shared templates on all the hosts${NC}"
6692 # for host in $ansible_overcloud_hosts
6693 # do
6694 # templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
6695 # -H 'Content-Type: application/json-rpc' \
6696 # -H 'Cookie: SERVERID='$last_index_controller'' \
6697 # --data '{
6698 # "jsonrpc": "2.0",
6699 # "method": "host.get",
6700 # "params": {
6701 # "output": ["host"],
6702 # "selectParentTemplates": [
6703 # "templateid",
6704 # "name"
6705 # ],
6706 # "filter": {
6707 # "host": "'$host'"
6708 # }
6709 # },
6710 # "id": 1,
6711 # "auth": '$zabbix_auth'
6712 # }' | jq . | grep \"name\"\: | awk -F: '{print $2}' | tr -d '," ' | sort)
6713 # diff=$(diff <(echo -e "$all_hosts_zabbix_templates") <(echo -e "$templates") | grep -E '$all_hosts_zabbix_templates_fixed')
6714 # if [[ $diff ]]
6715 # then
6716 # echo -e "${GREEN}the templates: ($all_hosts_zabbix_templates_fixed) are found under $host${NC}"
6717 # else
6718 # echo -e "${RED}$diff\n\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6719 # fi
6720 # done
6721
6722 # echo -e "${CYAN}checking the dedicated controller templates on on the controller hosts${NC}"
6723 # for host in $ansible_controllers_hosts
6724 # do
6725 # templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
6726 # -H 'Content-Type: application/json-rpc' \
6727 # -H 'Cookie: SERVERID='$last_index_controller'' \
6728 # --data '{
6729 # "jsonrpc": "2.0",
6730 # "method": "host.get",
6731 # "params": {
6732 # "output": ["host"],
6733 # "selectParentTemplates": [
6734 # "templateid",
6735 # "name"
6736 # ],
6737 # "filter": {
6738 # "host": "'$host'"
6739 # }
6740 # },
6741 # "id": 1,
6742 # "auth": '$zabbix_auth'
6743 # }' | jq . | grep \"name\"\: | awk -F: '{print $2}' | tr -d '," ' | sort)
6744 # diff=$(diff <(echo -e "$controllers_zabbix_templates") <(echo -e "$templates") | grep -E '$controllers_zabbix_templates_fixed')
6745 # if [[ $diff ]]
6746 # then
6747 # echo -e "${GREEN}the templates: ($controllers_zabbix_templates_fixed) are found under $host${NC}"
6748 # else
6749 # echo -e "${RED}$diff\n\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6750 # fi
6751 # done
6752
6753 # echo -e "${CYAN}checking the dedicated compute templates on on the compute hosts${NC}"
6754 # for host in $ansible_computes_hosts
6755 # do
6756 # templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
6757 # -H 'Content-Type: application/json-rpc' \
6758 # -H 'Cookie: SERVERID='$last_index_controller'' \
6759 # --data '{
6760 # "jsonrpc": "2.0",
6761 # "method": "host.get",
6762 # "params": {
6763 # "output": ["host"],
6764 # "selectParentTemplates": [
6765 # "templateid",
6766 # "name"
6767 # ],
6768 # "filter": {
6769 # "host": "'$host'"
6770 # }
6771 # },
6772 # "id": 1,
6773 # "auth": '$zabbix_auth'
6774 # }' | jq . | grep \"name\"\: | awk -F: '{print $2}' | tr -d '," ' | sort)
6775 # if [[ $hci == "false" ]]
6776 # then
6777 # if [[ " ${computes_non_hci_zabbix_templates[@]} " =~ "$templates" ]]
6778 # then
6779 # echo -e "${GREEN}$host has all the expected templates${NC}"
6780 # else
6781 # echo -e "${RED}> Expecting Templates ($host):\n${computes_non_hci_zabbix_templates[@]}\n> Received Templates ($host):\n$templates${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6782 # fi
6783 # elif [[ $hci == "true" ]]
6784 # then
6785 # if [[ " ${computes_hci_zabbix_templates[@]} " =~ "$templates" ]]
6786 # then
6787 # echo -e "${GREEN}$host has all the expected templates${NC}"
6788 # else
6789 # echo -e "${RED}> Expecting Templates ($host):\n${computes_hci_zabbix_templates[@]}\n> Received Templates ($host):\n$templates${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6790 # fi
6791
6792 # fi
6793 # done
6794
6795 # templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
6796 # -H 'Content-Type: application/json-rpc' \
6797 # -H 'Cookie: SERVERID='$last_index_controller'' \
6798 # --data '{
6799 # "jsonrpc": "2.0",
6800 # "method": "host.get",
6801 # "params": {
6802 # "output": ["host"],
6803 # "selectParentTemplates": [
6804 # "templateid",
6805 # "name"
6806 # ],
6807 # "filter": {
6808 # "host": "active-controller"
6809 # }
6810 # },
6811 # "id": 1,
6812 # "auth": '$zabbix_auth'
6813 # }' | jq . | grep \"name\"\: | awk -F: '{print $2}' | tr -d '," ' | sort | paste -sd " ")
6814 # if [[ " ${active_controller_zabbix_templates[@]} " =~ "$templates" ]]
6815 # then
6816 # echo -e "${GREEN}active-controller has all the expected templates${NC}"
6817 # else
6818 # echo -e "${RED}> Expecting Templates (active-controller):\n${active_controller_zabbix_templates[@]}\n> Received Templates (active-controller):\n$templates${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6819 # fi
6820
6821 # templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
6822 # -H 'Content-Type: application/json-rpc' \
6823 # -H 'Cookie: SERVERID='$last_index_controller'' \
6824 # --data '{
6825 # "jsonrpc": "2.0",
6826 # "method": "host.get",
6827 # "params": {
6828 # "output": ["host"],
6829 # "selectParentTemplates": [
6830 # "templateid",
6831 # "name"
6832 # ],
6833 # "filter": {
6834 # "host": "undercloud.localdomain"
6835 # }
6836 # },
6837 # "id": 1,
6838 # "auth": '$zabbix_auth'
6839 # }' | jq . | grep \"name\"\: | awk -F: '{print $2}' | tr -d '," ' | sort | paste -sd " ")
6840 # if [[ " ${undercloud_zabbix_templates[@]} " =~ "$templates" ]]
6841 # then
6842 # echo -e "${GREEN}undercloud.localdomain has all the expected templates${NC}"
6843 # else
6844 # echo -e "${RED}> Expecting Templates (undercloud.localdomain):\n${undercloud_zabbix_templates[@]}\n> Received Templates (undercloud.localdomain):\n$templates${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6845 # fi
6846 # elapsed_time_seconds=$(expr $(date +%s) - $start)
6847
6848
6849 ####################################################################################################
6850
6851
6852 # start=$(date +%s)
6853 # STEPS_COUNTER=$((STEPS_COUNTER+1))
6854 # echo -e "${BLUE}\n\n$STEPS_COUNTER) MEASURE THE TIME OF CORE MAINTENANCE OPERATIONS (+$elapsed_time_seconds `date '+%T'`)${NC}"
6855 # echo -e "${CYAN}measuring the time it took for the overcloud deployment to finish${NC}"
6856 # fail_criteria=5.0
6857 # if [[ $cbis_version == "18.0.0.1" || $cbis_version == "19.0.0.1" || $cbis_version == "19.100.1" ]]
6858 # then
6859 # log=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/deployment.log | awk '/undercloud installation finished/,0'" | grep -v '| SUCCESS |')
6860 # else
6861 # log=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/deployment.log" | grep -v '| SUCCESS |')
6862 # fi
6863 # start=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | head -n1 | xargs -i date -d "{}" +%s)
6864 # end=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | tail -n1 | xargs -i date -d "{}" +%s)
6865 # hours=$(expr $end - $start | xargs -i echo "scale = 2; {} / 60 / 60" | bc)
6866 # result=$(echo $hours'>'$fail_criteria | bc -l)
6867 # if [[ $result == "1" ]]
6868 # then
6869 # echo -e "${RED}the overcloud deployment took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6870 # echo -e "\n${ORANGE}the overcloud deployment may take more time then usual if software raid is enabled on one or more host-groups. check if this is the case${NC}"
6871 # elif [[ $result == "0" ]]
6872 # then
6873 # echo -e "${GREEN}the overcloud deployment took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}"
6874 # fi
6875
6876 # echo -e "${CYAN}measuring the time it took for the undercloud deployment to finish${NC}"
6877 # fail_criteria=2.0
6878 # log=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/install_undercloud.log | awk '/Time zone adjusted/,0'" | grep -v '| SUCCESS |')
6879 # start=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | head -n1 | xargs -i date -d "{}" +%s)
6880 # end=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | tail -n1 | xargs -i date -d "{}" +%s)
6881 # hours=$(expr $end - $start | xargs -i echo "scale = 2; {} / 60 / 60" | bc)
6882 # result=$(echo $hours'>'$fail_criteria | bc -l)
6883 # if [[ $result == "1" ]]
6884 # then
6885 # echo -e "${RED}the undercloud deployment took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6886 # elif [[ $result == "0" ]]
6887 # then
6888 # echo -e "${GREEN}the undercloud deployment took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}"
6889 # fi
6890
6891 # fail_criteria=5.0
6892 # log=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/add_node.log" | grep -v '| SUCCESS |')
6893 # if [[ $log ]]
6894 # then
6895 # echo -e "${CYAN}measuring the time it took for the latest scale-out to finish${NC}"
6896 # start=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | head -n1 | xargs -i date -d "{}" +%s)
6897 # end=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | tail -n1 | xargs -i date -d "{}" +%s)
6898 # hours=$(expr $end - $start | xargs -i echo "scale = 2; {} / 60 / 60" | bc)
6899 # result=$(echo $hours'>'$fail_criteria | bc -l)
6900 # if [[ $result == "1" ]]
6901 # then
6902 # echo -e "${RED}the scale-out operation took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6903 # elif [[ $result == "0" ]]
6904 # then
6905 # echo -e "${GREEN}the scale-out operation took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}"
6906 # fi
6907 # fi
6908
6909 # fail_criteria=2.0
6910 # log=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/remove_node.log" | grep -v '| SUCCESS |')
6911 # if [[ $log ]]
6912 # then
6913 # echo -e "${CYAN}measuring the time it took for the latest scale-in to finish${NC}"
6914 # start=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | head -n1 | xargs -i date -d "{}" +%s)
6915 # end=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | tail -n1 | xargs -i date -d "{}" +%s)
6916 # hours=$(expr $end - $start | xargs -i echo "scale = 2; {} / 60 / 60" | bc)
6917 # result=$(echo $hours'>'$fail_criteria | bc -l)
6918 # if [[ $result == "1" ]]
6919 # then
6920 # echo -e "${RED}the scale-in operation took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6921 # elif [[ $result == "0" ]]
6922 # then
6923 # echo -e "${GREEN}the scale-in operation took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}"
6924 # fi
6925 # fi
6926
6927 # if [[ $cbis_version != "19.0.0.1" && $cbis_version != "18.0.0.1" ]]
6928 # then
6929 # fail_criteria=5.0
6930 # log=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/replace_controller.log" | grep -v '| SUCCESS |')
6931 # if [[ $log ]]
6932 # then
6933 # echo -e "${CYAN}measuring the time it took for the latest replace controller to finish${NC}"
6934 # start=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | head -n1 | xargs -i date -d "{}" +%s)
6935 # end=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | tail -n1 | xargs -i date -d "{}" +%s)
6936 # hours=$(expr $end - $start | xargs -i echo "scale = 2; {} / 60 / 60" | bc)
6937 # result=$(echo $hours'>'$fail_criteria | bc -l)
6938 # if [[ $result == "1" ]]
6939 # then
6940 # echo -e "${RED}the replace controller operation took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6941 # elif [[ $result == "0" ]]
6942 # then
6943 # echo -e "${GREEN}the replace controller operation took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}"
6944 # fi
6945 # fi
6946
6947 # fail_criteria=2.0
6948 # log=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/security_hardening.log" | grep -v '| SUCCESS |')
6949 # if [[ $log ]]
6950 # then
6951 # echo -e "${CYAN}measuring the time it took for the latest security hardening deployment to finish${NC}"
6952 # start=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | head -n1 | xargs -i date -d "{}" +%s)
6953 # end=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | tail -n1 | xargs -i date -d "{}" +%s)
6954 # hours=$(expr $end - $start | xargs -i echo "scale = 2; {} / 60 / 60" | bc)
6955 # result=$(echo $hours'>'$fail_criteria | bc -l)
6956 # if [[ $result == "1" ]]
6957 # then
6958 # echo -e "${RED}the security hardening deployment took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6959 # elif [[ $result == "0" ]]
6960 # then
6961 # echo -e "${GREEN}the security hardening deployment took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}"
6962 # fi
6963 # fi
6964
6965 # fail_criteria=0.5
6966 # log=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/security_secured_communication.log" | grep -v '| SUCCESS |')
6967 # if [[ $log ]]
6968 # then
6969 # echo -e "${CYAN}measuring the time it took for the latest ipsec deployment to finish${NC}"
6970 # start=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | head -n1 | xargs -i date -d "{}" +%s)
6971 # end=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | tail -n1 | xargs -i date -d "{}" +%s)
6972 # hours=$(expr $end - $start | xargs -i echo "scale = 2; {} / 60 / 60" | bc)
6973 # result=$(echo $hours'>'$fail_criteria | bc -l)
6974 # if [[ $result == "1" ]]
6975 # then
6976 # echo -e "${RED}the ipsec deployment took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6977 # elif [[ $result == "0" ]]
6978 # then
6979 # echo -e "${GREEN}the ipsec deployment took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}"
6980 # fi
6981 # fi
6982
6983 # fail_criteria=0.05
6984 # log=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/add_host_group.log" | grep -v '| SUCCESS |')
6985 # if [[ $log ]]
6986 # then
6987 # echo -e "${CYAN}measuring the time it took for the latest custom host-group creation to finish${NC}"
6988 # start=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | head -n1 | xargs -i date -d "{}" +%s)
6989 # end=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | tail -n1 | xargs -i date -d "{}" +%s)
6990 # hours=$(expr $end - $start | xargs -i echo "scale = 2; {} / 60 / 60" | bc)
6991 # result=$(echo $hours'>'$fail_criteria | bc -l)
6992 # if [[ $result == "1" ]]
6993 # then
6994 # echo -e "${RED}the custom host-group creation took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6995 # elif [[ $result == "0" ]]
6996 # then
6997 # echo -e "${GREEN}the custom host-group creation took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}"
6998 # fi
6999 # fi
7000 # fi
7001 # elapsed_time_seconds=$(expr $(date +%s) - $start)
7002
7003
7004 ####################################################################################################
7005
7006
7007 # start=$(date +%s)
7008 # STEPS_COUNTER=$((STEPS_COUNTER+1))
7009 # echo -e "${BLUE}\n\n$STEPS_COUNTER) BACKUP /cbis-installer/ AND /opt/install/data/ IN THE UNDERCLOUD PHYSICAL SERVER (skip if backup already exists)${NC}"
7010 # echo -e "${CYAN}sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a \"cp -anuR ~/cbis-installer/ ~/cbis-installer-backup/\"${NC}"
7011 # sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cp -anuR ~/cbis-installer/ ~/cbis-installer-backup/"
7012 # echo -e "${CYAN}sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a \"cp -anuR /opt/install/data/ ~/opt-install-data-backup/\"${NC}"
7013 # sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cp -anuR /opt/install/data/ ~/opt-install-data-backup/"
7014 # elapsed_time_seconds=$(expr $(date +%s) - $global_start)
7015
7016
7017 ####################################################################################################
7018
7019
7020 # start=$(date +%s)
7021 # STEPS_COUNTER=$((STEPS_COUNTER+1))
7022 # echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR INSTANCES CONNECTIVITY LOSS HISTORY (+$elapsed_time_seconds `date '+%T'`)${NC}"
7023 # if [[ $instances != "0" ]]
7024 # then
7025 # if [ -f "$logs_dir/vms_namespace_connectivity_check.log" ]
7026 # then
7027 # file_content=$(cat $logs_dir/vms_namespace_connectivity_check.log)
7028 # if [[ $file_content ]]
7029 # then
7030 # unreachable_addresses=$(cat $logs_dir/vms_namespace_connectivity_check.log | grep -v '(0.00%)' | grep % -B 1 | grep Statistics | awk '{print $NF}' | sort -u | sed 's/.$/ /' | paste -sd' ')
7031 # echo -e "${RED}unreachable addresses: $unreachable_addresses${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
7032 # echo -e "\n${ORANGE}after you resolved the connectivity issue, stop and start the vms_namespace_connectivity_check.sh script to truncate the previous errors${NC}"
7033 # else
7034 # echo -e "${GREEN}the vms_namespace_connectivity_check.sh file is empty. thus, no connection failures are found${NC}"
7035 # fi
7036 # else
7037 # script_running_check=$(ps aux | grep vms_namespace_connectivity_check.sh | grep -v grep)
7038 # if [[ -z $script_running_check ]]
7039 # then
7040 # echo -e "${MAGENTA}the vms_namespace_connectivity_check.sh script is not running!${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
7041 # fi
7042 # fi
7043 # else
7044 # echo -e "${MAGENTA}no instances are found in the system!${NC}"
7045 # fi
7046 # elapsed_time_seconds=$(expr $(date +%s) - $start)
7047
7048
7049 ####################################################################################################
7050
7051
7052 # start=$(date +%s)
7053 # STEPS_COUNTER=$((STEPS_COUNTER+1))
7054 # echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPRESS AND SEND THE SCRIPT PRODUCT TO REMOTE HOST (+$elapsed_time_seconds `date '+%T'`)${NC}"
7055 # if [[ $hotfix_name_build ]]
7056 # then
7057 # if [[ $nuage == "true" ]]
7058 # then
7059 # tar -cvzf "$undercloud_vm_ip"__"$hw_model"__"$hotfix_name_build"__"$nuage_version"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz -P $logs_dir/* > /dev/null
7060 # sshpass -p 'airframe' scp $ssh_params "$undercloud_vm_ip"__"$hw_model"__"$hotfix_name_build"__"$nuage_version"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz airframe@10.104.211.33:/var/www/html/CBIS/scripts/system_validation_logs
7061 # else
7062 # tar -cvzf "$undercloud_vm_ip"__"$hw_model"__"$hotfix_name_build"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz -P $logs_dir/* > /dev/null
7063 # sshpass -p 'airframe' scp $ssh_params "$undercloud_vm_ip"__"$hw_model"__"$hotfix_name_build"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz airframe@10.104.211.33:/var/www/html/CBIS/scripts/system_validation_logs
7064 # fi
7065 # else
7066 # if [[ $nuage == "true" ]]
7067 # then
7068 # tar -cvzf "$undercloud_vm_ip"__"$hw_model"__"$cbis_version"__"$nuage_version"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz -P $logs_dir/* > /dev/null
7069 # sshpass -p 'airframe' scp $ssh_params "$undercloud_vm_ip"__"$hw_model"__"$cbis_version"__"$nuage_version"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz airframe@10.104.211.33:/var/www/html/CBIS/scripts/system_validation_logs
7070 # else
7071 # tar -cvzf "$undercloud_vm_ip"__"$hw_model"__"$cbis_version"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz -P $logs_dir/* > /dev/null
7072 # sshpass -p 'airframe' scp $ssh_params "$undercloud_vm_ip"__"$hw_model"__"$cbis_version"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz airframe@10.104.211.33:/var/www/html/CBIS/scripts/system_validation_logs
7073 # fi
7074 # fi
7075 # if [[ $hotfix_name_build ]]
7076 # then
7077 # validate_file_on_remote_server=$(curl -g -s -L -k http://10.104.211.33:88/CBIS/scripts/system_validation_logs | grep "$undercloud_vm_ip"__"$hw_model"__"$hotfix_name_build"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz)
7078 # if [[ $validate_file_on_remote_server ]]
7079 # then
7080 # echo -e "${GREEN}"$undercloud_vm_ip"__"$hw_model"__"$hotfix_name_build"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz is found on the remote server${NC}"
7081 # else
7082 # echo -e "${MAGENTA}unable to find "$undercloud_vm_ip"__"$hw_model"__"$hotfix_name_build"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz on the remote server${NC}"
7083 # fi
7084
7085 # else
7086 # validate_file_on_remote_server=$(curl -g -s -L -k http://10.104.211.33:88/CBIS/scripts/system_validation_logs | grep "$undercloud_vm_ip"__"$hw_model"__"$cbis_version"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz)
7087 # if [[ $validate_file_on_remote_server ]]
7088 # then
7089 # echo -e "${GREEN}"$undercloud_vm_ip"__"$hw_model"__"$cbis_version"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz is found on the remote server${NC}"
7090 # else
7091 # echo -e "${MAGENTA}unable to find "$undercloud_vm_ip"__"$hw_model"__"$cbis_version"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz on the remote server${NC}"
7092 # fi
7093 # fi
7094 # elapsed_time_seconds=$(expr $(date +%s) - $start)
7095
7096
7097 ####################################################################################################
7098
7099
7100 # start=$(date +%s)
7101 # STEPS_COUNTER=$((STEPS_COUNTER+1))
7102 # echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE LATEST AVAILABLE HOTFIX IS USED (+$elapsed_time_seconds `date '+%T'`)${NC}"
7103 # if [[ $hotfix_install_success ]]
7104 # then
7105 # cbis_short_version=$(echo -e "$cbis_version" | awk -F\. '{print $1}')
7106 # cbis_hotfix_name=$(/var/lib/cbis/cbis_hotfix list -c 'Hotfix name' -f json | jq .[].hotfix_name | head -n1 | tr -d \")
7107 # if [[ $cbis_hotfix_name != "CBIS-19.0-SP3" ]]
7108 # then
7109 # cbis_hotfix_build=$(/var/lib/cbis/cbis_hotfix list -c 'Hotfix name' -f json | jq .[].build_number | head -n1 | tr -d \")
7110 # latest_hotfix_build=$(curl -g -sk https://repo3.cci.nokia.net/cbis-generic-candidates-local/cbis_local_repo/hotfix/CBIS-$cbis_short_version.x/$cbis_hotfix_name/ | awk -F\" '{print $2}' | grep ^[0-9] | tr -d / | sort -n | tail -n1)
7111 # if [[ -z $latest_hotfix_build ]]
7112 # then
7113 # echo -e "${MAGENTA}something went wrong, couldn't get the builds information from https://repo3.cci.nokia.net/cbis-generic-candidates-local/cbis_local_repo/hotfix/CBIS-$cbis_short_version.x/$cbis_hotfix_name/${NC}"
7114 # else
7115 # if [[ $latest_hotfix_build != $cbis_hotfix_build ]]
7116 # then
7117 # echo -e "${MAGENTA}$cbis_hotfix_name build $latest_hotfix_build is available while the system is installed with build $cbis_hotfix_build - check with RnD if build $latest_hotfix_build is valid and if so, install it${NC}"
7118 # else
7119 # echo -e "${GREEN}using the latest $cbis_hotfix_name build $latest_hotfix_build${NC}"
7120 # fi
7121 # fi
7122 # else
7123 # echo -e "${GREEN}CBIS-19.0-SP3 hotfix is already published and is a scratch install${NC}"
7124 # fi
7125 # else
7126 # echo -e "${ORANGE}no hotfix is deployed${NC}"
7127 # fi
7128
7129
7130 ####################################################################################################
7131
7132
7133 # start=$(date +%s)
7134 # STEPS_COUNTER=$((STEPS_COUNTER+1))
7135 # echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT SECURITY HARDENING FINISHED VIA THE SECURITY HARDENING LOG IN CBIS MANAGER (+$elapsed_time_seconds `date '+%T'`)${NC}"
7136
7137 # if [[ $cbis_version == "19.0.0.1" ]]
7138 # then
7139 # hardening_finished_line=$(curl -g -s -k -L -X GET 'https://'$HypervisorURL'/log/ansible/ansible.log' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' | tail -n1 | grep 'Finished security hardening')
7140 # else
7141 # hardening_finished_line=$(curl -g -s -k -L -X GET 'https://'$HypervisorURL'/log/security_hardening.log' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' | grep '/opt/install/data/states/hardening_state: 0')
7142 # fi
7143 # if [[ $hardening_finished_line ]]
7144 # then
7145 # echo -e "${GREEN}security hardening finished sucessfully${NC}"
7146 # else
7147 # echo -e "${MAGENTA}according to the security hardening log in cbis manager, security hardening was never deployed or deployed but didn't finish as expected\nreview the security hardening logs from within cbis manager security tab${NC}"
7148 # fi
7149 # elapsed_time_seconds=$(expr $(date +%s) - $start)
7150
7151
7152 ####################################################################################################
7153
7154
7155 # start=$(date +%s)
7156 # STEPS_COUNTER=$((STEPS_COUNTER+1))
7157 # echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE INSTANCES LEFTOVERS IN /var/lib/nova/instances/ OF THE COMPUTES (+$elapsed_time_seconds `date '+%T'`)${NC}"
7158 # compute_images=$(ansible compute -b -m shell -a "ls -l /var/lib/nova/instances/_base/ | grep -v total" | grep qemu -B 1)
7159 # compute_images_sorted=$(echo -e "$compute_images" | awk '{print $NF}' | grep ^[0-9a-f] | sort | uniq)
7160 # for compute_image in $compute_images_sorted
7161 # do
7162 # check_if_image_in_glance=$(echo -e "$overcloud_images_list" | grep $compute_image)
7163 # if [[ -z $check_if_image_in_glance ]]
7164 # then
7165 # show_image_compute=$(echo -e "$compute_images" | grep overcloud- | awk '{print $1}')
7166 # echo -e "${RED}/var/lib/nova/instances/_base/$compute_image is not found in openstack image list\n$show_image_compute${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
7167 # else
7168 # echo -e "${GREEN}/var/lib/nova/instances/_base/$compute_image is found in openstack image list${NC}"
7169 # fi
7170 # done
7171 # elapsed_time_seconds=$(expr $(date +%s) - $global_start)
7172
7173
7174 ####################################################################################################
7175
7176
7177 # start=$(date +%s)
7178 # STEPS_COUNTER=$((STEPS_COUNTER+1))
7179 # echo -e "\n\n${BLUE}$STEPS_COUNTER) BACKUP CONFIG AND CRITICAL FILES FROM ALL THE HOSTS (+$elapsed_time_seconds `date '+%T'`)${NC}"
7180 # ansible all --limit '!hypervisor,!localhost' -b -m shell -a "yes | cp -R /var/lib/config-data/puppet-generated/. /home/cbis-admin/conf_backup" > /dev/null 2>&1
7181 # ansible all --limit '!hypervisor,!localhost' -b -m shell -a "tar -czf /home/cbis-admin/conf_backup_\`hostname\`.tar.gz -P /home/cbis-admin/conf_backup/*" > /dev/null 2>&1
7182 # ansible all --limit '!hypervisor,!localhost' -b -m shell -a "chmod 777 /home/cbis-admin/conf_backup_*.tar.gz" > /dev/null 2>&1
7183 # for host in $ansible_overcloud_hosts
7184 # do
7185 # scp $ssh_params cbis-admin@$host:/home/cbis-admin/conf_backup_*.tar.gz .
7186 # done
7187 # mv conf_backup_*.tar.gz $logs_dir/
7188 # tar -czf $logs_dir/conf_backup.tar.gz -P $logs_dir/conf_backup_*.tar.gz > /dev/null 2>&1
7189 # chmod 777 $logs_dir/conf_backup.tar.gz
7190 # rm -rf $logs_dir/conf_backup_*.tar.gz
7191 # if [[ -f "$logs_dir/conf_backup.tar.gz" ]]
7192 # then
7193 # echo -e "${GREEN}$logs_dir/conf_backup.tar.gz is found${NC}"
7194 # else
7195 # echo -e "${RED}$logs_dir/conf_backup.tar.gz can't be found${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
7196 # fi
7197 # if [[ ! -f $logs_dir/templates_backup.tar.gz ]]
7198 # then
7199 # tar -zcf templates_backup.tar.gz -P /home/stack/templates/ user_config.yaml
7200 # mv templates_backup.tar.gz $logs_dir/
7201 # fi
7202 # sshpass -p $hv_cbis_admin_password ansible -k all -b -m shell -a "cat /home/stack/.bash_history /home/cbis-admin/.bash_history /root/.bash_history" > bash_history
7203 # cat bash_history | grep -E '^overcloud|^172.31.7.254' | xargs -i sed -i 's/{}/\n\n\n\n\n&/g' bash_history
7204 # mv bash_history $logs_dir/
7205 # elapsed_time_seconds=$(expr $(date +%s) - $start)
7206
7207
7208 ####################################################################################################
7209
7210
7211 # start=$(date +%s)
7212 # STEPS_COUNTER=$((STEPS_COUNTER+1))
7213 # echo -e "${BLUE}\n\n$STEPS_COUNTER) VERIFY non-admin CORE RESOURCES (instances, backups, networks, ram, volumes, cores and snapshots) QUOTA IS NOT -1 (UNLIMITED) (+$elapsed_time_seconds `date '+%T'`)${NC}"
7214 # projects=$(source ~/overcloudrc && openstack project list -f value | grep -Fvw -e 'service' -e 'admin' -e 'opnfv_bench' | awk '{print $NF}')
7215 # if [[ $projects ]]
7216 # then
7217 # for project in $projects
7218 # do
7219 # echo -e "${CYAN}checking quota for project $project${NC}"
7220 # quota=$(source ~/overcloudrc && openstack quota show $project -f json | jq '{instances,backups,networks,ram,volumes,cores,snapshots}' | grep '\-1')
7221 # if [[ $quota ]]
7222 # then
7223 # echo -e "${RED}$quota${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
7224 # else
7225 # echo -e "${GREEN}project $project has no -1 (unlimited) quota value for resources instances, backups, networks, ram, volumes, cores and snapshots${NC}"
7226 # fi
7227 # done
7228 # else
7229 # source ~/overcloudrc && openstack project create system_validation_project > /dev/null
7230 # echo -e "${CYAN}checking quota for project system_validation_project${NC}"
7231 # validate_project_creation=$(source ~/overcloudrc && openstack project list | grep system_validation_project)
7232 # while [[ -z $validate_project_creation ]]
7233 # do
7234 # sleep 2
7235 # validate_project_creation=$(source ~/overcloudrc && openstack project list | grep system_validation_project)
7236 # done
7237 # echo -e "${GREEN}project system_validation_project is created successfully${NC}"
7238 # quota=$(source ~/overcloudrc && openstack quota show system_validation_project -f json | jq '{instances,backups,networks,ram,volumes,cores,snapshots}' | grep '\-1')
7239 # if [[ $quota ]]
7240 # then
7241 # echo -e "${RED}$quota${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
7242 # else
7243 # echo -e "${GREEN}project system_validation_project has no -1 (unlimited) quota value for resources instances, backups, networks, ram, volumes, cores and snapshots${NC}"
7244 # fi
7245 # source ~/overcloudrc && openstack project delete system_validation_project
7246 # validate_project_deletion=$(source ~/overcloudrc && openstack project list | grep system_validation_project)
7247 # while [[ $validate_project_deletion ]]
7248 # do
7249 # sleep 2
7250 # validate_project_deletion=$(source ~/overcloudrc && openstack project list | grep system_validation_project)
7251 # done
7252 # echo -e "${GREEN}project system_validation_project is deleted successfully${NC}"
7253 # fi
7254
7255
7256 ####################################################################################################
7257
7258
7259 # start=$(date +%s)
7260 # STEPS_COUNTER=$((STEPS_COUNTER+1))
7261 # echo -e "${BLUE}\n\n$STEPS_COUNTER) SAVE rabbitmqctl report IN $logs_dir/rabbitmqctl_report.log (+$elapsed_time_seconds `date '+%T'`)${NC}"
7262 # ansible $last_index_controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl report" > $logs_dir/rabbitmqctl_report.log
7263 # if [[ -f "$logs_dir/rabbitmqctl_report.log" ]]
7264 # then
7265 # echo -e "${GREEN}$logs_dir/rabbitmqctl_report.log is found${NC}"
7266 # else
7267 # echo -e "${RED}$logs_dir/rabbitmqctl_report.log is not found${NC}"
7268 # fi
7269 # elapsed_time_seconds=$(expr $(date +%s) - $start)
7270
7271
7272 ####################################################################################################
7273fi
7274 ####################################################################################################
7275
7276
7277elapsed_time_in_minutes=$(expr $(date +%s) - $global_start | xargs -i echo 'scale=2; '{}'/60' | bc)
7278if (( $(echo "$elapsed_time_in_minutes > 20" | bc -l) && $(echo "$elapsed_time_in_minutes < 20" | bc -l) ))
7279then
7280 echo -e "\n\n${ORANGE}it took $elapsed_time_in_minutes minutes for the script to finish. in average the script finishes after 15~ minutes.${NC}"
7281elif (( $(echo "$elapsed_time_in_minutes > 25" | bc -l) ))
7282then
7283 echo -e "\n\n${MAGENTA}it took $elapsed_time_in_minutes minutes for the script to finish. in average the script finishes after 15~ minutes.${NC}"
7284fi
7285date=$(date +"%x %X %Z %Y")
7286
7287
7288if [ $FAILURE_COUNTER -gt 0 ]
7289then
7290 echo -e "\n\n${RED}${BLINK}==================================================="
7291 echo -e "Found $FAILURE_COUNTER failures - Please review the script output"
7292 echo -e "===================================================${NC}"
7293fi
7294
7295
7296echo -e "${BLUE}\n\nTHE SYSTEM HEALTH VALIDATION SCRIPT FINISHED AFTER $elapsed_time_in_minutes MINUTES AT ($date)\nTHE SCRIPT OUTPUT IS SAVED UNDER: $logs_dir/system_health_report_"$logs_count"_"$DESCRIPTION"_"$MODE".log\nFOR QUESTIONS, IMPROVEMENTS, SUGGESTIONS, COMPLAINS AND BUGS, PLEASE CONTACT ${UL}arik.rozenman@nokia.com${NC}"
7297
7298}
7299main_function 2>&1 | tee $logs_dir/system_health_report_"$logs_count"_"$DESCRIPTION"_"$MODE".log
7300