fkPpWzTQ

· 4 years ago · Aug 18, 2021, 04:32 PM
1#!/bin/sh
2### Author: Arik Rozenman (arik.rozenman@nokia.com)
3### Current supported CBIS versions: 19A and 20
4### All Rights Reserved
5
6
7### colors
8NC='\033[0m'
9RED='\033[0;31m'
10GREEN='\033[32m'
11ORANGE='\033[33m'
12BLUE='\033[34m'
13BOLD='\e[1m'
14MAGENTA='\e[35m'
15CYAN='\e[36m'
16UL='\e[4m'
17LRB='\e[101m'
18BB='\e[44m'
19BLINK='\e[5m'
20
21
22### variables
23hv_cbis_admin_password=password
24ssh_params="-q -o LogLevel=error -o GlobalKnownHostsFile=/dev/null -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no"
25FAILURE_COUNTER=0
26SCRIPT_VERSION="8.9.5"
27
28
29logs_dir="/home/stack/system_health_validation_logs"
30if [ -d "$logs_dir/" ]
31then
32	logs_count=$(ls -l $logs_dir/ | grep -c system_health_report | xargs -i expr {} + 1)
33else
34	echo -e "${CYAN}creating the directory $logs_dir/${NC}"
35	mkdir -p $logs_dir/
36fi
37
38
39date=$(date +"%x %X %Z %Y")
40echo -e "${BLUE}\nWELCOME TO THE SYSTEM HEALTH VALIDATION VERSION $SCRIPT_VERSION SCRIPT ($date)${NC}"
41echo -e "${BLUE}THE SCRIPT OUTPUT WILL RESIDE AT: $logs_dir/ DIRECTORY${NC}\n\n"
42echo -e "${ORANGE}for script usage, run the script with -h (system_health_validation.sh -h)${NC}\n\n"
43echo -e "${MAGENTA}changes in version 8.9.5:\n1. added availability zones status validation${NC}\n"
44echo -e "${MAGENTA}changes in version 8.9.4:\n1. minor bug fixes\n2. revamp the docker CpusetCpus validation${NC}\n"
45echo -e "${MAGENTA}changes in version 8.9.3:\n1. minor bug fixes\n2. added zabbix global macros validation\n3. added computes total memory size validation${NC}\n\n"
46
47
48trap ctrl_c INT
49function ctrl_c() {
50		if [[ $FAILURE_COUNTER = "0" ]]
51		then
52			echo -e "${GREEN}failures found: $FAILURE_COUNTER${NC}"
53			exit
54		else
55			echo -e "${RED}failures found: $FAILURE_COUNTER${NC}"
56			exit
57		fi
58}
59
60
61OPTIND=1
62while getopts ":h" opt
63do
64  case ${opt}
65  in
66    h) echo -e "Usage:\t-h\t\t\t\t- SHOW SCRIPT USAGE\n\t-d <'description'>\t\t- FILE DESCRIPTION (default is no description)\n\t-u <username>\t\t\t- CBIS MANAGER USERNAME (default is 'cbis-admin')\n\t-p <password>\t\t\t- CBIS MANAGER PASSWORD (default is 'password')\n\t-e <yes/no>\t\t\t- ESSENTIAL CHECKS ONLY (default is no)\n\t-c <yes/no>\t\t\t- CONNECTIVITY CHECK ONLY (default is no)\n\t-x <stack user password>\t- SET USER stack PASSWORD"
67		exit
68  esac
69done
70
71
72# OPTIND=1 is executed because without it the second getopts function call is ignored. every-time getopts is called OPTIND increases by 1 while the getopts function searching for OPTIND=1 in order to work.
73# therefore, need to reset the OPTIND variable after each time we call the getopts function.
74OPTIND=1
75while getopts d:u:p:e:c:x: option
76do
77	case "${option}"
78	in
79		d) DESCRIPTION=${OPTARG};;
80		u) CBIS_MANAGER_USER=${OPTARG};;
81		p) CBIS_MANAGER_PASSWORD=${OPTARG};;
82		e) ESSENTIAL=${OPTARG};;
83		c) CONNECTIVITY=${OPTARG};;
84		x) STACK_PASSWORD=${OPTARG};;
85	esac
86done
87
88
89if [[ $DESCRIPTION == "" ]]
90then
91	echo -e "${RED}the script must run with -d <DESCRIPTION> (e.g, system_health_validation.sh -d 'after_clean_deploy_before_hardening'). exiting..${NC}"
92	exit
93else
94	if [[ "$DESCRIPTION" =~ \ |\' ]]
95	then
96		echo -e "${RED}the description can't have white-spaces in it. exiting..${NC}"
97		exit
98	fi
99fi
100
101
102if [[ $CBIS_MANAGER_USER == "" ]]
103then
104	CBIS_MANAGER_USER=cbis-admin
105	echo -e "${ORANGE}> no cbis manager username input (-u) was entered, using default username <cbis-admin>${NC}"
106fi
107if [[ $CBIS_MANAGER_PASSWORD == "" ]]
108then
109	CBIS_MANAGER_PASSWORD=password
110	echo -e "${ORANGE}> no cbis manager password input (-p) was entered, using default password <password>${NC}"
111fi
112cbis_manager_token=$(echo -n "$CBIS_MANAGER_USER:$CBIS_MANAGER_PASSWORD" | base64)
113
114
115
116if [[ $ESSENTIAL == "yes" ]]
117then
118	echo -e "${ORANGE}> running only the script essential checks - to run all the checks, execute the script with [-e no] or simply don't use [-e]${NC}"
119	MODE=ESSENTIAL
120fi
121if [[ $ESSENTIAL == "no" ]]
122then
123	echo -e "${ORANGE}> running all the script checks - to run just the essential checks, execute the script with [-e yes]${NC}"
124	ESSENTIAL=no
125	MODE=FULL
126fi
127if [[ $ESSENTIAL == "" ]]
128then
129	echo -e "${ORANGE}> running all the script checks - to run just the essential checks, execute the script with [-e yes]${NC}"
130	ESSENTIAL=no
131	MODE=FULL
132fi
133if [[ $ESSENTIAL != "yes" && $ESSENTIAL != "no" ]]
134then
135	echo -e "${RED}-e accepts only [yes/no] input. exiting..${NC}"
136	exit
137fi
138
139
140if [[ $CONNECTIVITY == "" ]]
141then
142	CONNECTIVITY=no
143fi
144if [[ $CONNECTIVITY == "yes" ]]
145then
146	echo -e "> running the script with -c yes (connectivity check only)"
147fi
148if [[ $CONNECTIVITY != "yes" && $CONNECTIVITY != "no" ]]
149then
150	echo -e "${RED}-c accepts only [yes/no] input. exiting..${NC}"
151	exit
152fi
153
154
155###########################################################################################################
156main_function() {
157global_start=$(date +%s)
158start=$(date +%s)
159###########################################################################################################
160
161
162echo -e "\n\n${BLUE}system health validation script version: $SCRIPT_VERSION${NC}"
163echo -e "\n\n${UL}${BOLD}Colors Legend${NC}"
164echo -e "${GREEN}Success"
165echo -e "${RED}Failure (ideally, each failure equals bug)"
166echo -e "${MAGENTA}Action item for the user"
167echo -e "${ORANGE}Warning that should be read and acknowledged"
168echo -e "${NC}Information (for future debug purposes)"
169echo -e "${BLUE}Presentation (to make the script more eye appealing and organized)"
170echo -e "${CYAN}Presentation (to make the script more eye appealing and organized)\n"
171
172UNDERCLOUD_LOCAL_IP_ADDRESS=$(ip address show | grep 172.31.0.1/21)
173if [[ -z $UNDERCLOUD_LOCAL_IP_ADDRESS ]]
174then
175	echo -e "\n${RED}the system_health_validation.sh script must be executed only from the undercloud virtual machine. exiting..${NC}"
176	exit
177fi
178	
179	
180if [[ $CONNECTIVITY == "yes" ]]
181then
182	echo -e "${BLUE}\nINSTANCES CONNECTIVITY VALIDATION (VIA NETWORK NAMESPACE)${NC}"
183	nuage=$(grep nuage: user_config.yaml | awk '{print $2}' | column -t)
184	if [[ $nuage != "true" ]]
185	then		
186		source ~/overcloudrc
187		# the reason ebhind working with the last index controller is to find issues that related to replace controller. E.G an isse we faces that connectivity via the namespace to a VM doesn't work only from the replaced controller.
188		last_index_controller=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i controller | awk '{print $1}' | sort -r --uniq | awk NR==1)
189		cbis_version=$(openstack cbis version -f value | grep build | awk -F- '{print $2}')
190		instances=$(openstack server list --all -f value | wc -l)
191		if [[ $instances != "0" ]]
192		then
193			if [[ $cbis_version != "19.0.0.1" && $cbis_version != "18.0.0.1" ]]
194			then
195				networks=$(openstack server list --all --long -c Networks -f value | grep -E -i -v ':|avrs' | awk -F= '{print $1}' | sort -u)
196				if [[ $networks ]]
197				then
198					for network in $networks
199					do
200						addresses=$(openstack server list --all --long -c Networks -f value | awk '{OFS=RS;$1=$1}1' | grep -w $network | awk -F= '{print $2}' | tr -d ',;' | paste -sd " ")
201						if [[ -z $addresses ]]
202						then
203							echo -e "${RED}addresses variable didn't return any value${NC}"
204						fi
205						network_id=$(openstack network list -f value | grep -w $network | awk '{print $1}')
206						if [[ -z $network_id ]]
207						then
208							echo -e "${RED}network_id variable didn't return any value${NC}"
209						fi
210						echo -e "${CYAN}ansible $last_index_controller -b -m shell -a \"ip netns exec qdhcp-$network_id nping -4 --tcp-connect -p 22 -c 3 $addresses\"${NC}"
211						result=$(ansible $last_index_controller -b -m shell -a "ip netns exec qdhcp-$network_id nping -4 --tcp-connect -p 22 -c 3 $addresses" | grep 'Failed: [1-9]')
212						if [[ -z $result ]]
213						then	
214							echo -e "${GREEN}network $network addresses replied successfully${NC}"
215						else
216							echo -e "${RED}$result${NC}"
217						fi
218					done
219				fi
220				networks=$(openstack server list --all --long -c Networks -f value | grep -i -v avrs | grep : | awk -F= '{print $1}' | sort -u)
221				if [[ $networks ]]
222				then
223					for network in $networks
224					do
225						addresses=$(openstack server list --all --long -c Networks -f value | awk '{OFS=RS;$1=$1}1' | grep -w $network | awk -F= '{print $2}' | tr -d ',;' | paste -sd " ")
226						if [[ -z $addresses ]]
227						then
228							echo -e "${RED}addresses variable didn't return any value${NC}"
229						fi
230						network_id=$(openstack network list -f value | grep -w $network | awk '{print $1}')
231						if [[ -z $network_id ]]
232						then
233							echo -e "${RED}network_id variable didn't return any value${NC}"
234						fi
235						echo -e "${CYAN}ansible $last_index_controller -b -m shell -a \"ip netns exec qdhcp-$network_id nping -6 --tcp-connect -p 22 -c 3 $addresses\"${NC}"
236						result=$(ansible $last_index_controller -b -m shell -a "ip netns exec qdhcp-$network_id nping -6 --tcp-connect -p 22 -c 3 $addresses" | grep 'Failed: [1-9]')
237						if [[ -z $result ]]
238						then
239							echo -e "${GREEN}network $network addresses replied successfully${NC}"
240						else
241							echo -e "${RED}$result${NC}"
242						fi
243					done
244				fi
245			else
246				networks=$(openstack server list --all --long -c Networks -f value | grep -v -i avrs)
247				ipv4_networks=$(echo "$networks" | grep -v : | awk -F= '{print $1}' | sort -u)
248				ipv6_networks=$(echo "$networks"  | grep : | awk -F= '{print $1}' | sort -u)
249				if [[ $ipv4_networks ]]
250				then
251					for network in $ipv4_networks
252					do
253						addresses=$(openstack server list --all --long -c Networks -f value | awk '{OFS=RS;$1=$1}1' | grep -w $network | awk -F= '{print $2}' | tr -d ',;' | paste -sd " ")
254						if [[ -z $addresses ]]
255						then
256							echo -e "${RED}addresses variable didn't return any value${NC}"
257						fi
258						network_id=$(openstack network list -f value | grep -w $network | awk '{print $1}')
259						if [[ -z $network_id ]]
260						then
261							echo -e "${RED}network_id variable didn't return any value${NC}"
262						fi
263						echo -e "${CYAN}ansible $last_index_controller -b -m shell -a \"ip netns exec qdhcp-$network_id fping $addresses'\"${NC}"
264						result=$(ansible $last_index_controller -b -m shell -a "ip netns exec qdhcp-$network_id fping $addresses" | grep ^[0-9] | grep -v 'is alive')
265						if [[ $result ]]
266						then
267							echo -e "\n${RED}\n$result${NC}\n"
268						else
269							echo -e "${GREEN}all the addresses of network $network replied successfully${NC}"
270						fi
271					done
272				fi
273				if [[ $ipv6_networks ]]
274				then
275					for network in $ipv6_networks
276					do
277						addresses=$(openstack server list --all --long -c Networks -f value | awk '{OFS=RS;$1=$1}1' | grep -w $network | awk -F= '{print $2}' | tr -d ',;' | paste -sd " ")
278						if [[ -z $addresses ]]
279						then
280							echo -e "${RED}addresses variable didn't return any value${NC}"
281						fi						
282						network_id=$(openstack network list -f value | grep -w $network | awk '{print $1}')
283						if [[ -z $network_id ]]
284						then
285							echo -e "${RED}network_id variable didn't return any value${NC}"
286						fi						
287						echo -e "${CYAN}ansible $last_index_controller -b -m shell -a \"ip netns exec qdhcp-$network_id fping6 $addresses$\"${NC}"
288						result=$(ansible $last_index_controller -b -m shell -a "ip netns exec qdhcp-$network_id fping6 $addresses" | grep ^[0-9] | grep -v 'is alive')
289						if [[ $result ]]
290						then
291							echo -e "\n${RED}\n$result${NC}\n"
292						else
293							echo -e "${GREEN}all the addresses of network $network replied successfully${NC}"
294						fi
295					done
296				fi				
297			fi
298		else
299			echo -e "${LRB}${BLINK}no instances are found on the system${NC}"
300		fi
301	elif [[ $nuage == "true" ]]
302	then
303		echo -e "${ORANGE}nuage/avrs instance aren't using the neutron dhcp namespace and therefore this check is irrelevant for nuage deployment${NC}"
304	fi
305	exit
306fi
307
308
309###########################################################################################################
310
311
312echo -e "\n${UL}${BLUE}GATHERING REQUIRED SYSTEM INFORMATION${NC}"
313
314echo -e "${CYAN}retrieve the cbis manager/hypervisor ip address${NC}"
315HypervisorURL=$(cat ~/user_config.yaml | grep -w hypervisor_cidr6)
316if [[ $HypervisorURL ]]
317then
318	HypervisorURL=$(cat ~/user_config.yaml | grep -w hypervisor_cidr | awk '{print $2}' | awk -F/ '{print $1}' | grep ^[0-9])
319	echo -e "${CYAN}retrieve the deployment ip stack${NC}"
320	ip_stack="IPv6/IPv4 Dual Stack"
321else
322	HypervisorURL=$(cat ~/user_config.yaml | grep -w hypervisor_cidr | awk '{print $2}' | awk -F/ '{print $1}' | grep ^[0-9])
323	echo -e "${CYAN}retrieve the deployment ip stack${NC}"
324	ip_stack="IPv4"
325fi
326
327echo -e "${CYAN}validate cbis manager credentials authenticity${NC}"
328cbis_manager_credentials_check=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X GET 'https://'$HypervisorURL'/api/pages' -H 'Authorization: Basic '$cbis_manager_token'' | grep RESP_CODE | awk -F: '{print $2}')
329if [[ $cbis_manager_credentials_check == "401" || $cbis_manager_credentials_check == "402" ]]
330then
331	echo -e "${RED}cbis manager authorization failure (http error $cbis_manager_credentials_check). make sure you entered the correct cbis manager user/password. exiting..${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
332	exit
333fi
334echo -e "${CYAN}retrieve cbis software version${NC}"
335cbis_version_full=$(openstack cbis version -f value)
336cbis_version_base=$(echo -e "$cbis_version_full" | grep build | awk '{print $NF}')
337cbis_version=$(echo -e "$cbis_version_full" | grep build | awk -F- '{print $2}')
338
339
340echo -e "${CYAN}retrieve cbis hotfix version and build${NC}"
341hotfix_install_success=$(/var/lib/cbis/cbis_hotfix list -f json 2>&1 | jq .[] 2>&1 | jq 'select(.state == "post-install-success")' 2>&1 | grep post-install-success)
342if [[ -z $hotfix_install_success ]]
343then
344	continue
345else
346	hotfix_json=$(/var/lib/cbis/cbis_hotfix list -f json | jq .[] | jq 'select(.state == "post-install-success")')
347	hotfix_name=$(echo $hotfix_json | jq .hotfix_name | head -n1 | tr -d \")
348	build_number=$(echo $hotfix_json | jq .build_number | head -n1 | tr -d \")
349	hotfix_name_build="${hotfix_name}-${build_number}"
350fi
351
352echo -e "${CYAN}retrieve the setup hardware model${NC}"
353hw_model=$(grep hw_model_type user_config.yaml | awk '{print $2}')
354
355echo -e "${CYAN}retrieve the cloud name${NC}"
356cloud_name=$(cat user_config.yaml | grep cloud_name | awk '{print $2}')
357if [[ $cloud_name == "''" ]]
358then
359	cloud_name=""
360fi
361
362echo -e "${CYAN}retrieve the nfs backup mountpoint${NC}"
363backup_nfs_mountpoint=$(grep backup_nfs_mountpoint: user_config.yaml | awk '{print $NF}')
364
365echo -e "${CYAN}retrieve the undercloud vm external ip address${NC}"
366undercloud_vm_ip=$(sudo grep undercloud_cidr: user_config.yaml | awk '{print $2}' | awk -F/ '{print $1}')
367
368echo -e "${CYAN}retrieve the existing host-groups${NC}"
369aggregate_hosts=$(source ~/overcloudrc && openstack aggregate list --long -f value -c Name | grep [a-zA-Z]*Compute)
370
371echo -e "${CYAN}retrieve the setup platform (model) type${NC}"
372platform=$(curl -g -s -L -k -X GET 'https://'$HypervisorURL'/api/installation/initial_page' -H 'Authorization: Basic '$cbis_manager_token'' | jq . | grep default | awk '{print $2}' | tr -d ,)
373
374echo -e "${CYAN}retrieve the entire cbis manager installation page${NC}"
375installation_page=$(curl -g -s -L -k -X POST 'https://'$HypervisorURL'/api/installation/status' -H 'Authorization: Basic '$cbis_manager_token'' -H 'content-type: application/json' --data '{"hardware":'$platform'}' | jq .)
376
377echo -e "${CYAN}retrieve openstack version${NC}"
378openstack_version_numerical=$(openstack --version 2>&1 | awk '{print $NF}')
379openstack_version_name=$(docker image list | grep '\-latest' | awk '{print $2}' | sort -u | awk -F- '{print $1}' | grep -v zabbix)
380
381echo -e "${CYAN}check if ceph backend is enabled or disabled${NC}"
382ceph_backend=$(grep ceph_backend_enabled: ~/user_config.yaml | awk '{print $2}')
383
384echo -e "${CYAN}check if the setup is deployed with nuage${NC}"
385nuage=$(grep nuage: user_config.yaml | awk '{print $2}' | column -t)
386
387echo -e "${CYAN}check if the setup is deployed external storage system${NC}"
388external_storage_system=$(cat user_config.yaml | grep external_storage_system | awk '{print $2}' | head -n1)
389
390echo -e "${CYAN}check if the setup is hci or non-hci${NC}"
391hci=$(grep ceph_hci: ~/user_config.yaml | awk '{print $2}')
392
393echo -e "${CYAN}check if the setup is deployed with/without ceph fast-pools${NC}"
394fast_pools=$(cat user_config.yaml | grep enable_fast_pool | awk '{print $2}')
395
396echo -e "${CYAN}retrieve the servers count (overcloud + undercloud vm)${NC}"
397ansible_all_hosts_count=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | awk '{print $1}' | sort --uniq | wc -l | xargs -i expr {} + 1)
398
399echo -e "${CYAN}retrieve controllers hostname${NC}"
400last_index_controller=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i controller | awk '{print $1}' | sort --uniq | sort -n | tail -n1)
401first_index_controller=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i controller | awk '{print $1}' | sort --uniq | awk NR==1)
402current_controllers=$(source ~/stackrc && openstack server list -c Name --no-name-lookup --name Controller -f value)
403current_controllers_piped=$(echo -e "$current_controllers" | paste -sd'|')
404
405echo -e "${CYAN}retrieve stackrc/overcloudrc openstack server list information${NC}"
406nova_overcloud_hosts_list=$(source ~/stackrc && openstack server list -f json | jq ".[]" | jq .Name | tr -d \" | sort | tr '[:upper:]' '[:lower:]')
407nova_overcloud_and_undercloud_hosts_count=$(echo -e "$nova_overcloud_hosts_list" | wc -l | xargs -i expr {} + 1)
408nova_instances=$(source ~/overcloudrc && openstack server list --long --all -f value -c ID -c Name -c Host -c Status -c "Power State"  | column -t)
409nova_instances_count=$(echo -e "$nova_instances" | grep -c -E 'ACTIVE\s+Running')
410
411echo -e "${CYAN}retrieve overcloud images list${NC}"
412overcloud_images_list=$(source ~/overcloudrc && openstack image list -f value | column -t)
413
414echo -e "${CYAN}check if local storage is enabled/disabled per host-group${NC}"
415ovs_local_storage=$(awk '/OvsCompute:/,0' user_config.yaml | grep enable_local_storage: | head -n1 | awk '{print $2}')
416sriov_local_storage=$(awk '/SriovPerformanceCompute:/,0' user_config.yaml | grep enable_local_storage: | head -n1 | awk '{print $2}')
417dpdk_local_storage=$(awk '/DpdkPerformanceCompute:/,0' user_config.yaml | grep enable_local_storage: | head -n1 | awk '{print $2}')
418avrs_local_storage=$(awk '/AvrsCompute/,0' user_config.yaml |  grep enable_local_storage: | head -n1 | awk '{print $2}')
419
420echo -e "${CYAN}retrieve the hostname of each host under each host-group from /etc/ansible/hosts${NC}"
421ansible_computes_hosts=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i compute | awk '{print $1}' | sort --uniq)
422ansible_controllers_hosts=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i controller | awk '{print $1}' | sort --uniq)
423ansible_monitoring_hosts=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i monitoring | awk '{print $1}' | sort --uniq)
424ansible_storage_hosts=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i storage | awk '{print $1}' | sort --uniq)
425ansible_ovs_hosts=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i ovs | awk '{print $1}' | sort --uniq)
426ansible_sriov_hosts=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i sriov | awk '{print $1}' | sort --uniq)
427ansible_dpdk_hosts=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i dpdk | awk '{print $1}' | sort --uniq)
428ansible_avrs_hosts=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i avrs | awk '{print $1}' | sort --uniq)
429ansible_overcloud_hosts=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | awk '{print $1}' | sort --uniq)
430random_storage_hostname=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i storage | awk '{print $1}' | sort --uniq | shuf -n 1)
431random_compute_hostname=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i compute | awk '{print $1}' | sort --uniq | shuf -n 1)
432
433echo -e "${CYAN}retrieve the hosts count of each host-group${NC}"
434ansible_storage_hosts_count=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i storage | awk '{print $1}' | sort --uniq | wc -l)
435ansible_monitoring_hosts_count=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i monitoring | awk '{print $1}' | sort --uniq | wc -l)
436ansible_dpdk_hosts_count=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i dpdk | awk '{print $1}' | sort --uniq | wc -l)
437ansible_ovs_hosts_count=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i ovs | awk '{print $1}' | sort --uniq | wc -l)
438ansible_sriov_hosts_count=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i sriov | awk '{print $1}' | sort --uniq | wc -l)
439ansible_avrs_hosts_count=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i avrs | awk '{print $1}' | sort --uniq | wc -l)
440ansible_controllers_hosts_count=$(cat /etc/ansible/hosts | grep overcloud-[a-zA-Z] | grep -i controller | awk '{print $1}' | sort --uniq | wc -l)
441
442echo -e "${CYAN}retrieve $last_index_controller InternalApi, Tenant, Storage, StorageMgmt, ControlPlane (Provisioning) addresses${NC}"
443ip_addres_show=$(ansible $last_index_controller -b -m shell -a "ip addres show")
444internal_api_cidr=$(cat /home/stack/templates/network-environment.j2.yaml | grep InternalApiNetCidr | awk '{print $2}' | cut -d . -f 1-3)
445internal_api_controller_address=$(echo -e "$ip_addres_show" | grep $internal_api_cidr | grep inet | awk NR==1 | awk '{print $2}' | awk -F/ '{print $1}')
446tenant_cidr=$(cat /home/stack/templates/network-environment.j2.yaml | grep TenantNetCidr | awk '{print $2}' | cut -d . -f 1-3)
447tenant_controller_address=$(echo -e "$ip_addres_show" | grep $tenant_cidr | grep inet | awk NR==1 | awk '{print $2}' | awk -F/ '{print $1}')
448storage_cidr=$(cat /home/stack/templates/network-environment.j2.yaml | grep StorageNetCidr | awk '{print $2}' | cut -d . -f 1-3)
449storage_controller_address=$(echo -e "$ip_addres_show" | grep $storage_cidr | grep inet | awk NR==1 | awk '{print $2}' | awk -F/ '{print $1}')
450storage_mgmt_cidr=$(cat /home/stack/templates/network-environment.j2.yaml | grep StorageMgmtNetCidr | awk '{print $2}' | cut -d . -f 1-3)
451storage_mgmt_controller_address=$(echo -e "$ip_addres_show" | grep $storage_mgmt_cidr | grep inet | awk NR==1 | awk '{print $2}' | awk -F/ '{print $1}')
452provisioning_cidr=$(cat /home/stack/templates/network-environment.yaml | grep ControlPlaneDefaultRoute | awk '{print $2}' | cut -d . -f 1-3)
453provisioning_controller_address=$(echo -e "$ip_addres_show" | grep $provisioning_cidr | grep inet | awk NR==1 | awk '{print $2}' | awk -F/ '{print $1}')
454
455echo -e "${CYAN}retrieve the hypervisor_dedicated_cpus value from the computes${NC}"
456ovs_hypervisor_dedicated_cpus=$(grep -w OvsCompute: user_config.yaml -A19 | grep hypervisor_dedicated_cpus: | awk '{print $2}')
457sriov_hypervisor_dedicated_cpus=$(grep -w SriovPerformanceCompute: user_config.yaml -A19 | grep hypervisor_dedicated_cpus: | awk '{print $2}')
458dpdk_hypervisor_dedicated_cpus=$(grep -w DpdkPerformanceCompute: user_config.yaml -A19 | grep hypervisor_dedicated_cpus: | awk '{print $2}')
459avrs_hypervisor_dedicated_cpus=$(grep -w AvrsCompute: user_config.yaml -A19 | grep hypervisor_dedicated_cpus: | awk '{print $2}')
460
461echo -e "${CYAN}check if the setup is deployed with ELK and which deployment type (remote or local)${NC}"
462elk=$(cat user_config.yaml | grep deploy_elk | awk '{print $2}')
463elk_deployment_type=$(cat user_config.yaml | grep elk_deployment_type | awk '{print $2}')
464
465echo -e "${CYAN}retrieve OS_AUTH_URL (public virtual IP address)${NC}"
466PublicURL=$(cat user_config.yaml | grep ip_range_start: | awk '{print $2}' | grep -v :)
467if [[ -z $PublicURL ]]
468then
469	PublicURL=$(cat user_config.yaml | grep ip_range_start: | awk '{print $2}' | sed 's/$/]/g' | sed 's/^/[/g')
470else
471	PublicURL=$(cat user_config.yaml | grep ip_range_start: | awk '{print $2}')
472fi
473
474echo -e "${CYAN}retrieve the horizon admin password${NC}"
475ADMIN_PASSWORD=$(grep -w admin_password: user_config.yaml | awk '{print $NF}')
476
477echo -e "${CYAN}retrieve zabbix username and password${NC}"
478zabbix_username=$(cat user_config.yaml | grep zabbix_username: | awk '{print $2}')
479zabbix_password=$(cat user_config.yaml | grep zabbix_password: | awk '{print $2}')
480
481echo -e "${CYAN}retrieve kibana user, password and basic authentication token${NC}"
482kibana_user=$(cat user_config.yaml | grep kibana_username | awk '{print $2}')
483kibana_password=$(cat user_config.yaml | grep kibana_password | awk '{print $2}')
484kibana_basic_auth=$(echo -n "$kibana_user:$kibana_password" | base64)
485
486echo -e "${CYAN}retrieve zabbix authentication token${NC}"
487zabbix_auth=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php?result=$result' \
488-H 'Content-Type: application/json' \
489-H 'Cookie: SERVERID='$last_index_controller'' \
490--data '{
491	"jsonrpc": "2.0",
492	"method": "user.login",
493	"params": {
494		"user": "'$zabbix_username'",
495		"password": "'$zabbix_password'"
496	},
497	"id": 1,
498	"auth": null
499}' | jq .result)
500
501echo -e "${CYAN}retrieve zabbix overcloud hosts list${NC}"
502zabbix_hosts_raw=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
503-H 'Content-Type: application/json-rpc' \
504-H 'Cookie: SERVERID='$last_index_controller'' \
505--data '{
506	"jsonrpc": "2.0",
507	"method": "host.get",
508	"params": {
509		"output": [
510			"hostid",
511			"host"
512		],
513		"selectInterfaces": [
514			"interfaceid",
515			"ip"
516		]
517	},
518	"id": 2,
519	"auth": '$zabbix_auth'
520}')
521zabbix_hosts=$(echo -e "$zabbix_hosts_raw" | jq -r .result | jq ".[].host" | tr -d \" | grep -E -v -i 'undercloud|Zabbix server|active-controller|switch' | sort | tr '[:upper:]' '[:lower:]')
522zabbix_hosts_and_ids=$(echo -e "$zabbix_hosts_raw" | jq .result[] | jq -r "[.host,.hostid]")
523
524if [[ $cbis_version != "18.0.0.1" && $cbis_version != "19.0.0.1" ]]
525then
526	echo -e "${CYAN}check if the setup is upgraded${NC}"
527	upgraded=$(curl -g -s -k -L -X GET 'https://'$HypervisorURL'/api/cbis_upgrade/state' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' | jq .state | tr -d \")
528fi
529
530echo -e "${CYAN}retrieve the entire system installed rpms${NC}"
531sshpass -p $hv_cbis_admin_password ansible -k all -b -m shell -a "rpm -qa warn=False" > installed_rpms_$(date +%Y-%m-%d_%H-%M-%S)
532
533echo -e "${CYAN}retrieve the entire system bios version${NC}"
534bios=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "dmidecode -s bios-version" | tee BIOS_version)
535bios=$(echo -e "$bios" | grep -v SUCCESS | sort -u)
536
537echo -e "${CYAN}retrieve the entire system firmware version${NC}"
538firmware=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'" | tee firmware_version)
539firmware=$(echo -e "$firmware" | grep -v SUCCESS | sort -u)
540
541if [[ $nuage == "true" ]]
542then
543	echo -e "${CYAN}retrieve the vsd ip address${NC}"
544	vsd_ip=$(cat user_config.yaml | grep NeutronNuageVSDIp | awk '{print $NF}' | awk -F: '{print $1}')
545fi
546
547echo -e "${CYAN}retrieve the amount of time (in hours) from when the system was successfully deployed till now${NC}"
548deployment_ended_epoch_date=$(cat /var/log/cbis/overcloud_installation.log | grep ^20[2-9][1-9] | tail -n1 | awk -F, '{print $1}' | xargs -i date -d "{}" +%s)
549current_date=$(date +%s)
550uptime_hours=$(echo "scale=1;((($current_date-$deployment_ended_epoch_date) / 60 / 60))" | bc)
551uptime_days=$(echo -e "$uptime_hours" | awk -F. '{print $1}' | xargs -i echo "scale=1;(({} / 24))" | bc)
552
553echo -e "\n\n===================================================================================================="
554echo -e "                                           SYSTEM SUMMARY"
555echo -e "===================================================================================================="
556fixed_platform=$(echo -e "$platform" | tr -d \")
557if [[ $fixed_platform ]]
558then
559	echo -e "PLATFORM \t\t\t\t\t\t= $fixed_platform"
560else
561	echo -e "${RED}WARNING!${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
562	echo -e "${RED}The platform value of cbis manager installation page returned empty value.\nIn most cases it happens after uninstalling CBIS manager.\nSome checks might be negativly affected by it. Review the results with discretion.${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
563	platform=$(cat user_config.yaml | grep hw_model_type: | awk '{print $2}' | sed 's/$/"/g' | sed 's/^/"/g')
564	fixed_platform=$(echo -e "$platform" | tr -d \")
565	echo -e "===================================================================================================="
566	echo -e "PLATFORM \t\t\t\t\t\t= $fixed_platform"
567fi
568echo -e "BASE CBIS VERSION \t\t\t\t\t= $cbis_version_base"
569if [[ $cbis_version != "18.0.0.1" && $cbis_version != "19.0.0.1" ]]
570then
571	if [[ $upgraded == "NEW" ]]
572	then
573		echo -e "CBIS UPGRADE STATE \t\t\t\t\t= $upgraded (UPGRADE NEVER STARTED)"
574	elif [[ $upgraded == "UNKNOWN" ]]
575	then
576		echo -e "CBIS UPGRADE STATE \t\t\t\t\t= $upgraded (UPGRADE NEVER STARTED OR UPGRADE FAILED)"
577	else
578		echo -e "CBIS UPGRADE STATE \t\t\t\t\t= $upgraded"
579	fi
580fi
581echo -e "OPENSTACK VERSION \t\t\t\t\t= $openstack_version_numerical ($openstack_version_name)"
582if [[ $fixed_platform != 'airframe' && $fixed_platform != 'dell-730' && $fixed_platform != 'hp-slg7_OVS' && $fixed_platform != 'hp-slg7_OVS_SSD_single_nic' && $fixed_platform != 'hp-c7kg8' && $fixed_platform != 'hp-c7kg9' ]]
583then
584	mlx_ofed_version=$(ansible compute -b -m shell -a "/usr/bin/ofed_info | head -n1" | awk '{print $1}' | grep -v overcloud | sort -u | grep -v /bin/sh)
585	if [[ $mlx_ofed_version == "/bin/sh:" ]]
586	then
587		echo -e "${RED}unable to find installed mellanox ofed on this $fixed_platform system (/usr/bin/ofed_info)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
588		nics_type="Intel"
589	else
590		nics_firmware=$(ansible compute -b -m shell -a "ovs-appctl bond/list | grep tenant-bond | awk '{print \$NF}' | sort -u | xargs -i ethtool -i {}" | grep firmware-version: | head -n1 | awk '{print $2,$3}')
591		if [[ -z $nics_firmware ]]
592		then
593			nics_firmware=$(ansible compute -b -m shell -a "cat /proc/net/bonding/tenant-bond | grep 'Slave Interface:' | awk '{print \$NF}' | sort -u | xargs -i ethtool -i {}" | grep firmware-version: | head -n1 | awk '{print $2,$3}')
594		fi
595		echo -e "NICS TYPE \t\t\t\t\t\t= $mlx_ofed_version"
596		echo -e "NICS FIRMWARE \t\t\t\t\t\t= $nics_firmware"
597		nics_firmware=$(echo -e "$nics_firmware" | awk '{print $1}')
598		nics_type="$mlx_ofed_version"
599		nics_type+="-$nics_firmware"
600	fi
601else
602	nics_type="Intel"
603	echo -e "NICS TYPE \t\t\t\t\t\t= $nics_type"
604fi
605echo -e "eSW \t\t\t\t\t\t\t= $bios (BIOS), $firmware (firmware)"
606if [[ $hotfix_install_success ]]
607then
608	echo -e "SUCCESSFULLY DEPLOYED PATCH(S) \t\t\t\t= $hotfix_name_build"
609	hotfix_deployment_type=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/deployment.log" | grep 'Hooks found')
610	if [[ $hotfix_deployment_type ]]
611	then
612		echo -e "PATCH DEPLOYMENT TYPE \t\t\t\t\t= Scratch Deployment"
613	else
614		echo -e "PATCH DEPLOYMENT TYPE \t\t\t\t\t= Patch Management"
615	fi
616else
617	echo -e "SUCCESSFULLY DEPLOYED PATCH(S) \t\t\t\t= None"
618fi
619echo -e "IP STACK \t\t\t\t\t\t= $ip_stack"
620if [[ $nuage == "true" ]]
621then
622	nuage_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "cat /usr/share/cbis/nuage-version" | grep ^[0-9] | sort --uniq)
623	echo -e "NUAGE \t\t\t\t\t\t\t= $nuage_version"
624	
625else
626	echo -e "NUAGE \t\t\t\t\t\t\t= false"
627fi
628if [[ $external_storage_system != "null" ]]
629then
630	echo -e "EXTERNAL STORAGE SYSTEM \t\t\t\t= $external_storage_system"
631	storage_type="$external_storage_system"
632fi
633echo -e "ELK (ElasticSearch, Logstash & Kibana) \t\t\t= $elk ($elk_deployment_type)"
634if [[ $ceph_backend == "true" ]]
635then
636	echo -e "CEPH BACKEND \t\t\t\t\t\t= true"
637	if [[ $hci == "true" ]]
638	then
639		echo -e "HCI \t\t\t\t\t\t\t= true"
640		storage_type="HCI"
641	elif [[ $hci == "false" && $fast_pools == "false" ]]
642	then
643		echo -e "MULTI POOLS \t\t\t\t\t\t= true"
644		storage_type="Multi_Pools"
645		echo -e "STORAGE HOSTS \t\t\t\t\t\t= $ansible_storage_hosts_count"
646	else
647		echo -e "FAST POOLS \t\t\t\t\t\t= true"
648		storage_type="Fast_Pools"
649		echo -e "STORAGE HOSTS \t\t\t\t\t\t= $ansible_storage_hosts_count"
650	fi
651else
652	echo -e "CEPH BACKEND \t\t\t\t\t\t= false"
653fi
654echo -e "CONTROLLER HOSTS \t\t\t\t\t= $ansible_controllers_hosts_count"
655if [[ $ansible_monitoring_hosts_count != "0" ]]
656then
657	echo -e "MONITORING HOSTS \t\t\t\t\t= $ansible_monitoring_hosts_count"
658fi
659if [[ $ansible_dpdk_hosts_count != "0" ]]
660then
661	echo -e "DPDK HOSTS \t\t\t\t\t\t= $ansible_dpdk_hosts_count"
662fi
663if [[ $ansible_ovs_hosts_count != "0" ]]
664then
665	echo -e "OVS HOSTS \t\t\t\t\t\t= $ansible_ovs_hosts_count"
666fi
667if [[ $ansible_sriov_hosts_count != "0" ]]
668then
669	echo -e "SRIOV HOSTS \t\t\t\t\t\t= $ansible_sriov_hosts_count"
670fi
671if [[ $ansible_avrs_hosts_count != "0" ]]
672then
673	echo -e "AVRS HOSTS \t\t\t\t\t\t= $ansible_avrs_hosts_count"
674fi
675if [[ $ansible_dpdk_hosts_count != "0" && $dpdk_local_storage != "false" ]]
676then
677	echo -e "DPDK LOCAL STORAGE \t\t\t\t\t\t= $dpdk_local_storage"
678fi
679if [[ $ansible_ovs_hosts_count != "0" && $ovs_local_storage != "false" ]]
680then
681	echo -e "OVS LOCAL STORAGE \t\t\t\t\t= $ovs_local_storage"
682fi
683if [[ $ansible_sriov_hosts_count != "0" && $sriov_local_storage != "false" ]]
684then
685	echo -e "SRIOV LOCAL STORAGE \t\t\t\t\t\t= $sriov_local_storage"
686fi
687if [[ $ansible_avrs_hosts_count != "0" && $avrs_local_storage != "false" ]]
688then
689	echo -e "AVRS LOCAL STORAGE \t\t\t\t\t\t= $avrs_local_storage"
690fi
691echo -e "DEPLOYMENT UPTIME \t\t\t\t\t= $uptime_hours hours ($uptime_days days)"
692echo -e "===================================================================================================="
693elapsed_time_seconds=$(expr $(date +%s) - $global_start)
694	
695
696####################################################################################################
697
698
699start=$(date +%s)
700STEPS_COUNTER=$((STEPS_COUNTER+1))
701echo -e "${BLUE}\n\n$STEPS_COUNTER) SET USER stack PASSWORD (`date '+%T'`)${NC}"
702if [[ $STACK_PASSWORD != "" ]]
703then
704	echo -e "$STACK_PASSWORD\n$STACK_PASSWORD" | sudo passwd stack
705	echo -e "${GREEN}updated user stack password to <$STACK_PASSWORD>${NC}"
706else
707	echo -e "${ORANGE}skipped user stack password modification${NC}"
708fi
709elapsed_time_seconds=$(expr $(date +%s) - $global_start)
710
711
712####################################################################################################
713
714
715if [[ $ESSENTIAL == "yes" || $ESSENTIAL == "no" ]]
716then
717	start=$(date +%s)
718	STEPS_COUNTER=$((STEPS_COUNTER+1))
719	echo -e "${BLUE}\n\n$STEPS_COUNTER) SHOW HOTFIXES HISTORY (+$elapsed_time_seconds `date '+%T'`)${NC}"
720	hotfix_existance_check=$(/var/lib/cbis/cbis_hotfix list-all -f json 2>/dev/null)
721	if [[ $hotfix_existance_check ]]
722	then
723		hotfix_history_state=$(/var/lib/cbis/cbis_hotfix list-all -f json 2>/dev/null | jq .[].state | grep -v post-install-success)
724		hotfix_history=$(/var/lib/cbis/cbis_hotfix list-all -f json 2>/dev/null | jq .[] | jq '{hotfix_name,state}')
725		if [[ $hotfix_history_state ]]
726		then
727			echo -e "${ORANGE}$hotfix_history${NC}"
728		else
729			echo -e "${GREEN}$hotfix_history${NC}"
730		fi	
731	else
732		echo -e "${ORANGE}the system was never deployed with any hotfix${NC}"
733	fi
734	elapsed_time_seconds=$(expr $(date +%s) - $start)
735	
736
737	####################################################################################################
738	
739		
740	start=$(date +%s)
741	STEPS_COUNTER=$((STEPS_COUNTER+1))
742	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR ISSUES IN THE SRE ICE HEALTH CHECK REPORT (+$elapsed_time_seconds `date '+%T'`)${NC}"
743	if [ -d "/home/stack/HealthCheckFiles/" ]
744	then
745		ISSUES=$(ls -lrt /home/stack/HealthCheckFiles/ | grep \.log | awk '{print $NF}' | tail -n 1 | xargs -i cat /home/stack/HealthCheckFiles/{} | grep -E '\s+No\s+' | sort | uniq -c | tr -s '[:space:]' | awk -F\| '{print $1,$NF}')
746		if [[ $ISSUES ]]
747		then
748			echo -e "${RED}$ISSUES${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
749		else
750			echo -e "${GREEN}no issues found in the ice health check report (excluding warnings)${NC}"		
751		fi
752		INSTALLED_ICE_VERSION=$(cat ~/ice/ice_version 2>&1)
753		LATEST_ISP=$(curl -s https://repo.lab.pl.alcatel-lucent.com/ice-generic-candidates/ | grep -E 'ice-support-package-[0-9]' | tail -n 1 | awk -F\" '{print $2}')
754		LATEST_ISP_VALIDATION=$(echo -e "$LATEST_ISP" | grep -w "$INSTALLED_ICE_VERSION")
755		if [[ -z $LATEST_ISP_VALIDATION ]]
756		then
757			echo -e "\n${MAGENTA}it was found that the latest ice service package build is not used. current version: $INSTALLED_ICE_VERSION, latest version: $LATEST_ISP\nto obtain and execute the latest build perform the following:${NC}\n"
758			echo -e "${MAGENTA}${UL}from the undercloud physical server:${NC}"
759			echo -e "${MAGENTA}cd /root/${NC}"
760			echo -e "${MAGENTA}rm -rf /root/sre/${NC}"
761			echo -e "${MAGENTA}mkdir /root/sre/${NC}"
762			echo -e "${MAGENTA}cd /root/sre/${NC}"
763			echo -e "${MAGENTA}ISP=\$(curl -s https://repo.lab.pl.alcatel-lucent.com/ice-generic-candidates/ | grep -E 'ice-support-package-[0-9]' | tail -n 1 | awk -F'\"' '{print \$2}')${NC}"
764			echo -e "${MAGENTA}curl -s https://repo.lab.pl.alcatel-lucent.com/ice-generic-candidates/\$ISP -o \$ISP${NC}"
765			echo -e "${MAGENTA}chmod +x \$ISP${NC}"
766			echo -e "${MAGENTA}./\$ISP${NC}"
767			echo -e "${MAGENTA}cd /root/sre/ice-support-package/Installer/${NC}"
768			echo -e "${MAGENTA}python IceInstaller.py${NC}"
769			echo -e "\n${MAGENTA}${UL}from the undercloud vm:${NC}"
770			echo -e "${MAGENTA}cd ~/${NC}"
771			echo -e "${MAGENTA}. icerc${NC}"
772			echo -e "${MAGENTA}ice healthcheck${NC}"
773			echo -e "\n${MAGENTA}re-run the system health validation script to see the results of the ice healthcheck execution${NC}"
774		fi	
775		if [[ $ISSUES ]]
776		then
777			echo -e "\n\n${ORANGE}note: once you fixed any of the above failures, re-run the ice health check\nthe system validation script always reads from the last health check report and will keep presenting the previous failures until a new report is created${NC}"
778		fi
779	else
780		echo -e "${MAGENTA}/home/stack/HealthCheckFiles/ is not found which means that the ice health check was never executed${NC}"
781		echo -e "${MAGENTA}to obtain and execute the latest ice service package build perform the following:${NC}\n"
782		echo -e "${MAGENTA}${UL}from the undercloud physical server:${NC}"
783		echo -e "${MAGENTA}cd /root/${NC}"
784		echo -e "${MAGENTA}rm -rf /root/sre/${NC}"
785		echo -e "${MAGENTA}mkdir /root/sre/${NC}"
786		echo -e "${MAGENTA}cd /root/sre/${NC}"
787		echo -e "${MAGENTA}ISP=\$(curl -s https://repo.lab.pl.alcatel-lucent.com/ice-generic-candidates/ | grep -E 'ice-support-package-[0-9]' | tail -n 1 | awk -F'\"' '{print \$2}')${NC}"
788		echo -e "${MAGENTA}curl -s https://repo.lab.pl.alcatel-lucent.com/ice-generic-candidates/\$ISP -o \$ISP${NC}"
789		echo -e "${MAGENTA}chmod +x \$ISP${NC}"
790		echo -e "${MAGENTA}./\$ISP${NC}"
791		echo -e "${MAGENTA}cd /root/sre/ice-support-package/Installer/${NC}"
792		echo -e "${MAGENTA}python IceInstaller.py${NC}"
793		echo -e "\n${MAGENTA}${UL}from the undercloud vm:${NC}"
794		echo -e "${MAGENTA}cd ~/${NC}"
795		echo -e "${MAGENTA}. icerc${NC}"
796		echo -e "${MAGENTA}ice healthcheck${NC}"
797		echo -e "\n${MAGENTA}re-run the system health validation script to see the results of the ice healthcheck execution${NC}"
798	fi
799	elapsed_time_seconds=$(expr $(date +%s) - $start)	
800	
801	
802	####################################################################################################
803
804
805	start=$(date +%s)
806	STEPS_COUNTER=$((STEPS_COUNTER+1))
807	echo -e "${BLUE}\n\n$STEPS_COUNTER) SEARCH FOR IPMI HOSTS LEFTOVERS IN hosts.yaml and hosts_config.yaml (+$elapsed_time_seconds `date '+%T'`)${NC}"
808	cat hosts.yaml | grep pm_addr | awk '{print $2}' | sort -n > hosts_yaml_ipmi_addresses.txt
809	cat hosts_config.yaml | grep -E ^'\s+\-\s+[0-9]' | awk '{print $NF}' | sort -n > hosts_config_yaml_ipmi_addresses.txt
810	source ~/stackrc && openstack baremetal node list --long | grep ipmi_address | awk -F"ipmi_address" '{print $2}' | awk '{print $2}' | tr -d "'u," | sort -n > ironic_ipmi_addresses.txt
811	LEFTOVER_IPMI_HOSTS=$(cat hosts_yaml_ipmi_addresses.txt ironic_ipmi_addresses.txt hosts_config_yaml_ipmi_addresses.txt | sort -n | uniq -c | column -t | grep -v -E '^3\s+[0-9]' | awk '{print $NF}' | paste -sd'|')
812	if [[ $LEFTOVER_IPMI_HOSTS ]]
813	then
814		LEFTOVER_IPMI_HOSTS_DETAILED=$(strings -f hosts_yaml_ipmi_addresses.txt hosts_config_yaml_ipmi_addresses.txt | grep -E $LEFTOVER_IPMI_HOSTS)
815		echo -e "${RED}$LEFTOVER_IPMI_HOSTS_DETAILED${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
816	else
817		echo -e "${GREEN}no diff found between ironic, hosts.yaml and hosts_config.yaml${NC}"
818	fi	
819	elapsed_time_seconds=$(expr $(date +%s) - $start)
820	
821	
822	####################################################################################################
823	
824	
825	start=$(date +%s)
826	STEPS_COUNTER=$((STEPS_COUNTER+1))
827	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR HIGH DENTRY VALUES (sar) (+$elapsed_time_seconds `date '+%T'`)${NC}"
828	### each check prints the comparison between the second last dentry and the last dentry entry (which runs 10 minutes after each-other) 
829	for controller in $current_controllers
830	do
831		echo -e "${CYAN}$controller${NC}"
832		TAIL=10
833		FIRST=1
834		SECOND=2
835		EXIT_LOOP=0
836		FAILURE=0
837		SAR=$(ansible $controller -b -m shell -a "sar -v" | grep -v -E -i 'dentunusd|Average|Linux|overcloud-controller' | sed '/^$/d' | tail -n $TAIL)
838		while true
839		do
840			LAST_DENTRY=$(echo -e "$SAR" | tail -n $FIRST | awk NR==1 | awk '{print $3}')
841			SECOND_LAST_DENTRY=$(echo -e "$SAR" | tail -n $SECOND | awk NR==1 | awk '{print $3}')
842			RESULT=$(expr $LAST_DENTRY - $SECOND_LAST_DENTRY)
843			FIRST=$((FIRST+1))
844			SECOND=$((SECOND+1))
845			EXIT_LOOP=$((EXIT_LOOP+1))
846			if [ $RESULT -gt 100000 ]
847			then
848				echo -e "${RED}$RESULT${NC}"
849				FAILURE=$((FAILURE+1))
850			fi
851			if [ $EXIT_LOOP -gt $TAIL ]
852			then
853				break
854			fi
855		done
856		if [ $FAILURE -gt 0 ]
857		then
858			echo -e "\n\n${RED}$SAR${NC}\n\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
859			echo -e "${ORANGE}the above is fixed in CBIS 19A SP4-PP4, 20 PP4 and 22 - expect failures if you are using an older CBIS version (CBIS-16043, CBIS-16051)${NC}\n\n"
860		else
861			echo -e "${GREEN}no high dentry values are found${NC}\n"
862		fi
863	done
864	elapsed_time_seconds=$(expr $(date +%s) - $start)
865
866	
867	####################################################################################################	
868	
869	
870	start=$(date +%s)
871	STEPS_COUNTER=$((STEPS_COUNTER+1))
872	CACHE_SIZE=100000
873	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE KERNEL SLAB TABLES WITH MORE THEN $CACHE_SIZE MB (+$elapsed_time_seconds `date '+%T'`)${NC}"
874	slabtop=$(ansible controller -b -m shell -a "slabtop -o -s c 2>&1 | grep ^[1-9] -B 1 | column -t | head -n 2")
875	high_memory_cache_table=$(ansible controller -b -m shell -a "slabtop -o -s c 2>&1 | grep ^[1-9] | tr -d K | awk NR==1 | awk 'NF{NF-=1};1' | awk '{print \$NF}' | xargs -i expr {} \/ 1024" | grep ^[0-9] | sort -n | tail -n 1)
876	if [ $high_memory_cache_table -gt $CACHE_SIZE ]
877	then
878		echo -e "${RED}$slabtop${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
879	else
880		echo -e "${GREEN}no kernel cache table that is using more then $CACHE_SIZE MB is found\n\n$slabtop${NC}"
881	fi
882	elapsed_time_seconds=$(expr $(date +%s) - $start)
883	
884	
885	####################################################################################################	
886	
887	
888	start=$(date +%s)
889	STEPS_COUNTER=$((STEPS_COUNTER+1))
890	echo -e "\n\n${BLUE}$STEPS_COUNTER) CHECK FOR CBIS MANAGER PAGES IN IN-PROGRESS STATE (+$elapsed_time_seconds `date '+%T'`)${NC}"
891	pages=$(curl -g -s -k -L -X GET 'https://'$HypervisorURL'/api/pages' -H 'Authorization: Basic '$cbis_manager_token''  | jq . | grep name | grep -v \, | awk '{print $NF}' | tr -d \")
892	COUNTER=0
893	for page in $pages
894	do
895		state=$(curl -g -s -k -L -X GET 'https://'$HypervisorURL'/api/'$page'/state' -H 'Authorization: Basic '$cbis_manager_token'' | jq .state | tr -d \")
896		if [[ $state == "IN_PROGRESS" ]]
897		then
898			echo -e "${RED}$page is in $state state${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
899			COUNTER=$((COUNTER+1))
900		fi
901	done
902	if [[ $COUNTER -eq 0 ]]
903	then
904		echo -e "${GREEN}no pages in CBIS manager with IN_PROGRESS state${NC}"	
905	fi
906	elapsed_time_seconds=$(expr $(date +%s) - $global_start)
907	
908	
909	####################################################################################################	
910	
911	
912	start=$(date +%s)
913	STEPS_COUNTER=$((STEPS_COUNTER+1))
914	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR CBIS MANAGER PAGES IN PARTIAL STATE (+$elapsed_time_seconds `date '+%T'`)${NC}"
915	pages=$(curl -g -s -k -L -X GET 'https://'$HypervisorURL'/api/pages' -H 'Authorization: Basic '$cbis_manager_token''  | jq . | grep name | grep -v \, | awk '{print $NF}' | tr -d \")
916	COUNTER=0
917	for page in $pages
918	do
919		state=$(curl -g -s -k -L -X GET 'https://'$HypervisorURL'/api/'$page'/state' -H 'Authorization: Basic '$cbis_manager_token'' | jq .state | tr -d \")
920		if [[ $state == "PARTIAL" ]]
921		then
922			echo -e "${ORANGE}$page is in $state state${NC}"
923			echo -e "\n${ORANGE}note: currently, the only known PARTIAL state scenario is when the deployment smoke test failed or the deployment was executed with smoke test disabled${NC}"
924			COUNTER=$((COUNTER+1))
925		fi
926	done
927	if [[ $COUNTER -eq 0 ]]
928	then
929		echo -e "${GREEN}no pages in CBIS manager with PARTIAL state${NC}"	
930	fi
931	elapsed_time_seconds=$(expr $(date +%s) - $global_start)
932	
933	
934	####################################################################################################
935	
936	
937	start=$(date +%s)
938	STEPS_COUNTER=$((STEPS_COUNTER+1))
939	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE SYSTEMD SERVICES STATUS INCONCITIENCIES BETWEEN THE CONTROLLERS (+$elapsed_time_seconds `date '+%T'`)${NC}"
940	unique_enabled_services=$(ansible controller -b -m shell -a "systemctl list-unit-files --state=enabled" | grep enabled | grep -v cbis_update_ceph_pgs.service |sort | uniq -c | grep -v '3 ' | awk '{print $2}' | paste -sd"|")
941	if [[ $unique_enabled_services ]]
942	then
943		services_mismatch=$(ansible controller -b -m shell -a "systemctl list-unit-files | grep -E '$unique_enabled_services'")
944		echo -e "${RED}$services_mismatch${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
945		echo -e "\n\n${ORANGE}CBIS-16369 (19A) - mount_cephfs_share is not configured on replaced controllers${NC}"
946	else
947		echo -e "${GREEN}no inconsistencies found between the systemd services of the controllers${NC}"
948	fi
949	elapsed_time_seconds=$(expr $(date +%s) - $start)
950	
951	
952	####################################################################################################
953	
954	
955	start=$(date +%s)
956	STEPS_COUNTER=$((STEPS_COUNTER+1))
957	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THE OVERCLOUD BACKUP db_backup.enc IS CREATED FROM ALL THE CONTROLLERS UNDER THE CONFIGURED BACKUP NFS DIRECTORY (+$elapsed_time_seconds `date '+%T'`)${NC}"
958	deployment_date=$(sudo cat /var/log/cbis/overcloud_installation.log | grep -E ^202[1-9]-[0-9] | tail -n1 | awk '{print $1}')
959	current_date_for_skip_check=$(date +%Y-%m-%d)
960	current_date_for_backup_directories=$(date +%Y.%m.%d)
961	db_backup_directory_check=$(sudo du -ha /mnt/backup/overcloud-controller-*/$current_date_for_backup_directories* 2>&1)
962	db_backup=$(echo -e "$db_backup_directory_check" | grep db_backup.enc -c)
963	no_file_or_directory_error=$(echo -e "$db_backup_directory_check" | grep 'No such file or directory')
964	# if the deployment date and current date are same the test expects for no overcloud backup. the overcloud backup by default is created each night at 02:00 AM. if the setup was deployed at the day of when this check is running there will not be any overcloud backup and thus this check will get skipped by design.
965	if [[ $no_file_or_directory_error && $deployment_date == $current_date_for_skip_check ]]
966	then
967		echo -e "${ORANGE}since only today the setup was deployed, no overcloud backups are expected under /mnt/backup/${NC}"
968	else
969		if [[ $db_backup != "3" ]]
970		then
971			echo -e "${RED}unable to find 3 db_backup.enc backup files (one per controller) under /mnt/backup/overcloud-controller-*/$current_date_for_backup_directories*${NC}\n"
972			echo -e "${RED}$db_backup_directory_check${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
973		else
974			echo -e "${GREEN}found 3 db_backup.enc files under the /mnt/backup/overcloud-controller-*/$current_date_for_backup_directories directories${NC}"
975			echo -e "\n${GREEN}$db_backup_directory_check${NC}"
976		fi	
977	fi
978	elapsed_time_seconds=$(expr $(date +%s) - $start)
979	
980	
981	####################################################################################################
982	
983	
984	start=$(date +%s)
985	STEPS_COUNTER=$((STEPS_COUNTER+1))
986	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE REMOVED HOSTS FINGERPRINTS LEFTOVERS IN /home/stack/.ssh/known_hosts (+$elapsed_time_seconds `date '+%T'`)${NC}"	
987	LEFTOVER_KNOWN_HOSTS=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "ssh-keyscan -t ecdsa localhost" | grep ^localhost | awk '{print $3}' | sed 's/.$//' | awk -F'/' '{print $NF}' | awk -F'+' '{print $NF}' | paste -sd'|' | xargs -i sudo grep -E -v '{}' /home/stack/.ssh/known_hosts)
988	if [[ $LEFTOVER_KNOWN_HOSTS ]]
989	then
990		echo -e "${RED}$LEFTOVER_KNOWN_HOSTS${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
991		echo -e "\n\n${ORANGE}CBIS-15479 (20) / CBIS-15712 (19A) - hanging known_hosts entries${NC}"
992	else
993		echo -e "${GREEN}no leftover hosts fingerprints are found under /home/stack/.ssh/known_hosts${NC}"
994	fi
995	elapsed_time_seconds=$(expr $(date +%s) - $start)
996	
997	
998	####################################################################################################
999	
1000	
1001	start=$(date +%s)
1002	STEPS_COUNTER=$((STEPS_COUNTER+1))
1003	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE REMOVED CONTROLLERS RESIDUE IN FILES UNDER /var/lib/config-data/puppet-generated/ AND /etc/ (+$elapsed_time_seconds `date '+%T'`)${NC}"
1004	source ~/stackrc
1005	cloud_name=$(cat user_config.yaml | grep cloud_name | awk '{print $2}' | tr -d \')
1006	controllers=$(openstack server list --flavor Controller -f value -c Name | sort)
1007	controllers_index=$(echo -e "$controllers" | awk -F- '{print $NF}' | sort)
1008	echo -e "${CYAN}${UL}controllers found:${NC}\n${CYAN}$controllers${NC}\n"
1009	missing_index=$(echo -e "$controllers_index" | awk '{ for (i = prev + 0; i < $1; i++) {print i} } { prev = $1 + 1 }')
1010	if [[ $missing_index ]]
1011	then
1012		controller_full_name=$(openstack server list --flavor Controller -f value -c Name | tr -d [0-9] | uniq)	
1013		for index in $missing_index
1014		do
1015			removed_controller_leftovers=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "grep -R -w $controller_full_name$index /var/lib/config-data/puppet-generated/ /etc/ | grep -E -v '.conf.[0-9]|.conf.bck|.cfg.[0-9]'" | grep -v -E 'No such file or directory|rc=[1-9]' | grep ^/ -B 1)
1016			if [[ $removed_controller_leftovers ]]
1017			then
1018				echo -e "\n${RED}found $controller_full_name$index entries within one or more files under /var/lib/config-data/puppet-generated/ or /etc/${NC}"
1019				echo -e "\n${RED}$removed_controller_leftovers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1020				echo -e "\n\n${ORANGE}CBIS-16370 (19A) - the old replaced controllers are still presented in /etc/ssh/ssh_known_hosts on the existing controllers${NC}"
1021			else
1022				echo -e "${GREEN}$controller_full_name$index residue is not found under /var/lib/config-data/puppet-generated/*${NC}"
1023			fi
1024		done	
1025	else
1026		echo -e "${GREEN}couldn't find removed controller in the system${NC}"
1027	fi
1028	elapsed_time_seconds=$(expr $(date +%s) - $start)
1029
1030
1031	####################################################################################################
1032
1033
1034	start=$(date +%s)
1035	STEPS_COUNTER=$((STEPS_COUNTER+1))
1036	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE REMOVED CONTROLLER RESIDUE IN /var/log/* IN THE PAST 10 MINUTES (+$elapsed_time_seconds `date '+%T'`)${NC}"
1037	source ~/stackrc
1038	cloud_name=$(cat user_config.yaml | grep cloud_name | awk '{print $2}' | tr -d \')
1039	hour1=$(date +%Y"-"%m"-"%d" "%T | cut -d: -f1-2 | sed 's/.$//')
1040	hour2=$(date | awk '{print $2" "$3 ,$4}' | cut -d: -f1-2 | sed 's/.$//')
1041	controllers=$(openstack server list --flavor Controller -f value -c Name | sort)
1042	controllers_index=$(echo -e "$controllers" | awk -F- '{print $NF}' | sort)
1043	echo -e "${CYAN}${UL}controllers found:${NC}\n${CYAN}$controllers${NC}\n"
1044	missing_index=$(echo -e "$controllers_index" | awk '{ for (i = prev + 0; i < $1; i++) {print i} } { prev = $1 + 1 }')
1045	if [[ $missing_index ]]
1046	then
1047		controller_full_name=$(openstack server list --flavor Controller -f value -c Name | tr -d [0-9] | uniq)	
1048		for index in $missing_index
1049		do
1050			removed_controller_leftovers=$(ansible all --limit '!hypervisor' -b -m shell -a "grep -E -R '$hour1|$hour2' /var/log/ | grep -E -v '/var/log/filebeat/filebeat|Invoked with warn=True|/var/log/cbis/ansible/ansible.log' | grep -w $controller_full_name$index |  awk -F: '{print \$1}' | uniq -c | column -t" | grep ^[1-9] -B 1)
1051			if [[ $removed_controller_leftovers ]]
1052			then
1053				echo -e "${RED}found $controller_full_name$index entries within one or more log files in the past 10 minutes${NC}"
1054				echo -e "\n\n${RED}$removed_controller_leftovers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1055			else
1056				echo -e "${GREEN}$controller_full_name$index residue is not found under /var/log/*${NC}"
1057			fi
1058		done	
1059	else
1060		echo -e "${GREEN}couldn't find removed controller in the system${NC}"
1061	fi
1062	elapsed_time_seconds=$(expr $(date +%s) - $start)
1063
1064
1065	####################################################################################################
1066
1067	
1068	start=$(date +%s)
1069	STEPS_COUNTER=$((STEPS_COUNTER+1))
1070	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK INSTANCES STATE (OPENSTACK) (+$elapsed_time_seconds `date '+%T'`)${NC}"
1071	vms=$(source ~/overcloudrc && openstack server list --all --long -f value | grep -w -v 'ACTIVE None Running')
1072	if [[ $vms ]]
1073	then
1074		echo -e "${RED}$vms${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1075	else
1076		echo -e "${GREEN}all instances are in active/running state${NC}"
1077	fi
1078	elapsed_time_seconds=$(expr $(date +%s) - $start)
1079
1080
1081	####################################################################################################
1082
1083
1084	start=$(date +%s)
1085	STEPS_COUNTER=$((STEPS_COUNTER+1))
1086	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK INSTANCES STATE (VIRSH) (+$elapsed_time_seconds `date '+%T'`)${NC}"
1087	virsh_instances=$(ansible compute -b -m shell -a "virsh list --all")
1088	inactive_virsh_instances=$(echo -e "$virsh_instances" | grep -v running | grep instance- -B 3)
1089	running_virsh_instances=$(echo -e "$virsh_instances" | grep running)
1090	if [[ $inactive_virsh_instances ]]
1091	then
1092		echo -e "${RED}$inactive_virsh_instances${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1093	elif [[ -z $inactive_virsh_instances && -z $running_virsh_instances ]]
1094	then
1095		echo -e "${ORANGE}no instances are found on the system!${NC}"
1096	else
1097		echo -e "${GREEN}all instances are in running state${NC}"
1098	fi
1099	elapsed_time_seconds=$(expr $(date +%s) - $start)
1100
1101
1102	####################################################################################################
1103	
1104	
1105	start=$(date +%s)
1106	STEPS_COUNTER=$((STEPS_COUNTER+1))
1107	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE RUNNING OPENSTACK INSTANCES AND RUNNING VIRSH INSTANCES COUNT IS IDENTICAL (+$elapsed_time_seconds `date '+%T'`)${NC}"
1108	running_virsh_instances=$(echo -e "$virsh_instances" | grep -c running)
1109	if [[ $nova_instances_count != $running_virsh_instances ]]
1110	then
1111		echo -e "${RED}$running openstack instances ($nova_instances_count), virsh running instances ($running_virsh_instances)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1112	else
1113		echo -e "${GREEN}running openstack instances ($nova_instances_count), virsh running instances ($running_virsh_instances)${NC}"
1114	fi
1115	elapsed_time_seconds=$(expr $(date +%s) - $start)
1116
1117
1118	####################################################################################################
1119
1120
1121	start=$(date +%s)
1122	STEPS_COUNTER=$((STEPS_COUNTER+1))
1123	echo -e "${BLUE}\n\n$STEPS_COUNTER) INSTANCES CONNECTIVITY VALIDATION (VIA NETWORK NAMESPACE) (+$elapsed_time_seconds `date '+%T'`)${NC}"
1124	if [[ $nuage != "true" ]]
1125	then
1126		source ~/overcloudrc
1127		instances=$(openstack server list --all -f value | wc -l)
1128		if [[ $instances != "0" ]]
1129		then
1130			if [[ $cbis_version != "19.0.0.1" && $cbis_version != "18.0.0.1" ]]
1131			then
1132				networks=$(openstack server list --all --long -c Networks -f value | grep -E -i -v ':|avrs' | awk -F= '{print $1}' | sort -u)
1133				if [[ $networks ]]
1134				then
1135					for network in $networks
1136					do
1137						addresses=$(openstack server list --all --long -c Networks -f value | awk '{OFS=RS;$1=$1}1' | grep -w $network | awk -F= '{print $2}' | tr -d ',;' | paste -sd " ")
1138						if [[ -z $addresses ]]
1139						then
1140							echo -e "${RED}addresses variable didn't return any value${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1141						fi
1142						network_id=$(openstack network list -f value | grep -w $network | awk '{print $1}')
1143						if [[ -z $network_id ]]
1144						then
1145							echo -e "${RED}network_id variable didn't return any value${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1146						fi
1147						echo -e "${CYAN}ansible $last_index_controller -b -m shell -a \"ip netns exec qdhcp-$network_id nping -4 --tcp-connect -p 22 -c 3 $addresses\"${NC}"
1148						result=$(ansible $last_index_controller -b -m shell -a "ip netns exec qdhcp-$network_id nping -4 --tcp-connect -p 22 -c 3 $addresses" | grep 'Failed: [1-9]')
1149						if [[ -z $result ]]
1150						then	
1151							echo -e "${GREEN}network $network addresses replied successfully${NC}"
1152						else
1153							echo -e "${RED}$result${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1154						fi
1155					done
1156				fi
1157				networks=$(openstack server list --all --long -c Networks -f value | grep -i -v avrs | grep : | awk -F= '{print $1}' | sort -u)
1158				if [[ $networks ]]
1159				then
1160					for network in $networks
1161					do
1162						addresses=$(openstack server list --all --long -c Networks -f value | awk '{OFS=RS;$1=$1}1' | grep -w $network | awk -F= '{print $2}' | tr -d ',;' | paste -sd " ")
1163						if [[ -z $addresses ]]
1164						then
1165							echo -e "${RED}addresses variable didn't return any value${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1166						fi
1167						network_id=$(openstack network list -f value | grep -w $network | awk '{print $1}')
1168						if [[ -z $network_id ]]
1169						then
1170							echo -e "${RED}network_id variable didn't return any value${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1171						fi
1172						echo -e "${CYAN}ansible $last_index_controller -b -m shell -a \"ip netns exec qdhcp-$network_id nping -6 --tcp-connect -p 22 -c 3 $addresses\"${NC}"
1173						result=$(ansible $last_index_controller -b -m shell -a "ip netns exec qdhcp-$network_id nping -6 --tcp-connect -p 22 -c 3 $addresses" | grep 'Failed: [1-9]')
1174						if [[ -z $result ]]
1175						then
1176							echo -e "${GREEN}network $network addresses replied successfully${NC}"
1177						else
1178							echo -e "${RED}$result${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1179						fi
1180					done
1181				fi
1182			else
1183				networks=$(openstack server list --all --long -c Networks -f value | grep -v -i avrs)
1184				ipv4_networks=$(echo "$networks" | grep -v : | awk -F= '{print $1}' | sort -u)
1185				ipv6_networks=$(echo "$networks"  | grep : | awk -F= '{print $1}' | sort -u)
1186				if [[ $ipv4_networks ]]
1187				then
1188					for network in $ipv4_networks
1189					do
1190						addresses=$(openstack server list --all --long -c Networks -f value | awk '{OFS=RS;$1=$1}1' | grep -w $network | awk -F= '{print $2}' | tr -d ',;' | paste -sd " ")
1191						if [[ -z $addresses ]]
1192						then
1193							echo -e "${RED}addresses variable didn't return any value${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1194						fi
1195						network_id=$(openstack network list -f value | grep -w $network | awk '{print $1}')
1196						if [[ -z $network_id ]]
1197						then
1198							echo -e "${RED}network_id variable didn't return any value${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1199						fi
1200						echo -e "${CYAN}ansible $last_index_controller -b -m shell -a \"ip netns exec qdhcp-$network_id fping $addresses'\"${NC}"
1201						result=$(ansible $last_index_controller -b -m shell -a "ip netns exec qdhcp-$network_id fping $addresses" | grep ^[0-9] | grep -v 'is alive')
1202						if [[ $result ]]
1203						then
1204							echo -e "\n${RED}\n$result${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1205						else
1206							echo -e "${GREEN}all the addresses of network $network replied successfully${NC}"
1207						fi
1208					done
1209				fi
1210				if [[ $ipv6_networks ]]
1211				then
1212					for network in $ipv6_networks
1213					do
1214						addresses=$(openstack server list --all --long -c Networks -f value | awk '{OFS=RS;$1=$1}1' | grep -w $network | awk -F= '{print $2}' | tr -d ',;' | paste -sd " ")
1215						if [[ -z $addresses ]]
1216						then
1217							echo -e "${RED}addresses variable didn't return any value${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1218						fi						
1219						network_id=$(openstack network list -f value | grep -w $network | awk '{print $1}')
1220						if [[ -z $network_id ]]
1221						then
1222							echo -e "${RED}network_id variable didn't return any value${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1223						fi						
1224						echo -e "${CYAN}ansible $last_index_controller -b -m shell -a \"ip netns exec qdhcp-$network_id fping6 $addresses$\"${NC}"
1225						result=$(ansible $last_index_controller -b -m shell -a "ip netns exec qdhcp-$network_id fping6 $addresses" | grep ^[0-9] | grep -v 'is alive')
1226						if [[ $result ]]
1227						then
1228							echo -e "\n${RED}\n$result${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1229						else
1230							echo -e "${GREEN}all the addresses of network $network replied successfully${NC}"
1231						fi
1232					done
1233				fi				
1234			fi
1235		else
1236			echo -e "${ORANGE}no instances are found on the system!${NC}"
1237		fi
1238	elif [[ $nuage == "true" ]]
1239	then
1240		echo -e "${ORANGE}nuage/avrs instance aren't using the neutron dhcp namespace and therefore this check is irrelevant for nuage deployment${NC}"
1241	fi
1242	elapsed_time_seconds=$(expr $(date +%s) - $start)
1243
1244
1245	####################################################################################################
1246	
1247	start=$(date +%s)
1248	STEPS_COUNTER=$((STEPS_COUNTER+1))
1249	echo -e "${BLUE}\n\n$STEPS_COUNTER) SEARCH FOR BIG GAPS BETWEEN THE SYSTEM CLOCK AND THE HARDWARE CLOCK (+$elapsed_time_seconds `date '+%T'`)${NC}"	
1250	clock_gap=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "hwclock | awk '{print \$5}' | sed 's/...$//' && date +%I:%M:%S | sed 's/...$//'" | uniq -c | column -t | grep -E '^1\s+[0-9][0-9]:[0-9][0-9]:[0-9]' -B 1)
1251	if [[ $clock_gap ]]
1252	then
1253		clock_gap=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "hwclock | awk '{print \$5}' && date +%I:%M:%S" | uniq -c | column -t | grep -E '^1\s+[0-9][0-9]:[0-9][0-9]:[0-9]' -B 1)
1254		echo -e "${RED}$clock_gap${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1255	else
1256		echo -e "${GREEN}no substantial gap is found between the hardware clock (real time clock - A.K.A the RTC, CMOS clock) to the system clock (A.K.A the kernel clock or software clock)${NC}"
1257	fi
1258	elapsed_time_seconds=$(expr $(date +%s) - $start)
1259
1260	
1261	####################################################################################################
1262	
1263	
1264	start=$(date +%s)
1265	STEPS_COUNTER=$((STEPS_COUNTER+1))
1266	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT overcloud-full.qcow2 IS NOT CORRUPTED (+$elapsed_time_seconds `date '+%T'`)${NC}"
1267	### based on ICE-2453
1268	overcloud_image_validation=$(sudo qemu-img check /home/stack/images/overcloud-full.qcow2)
1269	overcloud_image_validation_result=$(echo -e "$overcloud_image_validation" | grep 'No errors were found on the image')
1270	if [[ $overcloud_image_validation_result ]]
1271	then
1272		echo -e "${GREEN}$overcloud_image_validation_result${NC}"
1273	else
1274		echo -e "${RED}$overcloud_image_validation${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1275	fi
1276	elapsed_time_seconds=$(expr $(date +%s) - $start)
1277	
1278	
1279	####################################################################################################
1280	
1281	
1282	start=$(date +%s)
1283	STEPS_COUNTER=$((STEPS_COUNTER+1))
1284	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE LEFTOVERS OF REMOVED COMPUTES IN THE UNDERCLOUD AND CONTROLLERS CONF FILES (+$elapsed_time_seconds `date '+%T'`)${NC}"
1285	scaled_in_compute=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/remove_node.log | grep 'node_names with value' |  tr \" \" \"\n\"" | grep overcloud- | tr -d "u',[]" | head -n1)
1286	if [[ $scaled_in_compute ]]
1287	then
1288		compute_leftover=$(ansible localhost,controller -b -m shell -a "grep -R -i $scaled_in_compute /var/lib/config-data/puppet-generated/ 2>&1 | grep -v 'No such file or directory'" | grep -i $scaled_in_compute -B 1)
1289		if [[ $compute_leftover ]]
1290		then
1291			echo -e "${RED}$compute_leftover${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1292		else
1293			echo -e "${GREEN}couldn't find left-overs in /var/lib/config-data/puppet-generated for the scaled-in compute $scaled_in_compute${NC}"
1294		fi
1295	else
1296		echo -e "${GREEN}according to /var/log/cbis/remove_node.log on the undercloud physical server, scale-in was never performed on this system${NC}"
1297	fi
1298	elapsed_time_seconds=$(expr $(date +%s) - $start)
1299	
1300	
1301	####################################################################################################
1302
1303	
1304	start=$(date +%s)
1305	STEPS_COUNTER=$((STEPS_COUNTER+1))
1306	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE SEGMENTATIONS FAULT IN /var/log/messages (+$elapsed_time_seconds `date '+%T'`)${NC}"
1307	fault=$(sshpass -p $hv_cbis_admin_password ansible -k all -b -m shell -a "grep -R -i fault /var/log/message*" | grep -i signal -B 1)
1308	if [[ $fault ]]
1309	then
1310		echo -e "${RED}$fault${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1311		echo -e "\n\n${ORANGE}CBIS-16119 (20) - periodic ceph dashboard segmentation faults on the controllers ${NC}"
1312	else
1313		echo -e "${GREEN}no segmentation faults found in /var/log/messages${NC}"	
1314	fi
1315	elapsed_time_seconds=$(expr $(date +%s) - $global_start)
1316	
1317	
1318	####################################################################################################
1319
1320	
1321	start=$(date +%s)
1322	STEPS_COUNTER=$((STEPS_COUNTER+1))
1323	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE SOFT/HARD LOCKUPS IN /var/log/messages (+$elapsed_time_seconds `date '+%T'`)${NC}"
1324	soft_lockup=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "grep -i -E 'soft lockup|hard lockup ' /var/log/messages | grep -v ansible-command" | grep 'soft lockup' -B 1)
1325	if [[ $soft_lockup ]]
1326	then
1327		echo -e "${RED}$soft_lockup${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))	
1328	else
1329		echo -e "${GREEN}no soft/hard lockups found in /var/log/messages${NC}"	
1330	fi
1331	elapsed_time_seconds=$(expr $(date +%s) - $global_start)
1332
1333
1334	####################################################################################################	
1335	
1336	
1337	start=$(date +%s)
1338	STEPS_COUNTER=$((STEPS_COUNTER+1))
1339	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE Traceback LOG LINES (case-insensitive) WITHIN /var/log/ UP TO 10 MINUTES EARLIER (+$elapsed_time_seconds `date '+%T'`)${NC}"
1340	hour1=$(date +%Y"-"%m"-"%d" "%T | cut -d: -f1-2 | sed 's/.$//')
1341	hour2=$(date | awk '{print $2" "$3 ,$4}' | cut -d: -f1-2 | sed 's/.$//')
1342	traceback=$(ansible all --limit '!hypervisor' -b -m shell -a "grep -E -R '$hour1|$hour2' /var/log/ | grep -i 'Traceback' | grep -E -v 'ansible-command: Invoked with warn|ansible.log|filebeat' | awk -F: '{print \$1}' |  sort | uniq -c | column -t" | grep ^[1-9] -B 1)
1343	if [[ $traceback ]]
1344	then
1345		echo -e "${RED}$traceback${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1346		echo -e "\n\n${ORANGE}CBIS-13698 (19) / CBIS-16371 (19A) - every 10 minutes aodh exception in /var/log/vitrage/collector.log${NC}"
1347	else
1348		echo -e "${GREEN}no traceback logs lines found under /var/log/${NC}"
1349	fi
1350	elapsed_time_seconds=$(expr $(date +%s) - $start)
1351
1352
1353	####################################################################################################
1354
1355
1356	start=$(date +%s)
1357	STEPS_COUNTER=$((STEPS_COUNTER+1))
1358	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE error, err_code, failure or fatal LOG LINES (case-insensitive) WITHIN /var/log/ UP TO 10 MINUTES EARLIER (+$elapsed_time_seconds `date '+%T'`)${NC}"
1359	hour1=$(date -d "-0 hour" +%Y"-"%m"-"%d" "%T | cut -d: -f1-2 | sed 's/.$//')
1360	hour2=$(date | awk '{print $2" "$3 ,$4}' | cut -d: -f1-2 | sed 's/.$//')
1361	errors=$(ansible all --limit '!hypervisor' -b -m shell -a "grep -E -R '$hour1|$hour2' /var/log/ | grep -E -i 'failure|error|err_code|fatal' | grep -E -v 'ansible-command: Invoked with warn|ansible.log|filebeat|, 0 errors,' | awk -F: '{print \$1}' | sort | uniq -c | column -t" | grep ^[1-9] -B 1)
1362	if [[ $errors ]]
1363	then
1364		echo -e "${RED}$errors${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1365		echo -e "\n\n${ORANGE}CBIS-16403 (19A) - barbican_wsgi_main_error.log continually reports 'access to /var/www/cgi-bin/barbican/main failed'${NC}"
1366		echo -e "${ORANGE}CBIS-16655 (20) - /var/log/rhsm/rhsm.log in the UC throws Certificate update using daemon failed error${NC}"		
1367	else
1368		echo -e "${GREEN}no failure, error, err_code or fatal log lines were found under /var/log/${NC}"
1369	fi
1370	elapsed_time_seconds=$(expr $(date +%s) - $start)
1371
1372
1373	####################################################################################################
1374
1375
1376	start=$(date +%s)
1377	STEPS_COUNTER=$((STEPS_COUNTER+1))
1378	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE Permission denied LOG LINES (case-insensitive) WITHIN /var/log/ UP TO 10 MINUTES EARLIER (+$elapsed_time_seconds `date '+%T'`)${NC}"
1379	hour1=$(date -d "-0 hour" +%Y"-"%m"-"%d" "%T | cut -d: -f1-2 | sed 's/.$//')
1380	hour2=$(date | awk '{print $2" "$3 ,$4}' | cut -d: -f1-2 | sed 's/.$//')
1381	errors=$(ansible all --limit '!hypervisor' -b -m shell -a "grep -E -R '$hour1|$hour2' /var/log/ | grep -E -i 'Permission denied' | grep -E -v 'ansible-command: Invoked with warn|ansible.log|filebeat' | awk -F: '{print \$1}' | sort | uniq -c | column -t" | grep ^[1-9] -B 1)
1382	if [[ $errors ]]
1383	then
1384		echo -e "${RED}$errors${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1385	else
1386		echo -e "${GREEN}no permission denied log lines were found under /var/log/${NC}"
1387	fi
1388	elapsed_time_seconds=$(expr $(date +%s) - $start)
1389
1390
1391	####################################################################################################
1392
1393
1394	start=$(date +%s)
1395	STEPS_COUNTER=$((STEPS_COUNTER+1))
1396	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR ERRORS IN THE NOVA LOCAL STORAGE SERVICE cbis_local_storage_filesystem_remount${NC}"
1397	# local_enable_check=$(cat user_config.yaml | grep 'enable_local_storage: true')
1398	for host in $aggregate_hosts
1399	do
1400		user_config_json=$(cat /home/stack/templates/user_config.json | jq .CBIS[].$host)
1401		enable_local_storage=$(echo -e "$user_config_json" | grep '"enable_local_storage": true,')	
1402		if [[ $enable_local_storage ]]
1403		then
1404			local_storage_devices=$(echo -e "$user_config_json" | grep local_storage_devices -A 1 | awk NR==2 | tr -d \" | column -t)
1405			echo -e "${CYAN}local storage is enabled on the $host host-group on disk $local_storage_devices${NC}"
1406			local_storage_enabled=true
1407		fi
1408	done
1409	if [[ $local_storage_enabled = "true" ]]
1410	then
1411		local_storage_nodes=$(ansible compute -b -m shell -a "systemctl --all | grep cbis_local_storage_filesystem_remount.service" | grep SUCCESS | awk '{print $1}' | paste -sd ,)
1412		if [[ $local_storage_nodes ]]
1413		then
1414			local_storage=$(ansible $local_storage_nodes -b -m shell -a "journalctl -axu cbis_local_storage_filesystem_remount.service | grep -E -i 'Error|Failed|cannot open|cannot stat'" | grep -E -v 'non-zero return code|FAILED')
1415			if [[ $local_storage ]]
1416			then
1417				echo -e "${RED}$local_storage${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1418			else
1419				echo -e "${GREEN}no errors found in journalctl -u cbis_local_storage_filesystem_remount.service${NC}"
1420			fi
1421		else
1422			echo -e "${RED}couldn't find the systemd service cbis_local_storage_filesystem_remount on the computes which has local storage enabled\nconnect them computes and check /var/log/cbis/cbis.pre_deploy for local storage configuration exception(s)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1423		fi
1424	else
1425		echo -e "${GREEN}nova local storage is not enabled for any host-group (according to the user_config.yaml)${NC}"
1426	fi
1427	
1428	
1429	####################################################################################################
1430
1431
1432	start=$(date +%s)
1433	STEPS_COUNTER=$((STEPS_COUNTER+1))
1434	echo -e "${BLUE}\n\n$STEPS_COUNTER) SEARCH FOR DUPLICATED LINES IN haproxy.cfg WITHIN THE CONTROLLERS (+$elapsed_time_seconds `date '+%T'`)${NC}"
1435	DUPLICATED_HAPROXY_LINES=$(ansible controller -m shell -b -a "cat /var/lib/config-data/puppet-generated/haproxy/etc/haproxy/haproxy.cfg | grep -E '^\s+server' | sort | uniq -c" | grep -E '^\s+[2-9]' -B 1)
1436	if [[ $DUPLICATED_HAPROXY_LINES ]]
1437	then
1438		echo -e "${RED}$DUPLICATED_HAPROXY_LINES${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1439	else
1440		echo -e "${GREEN}no duplicated lines found in haproxy.cfg within the controllers${NC}"
1441	fi
1442	elapsed_time_seconds=$(expr $(date +%s) - $start)
1443
1444
1445	####################################################################################################
1446
1447
1448	start=$(date +%s)
1449	STEPS_COUNTER=$((STEPS_COUNTER+1))
1450	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THE OVERCLOUD HEAT STACK(S) STATUS (+$elapsed_time_seconds `date '+%T'`)${NC}"
1451	stack_status=$(source ~/overcloudrc && openstack stack list)
1452	if [[ $stack_status ]]
1453	then
1454		stack_status=$(source ~/overcloudrc && openstack stack list -f value | grep -v COMPLETE)
1455		if [[ -z $stack_status ]]
1456		then
1457			echo -e "${GREEN}all the heat stacks are in COMPLETED status${NC}"
1458		else
1459			stack_status=$(source ~/overcloudrc && openstack stack list)
1460			echo -e "${RED}$stack_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1461		fi
1462	else
1463		echo -e "${ORANGE}no heat stacks found in the overcloud${NC}"
1464	fi
1465	elapsed_time_seconds=$(expr $(date +%s) - $start)
1466
1467
1468	####################################################################################################
1469
1470
1471	start=$(date +%s)
1472	STEPS_COUNTER=$((STEPS_COUNTER+1))
1473	echo -e "${BLUE}\n\n$STEPS_COUNTER) PACEMAKER STATUS (+$elapsed_time_seconds `date '+%T'`)${NC}"
1474	pcs_status=$(ansible $last_index_controller -b -m shell -a "pcs resource" | grep -E -i 'DISABLED|Stopped|Stopping|unmanaged|FAILED|blocked|OFFLINE|promote|Recover|Starting|error|Monitoring' | grep -v $last_index_controller)
1475	if [[ -z $pcs_status ]]
1476	then
1477		echo -e "${GREEN}no DISABLED, Stopped, Stopping, unmanaged, FAILED, blocked, OFFLINE, promote, Recover, Starting, error or Monitoring keywords found in pcs status${NC}"
1478	else
1479		echo -e "${RED}$pcs_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1480	fi
1481	if [[ $cbis_version != "18.0.0.1" ]]
1482	then
1483		galera_masters_count=$(ansible $last_index_controller -b -m shell -a "pcs resource show galera-bundle | grep masters= | awk '{print \$3}' | awk -F= '{print \$2}'" | grep ^[0-9])
1484		if [[ $galera_masters_count == "3" ]]
1485		then
1486			echo -e "${GREEN}all the 3 galera members are showing master state as expected${NC}"
1487		else
1488			echo -e "${RED}one or more galera members are not in master state${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1489		fi
1490		redis_masters_count=$(ansible $last_index_controller -b -m shell -a "pcs resource show redis-bundle | grep masters= | awk '{print \$3}' | awk -F= '{print \$2}'" | grep ^[0-9])
1491		if [[ $redis_masters_count == "1" ]]
1492		then
1493			echo -e "${GREEN}1 redis member is showing master state as expected${NC}"
1494		else
1495			echo -e "${RED}expecting 1 redis master but got $redis_masters_count masters${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1496		fi
1497	fi
1498	if [[ $cbis_version == "18.0.0.1" ]]
1499	then
1500		galera_masters_count=$(ansible $last_index_controller -b -m shell -a "pcs resource | grep 'galera-master' -A1 | grep Masters: | awk '{OFS=RS;\$1=\$1}1'" | grep -c overcloud-controller)
1501		if [[ $galera_masters_count == "3" ]]
1502		then
1503			echo -e "${GREEN}all the 3 galera members are showing master state as expected${NC}"
1504		else
1505			echo -e "${RED}one or more galera members are not in master state${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1506		fi
1507		redis_masters_count=$(ansible $last_index_controller -b -m shell -a "pcs resource | grep 'redis-master' -A1 | grep Masters: | awk '{OFS=RS;\$1=\$1}1'" | grep -c overcloud-controller)
1508		if [[ $redis_masters_count == "1" ]]
1509		then
1510			echo -e "${GREEN}1 redis member is showing master state as expected${NC}"
1511		else
1512			echo -e "${RED}expecting 1 redis master but got $redis_masters_count masters${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1513		fi
1514	fi
1515	elapsed_time_seconds=$(expr $(date +%s) - $start)
1516
1517
1518	####################################################################################################
1519	
1520	
1521	start=$(date +%s)
1522	STEPS_COUNTER=$((STEPS_COUNTER+1))
1523	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE DISABLED RESOURCES IN PACEMAKER CONSTRAINTS (+$elapsed_time_seconds `date '+%T'`)${NC}"
1524	pcs_disabled_constraints=$(ansible $last_index_controller -b -m shell -a "pcs constraint" | grep -i Disabled -B 1)
1525	if [[ $pcs_disabled_constraints ]]
1526	then
1527		echo -e "${RED}$pcs_disabled_constraints${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1528	else
1529		echo -e "${GREEN}no disabled pacemaker resources found${NC}"
1530	fi
1531
1532
1533	####################################################################################################
1534
1535
1536	start=$(date +%s)
1537	STEPS_COUNTER=$((STEPS_COUNTER+1))
1538	echo -e "${BLUE}\n\n$STEPS_COUNTER) PACEMAKER RESOURCES FAILED ACTIONS HISTORY (+$elapsed_time_seconds `date '+%T'`)${NC}"
1539	pcs_failed_actions=$(ansible $last_index_controller -b -m shell -a "pcs status | awk '/Failed Actions:/,/Daemon Status:/' | grep -v 'Daemon Status:'" | grep -v -E 'FAILED|non-zero return code|SUCCESS')
1540	if [[ $pcs_failed_actions ]]
1541	then
1542		echo -e "${RED}$pcs_failed_actions${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1543	else
1544		echo -e "${GREEN}couldn't find failed actions in pcs status${NC}"
1545	fi
1546
1547
1548	####################################################################################################
1549
1550
1551	start=$(date +%s)
1552	STEPS_COUNTER=$((STEPS_COUNTER+1))
1553	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT SELINUX IS ENABLED AND ENFORCING (ALSO CHECKS IF SECURITY HARDENING IS ENABLED) (+$elapsed_time_seconds `date '+%T'`)${NC}"
1554	selinux=$(ansible all --limit '!hypervisor' -b -m shell -a "sestatus | grep -E 'SELinux status:|Current mode:'" | grep -Ev 'SELinux status:\s+enabled|Current mode:\s+enforcing|SUCCESS')
1555	if [[ $selinux ]]
1556	then
1557		selinux=$(ansible all --limit '!hypervisor' -b -m shell -a "sestatus | grep -E 'SELinux status:\s+disabled|Current mode:\s+permissive'" | grep -E -v 'FAILED|non-zero return code')
1558		echo -e "${RED}selinux permissive mode is usually a case of not applying security hardening on the host(s)${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1559		echo -e "${RED}$selinux${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1560	else
1561		echo -e "${GREEN}selinux is enabled and enforcing on all the hosts${NC}"
1562	fi
1563	elapsed_time_seconds=$(expr $(date +%s) - $start)
1564
1565
1566	####################################################################################################
1567	
1568	
1569	start=$(date +%s)
1570	STEPS_COUNTER=$((STEPS_COUNTER+1))
1571	date=$(date '+%x %T' | cut -d: -f1-2 | sed 's/.$//')
1572	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE SELINUX Permission denied ERRORS IN /var/log/audit/audit.log WITH THE TIMESTAMPS ["$date"] WITHIN THE UNDERCLOUD VM (+$elapsed_time_seconds `date '+%T'`)${NC}"
1573	ausearch=$(ausearch -m AVC,USER_AVC,SELINUX_ERR,USER_SELINUX_ERR -i 2>&1 | grep 'Permission denied' -B 1 | grep "$date" | awk -F'proctitle=' '{print $2}' | column | sort | uniq -c)
1574	if [[ $ausearch ]]
1575	then
1576		echo -e "${RED}$ausearch${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1577	else
1578		echo -e "${GREEN}no Permission denied errors found in the audit logs${NC}"
1579	fi	
1580	elapsed_time_seconds=$(expr $(date +%s) - $start)
1581
1582
1583	####################################################################################################
1584
1585
1586	start=$(date +%s)
1587	STEPS_COUNTER=$((STEPS_COUNTER+1))
1588	echo -e "${BLUE}\n\n$STEPS_COUNTER) DNS RESOLUTION VALIDATION (+$elapsed_time_seconds `date '+%T'`)${NC}"
1589	dns=$(python /usr/share/cbis/undercloud/tools/dns_validation.py)
1590	if [[ -z $dns ]]
1591	then
1592		echo -e "${GREEN}DNS resolution succeeded${NC}"
1593	else
1594		echo -e "${RED}DNS resolution failed${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1595		echo -e "${RED}$dns${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1596	fi	
1597	elapsed_time_seconds=$(expr $(date +%s) - $start)
1598		
1599		
1600	####################################################################################################
1601
1602
1603	start=$(date +%s)
1604	STEPS_COUNTER=$((STEPS_COUNTER+1))
1605	echo -e "${BLUE}\n\n$STEPS_COUNTER) NTP (timedatectl) SYNCHRONIZATION CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
1606	timedatectl=$(ansible all --limit '!hypervisor' -b -m shell -a "timedatectl" | grep 'NTP synchronized: no' -B 6 | grep SUCCESS | awk '{print $1}')
1607	if [[ $timedatectl ]]
1608	then
1609		echo -e "${RED}timedatectl returned \"NTP synchronized: no\" for the following hosts:\n$timedatectl${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1610	else
1611		echo -e "${GREEN}all hosts are synchronized (timedatectl)${NC}"
1612	fi
1613	elapsed_time_seconds=$(expr $(date +%s) - $start)
1614
1615
1616	####################################################################################################
1617
1618
1619	start=$(date +%s)
1620	STEPS_COUNTER=$((STEPS_COUNTER+1))
1621	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK BOND INTERFACES STATE ON EACH OVERCLOUD HOST (+$elapsed_time_seconds `date '+%T'`)${NC}"
1622	echo -e "${CYAN}validating all the openvswitch bonds (ovs-appctl bond/list)${NC}"
1623	if [[ $nuage != "true" ]]
1624	then		
1625		bond_status=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ovs-appctl bond/list | grep - | awk '{print \$1}' | xargs -i ovs-appctl bond/show {}" | grep ^slave | awk '{print $3}' | sort -u)
1626		if [[ $bond_status != "enabled" ]]
1627		then
1628			bond_status=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ovs-appctl bond/list | grep - | awk '{print \$1}' | xargs -i ovs-appctl bond/show {} | grep ^slave")
1629			echo -e "${RED}$bond_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1630		else
1631			echo -e "${GREEN}all openvswitch bond interfaces returned enabled${NC}"
1632		fi
1633	else
1634		echo -e "${ORANGE}openvswitch bond interfaces are invalid when the setup is deployed with nuage${NC}"
1635	fi
1636	echo -e "${CYAN}validating all the linux bonds (cat /proc/net/bonding/..)${NC}"
1637	bond_status=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ls /proc/net/bonding/ | grep -v bond0 | xargs -i cat /proc/net/bonding/{}" | grep 'MII Status:' | awk '{print $3}' | sort -u)
1638	if [[ $bond_status != "up" ]]
1639	then
1640		bond_status=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ovs-appctl bond/list | grep - | awk '{print \$1}' | xargs -i ovs-appctl bond/show {} | grep ^slave")
1641		echo -e "${RED}$bond_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1642	else
1643		echo -e "${GREEN}all openvswitch bond interfaces returned enabled${NC}"
1644	fi
1645	elapsed_time_seconds=$(expr $(date +%s) - $start)
1646
1647
1648	####################################################################################################
1649
1650
1651	start=$(date +%s)
1652	STEPS_COUNTER=$((STEPS_COUNTER+1))
1653	echo -e "${BLUE}\n\n$STEPS_COUNTER) CEPH HEALTH CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
1654	if [[ $ceph_backend == "true" ]]
1655	then
1656		ceph_health_detail=$(ansible $last_index_controller -b -m shell -a "ceph health detail" | grep -v $last_index_controller)
1657		if [[ $ceph_health_detail == "HEALTH_OK" ]]
1658		then
1659			echo -e "${GREEN}ceph health is ok${NC}"
1660		else
1661			ceph_status=$(ansible $last_index_controller -b -m shell -a "ceph -s" | grep -v $last_index_controller)
1662			echo -e "${RED}$ceph_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1663		fi
1664	else
1665		echo -e "${ORANGE}the setup is deployed without ceph backend${NC}"
1666	fi	
1667	elapsed_time_seconds=$(expr $(date +%s) - $start)
1668
1669
1670	####################################################################################################
1671
1672
1673	start=$(date +%s)
1674	STEPS_COUNTER=$((STEPS_COUNTER+1))
1675	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK CONNECTVITY TO ALL THE INFRA ADDRESSES (+$elapsed_time_seconds `date '+%T'`)${NC}"
1676	if [[ $internal_api_controller_address ]]
1677	then
1678		ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $internal_api_controller_address" | grep 'ping statistics' -A1 | tail -n1 | grep -v '2 packets transmitted, 2 received')
1679		if [[ -z $ping ]]
1680		then
1681			echo -e "${GREEN}$internal_api_controller_address is reachable from all the overcloud servers${NC}"
1682		else
1683			ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $internal_api_controller_address | grep 'ping statistics' -A1 | tail -n1" | grep -v '2 packets transmitted, 2 received' | grep ^[0-9] -B 1)
1684			echo -e "${RED}$ping${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1685		fi
1686	else
1687		echo -e "${RED}the \"address\" variable returned empty output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1688	fi
1689	if [[ $tenant_controller_address ]]
1690	then
1691		if [[ $ceph_backend == "true" ]]
1692		then
1693			ping=$(ansible all --limit '!hypervisor,!localhost,!CephStorage' -b -m shell -a "ping -c 2 $tenant_controller_address" | grep 'ping statistics' -A1 | tail -n1 | grep -v '2 packets transmitted, 2 received')
1694			if [[ -z $ping ]]
1695			then
1696				echo -e "${GREEN}$tenant_controller_address is reachable from all the overcloud servers${NC}"
1697			else
1698				ping=$(ansible all --limit '!hypervisor,!localhost,!CephStorage' -b -m shell -a "ping -c 2 $tenant_controller_address | grep 'ping statistics' -A1 | tail -n1" | grep -v '2 packets transmitted, 2 received' | grep ^[0-9] -B 1)
1699				echo -e "${RED}$ping${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1700			fi
1701		else
1702			ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $tenant_controller_address" | grep 'ping statistics' -A1 | tail -n1 | grep -v '2 packets transmitted, 2 received')
1703			if [[ -z $ping ]]
1704			then
1705				echo -e "${GREEN}$tenant_controller_address is reachable from all the overcloud servers${NC}"
1706			else
1707				ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $tenant_controller_address | grep 'ping statistics' -A1 | tail -n1" | grep -v '2 packets transmitted, 2 received' | grep ^[0-9] -B 1)
1708				echo -e "${RED}$ping${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1709			fi
1710		fi
1711	else
1712		echo -e "${RED}the \"address\" variable returned empty output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1713	fi
1714	if [[ $storage_controller_address ]]
1715	then
1716		ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $storage_controller_address" | grep 'ping statistics' -A1 | tail -n1 | grep -v '2 packets transmitted, 2 received')
1717		if [[ -z $ping ]]
1718		then
1719			echo -e "${GREEN}$storage_controller_address is reachable from all the overcloud servers${NC}"
1720		else
1721			ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $storage_controller_address | grep 'ping statistics' -A1 | tail -n1" | grep -v '2 packets transmitted, 2 received' | grep ^[0-9] -B 1)
1722			echo -e "${RED}$ping${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1723		fi
1724	else
1725		echo -e "${RED}the \"address\" variable returned empty output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1726	fi
1727	if [[ $storage_mgmt_controller_address ]]
1728	then
1729		ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $storage_mgmt_controller_address" | grep 'ping statistics' -A1 | tail -n1 | grep -v '2 packets transmitted, 2 received')
1730		if [[ -z $ping ]]
1731		then
1732			echo -e "${GREEN}$storage_mgmt_controller_address is reachable from all the overcloud servers${NC}"
1733		else
1734			ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $storage_mgmt_controller_address | grep 'ping statistics' -A1 | tail -n1" | grep -v '2 packets transmitted, 2 received' | grep ^[0-9] -B 1)
1735			echo -e "${RED}$ping${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1736		fi
1737	else
1738		echo -e "${RED}the \"address\" variable returned empty output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1739	fi
1740	if [[ $provisioning_controller_address ]]
1741	then
1742		ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $provisioning_controller_address" | grep 'ping statistics' -A1 | tail -n1 | grep -v '2 packets transmitted, 2 received')
1743		if [[ -z $ping ]]
1744		then
1745			echo -e "${GREEN}$provisioning_controller_address is reachable from all the overcloud servers${NC}"
1746		else
1747			ping=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ping -c 2 $provisioning_controller_address | grep 'ping statistics' -A1 | tail -n1" | grep -v '2 packets transmitted, 2 received' | grep ^[0-9] -B 1)
1748			echo -e "${RED}$ping${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1749		fi
1750	else
1751		echo -e "${RED}the \"address\" variable returned empty output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1752	fi
1753	elapsed_time_seconds=$(expr $(date +%s) - $start)
1754
1755
1756	####################################################################################################
1757
1758
1759	start=$(date +%s)
1760	STEPS_COUNTER=$((STEPS_COUNTER+1))
1761	echo -e "${BLUE}\n\n$STEPS_COUNTER) KIBANA DASHBOARDS CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
1762	if [[ $elk == "true" && $elk_deployment_type == "local" ]]
1763	then
1764		if [[ $cbis_version != "18.0.0.1" && $cbis_version != "19.0.0.1" ]]
1765		then
1766			fixed_dashboards=(Cloud-ErrorsDashboard CephOverview Instance-Spawn-Fail-All-Clouds-Table-Cumulative-Sum [MetricbeatSystem]Hostoverview Openstack-Overview Requests-Dashboard Instance-Spawning-Failure Instance-Vtop Cloud-Usage ipmitoolforcloud [MetricbeatSystem]Overview Openstack-InstanceView [MetricbeatHAProxy]Backend [MetricbeatHAProxy]Frontend [MetricbeatHAProxy]HTTPbackend [MetricbeatHAProxy]HTTPfrontend [MetricbeatHAProxy]HTTPserver [MetricbeatHAProxy]Overview)
1767			dashboards=$(curl -g -s -L -X GET 'https://'$PublicURL'/api/saved_objects/?type=dashboard&' -H 'Content-Type: application/json, text/plain, */*' -H 'Authorization: Basic '$kibana_basic_auth'' -H 'Cookie: SERVERID='$last_index_controller'.internalapi.localdomain' --data '' | jq . | grep \"title\": | awk -F: '{print $2}' | tr -d '," ' | paste -sd " ")
1768			for dashboard in $dashboards
1769			do
1770				if [[ " ${fixed_dashboards[@]} " =~ "$dashboard" ]]
1771				then
1772					echo -e "${GREEN}$dashboard${NC}"
1773				else
1774					echo -e "${RED}$dashboard${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1775				fi
1776			done
1777		else
1778			dashboards=$(curl -g -s -L -X GET 'https://'$PublicURL'/api/saved_objects/?type=dashboard&' -H 'Content-Type: application/json, text/plain, */*' -H 'Authorization: Basic '$kibana_basic_auth'' -H 'Cookie: SERVERID='$last_index_controller'.internalapi.localdomain' --data '' | jq . | grep \"page\": | awk '{print $2}' | tr -d ,)
1779			if [[ $dashboards == "1" ]]
1780			then
1781				echo -e "${GREEN}kibana dashboard is accessible${NC}"
1782			else
1783				dashboards=$(curl -g -s -L -X GET 'https://'$PublicURL'/api/saved_objects/?type=dashboard&' -H 'Content-Type: application/json, text/plain, */*' -H 'Authorization: Basic '$kibana_basic_auth'' -H 'Cookie: SERVERID='$last_index_controller'.internalapi.localdomain' --data '' | jq .)
1784				echo -e "${RED}$dashboard${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1785			fi
1786		fi	
1787	else
1788		echo -e "${ORANGE}CBIS is deployed without ELK or ELK type is remote${NC}"
1789	fi
1790	elapsed_time_seconds=$(expr $(date +%s) - $start)
1791
1792
1793	####################################################################################################
1794
1795	
1796	start=$(date +%s)
1797	STEPS_COUNTER=$((STEPS_COUNTER+1))
1798	echo -e "${BLUE}\n\n$STEPS_COUNTER) ZABBIX ALARMS (+$elapsed_time_seconds `date '+%T'`)${NC}"
1799	zabbix_problem_triggers=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
1800	-H 'Content-Type: application/json-rpc' \
1801	-H 'Cookie: SERVERID='$last_index_controller'' \
1802	--data '{
1803		"jsonrpc": "2.0",
1804		"method": "trigger.get",
1805		"params": {
1806			"output": [
1807				"description"
1808			],
1809			"filter": {
1810				"value": 1
1811			},
1812			"sortfield": "hostname",
1813			"sortorder": "DESC"
1814		},
1815		"auth": '$zabbix_auth',
1816		"id": 1
1817	}' | jq .[] | grep -v '^[0-9]' | grep -v '^"'| jq .[] | jq 'select(.description != "/etc/passwd has been changed on {HOST.NAME}")' | jq 'select(.description != "Host information was changed on {HOST.NAME}")' | jq 'select(.description != "{HOST.NAME} has just been restarted")' | jq 'select(.hostname != "dummy")' | jq 'select(.hostname != "dummy_switch")')
1818	if [[ $zabbix_problem_triggers ]]
1819	then
1820		echo -e "${RED}$zabbix_problem_triggers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1821		echo -e "\n\n${ORANGE}CBIS-15886 - zabbix alarms still existing after scale-in/replace-controller operations (19.0)${NC}"		
1822	else
1823		echo -e "${GREEN}no active alarms found in zabbix${NC}"
1824	fi
1825	zabbix_problem_triggers_count=$(echo -e "$zabbix_problem_triggers" | grep -c description)
1826	elapsed_time_seconds=$(expr $(date +%s) - $start)
1827
1828
1829	####################################################################################################
1830
1831
1832	start=$(date +%s)
1833	STEPS_COUNTER=$((STEPS_COUNTER+1))
1834	hours="48"
1835	echo -e "${BLUE}\n\n$STEPS_COUNTER) PRINT ZABBIX EVENT PROBLEMS FROM THE PAST $hours HOURS (+$elapsed_time_seconds `date '+%T'`)${NC}"
1836	epoch_time_from=$(date +%s -d "-$hours hour")
1837	zabbix_history=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
1838	-H 'Content-Type: application/json-rpc' \
1839	-H 'Cookie: SERVERID="$last_index_controller".internalapi.localdomain' \
1840	--data '{
1841		"jsonrpc": "2.0",
1842		"method": "problem.get",
1843		"params": {
1844			"output": "extend",
1845			"selectAcknowledges": "extend",
1846			"selectTags": "extend",
1847			"time_from": "'$epoch_time_from'",
1848			"selectSuppressionData": "extend"
1849		},
1850		"auth": '$zabbix_auth',
1851		"id": 1
1852	}' | jq . | grep \"name\"\: | awk -F: '{print $2}' | tr -d '",' | sed 's/^[[:space:]]\+//' | sort -u | grep -v '/etc/passwd has been changed')
1853	if [[ $zabbix_history ]]
1854	then
1855		echo -e "${RED}$zabbix_history\n\n\n${ORANGE}Please log-in to the zabbix portal and acknowledge the problems history under Monitoring > Problems > History, set the filter timestamps as required and Apply" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1856	else
1857		echo -e "${GREEN}no problem events were found in the past $hours hours${NC}"
1858	fi
1859	elapsed_time_seconds=$(expr $(date +%s) - $start)
1860
1861
1862	####################################################################################################
1863
1864
1865	start=$(date +%s)
1866	STEPS_COUNTER=$((STEPS_COUNTER+1))
1867	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT /var/log/zabbix/metrics/last_run.status AND /var/log/zabbix/metrics/last_KPIs_run.status ARE EMPTY (+$elapsed_time_seconds `date '+%T'`)${NC}"
1868	metrics_status=$(ansible controller -b -m shell -a "du -b /var/log/zabbix/metrics/last_KPIs_run.status /var/log/zabbix/metrics/last_run.status | awk '{ if ( \$1 != 0 ) print }'" | grep -E ^[1-9] -B 1)
1869	if [[ $metrics_status ]]
1870	then
1871		echo -e "${RED}$metrics_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))	
1872	else
1873		echo -e "${GREEN}/var/log/zabbix/metrics/last_run.status and /var/log/zabbix/metrics/last_KPIs_run.status are empty (no erros) on all the controllers${NC}"
1874	fi
1875	elapsed_time_seconds=$(expr $(date +%s) - $start)
1876
1877
1878	####################################################################################################
1879
1880
1881	start=$(date +%s)
1882	STEPS_COUNTER=$((STEPS_COUNTER+1))
1883	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT THE ZABBIX zbx_metrics AND THE ZABBIX KPI FILES ARE CREATED ON TIME AND DOESN'T CONTAIN ERRORS (+$elapsed_time_seconds `date '+%T'`)${NC}"
1884	zabbix_server_controller=$(ansible $last_index_controller -b -m shell -a "pcs resource" | grep zabbix-server | awk '{print $NF}')
1885	echo -e "${CYAN}checking the zbx_metrics files on $zabbix_server_controller${NC}"
1886	latest_zabbix_metrics_exporter_file=$(ansible $zabbix_server_controller -b -m shell -a "ls -lrt /var/log/zabbix/metrics/ | grep zbx_metrics | grep -v inprogress | tail -n1" | grep zbx_metrics)
1887	if [[ -z $latest_zabbix_metrics_exporter_file ]]
1888	then
1889		echo -e "${RED}can't find any zbx_metrics.xml files under /var/log/zabbix/metrics/ in $zabbix_server_controller${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1890	else
1891		latest_zabbix_metrics_exporter_file_name=$(echo -e "$latest_zabbix_metrics_exporter_file" | awk '{print $NF}')
1892		latest_zabbix_metrics_exporter_file_date=$(echo -e "$latest_zabbix_metrics_exporter_file" | awk '{print $6,$7,$8}' | xargs -i date -d '{}' +%s)
1893		current_epoch_date=$(date +%s)
1894		delta_minutes=$(expr $current_epoch_date - $latest_zabbix_metrics_exporter_file_date | xargs -i expr {} / 60)
1895		if [ $delta_minutes -gt 15 ]
1896		then
1897			echo -e "${RED}$latest_zabbix_metrics_exporter_file_name is from before $delta_minutes minutes${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1898		else
1899			errors=$(ansible $zabbix_server_controller -b -m shell -a "cat /var/log/zabbix/metrics/$latest_zabbix_metrics_exporter_file_name" | grep -E -iw 'unable|error|fatal|fail|exception|traceback|denied|warning')
1900			if [[ $errors ]]
1901			then
1902				echo -e "${RED}found errors in $latest_zabbix_metrics_exporter_file_name\n\n$errors${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1903				echo -e "\n\n${ORANGE}CBIS-15330 (19.0) - OSError: [Errno 2] No such file or directory: /var/lib/cbis/inventory${NC}"
1904				echo -e "${ORANGE}CBIS-16385 (19A) - zbx_metrics.csv error: /var/log/zabbix/services_names.txt: Permission denied${NC}"
1905				
1906			else
1907				echo -e "${GREEN}$latest_zabbix_metrics_exporter_file_name was created in the expected time and doesn't contain errors${NC}"
1908			fi
1909		fi	
1910	fi	
1911	if [[ $cbis_version != "19.0.0.1" && $cbis_version != "18.0.0.1" ]]
1912	then
1913		echo -e "${CYAN}checking the KPIs files on $zabbix_server_controller${NC}"
1914		latest_zabbix_kpi_file=$(ansible $zabbix_server_controller -b -m shell -a "ls -lrt /var/log/zabbix/metrics/ | grep KPIs | grep -v inprogress | tail -n1" | grep KPIs)
1915		if [[ -z $latest_zabbix_metrics_exporter_file ]]
1916		then
1917			echo -e "${RED}can't find last_KPIs_run.status under /var/log/zabbix/metrics/ in $zabbix_server_controller${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1918		else
1919			latest_zabbix_kpi_file_name=$(echo -e "$latest_zabbix_kpi_file" | awk '{print $NF}')
1920			latest_zabbix_kpi_file_date=$(echo -e "$latest_zabbix_kpi_file" | awk '{print $6,$7,$8}' | xargs -i date -d '{}' +%s)
1921			current_epoch_date=$(date +%s)
1922			delta_hours=$(expr $current_epoch_date - $latest_zabbix_kpi_file_date | xargs -i expr {} / 60 / 60)
1923			if [ $delta_hours -gt 24 ]
1924			then
1925				echo -e "${RED}$latest_zabbix_kpi_file_name is from before $delta_hours hours${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1926			else
1927				errors=$(ansible $zabbix_server_controller -b -m shell -a "cat /var/log/zabbix/metrics/$latest_zabbix_kpi_file_name" | grep -E -iw 'unable|error|fatal|fail|exception|traceback|denied|warning')
1928				if [[ $errors ]]
1929				then
1930					echo -e "${RED}found errors in $latest_zabbix_kpi_file_name\n\n$errors${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1931				else
1932					echo -e "${GREEN}$latest_zabbix_kpi_file_name was created in the expected time and doesn't contain errors${NC}"
1933				fi
1934			fi		
1935		fi	
1936	fi	
1937	elapsed_time_seconds=$(expr $(date +%s) - $start)
1938	
1939	
1940	####################################################################################################
1941	
1942	
1943	start=$(date +%s)
1944	STEPS_COUNTER=$((STEPS_COUNTER+1))
1945	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE GLOBAL MACRO PRESENTED IN ZABBIX ARE AS CONFIGURED IN THE SYSTEM${NC}"	
1946	ZABBIX_GLOBAL_MACROS_MYSQL=$(ansible $last_index_controller -b -m shell -a "mysql -e \"SELECT * FROM zabbixdb.globalmacro\"" | column -t)
1947	ALL_CONTROLLERS_IP_ADDRESSES=$(ansible controller -b -m shell -a "ip address show" | grep -o '[0-9]\{1,3\}\.[0-9]\{1,3\}\.[0-9]\{1,3\}\.[0-9]\{1,3\}' | sort | uniq)
1948	CONTROLLER_DNS_ADDRESSES=$(ansible $last_index_controller -b -m shell -a "cat /etc/resolv.conf" | grep -E '^nameserver\s+[0-2]' | awk '{print $NF}')
1949	ZABBIX_IP_MANAGEMENT_MACRO_ADDRESSES=$(echo -e "$ZABBIX_GLOBAL_MACROS_MYSQL" | grep IP_MANAGEMENT | awk '{print $NF}' | sort -n | uniq)
1950	ZABBIX_DNS_MACRO_ADDRESSES=$(echo -e "$ZABBIX_GLOBAL_MACROS_MYSQL" | grep DNS | awk '{print $NF}' | sort -n | uniq)
1951	for DNS in $ZABBIX_DNS_MACRO_ADDRESSES
1952	do
1953		IP_CHECK=$(echo -e "$CONTROLLER_DNS_ADDRESSES" | grep $DNS)
1954		if [[ -z $IP_CHECK ]]
1955		then
1956			echo -e "${RED}the global DNS macro $DNS is not found under /etc/resolv.conf of $last_index_controller${NC}"
1957		else
1958			echo -e "${GREEN}the global DNS macro $DNS is found under /etc/resolv.conf of $last_index_controller${NC}"
1959		fi	
1960	done
1961	for IP_MANAGEMENT in $ZABBIX_IP_MANAGEMENT_MACRO_ADDRESSES
1962	do
1963		IP_CHECK=$(echo -e "$ALL_CONTROLLERS_IP_ADDRESSES" | grep $IP_MANAGEMENT)
1964		if [[ -z $IP_CHECK ]]
1965		then
1966			echo -e "${RED}the global IP_MANAGEMENT macro $IP_MANAGEMENT is not found in any of the controllers (ip a)${NC}"
1967		else
1968			echo -e "${GREEN}the global IP_MANAGEMENT macro $IP_MANAGEMENT is found in the controllers (ip a)${NC}"
1969		fi	
1970	done
1971	elapsed_time_seconds=$(expr $(date +%s) - $start)
1972	
1973	
1974	####################################################################################################
1975
1976
1977	start=$(date +%s)
1978	STEPS_COUNTER=$((STEPS_COUNTER+1))
1979	echo -e "${BLUE}\n\n$STEPS_COUNTER) SYSTEMD SERVICES CHECK ON THE OVERCLOUD EXCLUDING 'DHCP|NetworkManager|sysstat|epmd|kdump|ceph-disk@|sysroot.mount|driverctl|srpd|openstack-ironic|cloud-final' (+$elapsed_time_seconds `date '+%T'`)${NC}"
1980	if [[ ! -f "initial_overcloud_servers_systemctl_output.txt" ]]
1981	then
1982		ansible all --limit '!hypervisor,!localhost' -b -m shell -a "systemctl list-units --all --no-pager" > initial_overcloud_servers_systemctl_output.txt  > /dev/null
1983		ansible all --limit '!hypervisor,!localhost' -b -m shell -a "systemctl list-units --all --no-pager > initial_systemctl_output.txt"  > /dev/null
1984	fi
1985	systemd_overcloud=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "systemctl list-units --all --no-pager | grep failed | grep -E -v 'DHCP|NetworkManager|sysstat|epmd|kdump|ceph-disk@|sysroot.mount|driverctl|srpd|openstack-ironic|cloud-final'" | grep -E -v 'FAILED|non-zero return code|overcloud|localhost')
1986	if [[ -z $systemd_overcloud ]]
1987	then
1988		echo -e "${GREEN}no failed systemd services found${NC}"
1989	else
1990		systemd_overcloud=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "systemctl list-units --all --no-pager | grep failed | grep -E -v 'DHCP|NetworkManager|sysstat|epmd|kdump|ceph-disk@|sysroot.mount|driverctl|srpd|openstack-ironic|cloud-final'" | grep -E -v 'FAILED|non-zero return code')
1991		echo -e "${RED}$systemd_overcloud${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
1992		echo -e "\n\n${ORANGE}CBIS-14217 - httpd service on the controllers after security hardening   (19.0)${NC}"
1993	fi
1994	elapsed_time_seconds=$(expr $(date +%s) - $start)
1995
1996
1997	####################################################################################################
1998
1999
2000	start=$(date +%s)
2001	STEPS_COUNTER=$((STEPS_COUNTER+1))
2002	echo -e "${BLUE}\n\n$STEPS_COUNTER) SYSTEMD SERVICES CHECK ON THE UNDERCLOUD EXCLUDING 'DHCP|NetworkManager|sysstat|epmd|kdump|ceph-disk@|sysroot.mount|driverctl|srpd' (+$elapsed_time_seconds `date '+%T'`)${NC}"
2003	if [[ ! -f "initial_undercloud_systemctl_output.txt" ]]
2004	then
2005		sudo systemctl list-units --all --no-pager > initial_undercloud_systemctl_output.txt
2006	fi
2007	systemd_undercloud=$(sudo systemctl list-units --all --no-pager | grep failed | grep -E -v 'DHCP|NetworkManager|sysstat|epmd|kdump|ceph-disk@|sysroot.mount|driverctl|srpd')
2008	if [[ -z $systemd_undercloud ]]
2009	then
2010		echo -e "${GREEN}no failed systemd services found${NC}"
2011	else
2012		systemd_undercloud=$(sudo systemctl list-units --all --no-pager | grep failed | grep -E -v 'DHCP|NetworkManager|sysstat|epmd|kdump|ceph-disk@|sysroot.mount|driverctl|srpd')
2013		echo -e "${RED}$systemd_undercloud${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2014	fi
2015	elapsed_time_seconds=$(expr $(date +%s) - $start)
2016
2017
2018	####################################################################################################
2019
2020
2021	start=$(date +%s)
2022	STEPS_COUNTER=$((STEPS_COUNTER+1))
2023	echo -e "${BLUE}\n\n$STEPS_COUNTER) DOCKER CONTAINERS STATUS CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
2024	if [[ ! -f "initial_docker_containers_output.txt" ]]
2025	then
2026		ansible all --limit '!hypervisor' -b -m shell -a "docker ps" > initial_docker_containers_output.txt
2027	fi
2028	if [[ $ceph_backend == "false" ]]
2029	then
2030		failed_docker_containers=$(ansible all --limit '!hypervisor' -b -m shell -a "docker ps -a | grep -E -i 'Failed|unhealthy|restarting|starting|Exited' | grep -Fv -e 'ceilometer' -e 'gnocchi' -e 'aodh' -e 'Exited (0)' -e 'elk-curator' -e 'manila' -e 'Exited (255)'" | grep ^[0-9,a-f] -B 1)
2031		if [[ $failed_docker_containers ]]
2032		then
2033			echo -e "${RED}$failed_docker_containers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2034		else
2035			echo -e "${GREEN}no unhealthy, Exited, restarting, starting docker containers are found${NC}"	
2036		fi
2037	elif [[ $ceph_backend == "true" ]]
2038	then
2039		failed_docker_containers=$(ansible all --limit '!hypervisor' -b -m shell -a "docker ps -a | grep -E -i 'Failed|unhealthy|restarting|starting|Exited' | grep -Fv -e 'Exited (0)' -e 'ceilometer' -e 'gnocchi' -e 'aodh' -e 'elk-curator' -e 'Exited (255)'" | grep ^[0-9,a-f] -B 1)
2040		if [[ $failed_docker_containers ]]
2041		then
2042			echo -e "${RED}$failed_docker_containers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2043		else
2044			echo -e "${GREEN}no unhealthy, Exited, restarting, starting docker containers are found${NC}"	
2045		fi
2046	fi
2047	elapsed_time_seconds=$(expr $(date +%s) - $start)
2048
2049
2050	####################################################################################################
2051	
2052	
2053	start=$(date +%s)
2054	STEPS_COUNTER=$((STEPS_COUNTER+1))
2055	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE FLAPPING (CONTAINERS THAT GOES UP AND DOWN RAPIDLY) DOCKER CONTAINERS (+$elapsed_time_seconds `date '+%T'`)${NC}"
2056	# note: this check was added after I seen that in one of the storage nodes 1 OSD container went up and down every 15 seconds without any in-between status such as restarting.
2057	containers_changes=$(ansible all --limit '!hypervisor' -b -m shell -a "for i in {0..300}; do sleep 0.1 ; docker ps | wc -l; done > docker_containers_real_time_changes.log && cat docker_containers_real_time_changes.log | sort | uniq | wc -l" | grep ^[2-9] -B 1)
2058	if [[ $containers_changes ]]
2059	then
2060		echo -e "${RED}$containers_changes\n\nSSH to the failed server(s) and run the following to try and spot the elusive failed containers:\nwatch -d -n 0.1 \"sudo docker ps | grep -v -E 'Up [1-9] days|[1-9] hours'\"${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2061	else
2062		echo -e "${GREEN}no elusive (flapping) docker containers were found${NC}"
2063	fi
2064	elapsed_time_seconds=$(expr $(date +%s) - $start)
2065	
2066	
2067	####################################################################################################
2068
2069
2070	start=$(date +%s)
2071	STEPS_COUNTER=$((STEPS_COUNTER+1))
2072	HIGH_CPU_COUNTER=0
2073	PERCENTAGE=100.0
2074	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE CONSTANT $PERCENTAGE%+ CPU USAGE DOCKER CONTAINER (+$elapsed_time_seconds `date '+%T'`)${NC}"
2075	high_cpu_containers_ids=$(ansible all --limit '!hypervisor' -b -m shell -a "docker stats --no-stream | tr -d % | awk '{ if ( \$2 > $PERCENTAGE ) print \$0 }'" | grep ^[a-f,0-9] | awk '{print $1}' | paste -sd'|')
2076	if [[ $high_cpu_containers_ids ]]	
2077	then
2078		while true
2079		do
2080			sleep 3
2081			high_cpu_containers_ids_recheck=$(ansible all --limit '!hypervisor' -b -m shell -a "docker stats --no-stream | tr -d % | awk '{ if ( \$2 > $PERCENTAGE ) print \$0 }' | grep -E '$high_cpu_containers_ids'" | grep ^[a-f,0-9] | awk '{print $1}' | paste -sd'|')
2082			if [[ $high_cpu_containers_ids_recheck ]]
2083			then				
2084				HIGH_CPU_COUNTER=$((HIGH_CPU_COUNTER+1))
2085				high_cpu_containers_names=$(ansible all --limit '!hypervisor' -b -m shell -a "docker ps | grep -E '$high_cpu_containers_ids_recheck'" | grep ^[a-f,0-9] -B 1)
2086				echo -e "${ORANGE}$high_cpu_containers_names${NC}\n"
2087			else
2088				echo -e "${GREEN}couldn't find constant $PERCENTAGE%+ cpu usage docker containers${NC}"
2089				break
2090			fi
2091			if [ $HIGH_CPU_COUNTER -gt 5 ]
2092			then
2093				echo -e "${RED}$high_cpu_containers_names${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2094				break
2095			fi
2096		done
2097	else
2098		echo -e "${GREEN}couldn't find constant $PERCENTAGE%+ cpu usage docker containers${NC}"
2099	fi	
2100	elapsed_time_seconds=$(expr $(date +%s) - $start)
2101
2102
2103	####################################################################################################
2104
2105
2106	start=$(date +%s)
2107	STEPS_COUNTER=$((STEPS_COUNTER+1))
2108	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE DOCKER CONTAINER WITH 90.0%+ MEMORY USAGE (+$elapsed_time_seconds `date '+%T'`)${NC}"
2109	high_mem_containers_ids=$(ansible all --limit '!hypervisor' -b -m shell -a "docker stats --no-stream | tr -d % | awk '{ if ( \$8 > 90.0 ) print \$0 }'" | grep ^[a-f,0-9] | awk '{print $1}' | paste -sd'|')
2110	if [[ $high_mem_containers_ids ]]
2111	then
2112		high_mem_containers_names=$(ansible all --limit '!hypervisor' -b -m shell -a "docker ps | grep -E '$high_mem_containers_ids'" | grep ^[a-f,0-9] -B 1)
2113		echo -e "${RED}$high_mem_containers_names${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2114	else
2115		echo -e "${GREEN}didn't find docker containers with more then 90.0% memory usage${NC}"
2116	fi	
2117	elapsed_time_seconds=$(expr $(date +%s) - $start)
2118
2119
2120	####################################################################################################
2121	
2122	
2123	start=$(date +%s)
2124	STEPS_COUNTER=$((STEPS_COUNTER+1))
2125	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE CONTAINERS WITH NON-DEFAULT MEMORY HARD-LIMIT ON THE CONTROLLERS (+$elapsed_time_seconds `date '+%T'`)${NC}"
2126	TOTAL_CONTAINERS_MEMORY=$(ansible $last_index_controller -b -m shell -a "docker info" | grep 'Total Memory:' | awk '{print $3}')
2127	CEPH_CONTAINERS=$(ansible $last_index_controller -b -m shell -a "docker ps" | grep -E 'ceph-mon|ceph-rgw|ceph-mds|ceph-mgr' | awk '{print $1}' | paste -sd'|')
2128	PROBLEMATIC_CONTAINERS=$(ansible $last_index_controller -b -m shell -a "docker stats --no-stream" | grep -v $TOTAL_CONTAINERS_MEMORY | grep -v -E $CEPH_CONTAINERS | grep ^[0-9a-f])
2129	PROBLEMATIC_CONTAINERS_ID=$(echo -e "$PROBLEMATIC_CONTAINERS" | grep ^[0-9a-f] | awk '{print $1}' | paste -sd'|')
2130	PROBLEMATIC_CONTAINERS_NAME=$(ansible $last_index_controller -b -m shell -a "docker ps | grep -E \"$PROBLEMATIC_CONTAINERS_ID\"" | grep ^[0-9a-f])
2131	if [[ $PROBLEMATIC_CONTAINERS ]]
2132	then
2133		echo -e "${RED}$PROBLEMATIC_CONTAINERS\n\n$PROBLEMATIC_CONTAINERS_NAME${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2134		echo -e "\n\n${ORANGE}CBIS-16351 (19A) - horizon container is limited to 2 gigabyte until stack update is executed (fixed in SP4 PP4)${NC}"
2135	else
2136		echo -e "${GREEN}couldn't find containers with unexpected memory limit ("$TOTAL_CONTAINERS_MEMORY" GB)${NC}"
2137	fi	
2138	elapsed_time_seconds=$(expr $(date +%s) - $start)
2139
2140
2141	####################################################################################################
2142
2143
2144	start=$(date +%s)
2145	STEPS_COUNTER=$((STEPS_COUNTER+1))
2146	echo -e "${BLUE}\n\n$STEPS_COUNTER) OOM-KILLER CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
2147	oom_killer=$(ansible all --limit '!hypervisor' -b -m shell -a "grep -E -i 'Killed process|Out of memory' /var/log/dmesg /var/log/messages* | grep -v 'Invoked with warn'" | grep -E -v 'FAILED|non-zero return code')
2148	if [[ -z $oom_killer ]]
2149	then
2150		echo -e "${GREEN}no out of memory processes log lines found in /var/log/messages and /var/log/dmesg on all the hosts${NC}"
2151	else
2152		echo -e "${RED}$oom_killer${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2153	fi
2154	elapsed_time_seconds=$(expr $(date +%s) - $start)
2155
2156
2157	####################################################################################################
2158
2159
2160	start=$(date +%s)
2161	STEPS_COUNTER=$((STEPS_COUNTER+1))
2162	echo -e "${BLUE}\n\n$STEPS_COUNTER) RABBITMQ CLUSTER HEALTH CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
2163	if [[ $cbis_version == "18.0.0.1" ]]
2164	then
2165		rabbitmqctl=$(ansible controller -b -m shell -a "rabbitmqctl node_health_check" | grep -c 'Health check passed')
2166		if [[ $rabbitmqctl == "3" ]]
2167		then
2168			echo -e "${GREEN}rabbitmq node health check passed${NC}"
2169		else
2170			rabbitmqctl=$(ansible controller -b -m shell -a "rabbitmqctl node_health_check")
2171			echo -e "${RED}$rabbitmqctl${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2172		fi
2173	elif [[ $cbis_version != "18.0.0.1" ]]
2174	then
2175		rabbitmqctl=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl node_health_check" | grep -c 'Health check passed')
2176		if [[ $rabbitmqctl == "3" ]]
2177		then
2178			echo -e "${GREEN}rabbitmq node health check passed${NC}"
2179		else
2180			rabbitmqctl=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl node_health_check")
2181			echo -e "${RED}$rabbitmqctl${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2182		fi
2183	fi
2184	elapsed_time_seconds=$(expr $(date +%s) - $start)
2185
2186
2187	####################################################################################################
2188
2189
2190	start=$(date +%s)
2191	STEPS_COUNTER=$((STEPS_COUNTER+1))
2192	echo -e "${BLUE}\n\n$STEPS_COUNTER) RABBITMQ CLUSTER STATUS CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
2193	if [[ $cbis_version == "18.0.0.1" ]]
2194	then
2195		rabbitmqctl=$(ansible controller -b -m shell -a "rabbitmqctl cluster_status | grep running_nodes -A2 | wc -l" | grep ^[0-9] | sort -u)
2196		if [[ $rabbitmqctl == "3" ]]
2197		then
2198			echo -e "${GREEN}all 3 rabbitmq members are running${NC}"
2199		else
2200			rabbitmqctl=$(ansible controller -b -m shell -a "rabbitmqctl cluster_status")
2201			echo -e "${RED}$rabbitmqctl${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2202		fi
2203	elif [[ $cbis_version != "18.0.0.1" ]]
2204	then
2205		rabbitmqctl=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl cluster_status | grep running_nodes -A2 | wc -l" | grep ^[0-9] | sort | uniq -c | column -t)
2206		if [[ $rabbitmqctl == "3  3" || $rabbitmqctl = "1  3" ]]
2207		then
2208			
2209			echo -e "${GREEN}all rabbitmq member(s) are running${NC}"
2210		else
2211			rabbitmqctl=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl node_health_check")
2212			echo -e "${RED}$rabbitmqctl${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2213		fi
2214	fi
2215	elapsed_time_seconds=$(expr $(date +%s) - $start)
2216
2217
2218	####################################################################################################
2219
2220
2221	start=$(date +%s)
2222	STEPS_COUNTER=$((STEPS_COUNTER+1))
2223	echo -e "\n\n${BLUE}$STEPS_COUNTER) VALIDATE THAT ALL THE RABBITMQ CONFIGURED USERS HAS RUNNING CONNECTION AGAINST RABBITMQ (+$elapsed_time_seconds `date '+%T'`)${NC}"
2224bash <<"%EOF%"
2225	NC='\033[0m'
2226	RED='\033[0;31m'
2227	GREEN='\033[32m'
2228	rabbitmq_users_raw=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl list_users")
2229	rabbitmq_users_sorted=$(echo -e "$rabbitmq_users_raw" | grep -v -E 'keystone|glance' | awk '{print $1}' | grep -v -E 'Listing|overcloud-controller' | sort)
2230	rabbitmq_users=$(echo -e "$rabbitmq_users_sorted" | uniq)
2231	rabbitmq_users_per_controller=$(echo -e "$rabbitmq_users_sorted" | uniq -c | awk '{print $1}' | uniq)
2232	#  keystone and glance are removed from the check as they always return false negative. keystone and glance are not using AMQP since they are 1 service projects with only API and therefore they don't need to interact with anything by themselves (approved by Smigielski, Radoslaw)
2233	rabbitmq_connections_by_user=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl list_connections" | awk '{print $1}' | grep -v -E 'Listing|overcloud-controller' | sort | uniq)
2234	if [[ $rabbitmq_users == $rabbitmq_connections_by_user ]]
2235	then
2236		echo -e "${GREEN}all the rabbitmq configured users has running connection in the rabbitmq connections list${NC}"
2237	else
2238		rabbitmq_users_to_connections_comparison=$(comm -23 <(echo $rabbitmq_users | tr ' ' '\n' | sort) <(echo $rabbitmq_connections_by_user | tr ' ' '\n' | sort))
2239		echo -e "${RED}the following are rabbitmq configured users that do not appear in the rabbitmq running connections list:\n${NC}"
2240		echo -e "${RED}$rabbitmq_users_to_connections_comparison${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2241	fi
2242	if [[ $rabbitmq_users_per_controller != "3" ]]
2243	then	
2244		echo -e "\n\n${RED}one or more controllers missing one or more rabbitmq users:\n${NC}"
2245		echo -e "${RED}$rabbitmq_users_raw${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2246	fi
2247%EOF%
2248	elapsed_time_seconds=$(expr $(date +%s) - $global_start)
2249
2250
2251	####################################################################################################
2252
2253
2254	start=$(date +%s)
2255	STEPS_COUNTER=$((STEPS_COUNTER+1))
2256	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK RABBIMQ USERS PREMISSION (+$elapsed_time_seconds `date '+%T'`)${NC}"
2257	rabbitmq_permissions_check=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl list_permissions | awk -F\. '{print \$2,\$3,\$4,\$5}' | column -t" | grep -v -w -E '*  *  *|rc=0')
2258	if [[ $rabbitmq_permissions_check ]]
2259	then
2260		rabbitmq_permissions_check=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl list_permissions")
2261		echo -e "${RED}$rabbitmq_permissions_check${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2262	else
2263		echo -e "${GREEN}all the rabbitmq users has full access (.*      .*      .*)${NC}"
2264	fi
2265	elapsed_time_seconds=$(expr $(date +%s) - $start)
2266
2267
2268	####################################################################################################
2269
2270
2271	start=$(date +%s)
2272	STEPS_COUNTER=$((STEPS_COUNTER+1))
2273	echo -e "${BLUE}\n\n$STEPS_COUNTER) GALERA CLUSTER CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
2274	if [[ $cbis_version == "18.0.0.1" ]]
2275	then
2276		galera_cluster_check=$(ansible controller -b -m shell -a "clustercheck" | grep -c 'Galera cluster node is synced')
2277		if [[ $galera_cluster_check == "3" ]]
2278		then
2279			echo -e "${GREEN}galera cluster is synced${NC}"
2280		else
2281			galera_cluster_check=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=galera-bundle -q) clustercheck")
2282			echo -e "${RED}$galera_cluster_check${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2283		fi
2284	elif [[ $cbis_version != "18.0.0.1" ]]
2285	then
2286		galera_cluster_check=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=galera-bundle -q) clustercheck" | grep -c 'Galera cluster node is synced')
2287		if [[ $galera_cluster_check == "3" ]]
2288		then
2289			echo -e "${GREEN}galera cluster is synced${NC}"
2290		else
2291			galera_cluster_check=$(ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=galera-bundle -q) clustercheck")
2292			echo -e "${RED}$galera_cluster_check${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2293		fi
2294	fi
2295	elapsed_time_seconds=$(expr $(date +%s) - $start)
2296
2297
2298	####################################################################################################
2299
2300
2301	start=$(date +%s)
2302	STEPS_COUNTER=$((STEPS_COUNTER+1))
2303	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT resume_guests_state_on_host_boot IS SET TO false IN nova.conf (+$elapsed_time_seconds `date '+%T'`)${NC}"
2304	resume_vms_on_boot=$(ansible compute --limit '!localhost' -m  shell -b -a "grep -E ^resume_guests_state_on_host_boot=[fF]alse /var/lib/config-data/puppet-generated/nova_libvirt/etc/nova/nova.conf" | grep -v -E 'non-zero return code|rc=[1-9]')
2305	if [[ $resume_vms_on_boot ]]
2306	then
2307		echo -e "${RED}$resume_vms_on_boot${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2308	else
2309		echo -e "${GREEN}resume_guests_state_on_host_boot is set to True on all the computes${NC}"
2310	fi
2311	elapsed_time_seconds=$(expr $(date +%s) - $start)
2312
2313
2314	####################################################################################################
2315
2316
2317	start=$(date +%s)
2318	STEPS_COUNTER=$((STEPS_COUNTER+1))
2319	echo -e "${BLUE}\n\n$STEPS_COUNTER) NTP (ntpstat) SYNCHRONIZATION CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
2320	ntpstat=$(ansible all --limit '!hypervisor' -b -m shell -a "ntpstat" | grep unsynchronised -B 1 | grep overcloud- | awk '{print $1}')
2321	if [[ $ntpstat ]]
2322	then
2323		echo -e "${RED}ntpstat returned unsynchronised for the following hosts:\n$ntpstat${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2324	else
2325		echo -e "${GREEN}all hosts are synchronized (ntpstat)${NC}"
2326	fi
2327	elapsed_time_seconds=$(expr $(date +%s) - $start)
2328
2329
2330	####################################################################################################
2331
2332
2333	start=$(date +%s)
2334	STEPS_COUNTER=$((STEPS_COUNTER+1))
2335	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT THE OVERCLOUD GLANCE IMAGES ARE IN active STATE (+$elapsed_time_seconds `date '+%T'`)${NC}"
2336	overcloud_images=$(source ~/overcloudrc && openstack image list -f value | grep -v active | column -t)
2337	if [[ $overcloud_images ]]
2338	then
2339		echo -e "${RED}$overcloud_images${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2340	else
2341		overcloud_images=$(source ~/overcloudrc && openstack image list -f value | wc -l)
2342		if [[ $overcloud_images == "0" ]]
2343		then
2344			echo -e "${GREEN}no images found${NC}"
2345		else
2346			echo -e "${GREEN}all images are in active state${NC}"
2347		fi
2348	fi
2349	elapsed_time_seconds=$(expr $(date +%s) - $start)
2350
2351
2352	####################################################################################################
2353
2354
2355	start=$(date +%s)
2356	STEPS_COUNTER=$((STEPS_COUNTER+1))
2357	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT THE OVERCLOUD CINDER VOLUMES ARE IN available/in-use STATE (+$elapsed_time_seconds `date '+%T'`)${NC}"
2358	overcloud_volumes=$(source ~/overcloudrc && openstack volume list -f value | grep -E -v 'available|in-use' | column -t)
2359	if [[ $overcloud_volumes ]]
2360	then
2361		echo -e "${RED}$overcloud_volumes${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2362	else
2363		overcloud_volumes=$(source ~/overcloudrc && openstack volume list -f value | wc -l)
2364		if [[ $overcloud_volumes == "0" ]]
2365		then
2366			echo -e "${GREEN}no volumes found${NC}"
2367		else
2368			echo -e "${GREEN}all volumes are in available/in-use state${NC}"
2369		fi
2370	fi
2371	elapsed_time_seconds=$(expr $(date +%s) - $start)
2372
2373
2374	####################################################################################################
2375
2376
2377	start=$(date +%s)
2378	STEPS_COUNTER=$((STEPS_COUNTER+1))
2379	echo -e "${BLUE}\n\n$STEPS_COUNTER) OPENSTACK VOLUME SERVICES CHECK ON THE OVERCLOUD (+$elapsed_time_seconds `date '+%T'`)${NC}"
2380	volume_services=$(source ~/overcloudrc && openstack volume service list --long -f value | grep -E -i ' XXX | DOWN ' | column -t)
2381	if [[ -z $volume_services ]]
2382	then
2383		echo -e "${GREEN}all volume services are enabled and up${NC}"
2384	else
2385		echo -e "${RED}$volume_services${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2386		echo -e "\n\n${ORANGE}CBIS-14283 (19A) - cinder services for host tripleo_ceph_volumes-fast is down (19.100.1)${NC}"
2387		echo -e "${ORANGE}CBIS-9424 (19A) - removed controllers still showing under openstack volume service list (19.100.1)${NC}"
2388	fi
2389	elapsed_time_seconds=$(expr $(date +%s) - $start)
2390
2391
2392	####################################################################################################
2393
2394
2395	start=$(date +%s)
2396	STEPS_COUNTER=$((STEPS_COUNTER+1))
2397	echo -e "${BLUE}\n\n$STEPS_COUNTER) OPENSTACK NETWORK AGENTS LIST ON THE UNDERCLOUD (+$elapsed_time_seconds `date '+%T'`)${NC}"
2398	network_agents_undercloud=$(source ~/stackrc && openstack network agent list -f value | grep -E -i 'XXX|DOWN')
2399	if [[ -z $network_agents_undercloud ]]
2400	then
2401		echo -e "${GREEN}all network agents are alive and up${NC}"
2402	else
2403		echo -e "${RED}$network_agents_undercloud${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2404		echo -e "\n\n${ORANGE}CBIS-14294 - after upgrading from 19.x to 19A, there are old/unused/down undercloud services while the new services running on undercloud.localdomain${NC}"
2405		echo -e "${ORANGE}CBIS-16114,CBIS-15655,CBIS-13670 - after replace controller the network agent of the old controller still shows   (19.0, 19A, 20)${NC}"
2406	fi
2407	elapsed_time_seconds=$(expr $(date +%s) - $start)
2408
2409
2410	####################################################################################################
2411
2412
2413	start=$(date +%s)
2414	STEPS_COUNTER=$((STEPS_COUNTER+1))
2415	echo -e "${BLUE}\n\n$STEPS_COUNTER) OPENSTACK NETWORK AGENTS LIST ON THE OVERCLOUD (+$elapsed_time_seconds `date '+%T'`)${NC}"
2416	network_agents_overcloud=$(source ~/overcloudrc && openstack network agent list -f value | grep -E -i 'XXX|DOWN')
2417	if [[ -z $network_agents_overcloud ]]
2418	then
2419		echo -e "${GREEN}all network agents are alive and up${NC}"
2420	else
2421		echo -e "${RED}$network_agents_overcloud${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2422	fi
2423	elapsed_time_seconds=$(expr $(date +%s) - $start)
2424
2425
2426	####################################################################################################
2427
2428
2429	start=$(date +%s)
2430	STEPS_COUNTER=$((STEPS_COUNTER+1))
2431	echo -e "${BLUE}\n\n$STEPS_COUNTER) OPENSTACK COMPUTE SERVICES ON THE UNDERCLOUD (+$elapsed_time_seconds `date '+%T'`)${NC}"
2432	compute_services_undercloud=$(source ~/stackrc && openstack compute service list -f value | grep -E -i 'XXX|DOWN|disabled')
2433	if [[ -z $compute_services_undercloud ]]
2434	then
2435		echo -e "${GREEN}all compute services are enabled and up${NC}"
2436	else
2437		echo -e "${RED}$compute_services_undercloud${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2438	fi
2439	elapsed_time_seconds=$(expr $(date +%s) - $start)
2440
2441
2442	####################################################################################################
2443
2444
2445	start=$(date +%s)
2446	STEPS_COUNTER=$((STEPS_COUNTER+1))
2447	echo -e "${BLUE}\n\n$STEPS_COUNTER) OPENSTACK COMPUTE SERVICES ON THE OVERCLOUD (+$elapsed_time_seconds `date '+%T'`)${NC}"
2448	compute_services_overcloud=$(source ~/overcloudrc && openstack compute service list -f value | grep -E -i 'XXX|DOWN|disabled')
2449	if [[ -z $compute_services_overcloud ]]
2450	then
2451		echo -e "${GREEN}all compute services are enabled and up${NC}"
2452	else
2453		echo -e "${RED}$compute_services_overcloud${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2454	fi
2455	elapsed_time_seconds=$(expr $(date +%s) - $start)
2456
2457
2458	####################################################################################################
2459	
2460	start=$(date +%s)
2461	STEPS_COUNTER=$((STEPS_COUNTER+1))
2462	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK IF ANY AVAILABILITY-ZONE IS NOT IN available STATE (+$elapsed_time_seconds `date '+%T'`)${NC}"	
2463	AVAILABILITY_ZONES=$(source ~/overcloudrc && openstack availability zone list --long -f value)
2464	AVAILABILITY_ZONES_FAILURES=$(echo -e "$AVAILABILITY_ZONES" | grep -v available)
2465	
2466	if [[ $AVAILABILITY_ZONES_FAILURES ]]
2467	then
2468		echo -e "${RED}$AVAILABILITY_ZONES_FAILURES${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2469	else
2470		echo -e "${GREEN}all the availability zones are available${NC}"
2471	fi	
2472	elapsed_time_seconds=$(expr $(date +%s) - $start)
2473	
2474	
2475	####################################################################################################
2476	
2477	
2478	start=$(date +%s)
2479	STEPS_COUNTER=$((STEPS_COUNTER+1))
2480	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE LEFTOVER CONTROLLERS IN OPENSTACK ORCHESTRATION SERVICES (heat) ON THE OVERCLOUD (+$elapsed_time_seconds `date '+%T'`)${NC}"
2481	leftover_controllers=$(source ~/overcloudrc && openstack orchestration service list -c Hostname -c Binary -c Topic -c Status -f value | grep -w -E -v "$current_controllers_piped" | sort | uniq -c | column -t)
2482	if [[ $leftover_controllers ]]
2483	then
2484		echo -e "${RED}$leftover_controllers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2485		echo -e "\n\n${ORANGE}CBIS-16372 (19A) - leftover controllers in heat services list${NC}"
2486	else
2487		echo -e "${GREEN}no leftover controllers found${NC}"
2488	fi	
2489	elapsed_time_seconds=$(expr $(date +%s) - $start)
2490
2491
2492	####################################################################################################
2493
2494
2495	start=$(date +%s)
2496	STEPS_COUNTER=$((STEPS_COUNTER+1))
2497	echo -e "${BLUE}\n\n$STEPS_COUNTER) OPENSTACK ORCHESTRATION SERVICES (heat) ON THE OVERCLOUD (+$elapsed_time_seconds `date '+%T'`)${NC}"
2498	non_up_heat_engines=$(source ~/overcloudrc && openstack orchestration service list -c Hostname -c Binary -c Topic -c Status -f value | grep -w -E "$current_controllers_piped" | grep -v 'engine up' | sort | uniq -c)
2499	if [[ $non_up_heat_engines ]]
2500	then
2501		echo -e "${RED}$non_up_heat_engines${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2502		echo -e "\n\n${ORANGE}CBIS-16373 (19A) / CBIS-16374 (20) - several heat engine services shows down${NC}"
2503	else
2504		echo -e "${GREEN}no heat engines in down status found${NC}"
2505	fi	
2506	elapsed_time_seconds=$(expr $(date +%s) - $start)
2507
2508
2509	####################################################################################################
2510
2511
2512	start=$(date +%s)
2513	STEPS_COUNTER=$((STEPS_COUNTER+1))
2514	echo -e "${BLUE}\n\n$STEPS_COUNTER) OPENSTACK ORCHESTRATION SERVICES (heat) ON THE UNDERCLOUD (+$elapsed_time_seconds `date '+%T'`)${NC}"
2515	non_up_heat_engines=$(source ~/stackrc && openstack orchestration service list -c Hostname -c Binary -c Topic -c Status -f value | grep -v 'engine up' | sort | uniq -c)
2516	if [[ $non_up_heat_engines ]]
2517	then
2518		echo -e "${RED}$non_up_heat_engines${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2519	else
2520		echo -e "${GREEN}no heat engines in down status found${NC}"
2521	fi	
2522	elapsed_time_seconds=$(expr $(date +%s) - $start)
2523
2524
2525	####################################################################################################
2526	
2527	
2528	start=$(date +%s)
2529	STEPS_COUNTER=$((STEPS_COUNTER+1))
2530	echo -e "${BLUE}\n\n$STEPS_COUNTER) NEUTRON PORTS STATUS (+$elapsed_time_seconds `date '+%T'`)${NC}"
2531	ports_status=$(source ~/overcloudrc && openstack port list -c ID -c Status -f value | grep -v ACTIVE)
2532	if [[ $ports_status ]]
2533	then
2534		echo -e "${RED}$ports_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2535	else
2536		echo -e "${GREEN}the status of all the ports returned active${NC}"
2537	fi
2538	elapsed_time_seconds=$(expr $(date +%s) - $start)
2539	
2540	
2541	
2542	####################################################################################################
2543	
2544	
2545	start=$(date +%s)
2546	STEPS_COUNTER=$((STEPS_COUNTER+1))
2547	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE DUPLICATED NEUTRON PORTS IN MYSQL (+$elapsed_time_seconds `date '+%T'`)${NC}"
2548	duplicated_neutron_ports=$(ansible controller -b -m shell -a "mysql -e \"select port_id from ovs_neutron.ml2_port_bindings\"" | sort | uniq -c | column -t | grep ^[4-9] | awk '{print $NF}')
2549	if [[ $duplicated_neutron_ports ]]
2550	then
2551		echo -e "${RED}the following neutron ports have duplicated entries in the ml2_port_bindings table under the neutron_ovs database in mysql:\n\n$duplicated_neutron_ports${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2552		echo -e "\n\n${ORANGE}CBIS-15400 (19A) - duplicate inactive port is created in mysql after live-migrate failure${NC}"
2553	else
2554		echo -e "${GREEN}no duplicated ports found in the ml2_port_bindings table under the neutron_ovs database in mysql${NC}"
2555	fi
2556	elapsed_time_seconds=$(expr $(date +%s) - $start)
2557	
2558	
2559	####################################################################################################
2560	
2561	
2562	start=$(date +%s)
2563	STEPS_COUNTER=$((STEPS_COUNTER+1))
2564	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK IF ANY COMPUTE IS MARKED AS FORCED DOWN (+$elapsed_time_seconds `date '+%T'`)${NC}"
2565	force_down=$(nova service-list | awk '{print $18}' | tr -d '\|' | column -t | sort | uniq)
2566	if [[ $force_down == "False" ]]
2567	then
2568		echo -e "${GREEN}none of the computes are marked as nova force down${NC}"
2569	else
2570		force_down=$(nova service-list)
2571		echo -e "${RED}$force_down${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2572		echo -e "\n\n${ORANGE}a computes marked as forced down = True is normally caused by the auto-evacuate process and requires human intervention to unset back to False${NC}"
2573	fi
2574	elapsed_time_seconds=$(expr $(date +%s) - $start)
2575	
2576	
2577	####################################################################################################
2578	
2579	
2580	start=$(date +%s)
2581	STEPS_COUNTER=$((STEPS_COUNTER+1))
2582	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT /elk IS MOUNTED (+$elapsed_time_seconds `date '+%T'`)${NC}"
2583	if [[ $elk == "true" && $elk_deployment_type == "local" ]]
2584	then
2585		elk_disk=$(cat user_config.yaml | grep elk_disk | awk '{print $2}' | grep -v sda)
2586		if [[ $elk_disk ]]
2587		then
2588			elk_partition_validation=$(ansible controller -b -m shell -a "df -h | grep '/elk'")
2589			elk_results=$(echo -e "$elk_partition_validation" | grep $elk_disk | grep -c '/elk')
2590			if [[ $elk_results != "3" ]]
2591			then
2592				echo -e "${RED}the /elk partition is not found under any $elk_disk partition on one or more controllers:\n\n$elk_partition_validation${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2593			else
2594				echo -e "${GREEN}/elk partition is found on all the controllers on the expected disk $elk_disk${NC}"
2595			fi
2596		else
2597			echo -e "${GREEN}/elk is deployed on sda and therefore, using the root partition${NC}"
2598		fi
2599	else
2600		echo -e "${ORANGE}CBIS is deployed without ELK or ELK type is remote${NC}"
2601	fi
2602	elapsed_time_seconds=$(expr $(date +%s) - $start)
2603	
2604	
2605	####################################################################################################
2606	
2607	
2608	start=$(date +%s)
2609	STEPS_COUNTER=$((STEPS_COUNTER+1))
2610	RETENTION=$(cat user_config.yaml | grep elk_keep_data | awk '{print $NF}')
2611	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THERE ARE NO ELK INDICES OLDER THEN $RETENTION DAYS (+$elapsed_time_seconds `date '+%T'`)${NC}"
2612	RESIDUE_INDICES=$(ansible controller -m shell -b -a "find /elk/esdata/nodes/0/indices/ -maxdepth 1 -type d -daystart -mtime $RETENTION" | grep ^/elk/esdata/nodes/0/indices/ -B 1)
2613	if [[ $RESIDUE_INDICES ]]
2614	then
2615		echo -e "${RED}$RESIDUE_INDICES${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))	
2616	else
2617		echo -e "${GREEN}there are no ELK indices under /elk (of the controllers) older then $RETENTION days${NC}"
2618	fi	
2619	elapsed_time_seconds=$(expr $(date +%s) - $start)
2620
2621
2622	####################################################################################################
2623
2624	
2625	start=$(date +%s)
2626	STEPS_COUNTER=$((STEPS_COUNTER+1))
2627	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE FAULTY DISKS ATTRIBUTES USING SMARTCTL (+$elapsed_time_seconds `date '+%T'`)${NC}"
2628	disks_attributes=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "lsblk -dSn | grep -E -v 'CDROM|usb' | awk '{print \$1}' | xargs -i smartctl -a /dev/{} | grep -E -w 'Erase_Fail_Count|CRC_Error_Count|Uncorrectable_Error_Cnt' | awk '{ if ( \$NF > 0 ) print \$0 }'" | grep ^[0-9] -B 1)
2629	if [[ $disks_attributes ]]
2630	then
2631		echo -e "${RED}$disks_attributes${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2632		echo -e "\n\n${ORANGE}Erase Fail Count = The number of times when erase operation on a flash memory failed\nUncorrectable_Error_Cnt = the number of times a data transfer within the drive had an error that was unable to be corrected by the ECC (Error Checking) algorithm\nCRC_Error_Count = A cyclic redundancy check (CRC) is an error-detecting code commonly used in digital networks and storage devices to detect accidental changes to raw data${NC}"
2633		
2634		
2635	else
2636		echo -e "${GREEN}all tested disks attributes returned 0${NC}"
2637	fi
2638	elapsed_time_seconds=$(expr $(date +%s) - $start)
2639
2640
2641	####################################################################################################
2642	
2643	
2644	start=$(date +%s)
2645	STEPS_COUNTER=$((STEPS_COUNTER+1))
2646	echo -e "${BLUE}\n\n$STEPS_COUNTER) PERFORM SMARTCTL HEALTH CHECK ON THE DISKS (+$elapsed_time_seconds `date '+%T'`)${NC}"
2647	disk_health_test=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "lsblk -dSn | grep -E -v 'CDROM|usb' | awk '{print \$1}' | xargs -i smartctl -H /dev/{} | grep -E 'SMART overall-health self-assessment test result|SMART Health Status'")
2648	disk_health_test_filtered=$(echo -e "$disk_health_test" | grep -E -v 'OK|PASSED|rc=|non-zero return code')
2649	if [[ $disk_health_test_filtered ]]
2650	then
2651		echo -e "${RED}$disk_health_test${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2652	else
2653		echo -e "${GREEN}all tested disks health check passed${NC}"
2654	fi
2655	elapsed_time_seconds=$(expr $(date +%s) - $start)
2656	
2657	
2658	####################################################################################################	
2659
2660	
2661	start=$(date +%s)
2662	STEPS_COUNTER=$((STEPS_COUNTER+1))
2663	echo -e "${BLUE}\n\n$STEPS_COUNTER) PERFORM SMARTCTL SHOT OFFLINE TEST ON ALL THE DISKS (+$elapsed_time_seconds `date '+%T'`)${NC}"	
2664	UNTESTED_HOSTS=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "lsblk -dSn | grep -E -v 'CDROM|usb' | awk '{print \$1}' | xargs -i smartctl -a /dev/{} | grep 'Short offline'" | grep -E 'FAILED|rc=[1-9]' | awk '{print $1}' | paste -sd',')
2665	if [[ $UNTESTED_HOSTS ]]
2666	then
2667		sshpass -p $hv_cbis_admin_password ansible -k $UNTESTED_HOSTS -b -m shell -a "lsblk -dSn | grep -E -v 'CDROM|usb' | awk '{print \$1}' | xargs -i smartctl -t short /dev/{}" > /dev/null
2668		IN_PROGRESS_CHECK=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "lsblk -dSn | grep -E -v 'CDROM|usb' | awk '{print \$1}' | xargs -i smartctl -a /dev/{} | grep 'Short offline' | head -n 1" | grep 'in progress' -B 1)
2669		while [[ $IN_PROGRESS_CHECK ]]
2670		do
2671			sleep 10
2672			IN_PROGRESS_CHECK=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "lsblk -dSn | grep -E -v 'CDROM|usb' | awk '{print \$1}' | xargs -i smartctl -a /dev/{} | grep 'Short offline' | head -n 1" | grep 'in progress' -B 1)
2673		done		
2674		FAILURE_CHECK=$(sshpass -p password ansible -k all --limit '!localhost' -b -m shell -a "lsblk -dSn | grep -E -v 'CDROM|usb' | awk '{print \$1}' | xargs -i smartctl -a /dev/{} | grep 'Short offline' | head -n 1 | grep -v -E 'Completed without error\s+00%'" | grep ^\# -B 1)
2675		if [[ $FAILURE_CHECK ]]
2676		then
2677			echo -e "${RED}$FAILURE_CHECK${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2678		else
2679			echo -e "${GREEN}the latest smartctl self test on all the physical disks returned: 'Completed without error'${NC}"
2680		fi
2681	else
2682		echo -e "${GREEN}the latest smartctl self test on all the physical disks returned: 'Completed without error'${NC}"
2683	fi
2684	elapsed_time_seconds=$(expr $(date +%s) - $start)	
2685	
2686	
2687	####################################################################################################	
2688
2689	
2690	start=$(date +%s)
2691	STEPS_COUNTER=$((STEPS_COUNTER+1))
2692	SIZE=25
2693	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE PROCESSES WITH $SIZE%+ MEMORY CONSUMPTION (ps) (+$elapsed_time_seconds `date '+%T'`)${NC}"
2694	processes_memory_consumption=$(ansible all --limit '!hypervisor' -b -m shell -a "ps -eo %mem,comm,%cpu,pid --sort=-%mem | head -n 2 | column -t | grep ^[0-9] | awk '{ if ( \$1 > $SIZE ) print \$0 }'" | grep ^[0-9] -B 1)
2695	if [[ -z $processes_memory_consumption ]]
2696	then
2697		echo -e "${GREEN}no process with $SIZE%+ memory consumption${NC}"
2698	else
2699		echo -e "${RED}$processes_memory_consumption${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2700	fi
2701	elapsed_time_seconds=$(expr $(date +%s) - $start)
2702
2703
2704	####################################################################################################
2705
2706
2707	start=$(date +%s)
2708	STEPS_COUNTER=$((STEPS_COUNTER+1))
2709	SIZE=90
2710	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE PROCESSES WITH $SIZE%+ CPU CONSUMPTION (top) (+$elapsed_time_seconds `date '+%T'`)${NC}"
2711	processes_cpu_consumption=$(ansible all --limit '!hypervisor' -b -m shell -a "top -b -n 1 -o %CPU | head -n 10 | grep -E '^[0-9]|^\s+[0-9]' | awk '{ if ( \$9 > $SIZE ) print \$1,\$2,\$9,\$12 }' | grep -E -v 'top|fp-rte|ovs-vswi'" | grep ^[0-9] -B 1)
2712	if [[ -z $processes_cpu_consumption ]]
2713	then
2714		echo -e "${GREEN}no process with $SIZE%+ cpu consumption found${NC}"
2715	else
2716		echo -e "${RED}$processes_cpu_consumption${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2717	fi
2718	elapsed_time_seconds=$(expr $(date +%s) - $start)
2719
2720
2721	####################################################################################################
2722
2723
2724	start=$(date +%s)
2725	STEPS_COUNTER=$((STEPS_COUNTER+1))
2726	load_average_treshold="30.0"
2727	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THE HOSTS LOAD AVERAGE AND REPORT FAILURE IF ABOVE $load_average_treshold (+$elapsed_time_seconds `date '+%T'`)${NC}"
2728	load_average=$(ansible all --limit '!hypervisor' -b -m shell -a "cat /proc/loadavg |  awk '{ if ( \$1 > $load_average_treshold ) print \$1 }'" | grep ^[0-9] -B 1)
2729	if [[ $load_average ]]
2730	then
2731		echo -e "${RED}$load_average${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2732	else
2733		echo -e "${GREEN}no host has load average greater then $load_average_treshold${NC}"
2734	fi
2735	elapsed_time_seconds=$(expr $(date +%s) - $start)
2736
2737
2738	####################################################################################################
2739
2740
2741	start=$(date +%s)
2742	STEPS_COUNTER=$((STEPS_COUNTER+1))
2743	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK EACH HOST SWAP MEMORY USAGE (+$elapsed_time_seconds `date '+%T'`)${NC}"
2744	swap_memory=$(ansible all --limit '!hypervisor' -b -m shell -a "free | grep Swap: | awk '{ if ( \$3 > 0 ) print \$3 }'" | grep ^[0-9])
2745	if [[ $swap_memory ]]
2746	then
2747		swap_memory=$(ansible all --limit '!hypervisor' -b -m shell -a "free | grep Swap: | awk '{ if ( \$3 > 0 ) print \$3 }'" | grep ^[0-9] -B 1)
2748		echo -e "${RED}$swap_memory${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2749	else
2750		echo -e "${GREEN}none of the servers are using swap memory${NC}"
2751	fi
2752	elapsed_time_seconds=$(expr $(date +%s) - $start)
2753
2754
2755	####################################################################################################
2756
2757
2758	start=$(date +%s)
2759	STEPS_COUNTER=$((STEPS_COUNTER+1))
2760	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT EACH SRIOV HOST HAS THE EXPECTED NUMBER OF VIRTUAL FUNCTIONS (+$elapsed_time_seconds `date '+%T'`)${NC}"
2761	if [[ $ansible_sriov_hosts ]]
2762	then
2763		expected_vfs_num=$(grep -w SriovPerformanceComputeExtraConfig /home/stack/templates/sriov-info.yaml -A 6 | grep tripleo::host::sriov::number_of_vfs: | awk '{OFS=RS;$1=$1}1' | grep -E [0-9]+ |awk -F: '{print $2}' | tr -d "\'\]\," | paste -sd+ | bc)
2764		for host in $ansible_sriov_hosts
2765		do
2766			current_vfs_num=$(ansible $host -b -m shell -a "ip link show | grep -c 'vf '" | grep ^[0-9])
2767			if [[ $expected_vfs_num == $current_vfs_num ]]
2768			then
2769				echo -e "${GREEN}expected vfs number on $host ($expected_vfs_num), current vfs number on $host ($current_vfs_num)${NC}"
2770			else
2771				echo -e "${RED}expected vfs number on $host ($expected_vfs_num), current vfs number on $host ($current_vfs_num)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2772			fi
2773		done
2774	else
2775		echo -e "${ORANGE}no sriov computes are found${NC}"
2776	fi
2777	elapsed_time_seconds=$(expr $(date +%s) - $start)
2778
2779
2780	####################################################################################################
2781
2782
2783	start=$(date +%s)
2784	STEPS_COUNTER=$((STEPS_COUNTER+1))
2785	echo -e "${BLUE}\n\n$STEPS_COUNTER) VERIFY THE NEGOTIATED DUPLEX VALUE OF THE PHYSICAL INTERFACES ARE AS EXPECTED (+$elapsed_time_seconds `date '+%T'`)${NC}"
2786	duplex=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "ls -l /sys/class/net/ | grep -E \"[0-9a-z][0-9a-z]:00\.0/net|[0-9a-z][0-9a-z]:00\.1/net\" | awk -F/ '{print \$NF}' | xargs -i ip link show {} | grep 'UP mode' | awk '{print \$2}' | tr -d : | xargs -i ethtool {}" | grep Duplex: | awk '{print $2}' | uniq)
2787	if [[ $duplex != "Full" ]]
2788	then
2789		duplex=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "ls -l /sys/class/net/ | grep -E \"[0-9a-z][0-9a-z]:00\.0/net|[0-9a-z][0-9a-z]:00\.1/net\" | awk -F/ '{print \$NF}' | xargs -i ip link show {} | grep 'UP mode' | awk '{print \$2}' | tr -d : | xargs -i ethtool {} | grep -E 'Settings for |Duplex:'")
2790		echo -e "${RED}$duplex${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2791	else
2792		echo -e "${GREEN}all physical ports negotiated duplex set to full${NC}"
2793	fi
2794	elapsed_time_seconds=$(expr $(date +%s) - $start)
2795
2796
2797	####################################################################################################
2798	
2799	
2800	start=$(date +%s)
2801	STEPS_COUNTER=$((STEPS_COUNTER+1))
2802	echo -e "${BLUE}\n\n$STEPS_COUNTER) VERIFY THAT ALL THE PHYSICAL INTERFACES ARE UP (+$elapsed_time_seconds `date '+%T'`)${NC}"
2803	physical_interfaces=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "ls -l /sys/class/net/ | grep -E \"[0-9a-z][0-9a-z]:00\.0/net|[0-9a-z][0-9a-z]:00\.1/net\" | awk -F/ '{print \$NF}' | wc -l" | grep ^[0-9] | sort | md5sum | awk '{print $1}')
2804	up_physical_interfaces=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "ls -l /sys/class/net/ | grep -E \"[0-9a-z][0-9a-z]:00\.0/net|[0-9a-z][0-9a-z]:00\.1/net\" | awk -F/ '{print \$NF}' | xargs -i ip link show {} | grep 'state UP mode' -c" | grep ^[0-9] | sort | md5sum | awk '{print $1}')
2805	if [[ $physical_interfaces != $up_physical_interfaces ]]
2806	then
2807		down_interfaces=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "ls -l /sys/class/net/ | grep -E \"[0-9a-z][0-9a-z]:00\.0/net|[0-9a-z][0-9a-z]:00\.1/net\" | awk -F/ '{print \$NF}' | xargs -i ip link show {} | grep DOWN")
2808		echo -e "${RED}$down_interfaces${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2809	else
2810		echo -e "${GREEN}no physical interfaces are down${NC}"
2811	fi
2812	elapsed_time_seconds=$(expr $(date +%s) - $start)	
2813
2814
2815	####################################################################################################
2816
2817
2818	start=$(date +%s)
2819	STEPS_COUNTER=$((STEPS_COUNTER+1))
2820	echo -e "${BLUE}\n\n$STEPS_COUNTER) VERIFY THE NEGOTIATED LINK SPEED VALUE OF THE PHYSICAL INTERFACES ARE AS EXPECTED (+$elapsed_time_seconds `date '+%T'`)${NC}"
2821	link_speed=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "ls -l /sys/class/net/ | grep -E \"[0-9a-z][0-9a-z]:00\.0/net|[0-9a-z][0-9a-z]:00\.1/net\" | awk -F/ '{print \$NF}' | xargs -i ip link show {} | grep 'UP mode' | awk '{print \$2}' | tr -d : | xargs -i ethtool {}" | grep Speed: | awk '{print $2}' | uniq)
2822	if [[ $fixed_platform == "airframe" ]]
2823	then
2824		if [[ $link_speed != "10000Mb/s" ]]
2825		then
2826			link_speed=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "ls -l /sys/class/net/ | grep -E \"[0-9a-z][0-9a-z]:00\.0/net|[0-9a-z][0-9a-z]:00\.1/net\" | awk -F/ '{print \$NF}' | xargs -i ip link show {} | grep 'UP mode' | awk '{print \$2}' | tr -d : | xargs -i ethtool {} | grep -E 'Settings for |Speed:'")
2827			echo -e "${RED}$link_speed${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2828		else
2829			echo -e "${GREEN}all physical ports negotiated speed set to 10000Mb/s${NC}"
2830		fi
2831	else
2832		if [[ $link_speed != "25000Mb/s" ]]
2833		then
2834			link_speed=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "ls -l /sys/class/net/ | grep -E \"[0-9a-z][0-9a-z]:00\.0/net|[0-9a-z][0-9a-z]:00\.1/net\" | awk -F/ '{print \$NF}' | xargs -i ip link show {} | grep 'UP mode' | awk '{print \$2}' | tr -d : | xargs -i ethtool {} | grep -E 'Settings for |Speed:'")
2835			echo -e "${RED}$link_speed${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2836		else
2837			echo -e "${GREEN}all physical ports negotiated speed set to 25000Mb/s${NC}"
2838		fi
2839	fi
2840	elapsed_time_seconds=$(expr $(date +%s) - $start)
2841
2842
2843	####################################################################################################
2844
2845
2846	start=$(date +%s)
2847	STEPS_COUNTER=$((STEPS_COUNTER+1))
2848	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE warning LOG LINES (case-insensitive) WITHIN /var/log/ UP TO 10 MINUTES EARLIER (+$elapsed_time_seconds `date '+%T'`)${NC}"
2849	hour1=$(date -d "-0 hour" +%Y"-"%m"-"%d" "%T | cut -d: -f1-2 | sed 's/.$//')
2850	hour2=$(date | awk '{print $2" "$3 ,$4}' | cut -d: -f1-2 | sed 's/.$//')
2851	warning=$(ansible all --limit '!hypervisor' -b -m shell -a "grep -E -R '$hour1|$hour2' /var/log/ | grep -i warning | grep -E -v 'ansible-command: Invoked with warn|ansible.log|filebeat|DeprecationWarning|level=warning|deprecated' | awk -F: '{print \$1}' | sort | uniq -c | column -t" | grep ^[1-9] -B 1)
2852	if [[ $warning ]]
2853	then
2854		echo -e "${ORANGE}$warning${NC}"
2855	else
2856		echo -e "${GREEN}no warning log lines were found under /var/log/${NC}"
2857	fi
2858	elapsed_time_seconds=$(expr $(date +%s) - $start)
2859
2860
2861	####################################################################################################
2862
2863
2864	start=$(date +%s)
2865	STEPS_COUNTER=$((STEPS_COUNTER+1))
2866	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT ALL THE CBIS MANAGER PAGES RETURN HTTP 200 OK (+$elapsed_time_seconds `date '+%T'`)${NC}"
2867	echo -e "${CYAN}get zabbix, ceph, kibana and horizon components status${NC}"
2868	if [[ $ceph_backend == "false" && $elk == "false" ]]
2869	then
2870		installed_status=$(curl -g -s -L -k -X GET 'https://'$HypervisorURL'/api/components/getComponents' -H 'Authorization: Basic '$cbis_manager_token'' | jq 'select(.display != "Kibana")' | jq 'select(.status == "notInstalled")')
2871	elif [[ $ceph_backend == "false" && $elk == "true" ]]
2872	then
2873		installed_status=$(curl -g -s -L -k -X GET 'https://'$HypervisorURL'/api/components/getComponents' -H 'Authorization: Basic '$cbis_manager_token'' | jq .[][] | jq 'select(.display != "Ceph")' | jq 'select(.status == "notInstalled")')
2874	elif [[ $ceph_backend == "true" && $elk == "false" ]]
2875	then
2876		installed_status=$(curl -g -s -L -k -X GET 'https://'$HypervisorURL'/api/components/getComponents' -H 'Authorization: Basic '$cbis_manager_token'' | jq .[][] | jq 'select(.display != "Kibana")' | jq 'select(.status == "notInstalled")')
2877	fi
2878	if [[ $installed_status ]]
2879	then
2880		echo -e "${RED}$installed_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2881	else
2882		echo -e "${GREEN}all external components are installed${NC}"
2883	fi
2884	echo -e "${CYAN}access plugins page${NC}"
2885	http_status=$(curl -g -s -L -k -w 'RESP_CODE:%{response_code}' -X  GET 'https://'$HypervisorURL'/api/plugins' -H 'Authorization: Basic '$cbis_manager_token'' | grep RESP_CODE | awk -F: '{print $2}')
2886	if [[ $http_status != "200" ]]
2887	then
2888		echo -e "${RED}expected http code 200 and got $http_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2889	else
2890		echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2891	fi
2892	echo -e "${CYAN}access installation page${NC}"
2893	http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/installation/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"hardware":'$platform'}' | grep RESP_CODE | awk -F: '{print $2}')
2894	if [[ $http_status != "200" ]]
2895	then
2896		echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2897	else
2898		echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2899	fi
2900	echo -e "${CYAN}access custom templates page${NC}"
2901	http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/add_host_group/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"hardware":'$platform'}' | grep RESP_CODE | awk -F: '{print $2}')
2902	if [[ $http_status != "200" ]]
2903	then
2904		echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2905	else
2906		echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2907	fi
2908	if [[ $cbis_version != "19.0.0.1" ]]
2909	then
2910		echo -e "${CYAN}access upgrade (upgrade) page${NC}"
2911		http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/cbis_upgrade/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"set_in_upgrade":"Upgrade"}' | grep RESP_CODE | awk -F: '{print $2}')
2912		if [[ $http_status != "200" ]]
2913		then
2914			echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2915		else
2916			echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2917		fi
2918		echo -e "${CYAN}access upgrade (resume) page${NC}"
2919		http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/cbis_upgrade/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"set_in_upgrade":"Resume"}' | grep RESP_CODE | awk -F: '{print $2}')
2920		if [[ $http_status != "200" ]]
2921		then
2922			echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2923		else
2924			echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2925		fi
2926		echo -e "${CYAN}access upgrade (rollback) page${NC}"
2927		http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/cbis_upgrade/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"set_in_upgrade":"Rollback"}' | grep RESP_CODE | awk -F: '{print $2}')
2928		if [[ $http_status != "200" ]]
2929		then
2930			echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2931		else
2932			echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2933		fi
2934	fi
2935	if [[ $cbis_version != "19.0.0.1" ]]
2936	then
2937		echo -e "${CYAN}access novl page${NC}"
2938		http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/novl/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"test":"new test"}' | grep RESP_CODE | awk -F: '{print $2}')
2939		if [[ $http_status != "200" ]]
2940		then
2941			echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2942		else
2943			echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2944		fi
2945	fi
2946	echo -e "${CYAN}access scale-out page${NC}"
2947	if [[ $cbis_version == "18.0.0.1" || $cbis_version == "19.0.0.1" ||  $cbis_version == "19.100.1" ]]
2948	then
2949		http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/add_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"current_hardware":'$platform',"hardware":'$platform'}' | grep RESP_CODE | awk -F: '{print $2}')
2950	else
2951		http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/add_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"current_hardware":'$platform',"new_hardware":'$platform'}' | grep RESP_CODE | awk -F: '{print $2}')
2952	fi	
2953	if [[ $http_status != "200" ]]
2954	then
2955		echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2956	else
2957		echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2958	fi
2959	echo -e "${CYAN}access scale-in page${NC}"
2960	http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/remove_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
2961	if [[ $http_status != "200" ]]
2962	then
2963		echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2964	else
2965		echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2966	fi
2967	if [[ $cbis_version != "19.0.0.1" ]]
2968	then
2969		echo -e "${CYAN}access reboot servers page${NC}"
2970		http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/reboot_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
2971		if [[ $http_status != "200" ]]
2972		then
2973			echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2974		else
2975			echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2976		fi
2977	fi	
2978	if [[ $cbis_version != "19.0.0.1" ]]
2979	then
2980		echo -e "${CYAN}access maintenance mode (set) page${NC}"
2981		if [[ $cbis_version == "18.0.0.1" || $cbis_version == "19.0.0.1" ||  $cbis_version == "19.100.1" ]]
2982		then
2983			http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/maintenance_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"set_in_maintenance":"Set in Maintenance"}' | grep RESP_CODE | awk -F: '{print $2}')
2984		else
2985			http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/maintenance_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"set_to_maintenance":"Set to Maintenance"}' | grep RESP_CODE | awk -F: '{print $2}')
2986		fi	
2987		if [[ $http_status != "200" ]]
2988		then
2989			echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
2990		else
2991			echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
2992		fi
2993		echo -e "${CYAN}access maintenance mode (unset) page${NC}"
2994		if [[ $cbis_version == "18.0.0.1" || $cbis_version == "19.0.0.1" ||  $cbis_version == "19.100.1" ]]
2995		then
2996			http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/maintenance_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"set_in_maintenance":"Unset from Maintenance"}' | grep RESP_CODE | awk -F: '{print $2}')
2997		else
2998			http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/maintenance_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"set_to_maintenance":"Unset from Maintenance"}' | grep RESP_CODE | awk -F: '{print $2}')
2999		fi
3000		if [[ $http_status != "200" ]]
3001		then
3002			if [[ $cbis_version == "18.0.0.1" || $cbis_version == "19.0.0.1" ||  $cbis_version == "19.100.1" ]]
3003			then
3004				error=$(curl -g -s -k -L -X POST 'https://'$HypervisorURL'/api/maintenance_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"set_in_maintenance":"Unset from Maintenance"}' | jq .Error | tr -d '\"\.')
3005			else
3006				error=$(curl -g -s -k -L -X POST 'https://'$HypervisorURL'/api/maintenance_node/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"set_to_maintenance":"Unset from Maintenance"}' | jq .error_message | tr -d '\"\.')
3007			fi
3008			if [[ $error == "There are currently no computes to unset from maintenance mode" ]]
3009			then
3010				echo -e "${GREEN}There are currently no computes to unset from maintenance mode${NC}"
3011			else
3012				echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3013			fi
3014		elif [[ $http_status == "200" ]]
3015		then
3016			echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3017		fi
3018	fi
3019	echo -e "${CYAN}access undercloud vm backup/restore (backup) page${NC}"
3020	http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/undercloud_backup/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"operation":"Backup","backup_directory":"'$backup_nfs_mountpoint'"}' | grep RESP_CODE | awk -F: '{print $2}')
3021	if [[ $http_status != "200" ]]
3022	then
3023		echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3024	else
3025		echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3026	fi
3027	echo -e "${CYAN}access undercloud vm backup/restore (restore) page${NC}"
3028	http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/undercloud_backup/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{"operation":"Restore","backup_directory":"'$backup_nfs_mountpoint'"}' | grep RESP_CODE | awk -F: '{print $2}')
3029	if [[ $http_status != "200" ]]
3030	then
3031		echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3032	else
3033		echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3034	fi
3035	echo -e "${CYAN}access patch management page page${NC}"
3036	http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/patch_management/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3037	if [[ $http_status != "200" ]]
3038	then
3039		echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3040	else
3041		echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3042	fi
3043	if [[ $cbis_version != "19.0.0.1" ]]
3044	then
3045		echo -e "${CYAN}access controller replacement page${NC}"
3046		http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/replace_controller/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3047		if [[ $http_status != "200" ]]
3048		then
3049			echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3050		else
3051			echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3052		fi
3053	fi
3054	if [[ $cbis_version != "19.0.0.1" ]]
3055	then
3056		echo -e "${CYAN}access overcloud database restore page${NC}"
3057		http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/overcloud_db_restore/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3058		if [[ $http_status != "200" ]]
3059		then
3060			echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3061		else
3062			echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3063		fi
3064	fi
3065	if [[ $cbis_version != "19.0.0.1" ]]
3066	then	
3067		echo -e "${CYAN}access security hardening (install) page${NC}"
3068		http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/security_hardening/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3069		if [[ $http_status != "200" ]]
3070		then
3071			echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3072		else
3073			echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3074		fi
3075		echo -e "${CYAN}access security hardening (rollback) page${NC}"
3076		http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/security_hardening_rollback/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3077		if [[ $http_status != "200" ]]
3078		then
3079			echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3080		else
3081			echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3082		fi
3083		echo -e "${CYAN}access secured communication (ipsec) page${NC}"
3084		http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/security_secured_communication/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3085		if [[ $http_status != "200" ]]
3086		then
3087			echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3088		else
3089			echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3090		fi
3091		echo -e "${CYAN}access key management page${NC}"
3092		http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/security_key_management/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3093		if [[ $http_status != "200" ]]
3094		then
3095			echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3096		else
3097			echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3098		fi
3099		echo -e "${CYAN}access platform secret update page${NC}"
3100		http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/security_platform_secrets_update/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3101		if [[ $http_status != "200" ]]
3102		then
3103			echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3104		else
3105			echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3106		fi
3107		echo -e "${CYAN}access ldap deployment parameters page${NC}"
3108		http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/security_ldap_deployment/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3109		if [[ $http_status != "200" ]]
3110		then
3111			echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3112		else
3113			echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3114		fi
3115	fi
3116	echo -e "${CYAN}access multi cbis management page${NC}"
3117	http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/manage_multi/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3118	if [[ $http_status != "200" ]]
3119	then
3120		echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3121	else
3122		echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3123	fi
3124	echo -e "${CYAN}access remote patch management page${NC}"
3125	http_status=$(curl -g -s -k -L -w 'RESP_CODE:%{response_code}' -X POST 'https://'$HypervisorURL'/api/multi_vim/main' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep RESP_CODE | awk -F: '{print $2}')
3126	if [[ $http_status != "200" ]]
3127	then
3128		echo -e "${RED}expected http code 200 and got $http_status. log-in into cbis manager UI to verify${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3129	else
3130		echo -e "${GREEN}returned response code HTTP 200 OK${NC}"
3131	fi
3132	echo -e "${CYAN}validate that the deployment log returns 'CBIS Installation Finished Successfully or 'Post install Sanity tests completed'${NC}"
3133	deployment_log_success_check=$(curl -g -s -k -L  -X  GET 'https://'$HypervisorURL'/log/deployment.log' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' --data '{}' | grep -E 'CBIS Installation Finished Successfully|Post install Sanity tests completed')
3134	if [[ $deployment_log_success_check ]]
3135	then
3136		echo -e "${GREEN}returned response 'CBIS Installation Finished Successfully' or 'Post install Sanity tests completed'${NC}"
3137	else
3138		echo -e "${RED}didn't get 'CBIS Installation Finished Successfully' or 'Post install Sanity tests completed' as expected. log-in into cbis manager UI and verify that the CBIS installation log is showing${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3139	fi
3140	elapsed_time_seconds=$(expr $(date +%s) - $start)
3141	
3142	
3143	####################################################################################################
3144	
3145	
3146	start=$(date +%s)
3147	STEPS_COUNTER=$((STEPS_COUNTER+1))
3148	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE HORIZON LOGO PAGE AUTHENTICITY (+$elapsed_time_seconds `date '+%T'`)${NC}"
3149	### this check came after seeing permission issues (403 forbbiden errors) for most of the pages in horizon on newyork setup (19A SP4 PP3) which corrupted the view of horizon completely.
3150	horizon_logo_error_code=$(curl -g -s -k -L -w '\nRESP_CODE:%{response_code}\n' -X GET https://$PublicURL/dashboard/static/themes/nokia/img/logo.svg | grep RESP_CODE: | awk -F: '{print $2}')
3151	if [[ $horizon_logo_error_code != "200" ]]
3152	then
3153		echo -e "${RED}horizon logo page (logo.svg) returned error code $horizon_logo_error_code${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3154	else
3155		echo -e "${GREEN}horizon logo page (logo.svg) returned error code $horizon_logo_error_code${NC}"
3156	fi
3157	elapsed_time_seconds=$(expr $(date +%s) - $start)
3158	
3159	
3160	####################################################################################################
3161
3162
3163	start=$(date +%s)
3164	STEPS_COUNTER=$((STEPS_COUNTER+1))
3165	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK ALL HORIZON PAGES VALIDITY (+$elapsed_time_seconds `date '+%T'`)${NC}"
3166	curl -g -k -s -L -c cookies.txt -b cookies.txt -o output.1.html 'https://'$PublicURL'/dashboard/auth/login/'
3167	CSRFTOKEN=$(cat cookies.txt | grep -w csrftoken | awk '{print $NF}')
3168	if [[ -z $CSRFTOKEN ]]
3169	then
3170		echo -e "${RED}cookies.txt doesn't contain the csrftoken output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3171	fi
3172	csrfmiddlewaretoken=$(curl -g -s -k -L 'https://'$PublicURL'/dashboard/auth/login/' | grep csrfmiddlewaretoken | awk -F'value=' '{print $2}' | awk -F\' '{print $2}')
3173	if [[ -z $csrfmiddlewaretoken ]]
3174	then
3175		echo -e "${RED}didn't receive the csrfmiddlewaretoken output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3176	fi
3177	HORIZON_REGION=$(curl -g -s -k -L 'https://'$PublicURL'/dashboard/auth/login/' | grep region | awk -F'value="' '{print $2}' | awk -F\" '{print $1}')
3178	if [[ -z $HORIZON_REGION ]]
3179	then
3180		echo -e "${RED}didn't receive the region output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3181	fi
3182	DATA="username=admin&password=$ADMIN_PASSWORD&region=$HORIZON_REGION&csrfmiddlewaretoken=$CSRFTOKEN"
3183	if [[ -z $DATA ]]
3184	then
3185		echo -e "${RED}didn't receive the data output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3186	fi
3187	curl -g -k -s -L -c cookies.txt -b cookies.txt --output /dev/null -s -d "$DATA" --referer 'https://'$PublicURL'/dashboard/' 'https://'$PublicURL'/dashboard/auth/login/'
3188	SESSIONID=$(cat cookies.txt | grep sessionid | sed 's/^.*sessionid\s*//')
3189	if [[ -z $SESSIONID ]]
3190	then
3191		echo -e "${RED}didn't receive the sessionid output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3192	fi
3193	pages="identity identity/users identity/groups identity/groups identity/roles admin admin/hypervisors admin/aggregates admin/instances admin/flavors admin/images admin/volumes admin/snapshots admin/networks admin/routers admin/floating_ips admin/trunks admin/defaults admin/metadata_defs admin/info admin/vitrageadmindashboard admin/vitrageadminalarms admin/vitrageadminentities admin/vitrageadmintemplates admin/shares admin/share_snapshots admin/share_types admin/share_networks admin/security_services admin/share_servers admin/share_instances admin/share_groups admin/share_group_snapshots admin/share_group_types project/api_access project project/instances project/images project/key_pairs project/server_groups project/volumes project/snapshots project/volume_groups project/vg_snapshots project/network_topology project/networks project/routers project/security_groups project/floating_ips project/trunks project/stacks project/resource_types project/template_versions project/vitragedashboard project/vitragealarms project/vitrageentities project/vitragetemplates project/shares project/share_snapshots project/share_networks project/security_services project/share_groups project/share_group_snapshots settings"
3194	for page in $pages	
3195	do
3196		echo -e "${CYAN}checking the $page page${NC}"
3197		OUTPUT=$(curl -g -k -L -s 'https://'$PublicURL'/dashboard/'$page'/' -H 'Cookie: csrftoken='$CSRFTOKEN'; sessionid='$SESSIONID'')
3198		if [[ $OUTPUT ]]
3199		then
3200			ERROR=$(echo -e "$OUTPUT" | grep -E 'Error:|unexpected error|Server error')
3201			if [[ $ERROR ]]
3202			then
3203				echo -e "${RED}$ERROR${NC}" | awk '{$1=$1};1' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3204			else
3205				echo -e "${GREEN}the $page page returned with no errors${NC}"
3206			fi
3207		else
3208			echo -e "${RED}OUTPUT variable returned no ouput${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3209		fi
3210	done
3211	rm -rf cookies.txt output.1.html
3212	elapsed_time_seconds=$(expr $(date +%s) - $start)
3213	
3214	
3215	####################################################################################################
3216fi
3217	####################################################################################################
3218
3219
3220if [[ $ESSENTIAL == "no" ]]
3221then	
3222	start=$(date +%s)
3223	STEPS_COUNTER=$((STEPS_COUNTER+1))
3224	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT ALL THE COMPUTES HAS THE SAME TOTAL MEMORY SIZE (+$elapsed_time_seconds `date '+%T'`)${NC}"
3225	HYPERVISOR_LIST=$(source ~/overcloudrc && openstack hypervisor list --long -c "Hypervisor Hostname" -c "Memory MB")
3226	TOTAL_RAM=$(echo -e "$HYPERVISOR_LIST" | grep overcloud | awk '{print $4}' | sort | uniq | wc -l)
3227	if [ $TOTAL_RAM -gt 1 ]
3228	then
3229		echo -e "${RED}found one or more computes with different total memory size\n\n$HYPERVISOR_LIST${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3230	else
3231		echo -e "${GREEN}all the computes have the same total memory size${NC}"
3232	fi
3233	elapsed_time_seconds=$(expr $(date +%s) - $start)
3234	
3235	
3236	####################################################################################################
3237	
3238	
3239	start=$(date +%s)
3240	STEPS_COUNTER=$((STEPS_COUNTER+1))
3241	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE CPU MODEL INCONSISTENCIES INSIDE EACH SERVER INDIVIDUALLY (+$elapsed_time_seconds `date '+%T'`)${NC}"
3242	inconsistence_servers=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "cat /proc/cpuinfo  | grep 'model name' | awk -F: '{print \$2}' | sed 's/^ *//g' | sort | uniq | wc -l" | grep ^[2-9] -B 1 | grep overcloud- | awk '{print $1}' | paste -sd',')
3243	if [[ $inconsistence_servers ]]
3244	then
3245		inconsistence_servers=$(ssshpass -p $hv_cbis_admin_password ansible -k $inconsistence_servers --limit '!localhost' -b -m shell -a "cat /proc/cpuinfo  | grep 'model name' | awk -F: '{print \$2}' | sed 's/^ *//g' | sort | uniq -c")
3246		echo -e "${RED}$inconsistence_servers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3247	else
3248		echo -e "${GREEN}no inconsistencies found between the many CPUs of each server${NC}"
3249	fi
3250	elapsed_time_seconds=$(expr $(date +%s) - $start)
3251
3252
3253	####################################################################################################
3254	
3255	
3256	start=$(date +%s)
3257	STEPS_COUNTER=$((STEPS_COUNTER+1))
3258	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE CPU MODEL INCONSISTENCIES BETWEEN THE OVERCLOUD SERVERS + HYPERVISOR (+$elapsed_time_seconds `date '+%T'`)${NC}"
3259	echo -e "${CYAN}checking the CPU model of the computes and controllers${NC}"
3260	inconsistence_servers=$(sshpass -p $hv_cbis_admin_password ansible -k controller,compute,hypervisor -b -m shell -a "cat /proc/cpuinfo  | grep 'model name' | awk -F: '{print \$2}' | sed 's/^ *//g' | sort | uniq" | grep CPU | sort | uniq | wc -l)
3261	if [[ $inconsistence_servers != "1" ]]
3262	then
3263		inconsistence_servers=$(sshpass -p $hv_cbis_admin_password ansible -k controller,compute,hypervisor -b -m shell -a "cat /proc/cpuinfo  | grep 'model name' | awk -F: '{print \$2}' | sort | uniq")
3264		echo -e "${RED}$inconsistence_servers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3265	else
3266		echo -e "${GREEN}no inconsistencies found between the CPUs of the controllers, computes and hypervisor${NC}"
3267	fi
3268	if [[ $ansible_storage_hosts ]]	
3269	then
3270		echo -e "\n${CYAN}checking the CPU model of the computes and controllers${NC}"
3271		inconsistence_servers=$(ansible CephStorage -b -m shell -a "cat /proc/cpuinfo  | grep 'model name' | awk -F: '{print \$2}' | sed 's/^ *//g' | sort | uniq" | grep CPU | sort | uniq | wc -l)
3272		if [[ $inconsistence_servers != "1" ]]
3273		then
3274			inconsistence_servers=$(ansible CephStorage -b -m shell -a "cat /proc/cpuinfo  | grep 'model name' | awk -F: '{print \$2}' | sort | uniq")
3275			echo -e "${RED}$inconsistence_servers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3276		else
3277			echo -e "${GREEN}no inconsistencies found between the CPUs of the storage nodes${NC}"
3278		fi
3279	fi
3280	elapsed_time_seconds=$(expr $(date +%s) - $start)
3281	
3282	
3283	####################################################################################################
3284
3285
3286	start=$(date +%s)
3287	STEPS_COUNTER=$((STEPS_COUNTER+1))
3288	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE CPU FLAGS INCONCITIENCIES (+$elapsed_time_seconds `date '+%T'`)${NC}"
3289	lscpu=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "lscpu")
3290	uniqe_flags=$(echo -e "$lscpu" | grep Flags: | tr -s ' ' '\n' | grep -v Flags: | sort | uniq -c | column -t | awk '{print $1}' | sort -n | uniq | wc -l)
3291	if [ $uniqe_flags != "1" ]
3292	then
3293		uniqe_missing_flags=$(echo -e "$lscpu" | grep Flags: | tr -s ' ' '\n' | grep -v Flags: | sort | uniq -c | column -t | awk '{print $1}' | sort -n | uniq | head -n -1 | tail -n1)
3294		missing_flags_names=$(echo -e "$lscpu" | grep Flags: | tr -s ' ' '\n' | grep -v Flags: | sort | uniq -c | column -t | grep ^[1-$uniqe_missing_flags] | awk '{print $2}')
3295		missing_flags_paste=$(echo -e "$missing_flags_names" | paste -sd"|")
3296		flags_per_compute=$(ansible all --limit '!localhost,!hypervisor' -b -m shell -a "lscpu | grep Flags: | tr -s ' ' '\n' | grep -E '$missing_flags_paste' | wc -l")
3297		echo -e "${RED}the following flags are missing from one or more computes:\n\n$missing_flags_names\n\n$flags_per_compute${NC}"
3298	else
3299		echo -e "${GREEN}the CPU flags are identical on all the compute hosts${NC}"
3300	fi
3301	elapsed_time_seconds=$(expr $(date +%s) - $start)
3302	
3303
3304	####################################################################################################
3305
3306
3307	start=$(date +%s)
3308	STEPS_COUNTER=$((STEPS_COUNTER+1))	
3309	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT AUTO EVACUATE IS ENABLED (+$elapsed_time_seconds `date '+%T'`)${NC}"
3310	if [[ $cbis_version == "18.0.0.1" || $cbis_version == "19.0.0.1" || $cbis_version == "19.100.1" ]]
3311	then
3312		auto_evacuate=$(ansible $last_index_controller -b -m shell -a "cat /etc/vitrage/vitrage.conf" | grep enable_host_evacuate | awk '{print $3}')
3313		if [[ $auto_evacuate == "False" ]]
3314		then
3315			echo -e "${ORANGE}auto-evacuate is disabled${NC}"
3316		else
3317			echo -e "${GREEN}auto-evacuate is enabled${NC}"
3318		fi
3319	else
3320		auto_evacuate=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
3321		-H 'Content-Type: application/json-rpc' \
3322		-H 'Cookie: SERVERID='$last_index_controller'' \
3323		--data '{
3324			"jsonrpc": "2.0",
3325			"method": "action.get",
3326			"params": {
3327				"output": "extend",
3328				"selectOperations": "extend",
3329				"selectRecoveryOperations": "extend",
3330				"selectFilter": "extend",
3331				"filter": {
3332					"eventsource": 0
3333				}
3334			},
3335			"auth": '$zabbix_auth',
3336			"id": 2
3337		}' | jq . | grep auto-evacuate -A 5 | grep status | awk '{print $NF}' | tr -d '\"\,')
3338		if [[ $auto_evacuate == "1" ]]
3339		then
3340			echo -e "${ORANGE}auto-evacuate is disabled${NC}"
3341		elif [[ $auto_evacuate == "0" ]]
3342		then
3343			echo -e "${GREEN}auto-evacuate is enabled${NC}"
3344		fi
3345	fi
3346	elapsed_time_seconds=$(expr $(date +%s) - $global_start)
3347
3348
3349	####################################################################################################
3350
3351
3352	start=$(date +%s)
3353	STEPS_COUNTER=$((STEPS_COUNTER+1))
3354	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE CHANGES IN CONF FILES ON THE CONTROLLERS (+$elapsed_time_seconds `date '+%T'`)${NC}"
3355	src_dir="/var/lib/config-data/puppet-generated"
3356	dst_dir="/home/cbis-admin/configuration_files_backup"
3357	dir_check=$(ansible $last_index_controller -b -m shell -a "ls $dst_dir" | grep -E -v -c 'overcloud|cannot access')
3358	if [ $dir_check != "9"  ]
3359	then
3360		ansible controller -b -m shell -a "cp -u mkdir -p "$dst_dir" warn=False" > /dev/null
3361		ansible controller -b -m shell -a "cp -u "$src_dir"/cinder/etc/cinder/cinder.conf "$dst_dir"" > /dev/null
3362		ansible controller -b -m shell -a "cp -u "$src_dir"/nova_placement/etc/nova/nova.conf "$dst_dir"" > /dev/null
3363		ansible controller -b -m shell -a "cp -u "$src_dir"/glance_api/etc/glance/glance-api.conf "$dst_dir"" > /dev/null
3364		ansible controller -b -m shell -a "cp -u "$src_dir"/haproxy/etc/haproxy/haproxy.cfg "$dst_dir"" > /dev/null
3365		ansible controller -b -m shell -a "cp -u "$src_dir"/heat/etc/heat/heat.conf "$dst_dir"" > /dev/null
3366		ansible controller -b -m shell -a "cp -u "$src_dir"/keystone/etc/keystone/keystone.conf "$dst_dir"" > /dev/null
3367		ansible controller -b -m shell -a "cp -u "$src_dir"/neutron/etc/neutron/neutron.conf "$dst_dir"" > /dev/null
3368		ansible controller -b -m shell -a "cp -u "$src_dir"/rabbitmq/etc/rabbitmq/rabbitmq.config "$dst_dir"" > /dev/null
3369		ansible controller -b -m shell -a "cp -u "$src_dir"/ceph/etc/ceph/ceph.conf "$dst_dir"" > /dev/null
3370	fi
3371	cinder=$(ansible controller -b -m shell -a "diff "$src_dir"/cinder/etc/cinder/cinder.conf "$dst_dir"/cinder.conf | grep -v transport_url" | grep -E '^>|^<')
3372	nova=$(ansible controller -b -m shell -a "diff "$src_dir"/nova_placement/etc/nova/nova.conf "$dst_dir"/nova.conf | grep -v transport_url" | grep -E '^>|^<')
3373	glance_api=$(ansible controller -b -m shell -a "diff "$src_dir"/glance_api/etc/glance/glance-api.conf "$dst_dir"/glance-api.conf | grep -v transport_url" | grep -E '^>|^<')
3374	haproxy=$(ansible controller -b -m shell -a "diff "$src_dir"/haproxy/etc/haproxy/haproxy.cfg "$dst_dir"/haproxy.cfg | grep -v transport_url" | grep -E '^>|^<')
3375	heat=$(ansible controller -b -m shell -a "diff "$src_dir"/heat/etc/heat/heat.conf "$dst_dir"/heat.conf | grep -v transport_url" | grep -E '^>|^<')
3376	keystone=$(ansible controller -b -m shell -a "diff "$src_dir"/keystone/etc/keystone/keystone.conf "$dst_dir"/keystone.conf | grep -v transport_url" | grep -E '^>|^<')
3377	neutron=$(ansible controller -b -m shell -a "diff "$src_dir"/neutron/etc/neutron/neutron.conf "$dst_dir"/neutron.conf | grep -v transport_url" | grep -E '^>|^<')
3378	rabbitmq=$(ansible controller -b -m shell -a "diff "$src_dir"/rabbitmq/etc/rabbitmq/rabbitmq.config "$dst_dir"/rabbitmq.config | grep -v transport_url" | grep -E '^>|^<')
3379	ceph=$(ansible controller -b -m shell -a "diff "$src_dir"/ceph/etc/ceph/ceph.conf "$dst_dir"/ceph.conf | grep -v transport_url" | grep -E '^>|^<')
3380	if [[ $cinder ]]
3381	then
3382		echo -e "${LRB}cinder${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3383		echo -e "${RED}$cinder${NC}" | sed 's/^>/ORIGINAL:/g' | sed 's/^</CURRENT:/g' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3384	fi
3385	if [[ $nova ]]	
3386	then
3387		echo -e "${LRB}nova${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3388		echo -e "${RED}$nova${NC}" | sed 's/^>/ORIGINAL:/g' | sed 's/^</CURRENT:/g' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3389	else
3390		echo -e "${GREEN}nova original and current conf files are identical${NC}"	
3391	fi		
3392	if [[ $glance_api ]]	
3393	then
3394		echo -e "${LRB}glance_api${NC}"	; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3395		echo -e "${RED}$glance_api${NC}" | sed 's/^>/ORIGINAL:/g' | sed 's/^</CURRENT:/g' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3396	else
3397		echo -e "${GREEN}glance_api original and current conf files are identical${NC}"	
3398	fi		
3399	if [[ $haproxy ]]	
3400	then
3401		echo -e "${LRB}haproxy${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))	
3402		echo -e "${RED}$haproxy${NC}" | sed 's/^>/ORIGINAL:/g' | sed 's/^</CURRENT:/g' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3403	else
3404		echo -e "${GREEN}haproxy original and current conf files are identical${NC}"	
3405	fi	
3406	if [[ $heat ]]
3407	then
3408		echo -e "${LRB}heat${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))	
3409		echo -e "${RED}$heat${NC}" | sed 's/^>/ORIGINAL:/g' | sed 's/^</CURRENT:/g' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3410	else
3411		echo -e "${GREEN}heat original and current conf files are identical${NC}"	
3412	fi		
3413	if [[ $keystone ]]
3414	then
3415		echo -e "${LRB}keystone${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))		
3416		echo -e "${RED}$keystone${NC}" | sed 's/^>/ORIGINAL:/g' | sed 's/^</CURRENT:/g' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3417	else
3418		echo -e "${GREEN}keystone original and current conf files are identical${NC}"	
3419	fi		
3420	if [[ $neutron ]]
3421	then
3422		echo -e "${LRB}neutron${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))		
3423		echo -e "${RED}$neutron${NC}" | sed 's/^>/ORIGINAL:/g' | sed 's/^</CURRENT:/g' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3424	else
3425		echo -e "${GREEN}neutron original and current conf files are identical${NC}"		
3426	fi		
3427	if [[ $rabbitmq ]]
3428	then
3429		echo -e "${LRB}rabbitmq${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))		
3430		echo -e "${RED}$rabbitmq${NC}" | sed 's/^>/ORIGINAL:/g' | sed 's/^</CURRENT:/g' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3431	else
3432		echo -e "${GREEN}rabbitmq original and current conf files are identical${NC}"
3433	fi		
3434	if [[ $ceph ]]
3435	then
3436		echo -e "${LRB}ceph${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))		
3437		echo -e "${RED}$ceph${NC}" | sed 's/^>/ORIGINAL:/g' | sed 's/^</CURRENT:/g' ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3438	else
3439		echo -e "${GREEN}ceph original and current conf files are identical${NC}"
3440	fi		
3441	elapsed_time_seconds=$(expr $(date +%s) - $start)
3442
3443
3444	####################################################################################################
3445
3446
3447	start=$(date +%s)
3448	STEPS_COUNTER=$((STEPS_COUNTER+1))
3449	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT OPENSTACK CAN ISSUE TOKEN FROM BOTH stackrc AND overcloudrc (+$elapsed_time_seconds `date '+%T'`)${NC}"
3450	stackrc_token=$(source ~/stackrc && openstack token issue > /dev/null)
3451	if [[ $stackrc_token ]]
3452	then
3453		echo -e "${RED}$stackrc_token${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3454	else
3455		echo -e "${GREEN}token was created sucessfully for user admin (stackrc)${NC}"
3456	fi
3457	overcloudrc_token=$(source ~/overcloudrc && openstack token issue > /dev/null)
3458	if [[ $overcloudrc_token ]]
3459	then
3460		echo -e "\n${RED}$overcloudrc_token${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3461	else
3462		echo -e "${GREEN}token was created sucessfully for user admin (overcloudrc)${NC}"
3463	fi
3464	elapsed_time_seconds=$(expr $(date +%s) - $start)
3465
3466
3467	####################################################################################################
3468
3469
3470	start=$(date +%s)
3471	if [[ $nuage == "true" ]]
3472	then
3473		STEPS_COUNTER=$((STEPS_COUNTER+1))
3474		echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE BETWEEN THE NUAGE VERSION OF ALL THE OVERCLOUD HOSTS (+$elapsed_time_seconds `date '+%T'`)${NC}"
3475		nuage_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "cat /usr/share/cbis/nuage-version" | grep ^[0-9] | sort --uniq | wc -l)
3476		if [[ $nuage_version != "1" ]]
3477		then
3478			nuage_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "cat /usr/share/cbis/nuage-version")
3479			echo -e "${RED}nuage version mismatch found between one or more hosts${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3480			echo -e "${RED}$nuage_version${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3481		else
3482			echo -e "${GREEN}nuage version is: $nuage_version${NC} "
3483		fi
3484	fi
3485	elapsed_time_seconds=$(expr $(date +%s) - $start)
3486
3487
3488	####################################################################################################
3489
3490
3491	start=$(date +%s)
3492	if [[ $nuage == "true" ]]
3493	then
3494		STEPS_COUNTER=$((STEPS_COUNTER+1))
3495		echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE BETWEEN THE Open vSwitch NUAGE VERSION OF ALL THE OVERCLOUD HOSTS (+$elapsed_time_seconds `date '+%T'`)${NC}"
3496		nuage_ovs_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ovs-appctl --version" | grep nuage | awk '{print $ NF}' | cut -d - -f 1-2 | sort --uniq | wc -l)
3497		if [[ $nuage_ovs_version != "1" ]]
3498		then
3499			nuage_ovs_version$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ovs-appctl --version | grep nuage")
3500			echo -e "${RED}nuage Open vSwitch version mismatch found between one or more hosts${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3501			echo -e "${RED}$nuage_ovs_version${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3502		else
3503			echo -e "${GREEN}nuage Open vSwitch version is: $nuage_ovs_version${NC} "
3504		fi
3505	fi
3506	elapsed_time_seconds=$(expr $(date +%s) - $start)
3507		
3508
3509	####################################################################################################
3510
3511
3512	start=$(date +%s)
3513	STEPS_COUNTER=$((STEPS_COUNTER+1))
3514	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT ALL THE HOSTS HAS IDENTICAL MELLANOX OFED AND DRIVER VERSION (+$elapsed_time_seconds `date '+%T'`)${NC}"
3515	if [[ $fixed_platform != 'airframe' && $fixed_platform != 'dell-730' && $fixed_platform != 'hp-slg7_OVS' && $fixed_platform != 'hp-slg7_OVS_SSD_single_nic' && $fixed_platform != 'hp-c7kg8' && $fixed_platform != 'hp-c7kg9' ]]
3516	then
3517		if [[ $mlx_ofed_version != "/bin/sh:" ]]
3518		then
3519			mlx_version_comparison=$(echo -e "$mlx_ofed_version" | wc -l)
3520			if [[ $mlx_version_comparison != "1" ]]
3521			then
3522				echo -e "${RED}found multiple mellanox ofed versions:\n\n$mlx_ofed_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3523			else
3524				echo -e "${GREEN}all the hosts has the same mellanox ofed version ($mlx_ofed_version)${NC}"
3525				echo -e "MELLANOX OFED VERSION = $mlx_ofed_version" > $logs_dir/mellanox_details
3526				mlxfwmanager=$(ansible $last_index_controller -b -m shell -a "mlxfwmanager")
3527				mlx_firmware=$(echo -e "mlxfwmanager" | grep FW | awk '{print $2}' | sort -u)
3528				echo -e "\n\n$MELLANOX INTERFACE FIRMWARE = $mlx_firmware" >> $logs_dir/mellanox_details
3529				mlx_device_type=$(echo -e "mlxfwmanager" | grep 'Device Type:' | awk '{print $2}' | sort -u)
3530				echo -e "\n\n$MELLANOX DEVICE TYPE = $mlx_device_type" >> $logs_dir/mellanox_details
3531				echo -e "\n\n$MELLANOX RPMS INFO: = $mlx_firmware" >> $logs_dir/mellanox_details
3532				ansible $last_index_controller -b -m shell -a "/usr/bin/ofed_rpm_info" >> $logs_dir/mellanox_details
3533			fi
3534		else
3535			echo -e "${RED}$fixed_platform platform should use mellanox interfaces but no mellanox interface are to be found (/usr/bin/ofed_info)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3536		fi
3537	else
3538		echo -e "${ORANGE}$fixed_platform platform blueprint is without mellanox interfaces${NC}"
3539	fi
3540	elapsed_time_seconds=$(expr $(date +%s) - $start)
3541
3542
3543	####################################################################################################	
3544	
3545	
3546	start=$(date +%s)
3547	STEPS_COUNTER=$((STEPS_COUNTER+1))
3548	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE /etc/ansible/hosts.hfx Compute GROUP CONTAINS ONLY COMPUTES (ICE-3141) (+$elapsed_time_seconds `date '+%T'`)${NC}"
3549	NON_COMPUTE_HOSTS=$(cat /etc/ansible/hosts.hfx | awk '/\[Compute\]/,/\[compute\]/' | grep overcloud- | grep -v compute)
3550	if [[ $NON_COMPUTE_HOSTS ]]
3551	then
3552		echo -e "${RED}the following hosts appear under the [Compute] group in /etc/ansible/hosts.hfx :${NC}\n\n"
3553		echo -e "${RED}$NON_COMPUTE_HOSTS${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3554	else
3555		echo -e "${GREEN}no non-computes are found under the [Compute] group in /etc/ansible/hosts.hfx${NC}"
3556	fi	
3557	elapsed_time_seconds=$(expr $(date +%s) - $start)
3558
3559
3560	####################################################################################################	
3561
3562
3563	start=$(date +%s)
3564	STEPS_COUNTER=$((STEPS_COUNTER+1))
3565	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE DOCKER CONTAINERS ACTIVLY USING OLD/REMOVED IMAGES (+$elapsed_time_seconds `date '+%T'`)${NC}"
3566	REMOVE_STATE_DOCKER_IMAGES=$(ansible all --limit '!hypervisor' -b -m shell -a "docker image list | grep -w -E 'remove\s+' | awk '{print \$3}'" | grep ^[0-9a-f] | sort | uniq | paste -sd'|')
3567	if [[ $REMOVE_STATE_DOCKER_IMAGES ]]
3568	then
3569		CONTAINERS_USING_REMOVED_IMAGES=$(ansible all --limit '!hypervisor' -b -m shell -a "docker ps -a | grep -E \"$REMOVE_STATE_DOCKER_IMAGES\"" | grep ^[0-9a-f] -B 1)
3570		if [[ $CONTAINERS_USING_REMOVED_IMAGES ]]
3571		then
3572			echo -e "${RED}$CONTAINERS_USING_REMOVED_IMAGES${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3573			echo -e "\n\n${ORANGE}CBIS-16623/CBIS-16630 (19A/20) - post deploying the priority pack the zabbix docker container still using the old zabbix image that should have been removed and replaced by a new image${NC}"
3574		else
3575			echo -e "${GREEN}no docker containers that are using images that in remove state are found${NC}"
3576		fi	
3577	else
3578		echo -e "${GREEN}no docker containers that are using images that in remove state are found${NC}"
3579	fi
3580	elapsed_time_seconds=$(expr $(date +%s) - $start)
3581
3582
3583	####################################################################################################
3584
3585
3586	start=$(date +%s)
3587	STEPS_COUNTER=$((STEPS_COUNTER+1))
3588	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE DOCKER IMAGES WITH REMOVE STATE (+$elapsed_time_seconds `date '+%T'`)${NC}"
3589	REMOVE_STATE_DOCKER_IMAGES=$(ansible all --limit '!hypervisor' -b -m shell -a "docker image list | grep -w -E 'remove\s+'" | grep -w -E 'remove\s+' -B 1)
3590	if [[ $REMOVE_STATE_DOCKER_IMAGES ]]
3591	then
3592		echo -e "${RED}$REMOVE_STATE_DOCKER_IMAGES${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3593		echo -e "\n\n${ORANGE}CBIS-16623/CBIS-16630 (19A/20) - post deploying the priority pack the zabbix docker container still using the old zabbix image that should have been removed and replaced by a new image${NC}"
3594	else
3595		echo -e "${GREEN}no docker images with remove state are found${NC}"
3596	fi	
3597	elapsed_time_seconds=$(expr $(date +%s) - $start)
3598
3599
3600	####################################################################################################	
3601
3602
3603	start=$(date +%s)
3604	STEPS_COUNTER=$((STEPS_COUNTER+1))
3605	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE UNUSED DOCKER IMAGES (+$elapsed_time_seconds `date '+%T'`)${NC}"
3606	if [[ $cbis_version != "18.0.0.1" ]]
3607	then
3608		unused_docker_images=$(ansible all --limit '!hypervisor' -b -m shell -a "docker image ls --all | grep \<none\> | awk '{print \$1}' | sort | uniq -c | grep -E -v '^[[:space:]]+[0-1]' | grep -v \<none\>" | grep -E -v 'FAILED|non-zero return code')
3609		if [[ $unused_docker_images ]]
3610		then
3611			echo -e "${RED}$unused_docker_images${NC}"
3612		else
3613			echo -e "${GREEN}no unused docker images were found${NC}"
3614		fi
3615	else
3616		echo -e "${ORANGE}no docker containers in CBIS 18.0.0.1${NC}"
3617	fi
3618	elapsed_time_seconds=$(expr $(date +%s) - $start)
3619
3620
3621	####################################################################################################
3622
3623
3624	start=$(date +%s)
3625	STEPS_COUNTER=$((STEPS_COUNTER+1))
3626	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE UNUSED OVERCLOUD GLANCE IMAGES (+$elapsed_time_seconds `date '+%T'`)${NC}"
3627	used_overcloud_image=$(source ~/stackrc && openstack server list --long -c "Image ID" -f value | sort --uniq | paste -sd '|')
3628	unused_overcloud_images=$(source ~/stackrc && openstack image list -f value | grep overcloud-full_[0-9] | grep -E -v "$used_overcloud_image")
3629	unused_overcloud_images_parsed=$(echo -e "$unused_overcloud_images" | awk '{print $2}' | paste -sd' ')
3630	if [[ $unused_overcloud_images ]]
3631	then
3632		echo -e "${RED}$unused_overcloud_images\n\n${MAGENTA}it is safe to delete the above image(s) in order to free required disk space from the / partition in the UC\nfrom the undercloud vm: source ~/stackrc && openstack image delete $unused_overcloud_images_parsed${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3633	else
3634		echo -e "${GREEN}no unused overcloud images${NC}"
3635	fi
3636	elapsed_time_seconds=$(expr $(date +%s) - $start)
3637
3638
3639	####################################################################################################
3640	
3641	
3642	start=$(date +%s)
3643	STEPS_COUNTER=$((STEPS_COUNTER+1))
3644	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE NO DOCKER CONTAINER IS USING ALL THE COMPUTE CPUS (SHOULD ONLY USE THE ISOLATED CPUS) (+$elapsed_time_seconds `date '+%T'`)${NC}"
3645	UNRESTRAINED_CONTAINERS=$(ansible compute -b -m shell -a "docker ps | awk '{print \$NF}' | xargs -i  docker inspect {}  | grep -E '\"Name\": \"/|\"CpusetCpus\": \"\",' |  grep '\"CpusetCpus\": \"\",' -B 1" | grep -v 'Error: No such object: NAMES')
3646	if [[ $UNRESTRAINED_CONTAINERS ]]
3647	then
3648		echo -e "${RED}one or more containers are using all the compute cpus while it should only use the isolated cpus${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3649	else
3650		echo -e "${GREEN}couldn't find containers that using all the compute cpus${NC}"
3651	fi
3652	elapsed_time_seconds=$(expr $(date +%s) - $start)
3653
3654
3655	####################################################################################################
3656
3657
3658	start=$(date +%s)
3659	STEPS_COUNTER=$((STEPS_COUNTER+1))
3660	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE AMOUNT OF HOSTS PRESENTED BY ANSIBLE EQUALS TO THE AMOUNT OF HOSTS PRESENTED BY NOVA (+$elapsed_time_seconds `date '+%T'`)${NC}"
3661	if [[ $ansible_all_hosts_count != $nova_overcloud_and_undercloud_hosts_count ]]
3662	then
3663		echo -e "${RED}openstack server list overcloud + undercloud vm hosts count is ($nova_overcloud_and_undercloud_hosts_count) while there are ($ansible_all_hosts_count) hosts under /etc/ansible/hosts${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3664	else
3665		echo -e "${GREEN}/etc/ansible/hosts and openstack server list has the same hosts count${NC}"
3666	fi
3667	elapsed_time_seconds=$(expr $(date +%s) - $start)
3668
3669
3670	####################################################################################################
3671
3672
3673	if [[ $nuage == "true" ]]
3674	then
3675		start=$(date +%s)
3676		STEPS_COUNTER=$((STEPS_COUNTER+1))
3677		echo -e "${BLUE}\n\n$STEPS_COUNTER) MONITOR THE NUAGE SERVICE ON THE VSD $vsd_ip (+$elapsed_time_seconds `date '+%T'`)${NC}"
3678		monit_summary=$(sshpass -p 'Alcateldc' ssh root@$vsd_ip monit summary | grep -E -v 'Running|Accessible|Status ok')
3679		if [[ $monit_summary ]]
3680		then
3681			echo -e "${RED}$monit_summary${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3682		else
3683			echo -e "${GREEN}monit summary returned Running, Accessible and Status ok as expected${NC}"
3684		fi
3685	fi
3686	elapsed_time_seconds=$(expr $(date +%s) - $start)
3687		
3688		
3689		
3690	####################################################################################################	
3691	
3692	
3693	if [[ $nuage == "true" ]]
3694	then
3695		start=$(date +%s)
3696		STEPS_COUNTER=$((STEPS_COUNTER+1))
3697		echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THE EJABBERED LICENSE ON THE VSD $vsd_ip (+$elapsed_time_seconds `date '+%T'`)${NC}"
3698		ejabbered_license=$(sshpass -p 'Alcateldc' ssh root@$vsd_ip /opt/ejabberd/bin/ejabberdctl license_info | grep expired)
3699		if [[ $ejabbered_license ]]
3700		then
3701			ejabbered_license=$(sshpass -p 'Alcateldc' ssh root@$vsd_ip /opt/ejabberd/bin/ejabberdctl license_info)
3702			echo -e "${RED}$ejabbered_license${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3703		else
3704			echo -e "${GREEN}the ejabbered license is still valid${NC}"
3705		fi		
3706		elapsed_time_seconds=$(expr $(date +%s) - $start)
3707	fi
3708	
3709
3710	####################################################################################################
3711
3712
3713	start=$(date +%s)
3714	STEPS_COUNTER=$((STEPS_COUNTER+1))
3715	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THE OVERCLOUD STACK STATUS (+$elapsed_time_seconds `date '+%T'`)${NC}"
3716	stack_status=$(source ~/stackrc && openstack stack list -c 'Stack Status' | grep COMPLETE | awk '{print $2}')
3717	if [[ $stack_status ]]
3718	then
3719		echo -e "${GREEN}overcloud stack status is $stack_status${NC}"
3720	else
3721		stack_status=$(source ~/stackrc && openstack stack list)
3722		echo -e "${RED}$stack_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3723	fi
3724	elapsed_time_seconds=$(expr $(date +%s) - $start)
3725
3726
3727	####################################################################################################
3728
3729
3730	start=$(date +%s)
3731	STEPS_COUNTER=$((STEPS_COUNTER+1))
3732	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE SYSTEM IS CONFIGURED WITH THE EXPECTED NUMBER OF FAST OSDS (+$elapsed_time_seconds `date '+%T'`)${NC}"
3733	if [[ $hci == "false" && $fast_pools == "true" ]]
3734	then
3735		fast_disks=$(cat user_config.yaml | grep fast_pool_device: -A6 | grep -c /dev/)
3736		expected_fast_osds=$(expr $ansible_storage_hosts_count \* $fast_disks)
3737		current_fast_osds=$(ansible $last_index_controller -b -m shell -a "ceph osd tree | sed -n -e '/root fast/,/root common/ p' | grep -c osd\." | grep ^[0-9])
3738		if [[ $expected_fast_osds == $current_fast_osds ]]
3739		then
3740			echo -e "${GREEN}found $expected_fast_osds fast osds as expected${NC}"	
3741		else
3742			echo -e "${RED}expected $expected_fast_osds fast osds but received $current_fast_osds osds - execute \"ceph osd tree\" in one of the storage-nodes to check for inconsistencies" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3743			echo -e "\n\n${ORANGE}CBIS-16402 (19A) - OSDs are not configured for sdaa+ disks{NC}"	
3744		fi
3745	elif [[ $ceph_backend == "true" && $hci == "false" && $fast_pools == "false" ]]
3746	then
3747		echo -e "${ORANGE}the setup is configured with multi-pools${NC}"
3748	elif [[ $ceph_backend == "true" && $hci == "true" ]]
3749	then
3750		echo -e "${ORANGE}the setup is configured with hci${NC}"
3751	elif [[ $ceph_backend == "false" ]]
3752	then
3753		echo -e "${ORANGE}the setup is deployed without ceph backend${NC}"
3754	fi
3755	elapsed_time_seconds=$(expr $(date +%s) - $start)
3756
3757
3758	####################################################################################################
3759
3760
3761	start=$(date +%s)
3762	STEPS_COUNTER=$((STEPS_COUNTER+1))
3763	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THERE IS NO MIX OF BLOCK STORAGE TYPES (SSD, HDD and ETC..) BETWEEN THE CEPH FAST/COMMON POOLS (+$elapsed_time_seconds `date '+%T'`)${NC}"
3764	if [[ $ceph_backend == "true" && $fast_pools == "true" ]]
3765	then
3766		echo -e "${CYAN}fast-pools osds check${NC}"
3767		osds=$(ansible $last_index_controller -b -m shell -a "ceph osd tree -f json | jq .nodes[].name" | awk '/fast/,/common/' | grep osd\. | tr -d '"' | paste -sd "|")
3768		osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree | grep -E -w '$osds' | awk '{print \$2}' | sort --uniq | wc -l" | grep ^[0-9])
3769		if [[ $osds_class != "1" ]]
3770		then
3771			osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree" | awk '/fast/,/common/')
3772			echo -e "${RED}$osds_class${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3773		else
3774			osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree" | awk '/fast/,/common/' | grep osd\. | awk '{print $2}' | sort --uniq)
3775			echo -e "${GREEN}all fast osds using the same block storage device ($osds_class)${NC}"
3776		fi
3777		echo -e "${CYAN}common-pools osds check${NC}"
3778		osds=$(ansible $last_index_controller -b -m shell -a "ceph osd tree -f json | jq .nodes[].name" | awk '/common/,0' | grep osd\. | tr -d '"' | paste -sd "|")
3779		osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree | grep -E -w '$osds' | awk '{print \$2}' | sort --uniq | wc -l" | grep ^[0-9])
3780		if [[ $osds_class != "1" ]]
3781		then
3782			osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree" | awk '/common/,0')
3783			echo -e "${RED}$osds_class${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3784		else
3785			osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree" | awk '/common/,0' | grep osd\. | awk '{print $2}' | sort --uniq)
3786			echo -e "${GREEN}all common osds using the same block storage device ($osds_class)${NC}"
3787		fi
3788		echo -e "${CYAN}common-pools osds check${NC}"
3789		osds=$(ansible $last_index_controller -b -m shell -a "ceph osd tree -f json | jq .nodes[].name" | awk '/common/,0' | grep osd\. | tr -d '"' | paste -sd "|")
3790		osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree | grep -E -w '$osds' | awk '{print \$2}' | sort --uniq | wc -l" | grep ^[0-9])
3791		if [[ $osds_class != "1" ]]
3792		then
3793			osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree" | awk '/common/,0')
3794			echo -e "${RED}$osds_class${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3795		else
3796			osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree" | awk '/common/,0' | grep osd\. | awk '{print $2}' | sort --uniq)
3797			echo -e "${GREEN}all common osds using the same block storage device ($osds_class)${NC}"
3798		fi
3799	elif [[ $ceph_backend == "true" && $hci == "false" && $fast_pools == "false" ]]
3800	then
3801		echo -e "${ORANGE}the setup is configured with multi-pools${NC}"
3802	elif [[ $ceph_backend == "true" && $hci == "true" ]]
3803	then
3804		echo -e "${ORANGE}the setup is configured with hci${NC}"
3805	elif [[ $ceph_backend == "false" ]]
3806	then
3807		echo -e "${ORANGE}the setup is deployed without ceph backend${NC}"
3808	fi
3809	elapsed_time_seconds=$(expr $(date +%s) - $start)
3810
3811
3812	####################################################################################################
3813
3814
3815	start=$(date +%s)
3816	STEPS_COUNTER=$((STEPS_COUNTER+1))
3817	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE FAST POOL OSD'S BLOCK STORAGE TYPE ARE SSD/NVME (+$elapsed_time_seconds `date '+%T'`)${NC}"
3818	if [[ $ceph_backend == "true" && $fast_pools == "true" ]]
3819	then
3820		fast_osds_class=$(ansible $last_index_controller -b -m shell -a "ceph osd tree" | awk '/fast/,/common/' | grep osd\. | awk '{print $2}' | grep -v ssd | grep -v nvme | sort --uniq)
3821		if [[ -z $fast_osds_class ]]
3822		then
3823			echo -e "${GREEN}all fast osds using the same block storage device ($osds_class)${NC}"
3824		else
3825			echo -e "${RED}one or more osds under the fast-pool are from $fast_osds_class type while expecting the fast-pool osds to be from type nvme/ssd${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3826		fi
3827	elif [[ $ceph_backend == "true" && $hci == "false" && $fast_pools == "false" ]]
3828	then
3829		echo -e "${ORANGE}the setup is configured with multi-pools${NC}"
3830	elif [[ $ceph_backend == "true" && $hci == "true" ]]
3831	then
3832		echo -e "${ORANGE}the setup is configured with hci${NC}"
3833	elif [[ $ceph_backend == "false" ]]
3834	then
3835		echo -e "${ORANGE}the setup is deployed without ceph backend${NC}"
3836	fi
3837	elapsed_time_seconds=$(expr $(date +%s) - $start)
3838
3839
3840	####################################################################################################
3841
3842
3843	start=$(date +%s)
3844	STEPS_COUNTER=$((STEPS_COUNTER+1))
3845	echo -e "${BLUE}\n\n$STEPS_COUNTER) CEPH OSDS IN/UP STATUS CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
3846	if [[ $cbis_version != "18.0.0.1" ]]
3847	then
3848		if [[ $ceph_backend == "true" ]]
3849		then
3850			osds_total=$(ansible $last_index_controller -b -m shell -a "ceph -s" | grep osd: | awk '{print $2}')
3851			osds_up=$(ansible $last_index_controller -b -m shell -a "ceph -s" | grep osd: | awk '{print $4}')
3852			osds_in=$(ansible $last_index_controller -b -m shell -a "ceph -s" | grep osd: | awk '{print $6}')
3853			if [[ $osds_total == $osds_up && $osds_total == $osds_in ]]
3854			then
3855				echo -e "${GREEN}all $osds_total osds are in and up${NC}"	
3856			else
3857				ceph_status=$(ansible $last_index_controller -b -m shell -a "ceph -s")
3858				echo -e "${RED}found osds inconsistencies:${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3859				echo -e "${RED}total osds: $osds_total   |   osds up: $osds_up   |   osds in: $osds_in${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3860				
3861				echo -e "\n${RED}$ceph_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3862			fi
3863		else
3864			echo -e "${ORANGE}the setup is deployed without ceph backend${NC}"
3865		fi	    
3866	elif [[ $cbis_version == "18.0.0.1" ]]
3867	then
3868		if [[ $ceph_backend == "true" ]]
3869		then
3870			osds_total=$(ansible $last_index_controller -b -m shell -a "ceph -s | grep osdmap" | grep -v $last_index_controller | awk '{print $3}')
3871			osds_up=$(ansible $last_index_controller -b -m shell -a "ceph -s | grep osdmap" | grep -v $last_index_controller | awk '{print $5}')
3872			osds_in=$(ansible $last_index_controller -b -m shell -a "ceph -s | grep osdmap" | grep -v $last_index_controller | awk '{print $7}')
3873			if [[ $osds_total == $osds_up && $osds_total == $osds_in ]]
3874			then
3875				echo -e "${GREEN}all $osds_total osds are in and up${NC}"	
3876			else
3877				echo -e "${RED}found osds inconsistencies${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3878				echo -e "${RED}$osds_total total osds, $osds_up up osds, $osds_in in osds${NC}\n\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3879				ceph_status=$(ansible $last_index_controller -b -m shell -a "ceph -s")
3880				echo -e "${RED}$ceph_status${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3881			fi
3882		else
3883			echo -e "${ORANGE}the setup is deployed without ceph backend${NC}"
3884		fi	 
3885	fi
3886	elapsed_time_seconds=$(expr $(date +%s) - $start)
3887
3888
3889	####################################################################################################
3890
3891
3892	start=$(date +%s)
3893	STEPS_COUNTER=$((STEPS_COUNTER+1))
3894	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK EACH HCI COMPUTE/STORAGE NODE HAS THE SAME NUMBER OF OSDS (+$elapsed_time_seconds `date '+%T'`)${NC}"
3895	if [[ $hci == "true" ]]
3896	then
3897		ceph_osds_count_per_server=$(ansible compute -b -m shell -a "docker ps | grep ceph-osd | wc -l" | grep ^[0-9] | sort -u | wc -l)
3898		if [[ $ceph_osds_count_per_server == "1" ]]
3899		then
3900			echo -e "${GREEN}all the servers has the same number of osds docker containers${NC}"
3901		else
3902			ceph_osds_count_per_server=$(ansible compute -b -m shell -a "docker ps | grep ceph-osd | wc -l")
3903			echo -e "${RED}$ceph_osds_count_per_server${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3904		fi
3905	elif [[ $hci == "false" && $ceph_backend == "true" ]]
3906	then
3907		ceph_osds_count_per_server=$(ansible cephstorage -b -m shell -a "docker ps | grep ceph-osd | wc -l" | grep ^[0-9] | sort -u | wc -l)
3908		if [[ $ceph_osds_count_per_server == "1" ]]
3909		then
3910			echo -e "${GREEN}all the servers has the same number of osds docker containers${NC}"
3911		else
3912			ceph_osds_count_per_server=$(ansible cephstorage -b -m shell -a "docker ps | grep ceph-osd | wc -l")
3913			echo -e "${RED}$ceph_osds_count_per_server${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3914		fi
3915	elif [[ $ceph_backend == "false" ]]
3916	then
3917		echo -e "${ORANGE}the setup is deployed without ceph backend${NC}"
3918	fi
3919	elapsed_time_seconds=$(expr $(date +%s) - $start)
3920
3921		
3922	####################################################################################################
3923
3924
3925	start=$(date +%s)
3926	STEPS_COUNTER=$((STEPS_COUNTER+1))
3927	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE CEPH SLOW REQUESTS IN /var/log/messages /var/log/ceph/(+$elapsed_time_seconds `date '+%T'`)"
3928	if [[ $ceph_backend == "true" ]]
3929	then
3930		ceph_slow_requests=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "grep -i -R -E 'REQUEST_SLOW|slow requests' /var/log/messages /var/log/ceph/ | grep -v ansible-command" | grep -i -E 'REQUEST_SLOW|slow requests' -B 1)
3931		if [[ $ceph_slow_requests ]]
3932		then
3933			echo -e "${RED}$ceph_slow_requests${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))		
3934		else
3935
3936			echo -e "${GREEN}no ceph slow requests in /var/log/messages and /var/log/ceph/ are found${NC}"
3937		fi
3938	else
3939		echo -e "${ORANGE}the setup is deployed without ceph backend${NC}"
3940	fi	
3941	elapsed_time_seconds=$(expr $(date +%s) - $start)
3942	
3943	
3944	####################################################################################################
3945	
3946	
3947	start=$(date +%s)
3948	STEPS_COUNTER=$((STEPS_COUNTER+1))
3949	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT CEPH FSID IS IDENTICAL BETWEEN ALL THE HOSTS (+$elapsed_time_seconds `date '+%T'`)"
3950	if [[ $ceph_backend == "true" ]]
3951	then
3952		oc_fsid=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ceph fsid" | grep ^[0-9,a-f] | sort -u)
3953		uc_fsid=$(grep CephClusterFSID templates/storage-environment.yaml | awk '{print $2}' | tr -d \')
3954		if [[ $oc_fsid == $uc_fsid ]]
3955		then
3956			echo -e "${GREEN}all the hosts has the same ceph fsid ($uc_fsid)${NC}"
3957		else
3958			echo -e "${RED}ceph fsid presented in templates/storage-environment.yaml (undercloud vm):\n($uc_fsid)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3959			echo -e "\n${RED}ceph fsid result of the ceph fsid command (overcloud):\n($oc_fsid)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
3960			echo -e "\n\n${ORANGE}CBIS-15830 (19A) - Ceph returns failed to bind the UNIX domain socket warning ${NC}"
3961		fi
3962	else
3963		echo -e "${ORANGE}the setup is deployed without ceph backend${NC}"
3964	fi
3965	elapsed_time_seconds=$(expr $(date +%s) - $start)
3966	
3967	
3968	####################################################################################################
3969
3970
3971	start=$(date +%s)
3972	STEPS_COUNTER=$((STEPS_COUNTER+1))
3973	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE NOVA LOCAL STORAGE PARTITION IS CONFIGURED (df)  (+$elapsed_time_seconds `date '+%T'`)"
3974	if [[ $ovs_local_storage == "true" || $avrs_local_storage == "true" || $sriov_local_storage == "true" || $dpdk_local_storage == "true" ]]
3975	then
3976		if [[ $ovs_local_storage == "true" ]]
3977		then
3978			echo -e "${CYAN}checking OvsCompute partitions${NC}"
3979			nova_local_storage=$(ansible *overcloud-[oO]vs* -b -m shell -a "df | grep /var/lib/nova/instances")
3980			if [[ $nova_local_storage ]]
3981			then
3982				echo -e "${GREEN}$nova_local_storage${NC}"
3983			else
3984				echo -e "${GREEN}while the compute host-group is configured with local storage, the partition /var/lib/nova/instances is missing${NC}"
3985			fi
3986		fi
3987		if [[ $avrs_local_storage == "true" ]]
3988		then
3989			echo -e "${CYAN}checking AvrsCompute partitions${NC}"
3990			nova_local_storage=$(ansible *overcloud-[aA]vrs* -b -m shell -a "df | grep /var/lib/nova/instances")
3991			if [[ $nova_local_storage ]]
3992			then
3993				echo -e "${GREEN}$nova_local_storage${NC}"
3994			else
3995				echo -e "${GREEN}while the compute host-group is configured with local storage, the partition /var/lib/nova/instances is missing${NC}"
3996			fi
3997		fi
3998		if [[ $sriov_local_storage == "true" ]]
3999		then
4000			echo -e "${CYAN}checking SriovPerformanceCompute partitions${NC}"
4001			nova_local_storage=$(ansible *overcloud-[sS]riov* -b -m shell -a "df | grep /var/lib/nova/instances")
4002			if [[ $nova_local_storage ]]
4003			then
4004				echo -e "${GREEN}$nova_local_storage${NC}"
4005			else
4006				echo -e "${GREEN}while the compute host-group is configured with local storage, the partition /var/lib/nova/instances is missing${NC}"
4007			fi
4008		fi
4009		if [[ $dpdk_local_storage == "true" ]]
4010		then
4011			echo -e "${CYAN}checking SriovPerformanceCompute partitions${NC}"
4012			nova_local_storage=$(ansible *overcloud-[sS]riov* -b -m shell -a "df | grep /var/lib/nova/instances")
4013			if [[ $nova_local_storage ]]
4014			then
4015				echo -e "${GREEN}$nova_local_storage${NC}"
4016			else
4017				echo -e "${GREEN}while the compute host-group is configured with local storage, the partition /var/lib/nova/instances is missing${NC}"
4018			fi
4019		fi
4020	else
4021		echo -e "${ORANGE}no host with nova local storage enabled is found${NC}"
4022	fi
4023	elapsed_time_seconds=$(expr $(date +%s) - $start)
4024	
4025
4026	####################################################################################################
4027
4028
4029	start=$(date +%s)
4030	STEPS_COUNTER=$((STEPS_COUNTER+1))
4031	EXCEPTION=0
4032	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE INSTANCES LEFTOVERS IN /var/lib/nova/instances/ OF THE COMPUTES (+$elapsed_time_seconds `date '+%T'`)${NC}"
4033	for host in $ansible_computes_hosts
4034	do
4035		instances_id_dir=$(ansible $host -b -m shell -a "ls /var/lib/nova/instances/ | awk '{print \$NF}'" | grep '^[0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f]-')
4036		for instance_id_dir in $instances_id_dir
4037		do 
4038			check_if_id_in_nova=$(echo -e "$nova_instances" | grep $instance_id_dir)
4039			if [[ -z $check_if_id_in_nova ]]
4040			then
4041				echo -e "${RED}/var/lib/nova/instances/$instance_id_dir is not found in openstack server list${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4042				EXCEPTION=$((EXCEPTION+1))
4043			else
4044				echo -e "${GREEN}/var/lib/nova/instances/$instance_id_dir is found in openstack server list${NC}"
4045			fi
4046		done
4047	done
4048	if [ $EXCEPTION -gt 0 ]
4049	then
4050		echo -e "\n\n${ORANGE}CBIS-16393 (19A) - /var/lib/nova/instances/<instance> is not deleted after the instance was migrated${NC}"
4051	fi
4052	elapsed_time_seconds=$(expr $(date +%s) - $global_start)
4053	
4054	
4055	####################################################################################################
4056
4057	start=$(date +%s)
4058	STEPS_COUNTER=$((STEPS_COUNTER+1))
4059	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT EACH HOST-GROUP HAS THE SAME PARTITIONS (df) (+$elapsed_time_seconds `date '+%T'`)${NC}"
4060	echo -e "${CYAN}check the controllers${NC}"
4061	partitions=$(ansible controller -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|md[1][2][6-7]' | sha1sum" | grep ^[0-9a-f] | sort -u | wc -l)
4062	if [[ $partitions == "1" ]]
4063	then
4064		echo -e "${GREEN}all the controllers has the same partitions${NC}"
4065	else
4066		partitions=$(ansible controller -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|md[1][2][6-7]'")
4067		echo -e "${RED}$partitions${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4068		echo -e "\n\n${ORANGE}CBIS-16369 (19A) - mount_cephfs_share is not configured on replaced controllers${NC}"
4069	fi
4070	if [[ $ansible_sriov_hosts_count != "0" ]]
4071	then
4072		echo -e "${CYAN}check sriov computes${NC}"
4073		partitions=$(ansible *overcloud-[sS]riov* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|/var/lib/nova/instances|md[1][2][6-7]' | sha1sum" | grep ^[0-9a-f] | sort -u | wc -l)
4074		if [[ $partitions == "1" ]]
4075		then
4076			echo -e "${GREEN}all the sriov computes has the same partitions${NC}"
4077		else
4078			partitions=$(ansible *overcloud-[sS]riov* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|md[1][2][6-7]'")
4079			echo -e "${RED}$partitions${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4080		fi
4081	fi
4082	if [[ $ansible_ovs_hosts_count != "0" ]]
4083	then
4084		echo -e "${CYAN}check ovs computes${NC}"
4085		partitions=$(ansible *overcloud-[oO]vs* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|/var/lib/nova/instances|md[1][2][6-7]' | sha1sum" | grep ^[0-9a-f] | sort -u | wc -l)
4086		if [[ $partitions == "1" ]]
4087		then
4088			echo -e "${GREEN}all the ovs computes has the same partitions${NC}"
4089		else
4090			partitions=$(ansible *overcloud-[oO]vs* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|md[1][2][6-7]'")
4091			echo -e "${RED}$partitions${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4092		fi
4093	fi
4094	if [[ $ansible_dpdk_hosts_count != "0" ]]
4095	then
4096		echo -e "${CYAN}check dpdk computes${NC}"
4097		partitions=$(ansible *overcloud-[dD]pdk* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|/var/lib/nova/instances|md[1][2][6-7]' | sha1sum" | grep ^[0-9a-f] | sort -u | wc -l)
4098		if [[ $partitions == "1" ]]
4099		then
4100			echo -e "${GREEN}all the dpdk computes has the same partitions${NC}"
4101		else
4102			partitions=$(ansible *overcloud-[dD]pdk* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|md[1][2][6-7]'")
4103			echo -e "${RED}$partitions${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4104		fi
4105	fi
4106	if [[ $ansible_avrs_hosts_count != "0" ]]
4107	then
4108		echo -e "${CYAN}check avrs computes${NC}"
4109		partitions=$(ansible *overcloud-[aA]vrs* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|/var/lib/nova/instances|md[1][2][6-7]' | sha1sum" | grep ^[0-9a-f] | sort -u | wc -l)
4110		if [[ $partitions == "1" ]]
4111		then
4112			echo -e "${GREEN}all the avrs computes has the same partitions${NC}"
4113		else
4114			partitions=$(ansible *overcloud-[aA]vrs* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|md[1][2][6-7]'")
4115			echo -e "${RED}$partitions${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4116		fi
4117	fi
4118	if [[ $ansible_storage_hosts_count != "0" ]]
4119	then
4120		echo -e "${CYAN}check the storage nodes${NC}"
4121		partitions=$(ansible *overcloud-[Ss]torage* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|md[1][2][6-7]' | sha1sum" | grep ^[0-9a-f] | sort -u | wc -l)
4122		if [[ $partitions == "1" ]]
4123		then
4124			echo -e "${GREEN}all the storage nodes has the same partitions${NC}"
4125		else
4126			partitions=$(ansible *overcloud-[Ss]torage* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|ceph|md[1][2][6-7]'")
4127			echo -e "${RED}$partitions${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4128		fi
4129	fi
4130	if [[ $ansible_monitoring_hosts_count != "0" ]]
4131	then
4132		echo -e "${CYAN}check the monitoring nodes${NC}"
4133		partitions=$(ansible *overcloud-[Mm]onitoring* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|md[1][2][6-7]' | sha1sum" | grep ^[0-9a-f] | sort -u | wc -l)
4134		if [[ $partitions == "1" ]]
4135		then
4136			echo -e "${GREEN}all the monitoring nodes has the same partitions${NC}"
4137		else
4138			partitions=$(ansible *overcloud-[Mm]onitoring* -b -m shell -a "df --sync --output=source,target | grep -E -v 'docker|/run/user/|md[1][2][6-7]'")
4139			echo -e "${RED}$partitions${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4140		fi
4141	fi
4142	elapsed_time_seconds=$(expr $(date +%s) - $start)
4143
4144
4145	####################################################################################################
4146
4147
4148	start=$(date +%s)
4149	STEPS_COUNTER=$((STEPS_COUNTER+1))
4150	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT EACH HOST-GROUP HAS THE SAME AMOUNT OF PHYSICAL DISKS (lsblk) (+$elapsed_time_seconds `date '+%T'`)${NC}"
4151	echo -e "${CYAN}check the controllers${NC}"
4152	disks_count=$(ansible controller -b -m shell -a "lsblk | grep disk | wc -l" | grep ^[0-9] | sort -u | wc -l)
4153	if [[ $disks_count == "1" ]]
4154	then
4155		echo -e "${GREEN}same number of disks for all the controllers${NC}"
4156	else
4157		disks=$(ansible controller -b -m shell -a "lsblk | grep disk" | grep ^s | awk '{print $1}' | sort | uniq -c | column -t | sort -k1 | grep -v ^$ansible_storage_hosts_count | awk '{print $2}' | tr -d \n | paste -sd ' ' | tr -s ' ' '|')
4158		missing_disks=$(ansible controller -b -m shell -a "lsblk -d | grep -E '$disks'")
4159		echo -e "${RED}$missing_disks${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4160	fi
4161	echo -e "${CYAN}check the computes${NC}"
4162	disks_count=$(ansible compute -b -m shell -a "lsblk | grep disk | wc -l" | grep ^[0-9] | sort -u | wc -l)
4163	if [[ $disks_count == "1" ]]
4164	then
4165		echo -e "${GREEN}same number of disks for all the computes${NC}"
4166	else
4167		disks=$(ansible compute -b -m shell -a "lsblk | grep disk" | grep ^s | awk '{print $1}' | sort | uniq -c | column -t | sort -k1 | grep -v ^$ansible_storage_hosts_count | awk '{print $2}' | tr -d \n | paste -sd ' ' | tr -s ' ' '|')
4168		missing_disks=$(ansible compute -b -m shell -a "lsblk -d | grep -E '$disks'")
4169		echo -e "${RED}$missing_disks${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4170	fi
4171	if [[ $ansible_storage_hosts_count != "0" ]]
4172	then
4173		echo -e "${CYAN}check the storage nodes${NC}"
4174		disks_count=$(ansible *overcloud-[Ss]torage* -b -m shell -a "lsblk | grep disk | wc -l" | grep ^[0-9] | sort -u | wc -l)
4175		if [[ $disks_count == "1" ]]
4176		then
4177			echo -e "${GREEN}same number of disks for all the storage nodes${NC}"
4178		else
4179			disks=$(ansible *overcloud-[Ss]torage* -b -m shell -a "lsblk | grep disk" | grep ^s | awk '{print $1}' | sort | uniq -c | column -t | sort -k1 | grep -v ^$ansible_storage_hosts_count | awk '{print $2}' | tr -d \n | paste -sd ' ' | tr -s ' ' '|')
4180			missing_disks=$(ansible *overcloud-[Ss]torage* -b -m shell -a "lsblk -d | grep -E '$disks'")
4181			echo -e "${RED}$missing_disks${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4182		fi
4183	fi
4184	if [[ $ansible_monitoring_hosts_count != "0" ]]
4185	then
4186		echo -e "${CYAN}check the monitoring nodes${NC}"
4187		disks_count=$(ansible *overcloud-[Mm]onitoring* -b -m shell -a "lsblk | grep disk | wc -l" | grep ^[0-9] | sort -u | wc -l)
4188		if [[ $disks_count == "1" ]]
4189		then
4190			echo -e "${GREEN}same number of disks for all the monitoring nodes${NC}"
4191		else
4192			disks=$(ansible *overcloud-[Mm]onitoring* -b -m shell -a "lsblk | grep disk" | grep ^s | awk '{print $1}' | sort | uniq -c | column -t | sort -k1 | grep -v ^$ansible_storage_hosts_count | awk '{print $2}' | tr -d \n | paste -sd ' ' | tr -s ' ' '|')
4193			missing_disks=$(ansible *overcloud-[Mm]onitoring* -b -m shell -a "lsblk -d | grep -E '$disks'")
4194			echo -e "${RED}$missing_disks${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4195		fi
4196	fi
4197	elapsed_time_seconds=$(expr $(date +%s) - $start)
4198
4199
4200	####################################################################################################
4201
4202
4203	start=$(date +%s)
4204	STEPS_COUNTER=$((STEPS_COUNTER+1))
4205	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT ALL THE DISKS OF THE CONTROLLERS AND COMPUTES ARE ARRANGED/ORDERED THE SAME WAY (+$elapsed_time_seconds `date '+%T'`)${NC}"
4206	disk_order=$(ansible controller,compute -b -m shell -a "lsblk -dn | awk '{print \$1}'| md5sum" | grep ^[0-9a-f] | awk '{print $1}' | uniq | wc -l)
4207	if [[ $disk_order != "1" ]]
4208	then
4209		disk_order=$(ansible controller,compute -b -m shell -a "lsblk -dn")
4210		echo -e "${RED}$disk_order${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4211	else
4212		echo -e "${GREEN}all the disks are ordered the same way${NC}"
4213	fi
4214	elapsed_time_seconds=$(expr $(date +%s) - $start)
4215
4216
4217	####################################################################################################
4218
4219
4220	start=$(date +%s)
4221	STEPS_COUNTER=$((STEPS_COUNTER+1))
4222	echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE MEGARAID CONFIGURATION BETWEEN THE STORAGE NODES (+$elapsed_time_seconds `date '+%T'`)${NC}"
4223	if [[ $ansible_storage_hosts_count != "0" ]]
4224	then
4225		raid_mismatch=$(ansible CephStorage -b -m shell -a "/opt/MegaRAID/storcli/storcli64 /c0/vall show" | grep RAID | awk '{print $1,$2}' | sort | uniq -c | column -t | grep -v ^$ansible_storage_hosts_count)
4226		if [[ $raid_mismatch ]]
4227		then
4228			raid_mismatch=$(ansible CephStorage -b -m shell -a "/opt/MegaRAID/storcli/storcli64 /c0/vall show | grep RAID")
4229			echo -e "${RED}$raid_mismatch${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4230		else
4231			echo -e "${GREEN}the storage nodes megaraid configuration is aligned${NC}"
4232		fi
4233	else
4234		echo -e "${ORANGE}no storage nodes found${NC}"
4235	fi
4236	elapsed_time_seconds=$(expr $(date +%s) - $start)
4237	
4238	
4239	####################################################################################################
4240
4241
4242	start=$(date +%s)
4243	STEPS_COUNTER=$((STEPS_COUNTER+1))
4244	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THE SOFTWARE RAID STATUS OF ANY SERVER THAT IS CONFIGURED WITH SOFTWARE RAID (+$elapsed_time_seconds `date '+%T'`)${NC}"
4245	swraid1_servers=$(ansible all --limit '!localhost,!hypervisor' -m shell -b -a "test -f /var/log/cbis/raid_setup_inital_setup.log; echo \$?" | grep ^0 -B 1 | grep ^overcloud- | awk '{print $1}' | paste -sd',')
4246	if [[ $swraid1_servers ]]
4247	then
4248		servers=$(echo -e "$swraid1_servers" | tr -s , '\n')
4249		echo -e "${CYAN}servers configured with software raid 1:\n$servers${NC}\n\n" 
4250		swraid1_failed_servers=$(ansible $swraid1_servers -m shell -b -a "cat /proc/mdstat | grep -c '\[UU\]'" | grep '^[0-1]\|^[3-9]' -B 1 | grep ^overcloud- | awk '{print $1}' | paste -sd',')
4251		if [[ $swraid1_failed_servers ]]
4252		then
4253			swraid1_failure_output=$(ansible $swraid1_failed_servers -m shell -b -a "cat /proc/mdstat")
4254			echo -e "${RED}$swraid1_failure_output${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4255		else
4256			echo -e "${GREEN}the software raid status is as expected on all the servres that are configured with software raid${NC}"
4257		fi
4258	else
4259		echo -e "${ORANGE}no server is configured with software raid - please investigate if that shouldn't be the case${NC}"
4260	fi
4261	elapsed_time_seconds=$(expr $(date +%s) - $start)	
4262	
4263	
4264	####################################################################################################
4265
4266
4267	start=$(date +%s)
4268	STEPS_COUNTER=$((STEPS_COUNTER+1))
4269	EXCEPTION=0
4270	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT EACH HOSTS HAS THE EXPECTED HYPERVISOR ISOLATED CPUS (+$elapsed_time_seconds `date '+%T'`)${NC}"
4271	if [[ $ansible_avrs_hosts_count != "0" && $nuage == "true" ]]
4272	then
4273		echo -e "${CYAN}now checking AvrsCompute${NC}"
4274		physcpubind=$(ansible *overcloud-[aA]vrs* -b -m shell -a "numactl --show" | grep physcpubind | tr -s ' ' '\n' | grep ^[0-9] | sort --uniq | wc -l)
4275		if [[ $physcpubind == $avrs_hypervisor_dedicated_cpus ]]
4276		then
4277			echo -e "${GREEN}found the expected number of isolated cpus${NC}"
4278		else
4279			echo -e "${RED}according to the user_config.yaml $avrs_hypervisor_dedicated_cpus isolated cpus are expected while numactl --show returned $physcpubind isolated cpus${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4280			EXCEPTION=$((EXCEPTION+1))
4281		fi		
4282	fi
4283	if [[ $ansible_ovs_hosts_count != "0" ]]
4284	then
4285		echo -e "${CYAN}now checking OvsCompute${NC}"
4286		physcpubind=$(ansible *overcloud-[oO]vs* -b -m shell -a "numactl --show" | grep physcpubind | tr -s ' ' '\n' | grep ^[0-9] | sort --uniq | wc -l)
4287		if [[ $physcpubind == $ovs_hypervisor_dedicated_cpus ]]
4288		then
4289			echo -e "${GREEN}found the expected number of isolated cpus${NC}"
4290		else
4291			echo -e "${RED}according to the user_config.yaml $ovs_hypervisor_dedicated_cpus isolated cpus are expected while numactl --show returned $physcpubind isolated cpus${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4292			EXCEPTION=$((EXCEPTION+1))
4293		fi
4294	fi
4295	if [[ $ansible_sriov_hosts_count != "0" ]]
4296	then
4297		echo -e "${CYAN}now checking SriovPerformanceCompute${NC}"
4298		physcpubind=$(ansible *overcloud-[sS]riov* -b -m shell -a "numactl --show" | grep physcpubind | tr -s ' ' '\n' | grep ^[0-9] | sort --uniq | wc -l)
4299		if [[ $physcpubind == $sriov_hypervisor_dedicated_cpus ]]
4300		then
4301			echo -e "${GREEN}found the expected number of isolated cpus${NC}"
4302		else
4303			echo -e "${RED}according to the user_config.yaml $sriov_hypervisor_dedicated_cpus isolated cpus are expected while numactl --show returned $physcpubind isolated cpus${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4304			EXCEPTION=$((EXCEPTION+1))
4305		fi
4306	fi
4307	if [[ $ansible_dpdk_hosts_count != "0" ]]
4308	then
4309		echo -e "${CYAN}now checking DpdkPerformanceCompute${NC}"
4310		physcpubind=$(ansible *overcloud-[dD]pdk* -b -m shell -a "numactl --show" | grep physcpubind | tr -s ' ' '\n' | grep ^[0-9] | sort --uniq | wc -l)
4311		if [[ $physcpubind == $dpdk_hypervisor_dedicated_cpus ]]
4312		then
4313			echo -e "${GREEN}found the expected number of isolated cpus${NC}"
4314		else
4315			echo -e "${RED}according to the user_config.yaml $dpdk_hypervisor_dedicated_cpus isolated cpus are expected while numactl --show returned $physcpubind isolated cpus${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4316			EXCEPTION=$((EXCEPTION+1))
4317		fi
4318	fi
4319	if [ $EXCEPTION -gt 0 ]
4320	then
4321		echo -e "\n\n${ORANGE}validate that the setup is not deployed with odd number in the hypervisor_dedicated_cpus parameter${NC}"
4322	fi
4323	elapsed_time_seconds=$(expr $(date +%s) - $start)
4324
4325
4326	####################################################################################################
4327
4328
4329	if [[ $cbis_version != "18.0.0.1" ]]
4330	then
4331		start=$(date +%s)
4332	STEPS_COUNTER=$((STEPS_COUNTER+1))
4333		echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE BETWEEN current_node_count AND intended_node_count IN /etc/ansible/hosts (+$elapsed_time_seconds `date '+%T'`)${NC}"
4334		current_node_count=$(cat /etc/ansible/hosts | grep current_node_count | awk -F= '{print $2}')
4335		intended_node_count=$(cat /etc/ansible/hosts | grep intended_node_count | awk -F= '{print $2}')
4336		if [[ $current_node_count == $intended_node_count ]]
4337		then
4338			echo -e "${GREEN}current_node_count ("$current_node_count") and intended_node_count ("$intended_node_count") are identical${NC}"
4339		else
4340			echo -e "${RED}current_node_count ("$current_node_count") and intended_node_count ("$intended_node_count") not identical${NC}"	 ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4341		fi
4342	fi
4343	elapsed_time_seconds=$(expr $(date +%s) - $start)
4344
4345
4346	####################################################################################################
4347
4348
4349	start=$(date +%s)
4350	STEPS_COUNTER=$((STEPS_COUNTER+1))
4351	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THE BASIC HEALTH OF THE BMC (+$elapsed_time_seconds `date '+%T'`)${NC}"
4352	selftest=$(ansible all --limit '!localhost' -b -m shell -a "ipmitool mc selftest -v | grep -v -w 'Selftest: passed'" | grep Selftest -B 1)
4353	if [[ $selftest ]]
4354	then
4355		echo -e "${RED}$selftest${NC}"
4356	else
4357		echo -e "${GREEN}BMC selftest passed on all the servers${NC}"
4358	fi
4359	elapsed_time_seconds=$(expr $(date +%s) - $start)
4360
4361
4362	####################################################################################################
4363
4364
4365	if [[ $cbis_version != "19.0.0.1" ]]
4366	then
4367		start=$(date +%s)
4368		STEPS_COUNTER=$((STEPS_COUNTER+1))
4369		retransmits_count=100
4370		echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR HIGH PACKETS RETRANSMISIONS FROM RANDOM STORAGE/COMPUTE TO $storage_controller_address ON $last_index_controller (+$elapsed_time_seconds `date '+%T'`)${NC}"
4371		if [[ $cbis_version != "18.0.0.1" ]]
4372		then
4373			ansible $last_index_controller -b -m shell -a "killall iperf3" | grep -v SUCCESS | tr -d '\n'  > /dev/null
4374			a=$(ansible $last_index_controller -b -m shell -a "iptables -S" | grep ACCEPT | awk -F'--dports' '{print $2}' | awk '{print $1}' | tr -s ','  '\n' | tr -s ':'  ' ' | sort -n | uniq | grep -v ' ')
4375			b=$(ansible $last_index_controller -b -m shell -a "iptables -S" | grep ACCEPT | awk -F'--dports' '{print $2}' | awk '{print $1}' | tr -s ','  '\n' | tr -s ':'  ' ' | sort -n | uniq | grep ' ')
4376			if [[ $b ]]
4377			then
4378				b=$(ansible $last_index_controller -b -m shell -a "iptables -S" | grep ACCEPT | awk -F'--dports' '{print $2}' | awk '{print $1}' | tr -s ','  '\n' | tr -s ':'  ' ' | sort -n | uniq | grep ' ' | xargs -L 1 seq)
4379			fi
4380			c="${a} ${b}"
4381			printf "$c" | tr -s ' '  '\n' | sort -n | uniq | grep ^[0-9] > allowed_ports_on_'$last_index_controller'.txt
4382			e=$(ansible $last_index_controller -b -m shell -a "nmap -p 1-65535 127.0.0.1" | grep open | awk -F/ '{print $1}')
4383			f=$(ansible $last_index_controller -b -m shell -a "netstat -tuplen" | awk '{print $4}' | awk -F: '{print $2}' | sort -n | uniq | grep ^[0-9])
4384			g="${e} ${f}"
4385			printf "$g" | tr -s ' '  '\n' | sort -n | uniq | grep ^[0-9] > open_ports_on_'$last_index_controller'.txt
4386			random_unused_whitelist_port=$(diff allowed_ports_on_'$last_index_controller'.txt open_ports_on_'$last_index_controller'.txt | grep \< | awk '{print $2}' | shuf -n 1)
4387			ansible $last_index_controller -b -m shell -a "iperf3 -s -p $random_unused_whitelist_port -D" | grep -v SUCCESS | tr -d '\n'
4388			if [[ $hci == "false" && $ansible_storage_hosts ]]
4389			then
4390				retransmits=$(ansible $random_storage_hostname -b -m shell -a "iperf3 -c $storage_controller_address -p $random_unused_whitelist_port" | grep \/sec | grep -E -v 'sender|receiver' | tail -n+2 | awk '{ if ( $9 > $retransmits_count ) print $0 }')
4391			elif [[ $hci == "true" ]]
4392			then
4393				retransmits=$(ansible $random_compute_hostname -b -m shell -a "iperf3 -c $storage_controller_address -p $random_unused_whitelist_port" | grep \/sec | grep -E -v 'sender|receiver' | tail -n+2 | awk '{ if ( $9 > $retransmits_count ) print $0 }')
4394			fi	
4395			if [[ $retransmits ]]
4396			then
4397				echo -e "${RED}$retransmits${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4398			else
4399				echo -e "${GREEN}the number of retransmits per connection is less then 100${NC}"
4400			fi
4401			ansible $last_index_controller -b -m shell -a "killall iperf3" | grep -v SUCCESS | tr -d '\n'
4402		else
4403			echo -e "${ORANGE}iperf3 is not installed on cbis 18.0.0.1${NC}"
4404		fi
4405	fi
4406	elapsed_time_seconds=$(expr $(date +%s) - $start)
4407	
4408
4409	####################################################################################################
4410
4411
4412	start=$(date +%s)
4413	STEPS_COUNTER=$((STEPS_COUNTER+1))
4414	echo -e "${BLUE}\n\n$STEPS_COUNTER) KIBANA LOCALHOST LOGIN CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
4415	if [[ $elk == "true" && $elk_deployment_type == "local" ]]
4416	then
4417		kibana=$(ansible controller -b -m shell -a "curl -g -s localhost:5601/api/status | jq '.status' warn=False" | grep 'state' | sort | uniq -c | grep ^[[:blank:]]*3 | awk '{print $1}')
4418		if [[ $kibana == "3" ]]
4419		then
4420			echo -e "${GREEN}kibana (localhost:5601/api/status) sucessfully replied from all the controllers${NC}"
4421		else
4422			kibana=$(ansible controller -b -m shell -a "curl -g localhost:5601/api/status warn=False")
4423			echo -e "${RED}$kibana${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4424		fi
4425	else
4426		echo -e "${ORANGE}CBIS is deployed without ELK or ELK type is remote${NC}"
4427	fi
4428	elapsed_time_seconds=$(expr $(date +%s) - $start)
4429
4430
4431	####################################################################################################
4432
4433
4434	start=$(date +%s)
4435	STEPS_COUNTER=$((STEPS_COUNTER+1))
4436	echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE BETWEEN THE HOSTS PRESENTED IN ZABBIX AND THE HOSTS PRESENTED BY OPENSTACK (+$elapsed_time_seconds `date '+%T'`)${NC}"
4437	if [[ $zabbix_hosts != $nova_overcloud_hosts_list ]]
4438	then
4439		echo -e "${RED}openstack overcloud hosts:\n$nova_overcloud_hosts_list\n\nzabbix overcloud hosts:\n$zabbix_hosts${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4440	else
4441		echo -e "${GREEN}the configured overcloud hosts in zabbix are identical to the nova overcloud hosts${NC}"
4442	fi                                                 
4443
4444
4445	####################################################################################################
4446
4447
4448	start=$(date +%s)
4449	STEPS_COUNTER=$((STEPS_COUNTER+1))
4450	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE ZABBIX DISCOVERY RULES WITH UNEXPECTED STATE OR STATUS VALUE(+$elapsed_time_seconds `date '+%T'`)${NC}"
4451	discovery_rules=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
4452				-H 'Content-Type: application/json-rpc' \
4453				-H 'Cookie: SERVERID='$last_index_controller'' \
4454				--data '{
4455					"jsonrpc": "2.0",
4456					"method": "discoveryrule.get",
4457					"params": {
4458						"output": "extend",
4459						"sortfield": "name"
4460					},
4461					"auth": '$zabbix_auth',
4462					"id": 1
4463				}')
4464	wrong_status=$(echo -e "$discovery_rules" | tr '\r\n' ' ' | jq .result[] | jq 'select(.status != "0")' | jq -r "[.name,.hostid]")
4465	wrong_state=$(echo -e "$discovery_rules" | tr '\r\n' ' ' | jq .result[] | jq 'select(.state != "0")' | jq -r "[.name,.hostid]")
4466	if [[ $wrong_status || $wrong_state ]]
4467	then
4468		echo -e "${RED}$zabbix_hosts_and_ids\n\n\n$wrong_status\n\n\n$wrong_state${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4469		echo -e "\n${ORANGE}CBIS-16053 - Block devices discovery, KVM Network discovery, KVM Pool discovery, MD devices discovery, Network interface discovery, Hotfixes and Services Discovery triggers are disabled   (CBIS 20)${NC}"
4470		
4471	else
4472		echo -e "${GREEN}no zabbix discovery rules with unexpected status or state values are found${NC}\n"
4473	fi
4474	elapsed_time_seconds=$(expr $(date +%s) - $start)
4475	
4476	
4477	####################################################################################################
4478
4479
4480	start=$(date +%s)
4481	STEPS_COUNTER=$((STEPS_COUNTER+1))
4482	echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE ZABBIX TEMPLATES BETWEEN EACH HOST WITHIN EACH HOST-GROUP (+$elapsed_time_seconds `date '+%T'`)${NC}"
4483	if [[ $ansible_ovs_hosts ]]
4484	then
4485		echo -e "${CYAN}checking ovs computes${NC}"
4486		templates_total=""
4487		for host in $ansible_ovs_hosts
4488		do
4489			templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
4490			-H 'Content-Type: application/json-rpc' \
4491			-H 'Cookie: SERVERID='$last_index_controller'' \
4492			--data '{
4493				"jsonrpc": "2.0",
4494				"method": "host.get",
4495				"params": {
4496					"output": ["host"],
4497					"selectParentTemplates": [
4498						"templateid",
4499						"name"
4500					],
4501					"filter": {
4502								"host": "'$host'"
4503							}
4504				},
4505				"id": 1,
4506				"auth": '$zabbix_auth'
4507			}' | jq . | grep \"name\" | awk -F: '{print $2}' | tr -d '," ' | column -t)
4508			templates_total+="$templates\n"
4509		done
4510		templates_result=$(printf "$templates_total" | sort | uniq -c | awk '{print $1}' | sort -u | wc -l)
4511		if [[ $templates_result != "1" ]]
4512		then
4513			echo -e "${RED}found different number of templates between the ovs computes. log-in to zabbix, configuration > hosts and investigate the anomaly${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4514		else
4515			echo -e "${GREEN}all the ovs computes has identical number of templates${NC}"
4516		fi
4517	fi
4518	if [[ $ansible_sriov_hosts ]]
4519	then
4520		echo -e "${CYAN}checking sriov computes${NC}"
4521		templates_total=""
4522		for host in $ansible_sriov_hosts
4523		do
4524			templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
4525			-H 'Content-Type: application/json-rpc' \
4526			-H 'Cookie: SERVERID='$last_index_controller'' \
4527			--data '{
4528				"jsonrpc": "2.0",
4529				"method": "host.get",
4530				"params": {
4531					"output": ["host"],
4532					"selectParentTemplates": [
4533						"templateid",
4534						"name"
4535					],
4536					"filter": {
4537								"host": "'$host'"
4538							}
4539				},
4540				"id": 1,
4541				"auth": '$zabbix_auth'
4542			}' | jq . | grep \"name\" | awk -F: '{print $2}' | tr -d '," ' | column -t)
4543			templates_total+="$templates\n"
4544		done
4545		templates_result=$(printf "$templates_total" | sort | uniq -c | awk '{print $1}' | sort -u | wc -l)
4546		if [[ $templates_result != "1" ]]
4547		then
4548			echo -e "${RED}found different number of templates between the sriov computes. log-in to zabbix, configuration > hosts and investigate the anomaly${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4549		else
4550			echo -e "${GREEN}all the sriov computes has identical number of templates${NC}"
4551		fi
4552	fi
4553	if [[ $ansible_dpdk_hosts ]]
4554	then
4555		echo -e "${CYAN}checking dpdk computes${NC}"
4556		templates_total=""
4557		for host in $ansible_dpdk_hosts
4558		do
4559			templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
4560			-H 'Content-Type: application/json-rpc' \
4561			-H 'Cookie: SERVERID='$last_index_controller'' \
4562			--data '{
4563				"jsonrpc": "2.0",
4564				"method": "host.get",
4565				"params": {
4566					"output": ["host"],
4567					"selectParentTemplates": [
4568						"templateid",
4569						"name"
4570					],
4571					"filter": {
4572								"host": "'$host'"
4573							}
4574				},
4575				"id": 1,
4576				"auth": '$zabbix_auth'
4577			}' | jq . | grep \"name\" | awk -F: '{print $2}' | tr -d '," ' | column -t)
4578			templates_total+="$templates\n"
4579		done
4580		templates_result=$(printf "$templates_total" | sort | uniq -c | awk '{print $1}' | sort -u | wc -l)
4581		if [[ $templates_result != "1" ]]
4582		then
4583			echo -e "${RED}found different number of templates between the dpdk computes. log-in to zabbix, configuration > hosts and investigate the anomaly${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4584		else
4585			echo -e "${GREEN}all the dpdk computes has identical number of templates${NC}"
4586		fi
4587	fi
4588	if [[ $ansible_avrs_hosts ]]
4589	then
4590		echo -e "${CYAN}checking avrs computes${NC}"
4591		templates_total=""
4592		for host in $ansible_avrs_hosts
4593		do
4594			templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
4595			-H 'Content-Type: application/json-rpc' \
4596			-H 'Cookie: SERVERID='$last_index_controller'' \
4597			--data '{
4598				"jsonrpc": "2.0",
4599				"method": "host.get",
4600				"params": {
4601					"output": ["host"],
4602					"selectParentTemplates": [
4603						"templateid",
4604						"name"
4605					],
4606					"filter": {
4607								"host": "'$host'"
4608							}
4609				},
4610				"id": 1,
4611				"auth": '$zabbix_auth'
4612			}' | jq . | grep \"name\" | awk -F: '{print $2}' | tr -d '," ' | column -t)
4613			templates_total+="$templates\n"
4614		done
4615		templates_result=$(printf "$templates_total" | sort | uniq -c | awk '{print $1}' | sort -u | wc -l)
4616		if [[ $templates_result != "1" ]]
4617		then
4618			echo -e "${RED}found different number of templates between the avrs computes. log-in to zabbix, configuration > hosts and investigate the anomaly${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4619		else
4620			echo -e "${GREEN}all the avrs computes has identical number of templates${NC}"
4621		fi
4622	fi
4623	if [[ $ansible_storage_hosts ]]
4624	then
4625		echo -e "${CYAN}checking storage nodes${NC}"
4626		templates_total=""
4627		for host in $ansible_storage_hosts
4628		do
4629			templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
4630			-H 'Content-Type: application/json-rpc' \
4631			-H 'Cookie: SERVERID='$last_index_controller'' \
4632			--data '{
4633				"jsonrpc": "2.0",
4634				"method": "host.get",
4635				"params": {
4636					"output": ["host"],
4637					"selectParentTemplates": [
4638						"templateid",
4639						"name"
4640					],
4641					"filter": {
4642								"host": "'$host'"
4643							}
4644				},
4645				"id": 1,
4646				"auth": '$zabbix_auth'
4647			}' | jq . | grep \"name\" | awk -F: '{print $2}' | tr -d '," ' | column -t)
4648			templates_total+="$templates\n"
4649		done
4650		templates_result=$(printf "$templates_total" | sort | uniq -c | awk '{print $1}' | sort -u | wc -l)
4651		if [[ $templates_result != "1" ]]
4652		then
4653			echo -e "${RED}found different number of templates between the storage nodes. log-in to zabbix, configuration > hosts and investigate the anomaly${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4654		else
4655			echo -e "${GREEN}all the storage nodes has identical number of templates${NC}"
4656		fi
4657	fi
4658	if [[ $ansible_monitoring_hosts ]]
4659	then
4660		echo -e "${CYAN}checking monitoring hosts${NC}"
4661		templates_total=""
4662		for host in $ansible_monitoring_hosts
4663		do
4664			templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
4665			-H 'Content-Type: application/json-rpc' \
4666			-H 'Cookie: SERVERID='$last_index_controller'' \
4667			--data '{
4668				"jsonrpc": "2.0",
4669				"method": "host.get",
4670				"params": {
4671					"output": ["host"],
4672					"selectParentTemplates": [
4673						"templateid",
4674						"name"
4675					],
4676					"filter": {
4677								"host": "'$host'"
4678							}
4679				},
4680				"id": 1,
4681				"auth": '$zabbix_auth'
4682			}' | jq . | grep \"name\" | awk -F: '{print $2}' | tr -d '," ' | column -t)
4683			templates_total+="$templates\n"
4684		done
4685		templates_result=$(printf "$templates_total" | sort | uniq -c | awk '{print $1}' | sort -u | wc -l)
4686		if [[ $templates_result != "1" ]]
4687		then
4688			echo -e "${RED}found different number of templates between the monitoring hosts. log-in to zabbix, configuration > hosts and investigate the anomaly${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4689		else
4690			echo -e "${GREEN}all the monitoring hosts has identical number of templates${NC}"
4691		fi
4692	fi
4693	if [[ $ansible_controllers_hosts ]]
4694	then
4695		echo -e "${CYAN}checking controllers${NC}"
4696		templates_total=""
4697		for host in $ansible_controllers_hosts
4698		do
4699			templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
4700			-H 'Content-Type: application/json-rpc' \
4701			-H 'Cookie: SERVERID='$last_index_controller'' \
4702			--data '{
4703				"jsonrpc": "2.0",
4704				"method": "host.get",
4705				"params": {
4706					"output": ["host"],
4707					"selectParentTemplates": [
4708						"templateid",
4709						"name"
4710					],
4711					"filter": {
4712								"host": "'$host'"
4713							}
4714				},
4715				"id": 1,
4716				"auth": '$zabbix_auth'
4717			}' | jq . | grep \"name\" | awk -F: '{print $2}' | tr -d '," ' | column -t)
4718			templates_total+="$templates\n"
4719		done
4720		templates_result=$(printf "$templates_total" | sort | uniq -c | awk '{print $1}' | sort -u | wc -l)
4721		if [[ $templates_result != "1" ]]
4722		then
4723			echo -e "${RED}found different number of templates between the controllers. log-in to zabbix, configuration > hosts and investigate the anomaly${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4724		else
4725			echo -e "${GREEN}all the controllers has identical number of templates${NC}"
4726		fi
4727	fi
4728	elapsed_time_seconds=$(expr $(date +%s) - $start)
4729
4730
4731	####################################################################################################
4732
4733
4734	start=$(date +%s)
4735	STEPS_COUNTER=$((STEPS_COUNTER+1))
4736	echo -e "${BLUE}\n\n$STEPS_COUNTER) VITRAGE ALARMS (+$elapsed_time_seconds `date '+%T'`)${NC}"
4737	if [[ $cbis_version == *"2"* ]]
4738	then
4739		echo -e "${ORANGE}vitrage is deprecated from cbis-20.100.1 and onwards${NC}"
4740	elif [[ $cbis_version == "18.0.0.1" || $cbis_version == "19.0.0.1" || $cbis_version == "19.100.1" ]]
4741	then
4742		vitrage=$(ansible $last_index_controller -m shell -a "source ~/overcloudrc && vitrage alarm list -f value" | grep -E -v 'SUCCESS|/etc/passwd')
4743		if [[ -z $vitrage ]]
4744		then
4745			echo -e "${GREEN}no alarms found${NC}"
4746		else
4747			echo -e "${RED}$vitrage${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4748		fi
4749	fi
4750	elapsed_time_seconds=$(expr $(date +%s) - $start)
4751
4752
4753	####################################################################################################
4754
4755
4756	start=$(date +%s)
4757	STEPS_COUNTER=$((STEPS_COUNTER+1))
4758	echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE THE ALARMS COUNT BETWEEN ZABBIX AND VITRAGE (+$elapsed_time_seconds `date '+%T'`)${NC}"
4759	if [[ $cbis_version != *"2"* ]]
4760	then
4761		vitrage_alarms_count=$(ansible $last_index_controller -m shell -a "source ~/overcloudrc && vitrage alarm list -f value | grep -v '/etc/passwd has been changed' | wc -l" | grep ^[0-9])
4762		if [[ $zabbix_problem_triggers_count != $vitrage_alarms_count ]]
4763		then
4764			echo -e "${RED}zabbix alarms: "$zabbix_problem_triggers_count", vitrage alarms: "$vitrage_alarms_count" ${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4765		else
4766			echo -e "${GREEN}same number of zabbix and vitrage alarms${NC}"
4767		fi
4768	else
4769		echo -e "${ORANGE}vitrage is deprecated from cbis-20.100.1 and onwards${NC}"
4770	fi
4771	elapsed_time_seconds=$(expr $(date +%s) - $start)
4772
4773
4774	####################################################################################################
4775	
4776	
4777	start=$(date +%s)
4778	STEPS_COUNTER=$((STEPS_COUNTER+1))
4779	echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE THE ALARMS BETWEEN ZABBIX AND ALARM MANAGER (mysql) (+$elapsed_time_seconds `date '+%T'`)${NC}"
4780	almadb_alarms=$(ansible $last_index_controller -b -m shell -a "mysql -s -N -e \"select text from alma_db.ALMAALARM\"" | grep -v rc=0 | awk '{$NF=""; print $0}' | sort | uniq)
4781	zabbixdb_alarms=$(ansible $last_index_controller -b -m shell -a "mysql -s -N -e \"SELECT description FROM zabbixdb.triggers WHERE value = 1\"" | grep -v rc=0 | awk '{$NF=""; print $0}' | sort | uniq)
4782	if [[ $almadb_alarms != $zabbixdb_alarms ]]
4783	then
4784		echo -e "${RED}zabbix alarms:\n\n"$zabbixdb_alarms"\n\nalarm manager alarms:\n\n"$almadb_alarms"${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4785	else
4786		echo -e "${GREEN}zabbix alarms and alarm manager alarms are identical${NC}"
4787	fi
4788	elapsed_time_seconds=$(expr $(date +%s) - $start)
4789
4790	
4791	####################################################################################################
4792
4793
4794	start=$(date +%s)
4795	STEPS_COUNTER=$((STEPS_COUNTER+1))
4796	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOG ALL haproxy DOWN LOGS AND COMPARE WITH PREVIOUS CHECK AND CURRENT CHECK TO FIGURE OUT PROBLEMATIC TRENDS (+$elapsed_time_seconds `date '+%T'`)${NC}"
4797	if [[ ! -f "previous_haproxy_down_logs.txt" ]]
4798	then
4799		ansible $first_index_controller -b -m shell -a "cat /var/log/containers/haproxy/haproxy.log" | grep -w DOWN | awk -F, '{print $1}' > previous_haproxy_down_logs.txt
4800	fi
4801
4802	if [[ ! -f "current_haproxy_down_logs.txt" ]]
4803	then
4804		ansible $first_index_controller -b -m shell -a "cat /var/log/containers/haproxy/haproxy.log" | grep -w DOWN | awk -F, '{print $1}' > current_haproxy_down_logs.txt
4805	fi
4806
4807	previous_current_diff=$(diff -s previous_haproxy_down_logs.txt current_haproxy_down_logs.txt | awk '{print $NF}')
4808	if [[ $previous_current_diff == "identical" ]]
4809	then
4810		echo -e "${GREEN}couldn't find new DOWN lines in haproxy.log${NC}"
4811		sudo rm -f current_httpd_down_logs
4812	else
4813		echo -e "${GREEN}$previous_current_diff${NC}"
4814		sudo cp current_httpd_down_logs.txt previous_httpd_down_logs.txt
4815		sudo rm -f current_httpd_down_logs
4816	fi
4817	elapsed_time_seconds=$(expr $(date +%s) - $start)
4818
4819
4820
4821	####################################################################################################
4822	
4823	
4824	start=$(date +%s)
4825	STEPS_COUNTER=$((STEPS_COUNTER+1))
4826	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE DOWN AND NOLB (no load-balancer) SERVICES WITHIN THE HAPROXY SOCKET STATS (+$elapsed_time_seconds `date '+%T'`)${NC}"
4827	haproxy_down_services=$(ansible $last_index_controller -m shell -b -a "echo 'show stat' | nc -U /var/lib/haproxy/stats | grep -E -w 'DOWN|NOLB' | awk -F, '{print \$1,\$2}'" | grep -v -E rc=[0-9])
4828	if [[ $haproxy_down_services ]]
4829	then
4830		echo -e "${RED}$haproxy_down_services${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4831		echo -e "\n\n${ORANGE}CBIS-16375 (19A) - aodh, gnocchi and panko services leftovers in haproxy and redis and ceph_dashboard not showing in full 3/3 HA${NC}"
4832	else
4833		echo -e "${GREEN}no services are reported as DOWN or NOLB (no load-balancer)${NC}"
4834	fi
4835	elapsed_time_seconds=$(expr $(date +%s) - $start)
4836
4837
4838	####################################################################################################
4839
4840
4841	if [[ -f "$logs_dir/initial_undercloud_hostname" ]]
4842	then
4843		start=$(date +%s)
4844		STEPS_COUNTER=$((STEPS_COUNTER+1))
4845		echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE THE UNDERCLOUD INITIAL AND CURRENT HOSTNAME (+$elapsed_time_seconds `date '+%T'`)${NC}"
4846		hostname > current_undercloud_hostname
4847		diff=$(diff $logs_dir/initial_undercloud_hostname /home/stack/current_undercloud_hostname)
4848		if [[ $diff ]]
4849		then
4850			echo -e "${RED}$diff${NC}" | sed 's/^>/INITIAL:/g' | sed 's/^</CURRENT:/g'
4851		else
4852			echo -e "${GREEN}no differences between the initial and current hostname are found${NC}"
4853		fi
4854	else
4855		start=$(date +%s)
4856		STEPS_COUNTER=$((STEPS_COUNTER+1))
4857		echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE INITIAL AND CURRENT HOSTNAMES (+$elapsed_time_seconds `date '+%T'`)${NC}"
4858		hostname > $logs_dir/initial_undercloud_hostname
4859		echo -e "${GREEN}this is the initial undercloud hostname audit - the comparison will begin from the next script iteration${NC}"
4860	fi
4861
4862
4863	####################################################################################################
4864
4865
4866	start=$(date +%s)
4867	STEPS_COUNTER=$((STEPS_COUNTER+1))
4868	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK IF USERS root/stack ARE FAIL-LOCKED (+$elapsed_time_seconds `date '+%T'`)${NC}"
4869	# usually a user will become faillocked after several failed login attempts
4870	faillock=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor,localhost -b -m shell -a "faillock" | grep -w V -B 3)
4871	if [[ $faillock ]]
4872	then
4873		echo -e "${RED}$faillock${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4874	else
4875		echo -e "${GREEN}no fail-locked user found${NC}"
4876	fi
4877	elapsed_time_seconds=$(expr $(date +%s) - $start)
4878	
4879
4880	####################################################################################################
4881
4882
4883	# The Automatic Bug Reporting Tool, commonly abbreviated as ABRT, is a set of tools that is designed to help users detect and report application crashes.
4884	start=$(date +%s)
4885	STEPS_COUNTER=$((STEPS_COUNTER+1))
4886	echo -e "${BLUE}\n\n$STEPS_COUNTER) DETECT APPLICATION CRASHES USING RED-HAT ABRT (Automatic Bug Reporting Tool) ON THE HYPERVISOR (+$elapsed_time_seconds `date '+%T'`)${NC}"
4887	abrt=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "abrt-cli status" | grep -v SUCCESS)
4888	if [[ $abrt ]]
4889	then
4890		abrt=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "abrt-cli list 2> /dev/null | grep ^reason: | sort | uniq -c")
4891		echo -e "${ORANGE}$abrt${NC}"
4892	else
4893		echo -e "${GREEN}no application crashes detected${NC}"
4894	fi
4895	elapsed_time_seconds=$(expr $(date +%s) - $start)
4896
4897
4898	####################################################################################################
4899	
4900	
4901	# libguestfs-test-tool is a test program shipped with libguestfs to allow you to check basic libguestfs functionality is working. This is needed because libguestfs occasionally breaks for reasons beyond our control: usually because of changes in the underlying qemu or kernel packages, or the host environment.
4902	# libguestfs is a set of tools for accessing and modifying virtual machine (VM) disk images. You can use this for viewing and editing files inside guests, scripting changes to VMs, monitoring disk used/free statistics, creating guests, P2V, V2V, performing backups, cloning VMs, building VMs, formatting disks, resizing disks, and much more.
4903	# Context: CBIS 20 PP3 deployment failed due to https://access.redhat.com/solutions/3416791 which I used libguestfs-test-tool to debug.
4904	start=$(date +%s)
4905	STEPS_COUNTER=$((STEPS_COUNTER+1))
4906	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT BASIC libguestfs FUNCTIONALITY IS WORKING (+$elapsed_time_seconds `date '+%T'`)${NC}"
4907	libguestfs_test_tool=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "libguestfs-test-tool 2>&1")
4908	libguestfs_test_tool_verdict=$(echo -e "$libguestfs_test_tool" | grep 'TEST FINISHED OK')
4909	if [[ $libguestfs_test_tool_verdict ]]
4910	then
4911		echo -e "${GREEN}libguestfs-test-tool returned: TEST FINISHED OK${NC}"
4912	else
4913		echo -e "${RED}$libguestfs_test_tool${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4914	fi
4915	elapsed_time_seconds=$(expr $(date +%s) - $start)
4916
4917
4918	####################################################################################################
4919
4920
4921	start=$(date +%s)
4922	STEPS_COUNTER=$((STEPS_COUNTER+1))
4923	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT ALL THE IRONIC HOSTS POWER STATE IS power on, PROVISIONING STATE IS active AND MAINTENANCE IS False (+$elapsed_time_seconds `date '+%T'`)${NC}"
4924	maintenance=$(source ~/stackrc && openstack baremetal node list -f value -c UUID -c Name -c 'Power State' -c 'Provisioning State' -c Maintenance | column -t | grep -E -v 'power\s+on\s+active\s+False')
4925	if [[ $maintenance ]]
4926	then
4927		echo -e "${RED}$maintenance${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4928	else
4929		echo -e "${GREEN}no baremetal host(s) with unexpected status is found${NC}"
4930	fi
4931	elapsed_time_seconds=$(expr $(date +%s) - $start)
4932
4933
4934	####################################################################################################
4935
4936
4937	start=$(date +%s)
4938	if [[ $cbis_version != "19.0.0.1" ]]
4939	then
4940		STEPS_COUNTER=$((STEPS_COUNTER+1))
4941		echo -e "${BLUE}\n\n$STEPS_COUNTER) RABBITMQ QUEUES CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
4942		if [[ $cbis_version == "18.0.0.1" ]]
4943		then
4944			rabbitmqctl_list_queues=$(ansible $last_index_controller -b -m shell -a "rabbitmqctl list_queues" |  grep -E "[[:space:]]+[0-9]" | awk '($2!=0) {print $0}')
4945			sleep 60
4946			rabbitmqctl_list_queues_no_2=$(ansible $last_index_controller -b -m shell -a "rabbitmqctl list_queues" |  grep -E "[[:space:]]+[0-9]" | awk '($2!=0) {print $0}')
4947			if [[ $rabbitmqctl_list_queues == $rabbitmqctl_list_queues_no_2 ]]
4948			then
4949				echo -e "${GREEN}no change in the queues were found in 30 seconds duration${NC}"
4950			else
4951				echo -e "${RED}found difference in the queues between the first rabbitmqctl list_queues check and the second check which is taken 30 seconds after the first one${NC}\n\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4952				echo -e "${RED}first check:\n"$rabbitmqctl_list_queues"${NC}\n\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4953				echo -e "${RED}second check:\n"$rabbitmqctl_list_queues_no_2"${NC}\n\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4954			fi
4955		elif [[ $cbis_version != "18.0.0.1" ]]
4956		then
4957			rabbitmqctl_list_queues=$(ansible $last_index_controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl list_queues" |  grep -E "[[:space:]]+[0-9]" | awk '($2!=0) {print $0}')
4958			sleep 30
4959			rabbitmqctl_list_queues_no_2=$(ansible $last_index_controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl list_queues" |  grep -E "[[:space:]]+[0-9]" | awk '($2!=0) {print $0}')
4960			if [[ $rabbitmqctl_list_queues == $rabbitmqctl_list_queues_no_2 ]]
4961			then
4962				echo -e "${GREEN}no change in the queues were found in 30 seconds duration${NC}"
4963			else
4964				echo -e "${RED}found difference in the queues between the first rabbitmqctl list_queues check and the second check which is taken 30 seconds after the first one${NC}\n\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4965				echo -e "${RED}first check:\n"$rabbitmqctl_list_queues"${NC}\n\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4966				echo -e "${RED}second check:\n"$rabbitmqctl_list_queues_no_2"${ORANGE}\n\n\nnote: a difference between the first and second checks isn't necessary a bug. please examine the output carefully"${NC} ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4967			fi
4968		fi
4969		elapsed_time_seconds=$(expr $(date +%s) - $start)
4970	fi	
4971
4972
4973	####################################################################################################
4974
4975
4976	start=$(date +%s)
4977	STEPS_COUNTER=$((STEPS_COUNTER+1))
4978	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE ROOT PARTITIONS WITH 90%+ DISK USAGE (df) (+$elapsed_time_seconds `date '+%T'`)${NC}"
4979	partitions_size=$(ansible all -m shell -a "df -h | grep -w / | awk '{print \$5,\$6}' | sed 's/\%//g' | grep -E ^[0-9] | awk '{ if ( \$1 > 90 ) print \$1,\$2 }'" | grep ^[9] -B 1)
4980	if [[ -z $partitions_size ]]
4981	then
4982		echo -e "${GREEN}no partition with 90%+ usage found on the undercloud and on the overcloud servers${NC}"
4983	else
4984		echo -e "${RED}$partitions_size${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
4985	fi
4986	elapsed_time_seconds=$(expr $(date +%s) - $start)
4987
4988
4989	####################################################################################################
4990
4991
4992	start=$(date +%s)
4993	STEPS_COUNTER=$((STEPS_COUNTER+1))
4994	file_size="1"
4995	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE BIG LOG FILES (OVER "$file_size"G) (+$elapsed_time_seconds `date '+%T'`)${NC}"
4996	big_files=$(ansible all --limit '!hypervisor' -b -m shell -a "find /var/log/ -size +"$file_size"G -exec ls -lh {} \+" | grep /var/log/ -B 1)
4997	if [[ -z $big_files ]]
4998	then
4999		echo -e "${GREEN}couldn't find files under /var/log/ which weights more then "$file_size"G${NC}"
5000	else
5001		echo -e "${RED}$big_files${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5002	fi
5003	elapsed_time_seconds=$(expr $(date +%s) - $start)
5004
5005
5006	####################################################################################################
5007
5008
5009	start=$(date +%s)
5010	STEPS_COUNTER=$((STEPS_COUNTER+1))
5011	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE UNSUCCESSFUL LOG LINES IN /var/log/cbis/patches-applied.log ON ALL THE SERVERS  (+$elapsed_time_seconds `date '+%T'`)${NC}"
5012	applied_patch_bad_logs=$(ansible all --limit '!hypervisor' -b -m shell -a "cat /var/log/cbis/patches-applied.log | grep -v -E 'PATCH-SUCCESS|PATCH-START'" | grep -v -E 'rc=[1-9]|non-zero return code')
5013	if [[ $applied_patch_bad_logs ]]
5014	then
5015		echo -e "${RED}$applied_patch_bad_logs${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5016		echo -e "\n\n${ORANGE}CBIS-16038 scale out fails - the scaled out server doesn't have SSH iptables ACCEPT rule and thus ansible fails to access it\nthe sypthom was that on the scaled out compute the patch was not deployerd correctly and it this was seen under /var/log/cbis/patches-applied.log of the scaled out server(CBIS 20 PP3)${NC}"
5017	else
5018		echo -e "${GREEN}no unsuccessful log lines found under /var/log/cbis/patches-applied.log${NC}"
5019	fi
5020	elapsed_time_seconds=$(expr $(date +%s) - $start)
5021
5022
5023	####################################################################################################
5024
5025
5026	start=$(date +%s)
5027	STEPS_COUNTER=$((STEPS_COUNTER+1))
5028	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT 'max memory size|open files|cpu time|virtual memory|file locks' CONFIURATIOSN ARE SAME FOR ALL HOSTS (ulimit -a) (+$elapsed_time_seconds `date '+%T'`)${NC}"
5029	ulimit=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ulimit -a" | grep -E -w 'max memory size|open files|cpu time|virtual memory|file locks' | sort --uniq | awk '{print $NF}' | sort --uniq | paste -sd " ")
5030	if [[ $ulimit != "1024 unlimited" ]]
5031	then
5032		ulimit=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ulimit -a | grep -E -w 'max memory size|open files|cpu time|virtual memory|file locks'")
5033		echo -e "${RED}$ulimit${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5034	else
5035		echo -e "${GREEN}all the hosts returned the expected 'max memory size|open files|cpu time|virtual memory|file locks' values${NC}"
5036	fi
5037	elapsed_time_seconds=$(expr $(date +%s) - $start)
5038
5039
5040	####################################################################################################
5041
5042
5043	start=$(date +%s)
5044	STEPS_COUNTER=$((STEPS_COUNTER+1))
5045	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE VLOCK PROCESSES (+$elapsed_time_seconds `date '+%T'`)${NC}"
5046	vlock=$(ansible all --limit '!hypervisor' -b -m shell -a "top -b -n 1 | grep vlock" | grep -E -v 'FAILED|non-zero return code')
5047	if [[ -z $vlock ]]
5048	then
5049		echo -e "${GREEN}no vlock processes found${NC}"
5050	else
5051		echo -e "${MAGENTA}$vlock${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5052		echo -e "\n\n${ORANGE}CBIS-7324 (CBIS 19.0)\nCBIS-7324 (CBIS 19A)${NC}"
5053	fi
5054	elapsed_time_seconds=$(expr $(date +%s) - $start)
5055
5056
5057	####################################################################################################
5058
5059
5060	start=$(date +%s)
5061	STEPS_COUNTER=$((STEPS_COUNTER+1))
5062	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE ZOMBIE (DEFUNCT) PROCESSES (+$elapsed_time_seconds `date '+%T'`)${NC}"
5063	zombie=$(ansible all --limit '!hypervisor' -b -m shell -a "ps aux | grep [d]efunct | grep -v swift-object-au" | grep -E -v 'FAILED|non-zero return code')
5064	servers_with_zombie=$(echo -e "$zombie" | grep SUCCESS | awk '{print $1}' | paste -sd",")
5065	if [[ -z $zombie ]]
5066	then
5067		echo -e "${GREEN}no zombie processes found${NC}"
5068	else
5069		echo -e "${RED}the following PIDs are marked as defunct (zombie process):\n\n$zombie${NC}\n\n"
5070		zombie=$(ansible $servers_with_zombie -b -m shell -a "ps aux | grep [d]efunct | grep -v swift-object-au | awk '{print \$2}' | xargs -i pstree -aps {}")
5071		### to avoid trancuated lines use pstree -laps
5072		echo -e "${RED}$zombie${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5073		echo -e "\n\n${ORANGE}CBIS-11144 (19A) / CBIS-11245 (20) - neutron router causing zombie processes${NC}"
5074		echo -e "${ORANGE}CBIS-16391 (19A) - neutron-dhcp-agent zabbix alarm after deploying running default security hardening${NC}"
5075	fi
5076	elapsed_time_seconds=$(expr $(date +%s) - $start)
5077
5078
5079	####################################################################################################
5080
5081
5082	start=$(date +%s)
5083	STEPS_COUNTER=$((STEPS_COUNTER+1))
5084	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR CRITICAL ERRORS IN DMESG (KERNEL MESSAGES) (+$elapsed_time_seconds `date '+%T'`)${NC}"
5085	dmesg_alarms=$(ansible all --limit '!hypervisor' -b -m shell -a "dmesg -x --ctime --level crit --level alert --level emerg --nopager --decode --kernel --userspace" | grep : -B 1)
5086	if [[ -z $dmesg_alarms ]]
5087	then
5088		echo -e "${GREEN}no critical alarms found in dmesg${NC}"
5089	else
5090		echo -e "${RED}$dmesg_alarms${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5091	fi
5092	elapsed_time_seconds=$(expr $(date +%s) - $start)
5093
5094
5095	####################################################################################################
5096
5097
5098	start=$(date +%s)
5099	STEPS_COUNTER=$((STEPS_COUNTER+1))
5100	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT ALL THE OVERCLOUD HOSTS HAS THE SAME FIRMWARE VERSION (+$elapsed_time_seconds `date '+%T'`)${NC}"
5101	echo -e "${CYAN}checking the computes${NC}"
5102	firmware_version=$(ansible compute -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'" | grep -E ^[0-9] | sort -u | wc -l)
5103	if [[ $firmware_version == "1" ]]
5104	then
5105		firmware_version=$(ansible compute -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'" | grep -E ^[0-9] | sort -u)
5106		echo -e "${GREEN}all servers using firmware revision $firmware_version${NC}"
5107		touch $logs_dir/firmware_version
5108		echo "$firmware_version" > $logs_dir/firmware_version
5109	else
5110		firmware_version=$(ansible compute -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'")
5111		echo -e "${RED}$firmware_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5112	fi
5113	echo -e "${CYAN}checking the controllers${NC}"
5114	firmware_version=$(ansible controller -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'" | grep -E ^[0-9] | sort -u | wc -l)
5115	if [[ $firmware_version == "1" ]]
5116	then
5117		firmware_version=$(ansible controller -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'" | grep -E ^[0-9] | sort -u)
5118		echo -e "${GREEN}all servers using firmware revision $firmware_version${NC}"
5119	else
5120		firmware_version=$(ansible controller -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'")
5121		echo -e "${RED}$firmware_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5122	fi
5123	if [[ $ansible_storage_hosts_count != "0" ]]
5124	then
5125		echo -e "${CYAN}checking the storage nodes${NC}"
5126		firmware_version=$(ansible cephstorage -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'" | grep -E ^[0-9] | sort -u | wc -l)
5127		if [[ $firmware_version == "1" ]]
5128		then
5129			firmware_version=$(ansible cephstorage -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'" | grep -E ^[0-9] | sort -u)
5130			echo -e "${GREEN}all servers using firmware revision $firmware_version${NC}"
5131		else
5132			firmware_version=$(ansible cephstorage -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'")
5133			echo -e "${RED}$firmware_version${NC}"	 ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5134		fi
5135	fi
5136	echo -e "${CYAN}checking all the overcloud servers at once${NC}"
5137	firmware_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'" | grep -E ^[0-9] | sort -u | wc -l)
5138	if [[ $firmware_version == "1" ]]
5139	then
5140		firmware_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'" | grep -E ^[0-9] | sort -u)
5141		echo -e "${GREEN}all servers using firmware revision $firmware_version${NC}"
5142	else
5143		firmware_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipmitool mc info | grep 'Firmware Revision' | awk '{print \$4}'")
5144		echo -e "${RED}$firmware_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5145		echo "$firmware_version" > $logs_dir/firmware_version
5146	fi
5147	elapsed_time_seconds=$(expr $(date +%s) - $start)
5148
5149
5150	####################################################################################################
5151
5152
5153	start=$(date +%s)
5154	STEPS_COUNTER=$((STEPS_COUNTER+1))
5155	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT ALL THE HOSTS HAS THE SAME BIOS VERSION (+$elapsed_time_seconds `date '+%T'`)${NC}"
5156	echo -e "${CYAN}checking the computes${NC}"
5157	bios_version=$(ansible compute -b -m shell -a "dmidecode -s bios-version" | grep -v SUCCESS | sort -u | wc -l)
5158	if [[ $bios_version == "1" ]]
5159	then
5160		bios_version=$(ansible compute -b -m shell -a "dmidecode -s bios-version" | grep -v SUCCESS | sort -u)
5161		echo -e "${GREEN}all servers using bios version $bios_version${NC}"
5162		touch $logs_dir/bios_version
5163		echo "$bios_version" > $logs_dir/bios_version
5164	else
5165		bios_version=$(ansible compute -b -m shell -a "dmidecode -s bios-version")
5166		echo -e "${RED}$bios_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5167	fi
5168	echo -e "${CYAN}checking the controllers${NC}"
5169	bios_version=$(ansible controller -b -m shell -a "dmidecode -s bios-version" | grep -v SUCCESS | sort -u | wc -l)
5170	if [[ $bios_version == "1" ]]
5171	then
5172		bios_version=$(ansible controller -b -m shell -a "dmidecode -s bios-version" | grep -v SUCCESS | sort -u)
5173		echo -e "${GREEN}all servers using bios version $bios_version${NC}"
5174	else
5175		bios_version=$(ansible controller -b -m shell -a "dmidecode -s bios-version")
5176		echo -e "${RED}$bios_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5177	fi
5178	if [[ $ansible_storage_hosts_count != "0" ]]
5179	then
5180		echo -e "${CYAN}checking the storage nodes${NC}"
5181		bios_version=$(ansible cephstorage -b -m shell -a "dmidecode -s bios-version" | grep -v SUCCESS | sort -u | wc -l)
5182		if [[ $bios_version == "1" ]]
5183		then
5184			bios_version=$(ansible cephstorage -b -m shell -a "dmidecode -s bios-version" | grep -v SUCCESS | sort -u)
5185			echo -e "${GREEN}all servers using bios version $bios_version${NC}"
5186		else
5187			bios_version=$(ansible cephstorage -b -m shell -a "dmidecode -s bios-version")
5188			echo -e "${RED}$bios_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5189		fi
5190	else
5191		echo -e "${CYAN}checking the storage nodes${NC}"
5192		echo -e "${ORANGE}couldn't find storage nodes in the system${NC}"
5193	fi
5194	echo -e "${CYAN}checking all the overcloud servers at once${NC}"
5195	bios_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "dmidecode -s bios-version" | grep -v SUCCESS | sort -u | wc -l)
5196	if [[ $bios_version == "1" ]]
5197	then
5198		bios_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "dmidecode -s bios-version" | grep -v SUCCESS | sort -u)
5199		echo -e "${GREEN}all servers using bios version $bios_version${NC}"
5200	else
5201		bios_version=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "dmidecode -s bios-version")
5202		echo -e "${RED}$bios_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5203	fi
5204	elapsed_time_seconds=$(expr $(date +%s) - $start)
5205
5206
5207	####################################################################################################
5208
5209
5210	start=$(date +%s)
5211	STEPS_COUNTER=$((STEPS_COUNTER+1))
5212	echo -e "${BLUE}\n\n$STEPS_COUNTER) READ THE BMCS SENSORS AND LOCATE FAULTS (+$elapsed_time_seconds `date '+%T'`)${NC}"
5213	sensors=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -b -m shell -a "ipmitool sdr elist full | awk '{ if ( \$5 != ok ) print }' | grep -v -E '\| ok  \||\| ns  \|'" | grep ^[A-Z] -B 1)
5214	if [[ $sensors ]]
5215	then
5216		echo -e "${RED}$sensors${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5217		echo -e "\n\n${ORANGE}lnr - Lower Non-Recoverable${NC}"
5218		echo -e "${ORANGE}lcr - Lower Critical${NC}"
5219		echo -e "${ORANGE}lnc - Lower Non-Critical${NC}"
5220		echo -e "${ORANGE}unc - Upper Non-Critical${NC}"
5221		echo -e "${ORANGE}ucr - Upper Critical${NC}"
5222		echo -e "${ORANGE}unr - Upper Non-Recoverable${NC}"
5223		echo -e "${ORANGE}nr - Non Recoverable${NC}"
5224		echo -e "${ORANGE}cr - Critical${NC}"
5225		echo -e "${ORANGE}nc - Non Critical${NC}"
5226		echo -e "${ORANGE}ns - Not Specified${NC}"
5227		echo -e "${ORANGE}na - Not Available${NC}"
5228	else
5229		echo -e "${GREEN}all sensors returned ok${NC}"
5230	fi
5231	elapsed_time_seconds=$(expr $(date +%s) - $start)
5232
5233
5234	####################################################################################################
5235
5236
5237	start=$(date +%s)
5238	STEPS_COUNTER=$((STEPS_COUNTER+1))
5239	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE CRITICAL/NON-RECOVERABLE EVENT LOGS IN SYSTEM BMCS FROM CURRENT MONTH (+$elapsed_time_seconds `date '+%T'`)${NC}"
5240	this_month_dates=$(date +%m/[0-9][0-9]/%Y)
5241	events=$(sshpass -p $hv_cbis_admin_password ansible -k all --limit '!localhost' -m  shell -b -a "ipmitool sel list | grep -E -i 'Critical|Non-Recoverable' | grep -E $this_month_dates" | grep -E -v 'FAILED \| rc=[1-9]|non-zero return code')
5242	if [[ $events ]]
5243	then
5244		echo -e "${RED}$events${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5245	else
5246		echo -e "${GREEN}none of the events of the BMCs returned critical or non-recoverable${NC}"
5247	fi
5248	elapsed_time_seconds=$(expr $(date +%s) - $start)
5249
5250
5251	####################################################################################################
5252
5253
5254	if [[ -f "$logs_dir/kernel" ]]
5255	then
5256		continue
5257	else
5258		start=$(date +%s)
5259		STEPS_COUNTER=$((STEPS_COUNTER+1))
5260		echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT ALL THE OVERCLOUD HOSTS HAS THE SAME KERNEL VERSION (+$elapsed_time_seconds `date '+%T'`)${NC}"
5261		kernel_version=$(ansible all --limit '!hypervisor' -b -m shell -a "uname -r" | grep -v SUCCESS | sort -u | wc -l)
5262		if [[ $kernel_version == "1" ]]
5263		then
5264			kernel_version=$(ansible all --limit '!hypervisor' -b -m shell -a "uname -r" | grep -v SUCCESS | sort -u)
5265			echo -e "${GREEN}all servers using kernel version $kernel_version${NC}"
5266			touch $logs_dir/kernel
5267			echo "$kernel_version" > $logs_dir/kernel
5268		else
5269			kernel_version=$(ansible all --limit '!hypervisor' -b -m shell -a "uname -r")
5270			echo -e "${RED}$kernel_version${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5271		fi
5272	fi
5273	elapsed_time_seconds=$(expr $(date +%s) - $start)
5274
5275
5276	####################################################################################################
5277
5278
5279	if [[ -f "$logs_dir/cpu_model" ]]
5280	then
5281		continue
5282	else
5283		start=$(date +%s)
5284		STEPS_COUNTER=$((STEPS_COUNTER+1))
5285		echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT ALL THE OVERCLOUD HOSTS HAS THE SAME CPU MODEL (+$elapsed_time_seconds `date '+%T'`)${NC}"
5286		echo -e "${CYAN}checking the computes${NC}"
5287		cpu_model=$(ansible compute -b -m shell -a "dmidecode -t processor | grep Version: | sort -u | awk -F: '{print \$2}'" | grep -v SUCCESS | sort -u | wc -l)
5288		if [[ $cpu_model == "1" ]]
5289		then
5290			cpu_model=$(ansible compute -b -m shell -a "dmidecode -t processor | grep Version: | sort -u | awk -F: '{print \$2}'" | grep -v SUCCESS | sort -u)
5291			echo -e "${GREEN}all servers using cpu model $cpu_model${NC}"
5292			touch $logs_dir/cpu_model
5293			echo "$cpu_model" > $logs_dir/cpu_model
5294		else
5295			cpu_model=$(ansible compute -b -m shell -a "dmidecode -t processor | grep Version: | sort -u | awk -F: '{print \$2}'")
5296			echo -e "${RED}$cpu_model${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5297		fi
5298		echo -e "${CYAN}checking the controllers${NC}"
5299		cpu_model=$(ansible controller -b -m shell -a "dmidecode -t processor | grep Version: | sort -u | awk -F: '{print \$2}'" | grep -v SUCCESS | sort -u | wc -l)
5300		if [[ $cpu_model == "1" ]]
5301		then
5302			cpu_model=$(ansible controller -b -m shell -a "dmidecode -t processor | grep Version: | sort -u | awk -F: '{print \$2}'" | grep -v SUCCESS | sort -u)
5303			echo -e "${GREEN}all servers using cpu model $cpu_model${NC}"
5304		else
5305			cpu_model=$(ansible controller -b -m shell -a "dmidecode -t processor | grep Version: | sort -u | awk -F: '{print \$2}'")
5306			echo -e "${RED}$cpu_model${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5307		fi
5308		if [[ $ansible_storage_hosts_count != "0" ]]
5309		then	
5310			echo -e "${CYAN}checking the storage nodes${NC}"
5311			cpu_model=$(ansible cephstorage -b -m shell -a "dmidecode -t processor | grep Version: | sort -u | awk -F: '{print \$2}'" | grep -v SUCCESS | sort -u | wc -l)
5312			if [[ $cpu_model == "1" ]]
5313			then
5314				cpu_model=$(ansible cephstorage -b -m shell -a "dmidecode -t processor | grep Version: | sort -u | awk -F: '{print \$2}'" | grep -v SUCCESS | sort -u)
5315				echo -e "${GREEN}all servers using cpu model $cpu_model${NC}"
5316			else
5317				cpu_model=$(ansible cephstorage -b -m shell -a "dmidecode -t processor | grep Version: | sort -u | awk -F: '{print \$2}'")
5318				echo -e "${RED}$cpu_model${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5319			fi
5320		else
5321			echo -e "${CYAN}\nchecking the storage nodes${NC}"
5322			echo -e "${ORANGE}couldn't find storage nodes in the system${NC}"
5323		fi				
5324	fi
5325	elapsed_time_seconds=$(expr $(date +%s) - $start)
5326
5327
5328	####################################################################################################
5329
5330
5331	if [[ -f "$logs_dir/baseboard_product_name" ]]
5332	then
5333		continue
5334	else
5335		start=$(date +%s)
5336		STEPS_COUNTER=$((STEPS_COUNTER+1))
5337		echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT ALL THE OVERCLOUD HOSTS HAS THE SAME BASEBOARD PRODUCT NAME (+$elapsed_time_seconds `date '+%T'`)${NC}"
5338		echo -e "${CYAN}checking the computes${NC}"
5339		baseboard_product_name=$(ansible compute -b -m shell -a "dmidecode -s baseboard-product-name" | grep -v SUCCESS | sort -u | wc -l)
5340		if [[ $baseboard_product_name == "1" ]]
5341		then
5342			baseboard_product_name=$(ansible compute -b -m shell -a "dmidecode -s baseboard-product-name" | grep -v SUCCESS | sort -u)
5343			echo -e "${GREEN}all servers using baseboard product name $baseboard_product_name${NC}"
5344			touch $logs_dir/baseboard_product_name
5345			echo "$baseboard_product_name" > $logs_dir/baseboard_product_name
5346		else
5347			baseboard_product_name=$(ansible compute -b -m shell -a "dmidecode -s baseboard-product-name")
5348			echo -e "${RED}$baseboard_product_name${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5349		fi
5350		echo -e "${CYAN}checking the controllers${NC}"
5351		baseboard_product_name=$(ansible controller -b -m shell -a "dmidecode -s baseboard-product-name" | grep -v SUCCESS | sort -u | wc -l)
5352		if [[ $baseboard_product_name == "1" ]]
5353		then
5354			baseboard_product_name=$(ansible controller -b -m shell -a "dmidecode -s baseboard-product-name" | grep -v SUCCESS | sort -u)
5355			echo -e "${GREEN}all servers using baseboard product name $baseboard_product_name${NC}"
5356		else
5357			baseboard_product_name=$(ansible controller -b -m shell -a "dmidecode -s baseboard-product-name")
5358			echo -e "${RED}$baseboard_product_name${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5359		fi	
5360		if [[ $ansible_storage_hosts_count != "0" ]]
5361		then
5362			echo -e "${CYAN}checking the storage nodes${NC}"
5363			baseboard_product_name=$(ansible cephstorage -b -m shell -a "dmidecode -s baseboard-product-name" | grep -v SUCCESS | sort -u | wc -l)
5364			if [[ $baseboard_product_name == "1" ]]
5365			then
5366				baseboard_product_name=$(ansible cephstorage -b -m shell -a "dmidecode -s baseboard-product-name" | grep -v SUCCESS | sort -u)
5367				echo -e "${GREEN}all servers using baseboard product name $baseboard_product_name${NC}"
5368			else
5369				baseboard_product_name=$(ansible cephstorage -b -m shell -a "dmidecode -s baseboard-product-name")
5370				echo -e "${RED}$baseboard_product_name${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5371			fi
5372		else
5373			echo -e "${CYAN}\nchecking the storage nodes${NC}"
5374			echo -e "${ORANGE}couldn't find storage nodes in the system${NC}"
5375		fi			
5376	fi
5377	elapsed_time_seconds=$(expr $(date +%s) - $start)
5378
5379
5380	####################################################################################################
5381
5382
5383	if [[ $nuage != "true" ]]
5384	then		
5385		start=$(date +%s)
5386		STEPS_COUNTER=$((STEPS_COUNTER+1))
5387		echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT ALL CONTROLLERS HAS SAME NUMBER OF NAMESPACES (+$elapsed_time_seconds `date '+%T'`)${NC}"
5388		missing_namespaces=$(ansible controller -b -m shell -a "ip netns | wc -l" | grep ^[0-9] | sort --uniq | wc -l)
5389		if [[ $missing_namespaces != "1" ]]
5390		then
5391			missing_namespaces=$(ansible controller -b -m shell -a "ip netns | wc -l")
5392			echo -e "${RED}$missing_namespaces${NC}"	 ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5393		else
5394			echo -e "${GREEN}all controllers has the identical number of namespaces${NC}"
5395		fi
5396	fi	
5397	elapsed_time_seconds=$(expr $(date +%s) - $start)
5398
5399		
5400	####################################################################################################
5401
5402
5403	if [[ $nuage != "true" ]]
5404	then
5405		STEPS_COUNTER=$((STEPS_COUNTER+1))
5406		echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE CONTROLLERS HAS THE EXPECTED NUMBER OF NAMESPACES (+$elapsed_time_seconds `date '+%T'`)${NC}"
5407		dhcp_true_networks=$(source ~/overcloudrc && openstack subnet list --long -f value | grep True | awk '{print $3}' | sort -u | wc -l)
5408		routers=$(source ~/overcloudrc && openstack router list -f value | wc -l)
5409		expected_namespaces=$(expr $dhcp_true_networks + $routers)
5410		namespaces=$(ansible controller -b -m shell -a "ip netns | wc -l" | grep ^[0-9] | sort -u)
5411		if [[ $namespaces != $expected_namespaces ]]
5412		then
5413			namespaces_diff=$(ansible controller -b -m shell -a "ip netns | wc -l")
5414			echo -e "${RED}all the controller are expected to have $expected_namespaces namespaces\nexecute: ${MAGENTA}ansible controller -b -m shell -a \"ip netns\"${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5415			echo -e "${RED}$namespaces_diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5416		else
5417			echo -e "${GREEN}all the controllers has the expected number of namespaces${NC}"
5418		fi
5419	fi
5420	elapsed_time_seconds=$(expr $(date +%s) - $start)
5421
5422
5423	####################################################################################################
5424	
5425	
5426	start=$(date +%s)
5427	STEPS_COUNTER=$((STEPS_COUNTER+1))
5428	echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPARE THE NUMBER OF CRONTAB JOBS BETWEEN THE CONTROLLERS (+$elapsed_time_seconds `date '+%T'`)${NC}"
5429	cronjobs_comparison=$(ansible controller -b -m shell -a "crontab -l" | grep -v -E '\#|rabbit-drain-queues.py|AIDE integrity check run|SUCCESS' | sort | uniq -c | column -t | grep -v ^3 | awk '{print $NF}' | paste -sd'|' | xargs -i ansible controller -b -m shell -a "crontab -l | grep -E '{}'")
5430		if [[ $cronjobs_comparison ]]
5431	then
5432		echo -e "${RED}$cronjobs_comparison${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5433	else
5434		echo -e "${GREEN}the crontab jobs count is identical between the controllers${NC}"
5435	fi
5436	elapsed_time_seconds=$(expr $(date +%s) - $start)
5437	
5438
5439	####################################################################################################
5440
5441
5442	# 0 3 * * 6 bash /usr/share/cbis/utils/check_restart_horizon.sh
5443	# */15 * * * * source /home/cbis-admin/overcloudrc && python /usr/lib/zabbix/alertscripts/zbx_metrics_exporter.py > /var/log/zabbix/metrics/last_run.status
5444	# 0 0 * * * /usr/bin/find /var/log/zabbix/metrics/*.xml* -mtime +1 -exec rm -rf {} \; > /dev/null 2>&1
5445	# 0 0 * * * source /home/cbis-admin/overcloudrc && python /usr/lib/zabbix/alertscripts/zbx_KPIs_exporter.py &> /var/log/zabbix/metrics/last_KPIs_run.status
5446	# 1 * * * * sudo python /usr/bin/zabbix_db_partitions_manager.py --history=3 --trend=30
5447	# 0 * * * * sudo python /usr/share/cbis/overcloud/postdeploy/templates/zabbix/tools/clear_outdated_alma_alarms.py
5448	# 0 3 * * * /bin/docker start elk-curator
5449	# 0 1 * * * /usr/bin/find /backup/* -mtime +3 -exec rm -rf {} \; > /dev/null 2>&1
5450	# @daily sh /usr/local/bin/check_passwd_expiry.sh
5451	# @daily /usr/sbin/aide --check | /bin/mail -s "overcloud-controller-dublin-1 - AIDE integrity check run" root
5452	# @daily sh /usr/local/bin/create_cert_exp_alarm_oc.sh
5453	# 0 0 * * 0 python /usr/share/cbis/overcloud/postdeploy/scripts/rabbit-drain-queues.py --user guest --password '95eodeHJBpfbdXYAiaXc635bK' --host 172.17.1.11
5454	# 0 2 * * * python /root/backup/CbisOvercloudDatabaseBackup.py
5455	
5456	# note: ignoring rabbit-drain-queues.py and AIDE as they are uniq.
5457	
5458	start=$(date +%s)
5459	STEPS_COUNTER=$((STEPS_COUNTER+1))
5460	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE EXPECTED CRONTAB JOBS EXISTS ON THE CONTROLLERS (BASED ON CBIS 20 PP3) (+$elapsed_time_seconds `date '+%T'`)${NC}"
5461	crontab_list=$(ansible controller -b -m shell -a "crontab -l" | grep -v ^# | tr -d [0-9] | sort | uniq -c | column -t)
5462	declare -a cronjobs=(check_restart_horizon zbx_metrics_exporter /var/log/zabbix/metrics/*.xml zbx_KPIs_exporter zabbix_db_partitions_manager clear_outdated_alma_alarms elk-curator /backup/* check_passwd_expiry CbisOvercloudDatabaseBackup AIDE rabbit-drain-queues)
5463	for cronjob in "${cronjobs[@]}"
5464	do
5465		missing_cronjob=$(echo -e "$crontab_list" | grep $cronjob | grep -v ^3)
5466		if [[ $missing_cronjob ]]
5467		then
5468			controllers_count=$(echo -e "$missing_cronjob" | awk '{print $1}')
5469			echo -e "${RED}$cronjob crontab job is found only on $controllers_count controllers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5470		else
5471			echo -e "${GREEN}$cronjob crontab job is found on all 3 controllers${NC}"
5472		fi
5473	done
5474	elapsed_time_seconds=$(expr $(date +%s) - $start)
5475
5476
5477	####################################################################################################
5478
5479
5480	start=$(date +%s)
5481	STEPS_COUNTER=$((STEPS_COUNTER+1))
5482	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THE PRESENCE OF CRITICAL CRONTAB JOBS ON THE UNDERCLOUD VM (BASED ON CBIS 20 PP3) (+$elapsed_time_seconds `date '+%T'`)${NC}"
5483	crontab_list=$(sudo crontab -l | grep -v ^# && crontab -l | grep -v ^#)
5484	if [[ $cbis_version == "19.100.1" || $cbis_version == "19.0.0.1" ]]
5485	then
5486		declare -a cronjobs=(backup_fetcher check_fsid_mismatch heat-manage purge_deleted AIDE)
5487	else
5488		declare -a cronjobs=(backup_fetcher check_fsid_mismatch heat-manage purge_deleted AIDE create_cert_exp_alarm_uc)
5489	fi
5490	for cronjob in "${cronjobs[@]}"
5491	do
5492		missing_cronjob=$(echo -e "$crontab_list" | grep $cronjob)
5493		if [[ -z $missing_cronjob ]]
5494		then
5495			echo -e "${RED}$cronjob crontab job is missing${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5496		else
5497			echo -e "${GREEN}$cronjob crontab job is found${NC}"
5498		fi
5499	done
5500	elapsed_time_seconds=$(expr $(date +%s) - $start)
5501
5502
5503	####################################################################################################
5504	
5505	
5506	start=$(date +%s)
5507	STEPS_COUNTER=$((STEPS_COUNTER+1))
5508	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE DUPLICATE CRONJOBS IN CRONTAB (+$elapsed_time_seconds `date '+%T'`)${NC}"
5509	duplicate_cronjob=$(ansible controller -b -m shell -a "crontab -l | grep -E -v '\#|AIDE' | sort | uniq -c | grep -E '^\s+[2-9] '" | grep -E -v 'non-zero return code|FAILED')
5510	if [[ $duplicate_cronjob ]]
5511	then
5512		echo -e "${RED}$duplicate_cronjob${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5513		echo -e "\n${ORANGE}CBIS-13672 - Duplicated zabbix metrics cronjob in controller  (Detected in 19.0 MP4 PP1)${NC}"
5514	else
5515		echo -e "${GREEN}no duplicate cronjobs are found${NC}"
5516	fi	
5517	elapsed_time_seconds=$(expr $(date +%s) - $start)
5518
5519
5520	####################################################################################################
5521
5522
5523	start=$(date +%s)
5524	STEPS_COUNTER=$((STEPS_COUNTER+1))
5525	echo -e "${BLUE}\n\n$STEPS_COUNTER) PERFORMS LOGROTATE DRY RUN TO LOCATE ERRORS (+$elapsed_time_seconds `date '+%T'`)${NC}"
5526	logrotate_linux=$(ansible all --limit '!hypervisor' -b -m shell -a "ls /etc/logrotate.d/ | tr -s ' ' /n | xargs -i logrotate -vdf /etc/logrotate.d/{} 2>&1 | grep -v 'No such file or directory' | grep -i error:" | grep -v -E 'rc=[1-9]|non-zero return code')
5527	logrotate_containers=$(ansible all --limit '!hypervisor' -b -m shell -a "logrotate -vdf /var/lib/config-data/puppet-generated/crond/etc/logrotate-crond.conf 2>&1 | grep -v 'No such file or directory' | grep -i error:" | grep -v -E 'rc=[1-9]|non-zero return code')
5528	if [[ $logrotate_linux || $logrotate_containers ]]
5529	then
5530		echo -e "${RED}$logrotate_linux\n$logrotate_containers${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5531		echo -e "\n${ORANGE}CBIS-16153 - zabbix_ceph_racks, zabbix_cbis and zabbix-agent logrotate conf files have permission issues   (20)${NC}"
5532	else
5533		echo -e "${GREEN}no errors were found in the logrotate dry run${NC}"
5534	fi
5535	elapsed_time_seconds=$(expr $(date +%s) - $start)
5536
5537
5538	####################################################################################################
5539
5540
5541	start=$(date +%s)
5542	STEPS_COUNTER=$((STEPS_COUNTER+1))
5543	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE LOGROTATE CRONJOB IS CONFIGURED WITHIN THE logrotate_crond DOCKER CONTAINER OF EACH SERVER (+$elapsed_time_seconds `date '+%T'`)${NC}"
5544	logrotate_crond_crontab=$(ansible all --limit '!hypervisor' -b -m shell -a "docker exec -i logrotate_crond crontab -l | grep '/usr/sbin/logrotate -s' -c" | grep ^[0-9] | sort | uniq)
5545	if [[ $logrotate_crond_crontab != "1" ]]
5546	then
5547		logrotate_crond_crontab=$(ansible all --limit '!hypervisor' -b -m shell -a "docker exec -i logrotate_crond crontab -l | grep '/usr/sbin/logrotate -s'")
5548		echo -e "${RED}$logrotate_crond_crontab${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5549	else
5550		echo -e "${GREEN}the logrotate cronjob is configured within the logrotate_crond docker container of each server${NC}"
5551	fi
5552	elapsed_time_seconds=$(expr $(date +%s) - $start)
5553	elapsed_time_seconds=$(expr $(date +%s) - $start)
5554	
5555	
5556	####################################################################################################
5557	
5558	
5559	start=$(date +%s)
5560	STEPS_COUNTER=$((STEPS_COUNTER+1))
5561	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT LOG FILES UNDER /var/log/containers/ ARE LOG-ROTATED (+$elapsed_time_seconds `date '+%T'`)${NC}"	
5562	servers_without_logrotation=$(ansible all --limit '!hypervisor' -b -m shell -a "find /var/log/containers/ -iname '*.log*' | grep -v -E log$ | xargs -i ls -l {}" | grep -e '^$' -B 1 | awk '{print $1}' | column -t | sort | grep overcloud-)
5563	if [[ $servers_without_logrotation ]]
5564	then
5565		echo -e "${RED}unable to find single log-rotated log file under /var/log/containers/* for the following hosts:\n\n$servers_without_logrotation${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5566		echo -e "\n\n${ORANGE}CBIS-16404 (19A) - logrotate isn't working on all the overcloud servers - daily cronjob doesn't work${NC}"
5567	else
5568		echo -e "${GREEN}at least one log file is found log-rotated on each hosts${NC}"
5569	fi
5570	elapsed_time_seconds=$(expr $(date +%s) - $start)
5571	
5572	
5573	####################################################################################################
5574
5575
5576	start=$(date +%s)
5577	STEPS_COUNTER=$((STEPS_COUNTER+1))
5578	echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THERE ARE NO DIFFERENCES IN THE LOGROTATE CONFIG FILES BETWEEN THE HOST-GROUPS (+$elapsed_time_seconds `date '+%T'`)${NC}"
5579	echo -e "${CYAN}checking controllers${NC}"
5580	logrotate_sha1=$(ansible controller -b -m shell -a "ls /etc/logrotate.d/ | xargs -i sha1sum /etc/logrotate.d/{}" | grep ^[0-9,a-f] | sort | uniq -c | grep -E '^\s+[1-2]')
5581	if [[ $logrotate_sha1 ]]
5582	then
5583		echo -e "${RED}$logrotate_sha1${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5584		echo -e "\n\n${ORANGE}CBIS-16138 - zabbix-sender and zabbix-server missing from the replaced controllers/scaled-out compute/storage nodes   (20)${NC}\n"	
5585	else
5586		echo -e "${GREEN}the logrotate configuration files are identical on all the controller${NC}"
5587	fi
5588	if [ $ansible_avrs_hosts_count -gt 1 ]
5589	then
5590		echo -e "${CYAN}checking avrs computes${NC}"
5591		logrotate_sha1=$(ansible *overcloud-[aA]vrs* -b -m shell -a "ls /etc/logrotate.d/ | xargs -i sha1sum /etc/logrotate.d/{}" | grep ^[0-9,a-f] | sort | uniq -c | grep -vE '^\s+'$ansible_avrs_hosts_count'')
5592		if [[ $logrotate_sha1 ]]
5593		then
5594			echo -e "${RED}$logrotate_sha1${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5595			echo -e "\n\n${ORANGE}CBIS-16138 - zabbix-sender and zabbix-server missing from the replaced controllers/scaled-out compute/storage nodes   (20)${NC}\n"
5596		else
5597			echo -e "${GREEN}the logrotate configuration files are identical on all the avrs computes${NC}"
5598		fi	
5599	fi
5600	if [ $ansible_dpdk_hosts_count -gt 1 ]
5601	then
5602		echo -e "${CYAN}checking dpdk computes${NC}"
5603		logrotate_sha1=$(ansible *overcloud-[dD]pdk* -b -m shell -a "ls /etc/logrotate.d/ | xargs -i sha1sum /etc/logrotate.d/{}" | grep ^[0-9,a-f] | sort | uniq -c | grep -vE '^\s+'$ansible_dpdk_hosts_count'')
5604		if [[ $logrotate_sha1 ]]
5605		then
5606			echo -e "${RED}$logrotate_sha1${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5607			echo -e "\n\n${ORANGE}CBIS-16138 - zabbix-sender and zabbix-server missing from the replaced controllers/scaled-out compute/storage nodes   (20)${NC}\n"
5608		else
5609			echo -e "${GREEN}the logrotate configuration files are identical on all the dpdk computes${NC}"
5610		fi	
5611	fi
5612	if [ $ansible_ovs_hosts_count -gt 1 ]
5613	then
5614		echo -e "${CYAN}checking ovs computes${NC}"
5615		logrotate_sha1=$(ansible *overcloud-[oO]vs* -b -m shell -a "ls /etc/logrotate.d/ | xargs -i sha1sum /etc/logrotate.d/{}" | grep ^[0-9,a-f] | sort | uniq -c | grep -vE '^\s+'$ansible_ovs_hosts_count'')
5616		if [[ $logrotate_sha1 ]]
5617		then
5618			echo -e "${RED}$logrotate_sha1${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5619			echo -e "\n\n${ORANGE}CBIS-16138 - zabbix-sender and zabbix-server missing from the replaced controllers/scaled-out compute/storage nodes   (20)${NC}\n"
5620		else
5621			echo -e "${GREEN}the logrotate configuration files are identical on all the ovs computes${NC}"
5622		fi	
5623	fi	
5624	if [ $ansible_sriov_hosts_count -gt 1 ]
5625	then
5626		echo -e "${CYAN}checking sriov computes${NC}"
5627		logrotate_sha1=$(ansible *overcloud-[sS]riov* -b -m shell -a "ls /etc/logrotate.d/ | xargs -i sha1sum /etc/logrotate.d/{}" | grep ^[0-9,a-f] | sort | uniq -c | grep -vE '^\s+'$ansible_sriov_hosts_count'')
5628		if [[ $logrotate_sha1 ]]
5629		then
5630			echo -e "${RED}$logrotate_sha1${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5631			echo -e "\n\n${ORANGE}CBIS-16138 - zabbix-sender and zabbix-server missing from the replaced controllers/scaled-out compute/storage nodes   (20)${NC}\n"
5632		else
5633			echo -e "${GREEN}the logrotate configuration files are identical on all the sriov computes${NC}"
5634		fi	
5635	fi	
5636	if [ $ansible_storage_hosts_count -gt 1 ]
5637	then
5638		echo -e "${CYAN}checking storage nodes${NC}"
5639		logrotate_sha1=$(ansible *overcloud-[sS]torage* -b -m shell -a "ls /etc/logrotate.d/ | xargs -i sha1sum /etc/logrotate.d/{}" | grep ^[0-9,a-f] | sort | uniq -c | grep -vE '^\s+'$ansible_storage_hosts_count'')
5640		if [[ $logrotate_sha1 ]]
5641		then
5642			echo -e "${RED}$logrotate_sha1${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5643			echo -e "\n\n${ORANGE}CBIS-16138 - zabbix-sender and zabbix-server missing from the replaced controllers/scaled-out compute/storage nodes   (20)${NC}\n"
5644		else
5645			echo -e "${GREEN}the logrotate configuration files are identical on all the storage nodes${NC}"
5646		fi	
5647	fi	
5648	elapsed_time_seconds=$(expr $(date +%s) - $start)
5649
5650
5651	####################################################################################################
5652
5653
5654	if [[ -f "$logs_dir/sriov_vf_interfaces" ]]
5655	then
5656		continue
5657	else
5658		start=$(date +%s)
5659		STEPS_COUNTER=$((STEPS_COUNTER+1))
5660		echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE 2 SRIOV NICS ARE MAPPED TO THE 2 NUMAS (+$elapsed_time_seconds `date '+%T'`)${NC}"
5661		touch $logs_dir/sriov_vf_interfaces
5662		if [[ $hw_model == "airframe_or" || $hw_model == "hp-c7kg9" ]]
5663		then
5664			echo -e "${ORANGE}not applicable for airframe_or (OR17) and hp-c7kg9.\nwhen performing/designing tests around this hardware(s), understand that the sriov network interfaces are all mapped to 1 numa (numa 0)${NC}"
5665		else	
5666			if [[ $ansible_sriov_hosts_count == "0" ]]
5667			then
5668				echo -e "${ORANGE}couldn't find sriov computes${NC}"
5669			else
5670				sriov_numa_mapping=$(ansible Sriov -b -m shell -a "ip link show | grep 'vf 0' -B2 | grep ^[0-9] | awk -F: '{print \$2}' | xargs -i cat /sys/class/net/{}/device/numa_node | sort -u | wc -l" | grep ^[0-9] | sort -u)
5671				if [[ $sriov_numa_mapping == "2" ]]
5672				then
5673					echo -e "${GREEN}the sriov computes network interfaces are mapped to each individual NUMA as expected${NC}"
5674					echo -e "$sriov_numa_mapping" > $logs_dir/sriov_vf_interfaces
5675				else
5676					sriov_vf_interfaces=$(ansible Sriov -b -m shell -a "ip link show | grep 'vf 0' -B2 | grep ^[0-9] | awk -F: '{print \$2}' | xargs -i echo /sys/class/net/{}/device/numa_node")
5677					echo -e "${RED}$sriov_vf_interfaces${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5678
5679				fi
5680			fi
5681		fi
5682	fi
5683	elapsed_time_seconds=$(expr $(date +%s) - $start)
5684
5685
5686	####################################################################################################
5687
5688
5689	start=$(date +%s)
5690	STEPS_COUNTER=$((STEPS_COUNTER+1))
5691	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR DISK ERRORS USING SMARTCTL (+$elapsed_time_seconds `date '+%T'`)${NC}"
5692	disk_errors=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "smartctl --scan | cut -d' ' -f1-3 | while read line; do smartctl -x \${line} | awk '{ if ( \$4 > 10 ) print \$1,\$2,\$3,\$4 }'; done | grep 'Device Error Count'" | grep -E -v 'FAILED|non-zero return code')
5693	if [[ -z $disk_errors ]]
5694	then
5695		echo -e "${GREEN}no device errors found${NC}"
5696	else
5697		echo -e "${RED}$disk_errors${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5698	fi
5699	elapsed_time_seconds=$(expr $(date +%s) - $start)
5700
5701
5702	####################################################################################################
5703
5704
5705	start=$(date +%s)
5706	STEPS_COUNTER=$((STEPS_COUNTER+1))
5707	echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE 'Too many open files' IN /var/log/messages WITHIN ALL THE HOST (+$elapsed_time_seconds `date '+%T'`)${NC}"
5708	files=$(ansible all --limit '!hypervisor' -b -m shell -a "grep -i -R 'Too many open files' /var/log/messages" | grep -E -v 'Invoked with warn=True|SUCCESS')
5709	if [[ $files ]]
5710	then
5711		files=$(ansible all --limit '!hypervisor' -b -m shell -a "grep -i -R 'Too many open files' /var/log/messages | grep -v 'Invoked with warn=True'"| grep -E -v 'FAILED|non-zero return code')
5712		echo -e "${RED}$files${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5713	else
5714		echo -e "${GREEN}no 'Too many open files' lines found under /var/log/messages${NC}"
5715	fi
5716	elapsed_time_seconds=$(expr $(date +%s) - $start)
5717
5718
5719	####################################################################################################
5720
5721
5722	start=$(date +%s)
5723	STEPS_COUNTER=$((STEPS_COUNTER+1))
5724	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK RPMS DIFFERENCES BETWEEN THE HOSTS OF EACH SAME HOST-GROUP (+$elapsed_time_seconds `date '+%T'`)${NC}"
5725	echo -e "${CYAN}checking for rpms delta between all the > controllers < (to cover replace controller rpms possible mismatch)${NC}"
5726	md5sum=$(ansible controller -b -m shell -a "md5sum latest_installed_rpms.txt" | awk '{print $1}' | grep -vi overcloud | sort -u | wc -l)
5727	if [[ $md5sum == "1" ]]
5728	then
5729		echo -e "${GREEN}no differences are found between the current installed rpms of the controllers${NC}"
5730	else
5731		echo -e "${RED}one or more controllers has different md5 checksum. compare the latest_installed_rpms.txt file of each controller${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5732		md5sum=$(ansible controller -b -m shell -a "md5sum latest_installed_rpms.txt")
5733		echo -e "${RED}$md5sum${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5734	fi
5735	if [[ $ansible_dpdk_hosts_count != "0" && $ansible_dpdk_hosts_count != "1" ]]
5736	then
5737		echo -e "${CYAN}checking for rpms delta between all the dpdk computes${NC}"
5738		md5sum=$(ansible Dpdk* -b -m shell -a "md5sum latest_installed_rpms.txt" | awk '{print $1}' | grep -vi overcloud | sort -u | wc -l)
5739		if [[ $md5sum == "1" ]]
5740		then
5741			echo -e "${GREEN}no differences are found between the current installed rpms of the dpdk computes${NC}"
5742		else
5743			echo -e "${RED}one or more computes has different md5 checksum. compare the latest_installed_rpms.txt file of each compute${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5744			md5sum=$(ansible Dpdk* -b -m shell -a "md5sum latest_installed_rpms.txt")
5745			echo -e "${RED}$md5sum${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5746		fi
5747	fi
5748	if [[ $ansible_sriov_hosts_count != "0" && $ansible_sriov_hosts_count != "1" ]]
5749	then
5750		echo -e "${CYAN}checking for rpms delta between all the sriov computes${NC}"	
5751		md5sum=$(ansible Sriov -b -m shell -a "md5sum latest_installed_rpms.txt" | awk '{print $1}' | grep -vi overcloud | sort -u | wc -l)
5752		if [[ $md5sum == "1" ]]
5753		then
5754			echo -e "${GREEN}no differences are found between the current installed rpms of the sriov computes${NC}"
5755		else
5756			echo -e "${RED}one or more computes has different md5 checksum. compare the latest_installed_rpms.txt file of each compute${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5757			md5sum=$(ansible Sriov -b -m shell -a "md5sum latest_installed_rpms.txt")
5758			echo -e "${RED}$md5sum${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5759		fi
5760	fi	
5761	if [[ $ansible_ovs_hosts_count != "0" && $ansible_ovs_hosts_count != "1" ]]
5762	then
5763		echo -e "${CYAN}checking for rpms delta between all the ovs computes${NC}"	
5764		md5sum=$(ansible *overcloud-[Oo]vs* -b -m shell -a "md5sum latest_installed_rpms.txt" | awk '{print $1}' | grep -vi overcloud | sort -u | wc -l)
5765		if [[ $md5sum == "1" ]]
5766		then
5767			echo -e "${GREEN}no differences are found between the current installed rpms of the ovs computes${NC}"
5768		else
5769			echo -e "${RED}one or more computes has different md5 checksum. compare the latest_installed_rpms.txt file of each compute${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5770			md5sum=$(ansible *overcloud-[Oo]vs* -b -m shell -a "md5sum latest_installed_rpms.txt")
5771			echo -e "${RED}$md5sum${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5772		fi
5773	fi
5774	if [[ $ansible_avrs_hosts_count != "0" && $ansible_avrs_hosts_count != "1" && $nuage == "true" ]]
5775	then
5776		echo -e "${CYAN}checking for rpms delta between all the avrs computes${NC}"
5777		md5sum=$(ansible *overcloud-[Aa]vrs* -b -m shell -a "md5sum latest_installed_rpms.txt" | awk '{print $1}' | grep -vi overcloud | sort -u | wc -l)
5778		if [[ $md5sum == "1" ]]
5779		then
5780			echo -e "${GREEN}no differences are found between the current installed rpms of the avrs computes${NC}"
5781		else
5782			echo -e "${RED}one or more computes has different md5 checksum. compare the latest_installed_rpms.txt file of each compute${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5783			md5sum=$(ansible *overcloud-[Aa]vrs* -b -m shell -a "md5sum latest_installed_rpms.txt")
5784			echo -e "${RED}$md5sum${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5785		fi
5786	fi			
5787	if [[ $ansible_storage_hosts_count != "0" ]]
5788	then
5789		echo -e "${CYAN}checking for rpms delta between all the > storage nodes < (to cover scale-out rpms possible mismatch)${NC}"
5790		md5sum=$(ansible *overcloud-[Ss]torage* -b -m shell -a "md5sum latest_installed_rpms.txt" | awk '{print $1}' | grep -vi overcloud | sort -u | wc -l)
5791		if [[ $md5sum == "1" ]]
5792		then
5793			echo -e "${GREEN}no differences are found between the current installed rpms of the storage nodes${NC}"
5794		else
5795			echo -e "${RED}one or more storage nodes has different md5 checksum. compare the latest_installed_rpms.txt file of each storage node${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5796			md5sum=$(ansible *overcloud-[Ss]torage* -b -m shell -a "md5sum latest_installed_rpms.txt")
5797			echo -e "${RED}$md5sum${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5798		fi
5799	fi	
5800	elapsed_time_seconds=$(expr $(date +%s) - $start)
5801
5802
5803	####################################################################################################
5804	
5805	
5806	start=$(date +%s)
5807	STEPS_COUNTER=$((STEPS_COUNTER+1))
5808	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT /root/cbis-installer/user_config.yaml EXISTS (+$elapsed_time_seconds `date '+%T'`)${NC}"	
5809	user_config_hv=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "test -f /root/cbis-installer/user_config.yaml && echo '/root/cbis-installer/user_config.yaml exists in the hypervisor'" | grep exists)
5810	if [[ $user_config_hv ]]
5811	then
5812		echo -e "${GREEN}$user_config_hv${NC}"
5813	else
5814		echo -e "${RED}/root/cbis-installer/user_config.yaml can't be found in the hypervisor${NC}\n"	 ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5815	fi
5816	elapsed_time_seconds=$(expr $(date +%s) - $start)
5817	
5818
5819	####################################################################################################
5820
5821
5822	start=$(date +%s)
5823	STEPS_COUNTER=$((STEPS_COUNTER+1))
5824	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THE TIME IT TAKES FOR nova list TO FINISH PROCESSING FROM BOTH stackrc AND OVERCLOUDRC AND REPORT FAILURE IF IT IS GREATER THEN 10 SECONDS (+$elapsed_time_seconds `date '+%T'`)${NC}"
5825	# when using self signed certificate the following error may show: Certificate for 10.55.220.115 has no `subjectAltName`, falling back to check for a `commonName` for now. This feature is being removed by major browsers and deprecated by RFC 2818.
5826	# this certificate warning is expected and not a bug according to Yves Brissette.
5827	source ~/stackrc
5828	for i in {1..3}
5829	do
5830		start_time=$(date +%s)
5831		nova list > /dev/null
5832		end_time=$(date +%s)
5833		result_in_seconds=$(expr $end_time - $start_time)
5834		if [ $result_in_seconds -gt 10 ]
5835		then
5836			echo -e "${RED}try $i/3 - nova list (stackrc) finished processing after $result_in_seconds seconds (fail criteria is > 10 seconds)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5837		else
5838			echo -e "${GREEN}try $i/3 - nova list (stackrc) finished processing after $result_in_seconds seconds${NC}"
5839		fi
5840	done
5841	source ~/overcloudrc
5842	for i in {1..3}
5843	do
5844		start_time=$(date +%s)
5845		nova list > /dev/null
5846		end_time=$(date +%s)
5847		result_in_seconds=$(expr $end_time - $start_time)
5848		if [ $result_in_seconds -gt 10 ]
5849		then
5850			echo -e "${RED}try $i/3 - nova list (overcloudrc) finished processing after $result_in_seconds seconds (fail criteria is > 10 seconds)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5851		else
5852			echo -e "${GREEN}try $i/3 - nova list (overcloudrc) finished processing after $result_in_seconds seconds${NC}"
5853		fi
5854	done
5855	elapsed_time_seconds=$(expr $(date +%s) - $start)
5856
5857
5858	####################################################################################################
5859
5860	
5861	start=$(date +%s)
5862	STEPS_COUNTER=$((STEPS_COUNTER+1))
5863	echo -e "${BLUE}\n\n$STEPS_COUNTER) VERIFY admin CORE RESOURCES QUOTA IS -1 (UNLIMITED) (+$elapsed_time_seconds `date '+%T'`)${NC}"
5864	quota=$(source ~/overcloudrc && openstack quota show admin | grep -E -w 'backups|cores|instances|networks|ram|volumes' | awk '{print $4}' | sort -u)
5865	if [[ $quota != "-1" ]]
5866	then
5867		quota=$(source ~/overcloudrc && openstack quota show admin)
5868		echo -e "${RED}$quota${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5869	else
5870		echo -e "${GREEN}core resources (backups, cores, instances, networks, ram and volumes) are al set to -1 (unlimited) quota${NC}"
5871	fi
5872	elapsed_time_seconds=$(expr $(date +%s) - $start)
5873	
5874
5875	####################################################################################################
5876
5877
5878	start=$(date +%s)
5879	STEPS_COUNTER=$((STEPS_COUNTER+1))
5880	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT CORE OPENSTACK CLIENTS USES THE EXPECTED VERSION (+$elapsed_time_seconds `date '+%T'`)${NC}"
5881	current=( $(ansible all --limit '!hypervisor' -b -m shell -a "openstack module list | tr -d '\|\+\-' | grep -E -w 'aodhclient|barbicanclient|cinderclient|glanceclient|heatclient|ironicclient|keystoneclient|mistralclient|novaclient|openstack|openstackclient|swiftclient|vitrageclient'" | grep -v SUCCESS | awk '{print $1,$2}' | sort -u | tr -s ' ' '-' | paste -sd' ') )
5882	if [[ $cbis_version == "19.0.0.1" ]]
5883	then
5884		expected=(aodhclient-1.0.0 barbicanclient-4.6.0 cinderclient-3.5.0 glanceclient-2.10.0 heatclient-1.14.0 ironicclient-2.2.1 keystoneclient-3.15.0 mistralclient-3.3.0 novaclient-10.1.0 openstack-0.11.3 openstackclient-3.14.2 swiftclient-3.5.0 vitrageclient-0.0.1)
5885		expected_current_diff=(`echo ${current[@]} ${expected[@]} | tr ' ' '\n' | sort | uniq -u | paste -sd'|'`)
5886		if [[ $expected_current_diff ]]
5887		then
5888			echo -e "${RED}$expected_current_diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5889		else
5890			echo -e "${GREEN}all diagnosed openstack clients are with the expected version${NC}"
5891		fi
5892	fi
5893	if [[ $cbis_version == "19.100.1" ]]
5894		then
5895			expected=(aodhclient-1.1.0 barbicanclient-4.7.2 cinderclient-4.0.2 glanceclient-2.13.1 heatclient-1.16.2 ironicclient-2.5.1 keystoneclient-3.17.0 mistralclient-3.7.0 novaclient-11.0.0 openstack-0.17.2 openstackclient-3.16.2 swiftclient-3.6.0 vitrageclient-2.3.0)
5896			expected_current_diff=(`echo ${current[@]} ${expected[@]} | tr ' ' '\n' | sort | uniq -u | paste -sd'|'`)
5897			if [[ $expected_current_diff ]]
5898			then
5899				echo -e "${RED}$expected_current_diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5900			else
5901				echo -e "${GREEN}all diagnosed openstack clients are with the expected version${NC}"
5902			fi
5903		fi	
5904	if [[ $cbis_version == "20.100.1" ]]
5905		then
5906			expected=(aodhclient-1.1.0 barbicanclient-4.7.2 cinderclient-4.0.2 glanceclient-2.13.1 heatclient-1.16.2 ironicclient-2.5.1 keystoneclient-3.17.0 mistralclient-3.7.0 novaclient-11.0.0 openstack-0.17.2 openstackclient-3.16.2 swiftclient-3.6.0)
5907			expected_current_diff=(`echo ${current[@]} ${expected[@]} | tr ' ' '\n' | sort | uniq -u | paste -sd'|'`)
5908			if [[ $expected_current_diff ]]
5909			then
5910				echo -e "${RED}$expected_current_diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5911			else
5912				echo -e "${GREEN}all diagnosed openstack clients are with the expected version${NC}"
5913			fi
5914		fi		
5915	elapsed_time_seconds=$(expr $(date +%s) - $start)
5916
5917
5918	####################################################################################################
5919
5920
5921	start=$(date +%s)
5922	STEPS_COUNTER=$((STEPS_COUNTER+1))
5923	echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT CBIS REPO ENABLED ON THE OVERCLOUD HOSTS (ICE-575) (+$elapsed_time_seconds `date '+%T'`)${NC}"
5924	cbis_repo_enabled=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "yum repolist all warn=False" | grep -E '^CBIS |^!CBIS ' | grep -v enabled)
5925	if [[ -z $cbis_repo_enabled ]]
5926	then
5927		echo -e "${GREEN}CBIS repo is enabled on all the overcloud hosts${NC}"
5928	else
5929		cbis_repo_enabled=$(ansible all --limit '!hypervisor' -b -m shell -a "yum repolist all | grep -E '^CBIS |^!CBIS ' warn=False")
5930		echo -e "${RED}$cbis_repo_enabled${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5931	fi
5932	elapsed_time_seconds=$(expr $(date +%s) - $start)
5933
5934
5935	####################################################################################################	
5936	
5937
5938	start=$(date +%s)
5939	STEPS_COUNTER=$((STEPS_COUNTER+1))
5940	echo -e "${BLUE}\n\n$STEPS_COUNTER) SHOW THE LATEST REBOOT + salt CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
5941	LAST_REBOOT=$(/home/stack/venv/salt-ssh/bin/python /home/stack/venv/salt-ssh/bin/salt-ssh -c /home/stack/salt/etc/salt/ --log-file /home/stack/salt/var/log/salt/ssh --no-host-keys "*" cmd.run "last reboot | grep reboot | head -n 1")
5942	echo -e "${GREEN}$LAST_REBOOT${NC}"
5943	elapsed_time_seconds=$(expr $(date +%s) - $start)
5944	
5945	
5946	####################################################################################################
5947
5948	# start=$(date +%s)
5949	# STEPS_COUNTER=$((STEPS_COUNTER+1))
5950	# echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE ATTACHED USB DEVICES ON ALL HOSTS(+$elapsed_time_seconds `date '+%T'`)${NC}"
5951	# usb=$(ansible all -b -m shell -a "lsblk --all -S | grep usb" | grep usb -B 1)
5952	# if [[ $usb ]]
5953	# then
5954		# echo -e "${RED}$usb${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5955	# else
5956		# echo -e "${GREEN}couldn't find attached usb devices${NC}"
5957	# fi	
5958	# elapsed_time_seconds=$(expr $(date +%s) - $start)
5959
5960
5961	####################################################################################################
5962
5963
5964	# start=$(date +%s)
5965	# STEPS_COUNTER=$((STEPS_COUNTER+1))
5966	# echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT PER-USER OPEN FILES LIMIT IS IDENTICAL FOR ALL THE HOSTS (+$elapsed_time_seconds `date '+%T'`)${NC}"
5967	# open_files_limit_per_user=$(ansible all --limit '!hypervisor' -b -m shell -a "cat /etc/security/limits.conf" | grep -E 'soft memlock|hard memlock' | awk '{print $NF}' | sort --uniq)
5968	# if [[ $open_files_limit_per_user != "unlimited" ]]
5969	# then
5970		# open_files_limit_per_user=$(ansible all --limit '!hypervisor' -b -m shell -a "cat /etc/security/limits.conf | grep -E 'soft memlock|hard memlock' | awk '{print $NF}' | sort --uniq")
5971		# echo -e "${RED}$open_files_limit_per_user${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5972	# else
5973		# echo -e "${GREEN}all the hosts returned unlimited for both soft memlock and hard memlock under /etc/security/limits.conf${NC}"
5974	# fi
5975	# elapsed_time_seconds=$(expr $(date +%s) - $start)	
5976
5977
5978	####################################################################################################
5979
5980
5981	# start=$(date +%s)
5982	# STEPS_COUNTER=$((STEPS_COUNTER+1))
5983	# echo -e "${BLUE}\n\n$STEPS_COUNTER) ZABBIX LOGIN CHECK (+$elapsed_time_seconds `date '+%T'`)${NC}"
5984	# http_availability=$(nc -v -i1 -w1 135.248.16.107 80 2>&1 | grep 'Connected to')
5985	# if [[ $http_availability ]]
5986	# then
5987		# ansible controller -b -m shell -a "wget -N http://135.248.16.107/testmanager/cbis/scripts/python_scripts/zabbix_connectivity.py warn=False" > /dev/null
5988		# zabbix=$(ansible controller -b -m shell -a "python zabbix_connectivity.py" | grep -c Running)
5989		# if [[ $zabbix == "3" || $zabbix == "1" ]]
5990		# then
5991			# echo -e "${GREEN}zabbix login page is accessible (via API) from all the controllers${NC}"
5992		# else
5993			# zabbix=$(ansible controller -b -m shell -a "python zabbix_connectivity.py")
5994			# echo -e "${RED}$zabbix${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5995		# fi
5996	# else
5997		# echo -e "${RED}can't download the zabbix script from 135.248.16.107 - check for zabbix alarms manually${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
5998	# fi
5999	# elapsed_time_seconds=$(expr $(date +%s) - $start)
6000
6001
6002	####################################################################################################
6003
6004
6005	# start=$(date +%s)
6006	# STEPS_COUNTER=$((STEPS_COUNTER+1))
6007	# echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT sshd_migration_fix AND sshd_migration_fix_2 RESIDE WITHIN THE SELINUX POLICY MODULES (+$elapsed_time_seconds `date '+%T'`)${NC}"
6008	# selinux_migrate_fix=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "semodule -l" | grep -E 'sshd_migration_fix_2|sshd_migration_fix' | sort -u | wc -l)
6009	# if [[ $selinux_migrate_fix  != "2" ]]
6010	# then
6011		# selinux_migrate_fix=$(ansible compute -b -m shell -a "semodule -l | grep -E 'sshd_migration_fix_2|sshd_migration_fix'")
6012		# echo -e "${RED}$selinux_migrate_fix${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6013	# else
6014		# echo -e "${GREEN}sshd_migration_fix and sshd_migration_fix_2 reside within the SELinux policy modules of the computes${NC}"
6015	# fi
6016	# elapsed_time_seconds=$(expr $(date +%s) - $start)
6017
6018
6019	####################################################################################################
6020
6021
6022	# start=$(date +%s)
6023	# STEPS_COUNTER=$((STEPS_COUNTER+1))
6024	# echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK EACH HOST FREE MEMORY AND REPORT FAILURE IF A HOST HAD LESS THEN 2G FREE MEMORY (free) (+$elapsed_time_seconds `date '+%T'`)${NC}"
6025	# free_memory=$(ansible all --limit '!hypervisor' -b -m shell -a "free -g | grep Mem: | awk '{ if ( \$4 < 1 ) print \$4 }'" | grep ^[0-9] -B1)
6026	# if [[ $free_memory ]]
6027	# then
6028		# echo -e "${RED}$free_memory${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6029	# else
6030		# echo -e "${GREEN}all the servers has equal or more then 2G free memory${NC}"
6031	# fi
6032	# elapsed_time_seconds=$(expr $(date +%s) - $start)
6033
6034
6035	####################################################################################################
6036
6037
6038	# start=$(date +%s)
6039	# STEPS_COUNTER=$((STEPS_COUNTER+1))
6040	# echo -e "${BLUE}\n\n$STEPS_COUNTER) PRINT SYSTEM FILE DESCRIPTOR LIMIT AND USAGE (+$elapsed_time_seconds `date '+%T'`)${NC}"
6041	# used_file_descriptor=$(ansible all --limit '!hypervisor' -b -m shell -a "cat /proc/sys/fs/file-nr | awk '{print \$1}'")
6042	# total_file_descriptor=$(ansible all --limit '!hypervisor' -b -m shell -a "cat /proc/sys/fs/file-nr | awk '{print \$3}'")
6043	# if [[ $used_file_descriptor -ge $total_file_descriptor ]]
6044	# then
6045		# file_descriptor=$(ansible all --limit '!hypervisor' -b -m shell -a "cat /proc/sys/fs/file-nr")
6046		# echo -e "${RED}$file_descriptor${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6047	# else
6048		# echo -e "${GREEN}the file descriptor limit is not reached${NC}"
6049	# fi
6050	# elapsed_time_seconds=$(expr $(date +%s) - $start)
6051
6052
6053	####################################################################################################
6054
6055
6056	# start=$(date +%s)
6057	# STEPS_COUNTER=$((STEPS_COUNTER+1))
6058	# echo -e "${BLUE}\n\n$STEPS_COUNTER) PRINT RABBITMQ FILE DESCRIPTOR/SOCKETS/PROCESSES LIMIT AND USAGE (+$elapsed_time_seconds `date '+%T'`)${NC}"
6059	# ansible controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl report | grep file_descriptors -A5"
6060	# elapsed_time_seconds=$(expr $(date +%s) - $start)
6061
6062
6063	####################################################################################################
6064
6065
6066	# start=$(date +%s)
6067	# STEPS_COUNTER=$((STEPS_COUNTER+1))
6068	# echo -e "${BLUE}\n\n$STEPS_COUNTER) PRINT MYSQL DATABASES SIZE (+$elapsed_time_seconds `date '+%T'`)${NC}"
6069	# ansible $last_index_controller -b -m shell -a "mysql -e \"SELECT table_schema 'DATABASE', sum(data_length + index_length)/1024/1024 'SIZE_IN_MB' FROM information_schema.TABLES GROUP BY table_schema;\"" | column -t
6070	# elapsed_time_seconds=$(expr $(date +%s) - $start)
6071
6072
6073	####################################################################################################
6074
6075
6076	# start=$(date +%s)
6077	# STEPS_COUNTER=$((STEPS_COUNTER+1))
6078	# if [[ $ansible_storage_hosts_count != "0" ]]
6079	# then
6080		# echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK IF THE CephStorage GROUP WITHIN THE /etc/ansible/hosts FILE IS WITHOUT COMPUTE HOSTS (BASED ON CBIS-13164) (+$elapsed_time_seconds `date '+%T'`)${NC}"
6081		# $ansible_hosts=$(sed -n -e '/[[Cc]eph[Ss]torage\]/,/\[/ p' /etc/ansible/hosts | grep overcloud | grep -v [Ss]torage | sort -u)
6082		# if [[ $ansible_hosts ]]
6083		# then
6084			# echo -e "${RED}the following servers should not appear under the CephStorage group in /etc/ansible/hosts${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6085			# echo -e "${RED}$ansible_hosts${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6086		# else
6087			# echo -e "${GREEN}no servers other then storage-nodes found under the cephstorage group within /etc/ansible/hosts ${NC}"
6088		# fi
6089	# fi
6090	# elapsed_time_seconds=$(expr $(date +%s) - $start)
6091
6092
6093	####################################################################################################
6094
6095
6096	# start=$(date +%s)
6097	# STEPS_COUNTER=$((STEPS_COUNTER+1))
6098	# echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECKING THE OVERCLOUDRC FILE VALIDITY (+$elapsed_time_seconds `date '+%T'`)${NC}"
6099	# expected_lines=$(cat ~/overcloudrc | grep -c -E 'OS_USERNAME=admin|OS_CLOUDNAME=overcloud|OS_PROJECT_NAME=admin|OS_TENANT_NAME=admin|OS_CACERT=/home/stack/ca.crt.pem')
6100	# if [[ $expected_lines == "5" ]]
6101	# then
6102		# echo -e "${GREEN}the expected configuration in the overcloudrc file was found${NC}"
6103	# else
6104		# echo -e "${RED}one or more from the following lines missing from the overcloudrc file\nOS_USERNAME=admin\nOS_CLOUDNAME=overcloud\nOS_PROJECT_NAME=admin\nOS_TENANT_NAME=admin\nOS_CACERT=/home/stack/ca.crt.pem${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6105		# cat ~/overcloudrc
6106	# fi
6107	# elapsed_time_seconds=$(expr $(date +%s) - $start)
6108
6109
6110	####################################################################################################
6111
6112
6113	# start=$(date +%s)
6114	# STEPS_COUNTER=$((STEPS_COUNTER+1))
6115	# echo -e "${BLUE}\n\n$STEPS_COUNTER) PRINT VITRAGE ALARMS HISTORY FROM THE DB FROM THE PAST 1 HOUR (+$elapsed_time_seconds `date '+%T'`)${NC}"
6116	# from_date=$(date -d "-1 hour" +%Y"-"%m"-"%d" "%T)
6117	# to_date=$(date -d "-0 hour" +%Y"-"%m"-"%d" "%T) 
6118	# vitrage_history=$(ansible $last_index_controller -b -m shell -a "mysql -e \"SELECT created_at,name FROM alarms WHERE start_timestamp BETWEEN '$from_date' AND '$to_date'\G;\" vitrage | grep -v '\*' | grep -v 'rc=0'" | grep created_at -A1)
6119	# if [[ -z $vitrage_history ]]
6120	# then
6121		# echo -e "${GREEN}no alarms were intiated in the past 1 hour${NC}"
6122	# else
6123		# echo -e "${RED}$vitrage_history${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6124	# fi
6125	# elapsed_time_seconds=$(expr $(date +%s) - $start)
6126
6127
6128	####################################################################################################
6129
6130
6131	# start=$(date +%s)
6132	# STEPS_COUNTER=$((STEPS_COUNTER+1))
6133	# echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT /etc/sysconfig/iptables IS IDENTICAL BETWEEN ALL THE HOSTS UNDER EACH HOST-GROUP (+$elapsed_time_seconds `date '+%T'`)${NC}"
6134	# echo -e "\n${CYAN}now checking DpdkPerformanceCompute${NC}"
6135	# if [[ $ansible_dpdk_hosts_count != "0" ]]
6136	# then
6137		# iptables=$(ansible *overcloud-[Dd]pdk* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u | wc -l)
6138		# if [[ $iptables != "1" ]]
6139		# then
6140			# echo -e "${RED}one or more dpdk computes has different content inside /etc/sysconfig/iptables${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6141			# iptables=$(ansible *overcloud-[Dd]pdk* -b -m shell -a "md5sum /etc/sysconfig/iptables")
6142			# echo -e "${RED}$iptables${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6143			# if [[ ! -f "dpdk_initial_iptables_config" ]]
6144			# then
6145				# ansible *overcloud-[Dd]pdk* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > dpdk_initial_iptables_config
6146			# else
6147				# ansible *overcloud-[Dd]pdk* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > dpdk_latest_iptables_config
6148				# diff=$(sudo diff -s dpdk_initial_iptables_config dpdk_latest_iptables_config | grep -c 'Files dpdk_initial_iptables_config and dpdk_latest_iptables_config are identical')
6149				# if [[ $diff != "1" ]]
6150				# then
6151					# echo -e "\n${RED}differences were found between dpdk_initial_iptables_config and dpdk_latest_iptables_config${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6152					# diff=$(diff dpdk_initial_iptables_config dpdk_latest_iptables_config)
6153					# echo -e "${RED}$diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6154				# else
6155					# echo -e "\n${GREEN}no differences were found between dpdk_initial_iptables_config and dpdk_latest_iptables_config${NC}"
6156				# fi
6157			# fi	
6158		# else
6159			# echo -e "${GREEN}/etc/sysconfig/iptable is idetical on all the dpdk servers${NC}"
6160			# if [[ ! -f "dpdk_initial_iptables_config" ]]
6161			# then
6162				# ansible *overcloud-[Dd]pdk* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > dpdk_initial_iptables_config
6163			# else
6164				# ansible *overcloud-[Dd]pdk* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > dpdk_latest_iptables_config
6165				# diff=$(sudo diff -s dpdk_initial_iptables_config dpdk_latest_iptables_config | grep -c 'Files dpdk_initial_iptables_config and dpdk_latest_iptables_config are identical')
6166				# if [[ $diff != "1" ]]
6167				# then
6168					# echo -e "\n${RED}differences were found between dpdk_initial_iptables_config and dpdk_latest_iptables_config${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6169					# diff=$(diff dpdk_initial_iptables_config dpdk_latest_iptables_config)
6170					# echo -e "${RED}$diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6171				# else
6172					# echo -e "${GREEN}no differences were found between dpdk_initial_iptables_config and dpdk_latest_iptables_config${NC}"
6173				# fi
6174			# fi
6175		# fi
6176	# else
6177		# echo -e "${ORANGE}no dpdk computes found${NC}"
6178	# fi
6179	# echo -e "\n${CYAN}now checking OvsCompute${NC}"
6180	# if [[ $ansible_ovs_hosts_count != "0" ]]
6181	# then
6182		# iptables=$(ansible *overcloud-[Oo]vs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u | wc -l)
6183		# if [[ $iptables != "1" ]]
6184		# then
6185			# echo -e "${RED}one or more OVS computes has different content inside /etc/sysconfig/iptables${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6186			# iptables=$(ansible *overcloud-[Oo]vs* -b -m shell -a "md5sum /etc/sysconfig/iptables")
6187			# echo -e "${RED}$iptables${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6188			# if [[ ! -f "ovs_initial_iptables_config" ]]
6189			# then
6190				# ansible *overcloud-[Oo]vs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > ovs_initial_iptables_config
6191			# else
6192				# ansible *overcloud-[Oo]vs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > ovs_latest_iptables_config
6193				# diff=$(sudo diff -s ovs_initial_iptables_config ovs_latest_iptables_config | grep -c 'Files ovs_initial_iptables_config and ovs_latest_iptables_config are identical')
6194				# if [[ $diff != "1" ]]
6195				# then
6196					# echo -e "\n${RED}differences were found between ovs_initial_iptables_config and ovs_latest_iptables_config${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6197					# diff=$(diff ovs_initial_iptables_config ovs_latest_iptables_config)
6198					# echo -e "${RED}$diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6199				# else
6200					# echo -e "\n${GREEN}no differences were found between ovs_initial_iptables_config and ovs_latest_iptables_config${NC}"
6201				# fi
6202			# fi	
6203		# else
6204			# echo -e "${GREEN}/etc/sysconfig/iptable is idetical on all the OVS servers${NC}"
6205			# if [[ ! -f "ovs_initial_iptables_config" ]]
6206			# then
6207				# ansible *overcloud-[Oo]vs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > ovs_initial_iptables_config
6208			# else
6209				# ansible *overcloud-[Oo]vs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > ovs_latest_iptables_config
6210				# diff=$(sudo diff -s ovs_initial_iptables_config ovs_latest_iptables_config | grep -c 'Files ovs_initial_iptables_config and ovs_latest_iptables_config are identical')
6211				# if [[ $diff != "1" ]]
6212				# then
6213					# echo -e "\n${RED}differences were found between ovs_initial_iptables_config and ovs_latest_iptables_config${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6214					# diff=$(diff ovs_initial_iptables_config ovs_latest_iptables_config)
6215					# echo -e "${RED}$diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6216				# else
6217					# echo -e "${GREEN}no differences were found between ovs_initial_iptables_config and ovs_latest_iptables_config${NC}"
6218				# fi
6219			# fi
6220		# fi
6221	# else
6222		# echo -e "${ORANGE}no OVS computes found${NC}"
6223	# fi
6224	# echo -e "\n${CYAN}now checking SriovPerformanceCompute${NC}"
6225	# if [[ $ansible_sriov_hosts_count != "0" ]]
6226	# then
6227		# iptables=$(ansible *overcloud-[Ss]riov* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u | wc -l)
6228		# if [[ $iptables != "1" ]]
6229		# then
6230			# echo -e "${RED}one or more sriov computes has different content inside /etc/sysconfig/iptables${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6231			# iptables=$(ansible *overcloud-[Ss]riov* -b -m shell -a "md5sum /etc/sysconfig/iptables")
6232			# echo -e "${RED}$iptables${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6233			# if [[ ! -f "sriov_initial_iptables_config" ]]
6234			# then
6235				# ansible *overcloud-[Ss]riov* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > sriov_initial_iptables_config
6236			# else
6237				# ansible *overcloud-[Ss]riov* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > sriov_latest_iptables_config
6238				# diff=$(sudo diff -s sriov_initial_iptables_config sriov_latest_iptables_config | grep -c 'Files sriov_initial_iptables_config and sriov_latest_iptables_config are identical')
6239				# if [[ $diff != "1" ]]
6240				# then
6241					# echo -e "\n${RED}differences were found between sriov_initial_iptables_config and sriov_latest_iptables_config${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6242					# diff=$(diff sriov_initial_iptables_config sriov_latest_iptables_config)
6243					# echo -e "${RED}$diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6244				# else
6245					# echo -e "\n${GREEN}no differences were found between sriov_initial_iptables_config and sriov_latest_iptables_config${NC}"
6246				# fi
6247			# fi	
6248		# else
6249			# echo -e "${GREEN}/etc/sysconfig/iptable is idetical on all the sriov servers${NC}"
6250			# if [[ ! -f "sriov_initial_iptables_config" ]]
6251			# then
6252				# ansible *overcloud-[Ss]riov* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > sriov_initial_iptables_config
6253			# else
6254				# ansible *overcloud-[Ss]riov* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > sriov_latest_iptables_config
6255				# diff=$(sudo diff -s sriov_initial_iptables_config sriov_latest_iptables_config | grep -c 'Files sriov_initial_iptables_config and sriov_latest_iptables_config are identical')
6256				# if [[ $diff != "1" ]]
6257				# then
6258					# echo -e "\n${RED}differences were found between sriov_initial_iptables_config and sriov_latest_iptables_config${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6259					# diff=$(diff sriov_initial_iptables_config sriov_latest_iptables_config)
6260					# echo -e "${RED}$diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6261				# else
6262					# echo -e "${GREEN}no differences were found between sriov_initial_iptables_config and sriov_latest_iptables_config${NC}"
6263				# fi
6264			# fi
6265		# fi
6266	# else
6267		# echo -e "${ORANGE}no sriov computes found${NC}"
6268	# fi
6269	# echo -e "\n${CYAN}now checking AvrsCompute${NC}"
6270	# if [[ $ansible_avrs_hosts_count != "0" && $nuage == "true" ]]
6271	# then
6272		# iptables=$(ansible *overcloud-[Aa]vrs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u | wc -l)
6273		# if [[ $iptables != "1" ]]
6274		# then
6275			# echo -e "${RED}one or more avrs computes has different content inside /etc/sysconfig/iptables${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6276			# iptables=$(ansible *overcloud-[Aa]vrs* -b -m shell -a "md5sum /etc/sysconfig/iptables")
6277			# echo -e "${RED}$iptables${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6278			# if [[ ! -f "avrs_initial_iptables_config" ]]
6279			# then
6280				# ansible *overcloud-[Aa]vrs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > avrs_initial_iptables_config
6281			# else
6282				# ansible *overcloud-[Aa]vrs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > avrs_latest_iptables_config
6283				# diff=$(sudo diff -s avrs_initial_iptables_config avrs_latest_iptables_config | grep -c 'Files avrs_initial_iptables_config and avrs_latest_iptables_config are identical')
6284				# if [[ $diff != "1" ]]
6285				# then
6286					# echo -e "\n${RED}differences were found between avrs_initial_iptables_config and avrs_latest_iptables_config${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6287					# diff=$(diff avrs_initial_iptables_config avrs_latest_iptables_config)
6288					# echo -e "${RED}$diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6289				# else
6290					# echo -e "\n${GREEN}no differences were found between avrs_initial_iptables_config and avrs_latest_iptables_config${NC}"
6291				# fi
6292			# fi	
6293		# else
6294			# echo -e "${GREEN}/etc/sysconfig/iptable is idetical on all the avrs servers${NC}"
6295			# if [[ ! -f "avrs_initial_iptables_config" ]]
6296			# then
6297				# ansible *overcloud-[Aa]vrs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > avrs_initial_iptables_config
6298			# else
6299				# ansible *overcloud-[Aa]vrs* -b -m shell -a "md5sum /etc/sysconfig/iptables" | grep ^[a-f,0-9] | awk '{print $1}' | sort -u > avrs_latest_iptables_config
6300				# diff=$(sudo diff -s avrs_initial_iptables_config avrs_latest_iptables_config | grep -c 'Files avrs_initial_iptables_config and avrs_latest_iptables_config are identical')
6301				# if [[ $diff != "1" ]]
6302				# then
6303					# echo -e "\n${RED}differences were found between avrs_initial_iptables_config and avrs_latest_iptables_config${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6304					# diff=$(diff avrs_initial_iptables_config avrs_latest_iptables_config)
6305					# echo -e "${RED}$diff${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6306				# else
6307					# echo -e "${GREEN}no differences were found between avrs_initial_iptables_config and avrs_latest_iptables_config${NC}"
6308				# fi
6309			# fi
6310		# fi
6311	# else
6312		# echo -e "${ORANGE}no avrs computes found${NC}"
6313	# fi
6314	# elapsed_time_seconds=$(expr $(date +%s) - $start)
6315
6316
6317	####################################################################################################
6318
6319
6320	# start=$(date +%s)
6321	# STEPS_COUNTER=$((STEPS_COUNTER+1))
6322	# echo -e "${BLUE}\n\n$STEPS_COUNTER) PRINT ZABBIX EVENTS FROM THE DB FROM THE PAST 1 HOUR (+$elapsed_time_seconds `date '+%T'`)${NC}"
6323	# if [[ $cbis_version != "19.0.0.1" && $cbis_version != "18.0.0.1" ]]
6324	# then
6325		# epoch_time_now=$(date +%s)
6326		# epoch_time_before=$(date +%s -d "-1 hour")
6327		# zabbix_events=$(ansible $last_index_controller -b -m shell -a "mysql -e \"SELECT name FROM events WHERE clock BETWEEN $epoch_time_before AND $epoch_time_now;\G;\" zabbixdb" | grep -E -v '^name|SUCCESS' | sort -u )
6328		# if [[ -z $zabbix_events ]]
6329		# then
6330			# echo -e "${GREEN}no events were intiated in the past 1 hour${NC}"
6331		# else
6332			# echo -e "${RED}$zabbix_events\n\n${ORANGE}Please log-in to the zabbix portal and acknowledge the problems history under Monitoring > Problems > History, set the filter timestamps as required and Apply" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6333		# fi
6334	# else
6335		# echo -e "${ORANGE}this zabbix events check is only valid from CBIS 19.100.1${NC}"
6336	# fi
6337	# elapsed_time_seconds=$(expr $(date +%s) - $start)
6338
6339
6340	####################################################################################################
6341
6342
6343	# start=$(date +%s)
6344	# STEPS_COUNTER=$((STEPS_COUNTER+1))
6345	# echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR CPU VULNERABILITES (MELTDOWN/SPECTRE) (+$elapsed_time_seconds `date '+%T'`)${NC}"
6346	# cpus_vulnerablities=$(ansible all -b -m shell -a "awk '{print FILENAME\":\"\$0}' /sys/devices/system/cpu/vulnerabilities/* | grep Vulnerable" | grep Vulnerable -B 1)
6347	# if [[ $cpus_vulnerablities ]]
6348	# then
6349		# echo -e "${RED}$cpus_vulnerablities${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6350	# fi
6351	# if [[ -z $kernel_vulnerablities &&  -z $cpus_vulnerablities ]]
6352	# then
6353		# echo -e "${GREEN}all hosts are MELTDOWN/SPECTRE hardened${NC}"
6354	# fi
6355	# elapsed_time_seconds=$(expr $(date +%s) - $start)
6356
6357
6358	####################################################################################################
6359
6360
6361	# start=$(date +%s)
6362	# STEPS_COUNTER=$((STEPS_COUNTER+1))
6363	# echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT USER stack CAN MODIFY FILES UNDER /mnt/backup/ TO UNDERSTAND IF THE /usr/bin/find /mnt/backup/... CRONJOB WORKS (CBIS-13758) (+$elapsed_time_seconds `date '+%T'`)${NC}"
6364	# sudo_check_in_backup_cronjon=$(crontab -l | grep -c 'sudo /usr/bin/find /mnt/backup/')
6365	# if [[ $sudo_check_in_backup_cronjon == "0" ]]
6366	# then
6367		# check_if_backup_files_exist=$(sudo /usr/bin/find /mnt/backup/* 2> /dev/null | grep db_backup.enc)
6368		# if [[ $check_if_backup_files_exist ]]
6369		# then
6370			# permissions_check=$(/usr/bin/find /mnt/backup/* | grep db_backup.enc | grep -v orig | awk NR==1 | xargs -i cp {} {}.orig 2>&1 | grep -c 'Permission denied')
6371			# if [[ $permissions_check == "1" ]]
6372			# then
6373				# cronjobs=$(crontab -l)
6374				# echo -e "${RED}unable to delete content under /mnt/backup/ as user stack. permission denied" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6375				# echo -e "${RED}sudo is required for the /usr/bin/find /mnt/backup/*... cronjob\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6376				# echo -e "${RED}$cronjobs${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6377			# else
6378				# echo -e "${GREEN}user stack can modify files under /mnt/backup/${NC}"
6379			# fi
6380		# else
6381			# echo -e "${ORANGE}couldn't find any db_backup.enc file under /mnt/backup/${NC}"
6382		# fi
6383	# else
6384		# echo -e "${GREEN}the \'sudo /usr/bin/find /mnt/backup/...\' cronjob contains sudo${NC}"
6385	# fi
6386	# elapsed_time_seconds=$(expr $(date +%s) - $start)
6387
6388
6389	####################################################################################################
6390
6391
6392	# start=$(date +%s)
6393	# STEPS_COUNTER=$((STEPS_COUNTER+1))
6394	# echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE UNEXPECTED CONFIGURATION IN /etc/fstab ON THE OVERCLOUD HOSTS (+$elapsed_time_seconds `date '+%T'`)${NC}"
6395	# fstab=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "cat /etc/fstab" | grep -E -v 'SUCCESS|LABEL=img-rootfs / xfs defaults 0 1|tmpfs /dev/shm tmpfs defaults,nodev,nosuid,noexec 0 0|elk|^#')
6396	# if [[ $fstab ]]
6397	# then
6398		# fstab=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "cat /etc/fstab | grep -v ^#")
6399		# echo -e "${RED}$fstab${NC} " ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6400	# else
6401		# echo -e "${GREEN}/etc/fstab is configured as expected on all the overcloud hosts${NC} "
6402	# fi
6403	# elapsed_time_seconds=$(expr $(date +%s) - $start)
6404
6405
6406	####################################################################################################
6407
6408
6409	# start=$(date +%s)
6410	# STEPS_COUNTER=$((STEPS_COUNTER+1))
6411	# echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT SYSTEM-WIDE OPEN FILES LIMIT IS IDENTICAL FOR ALL THE HOSTS (+$elapsed_time_seconds `date '+%T'`)${NC}"
6412	## I currently don't know why, but seems like the controllers always have slighly different limit value
6413	# open_files_limit_system_wide=$(ansible all --limit '!hypervisor,!localhost,!controller' -b -m shell -a "cat /proc/sys/fs/file-max" | grep ^[0-9] | sort --uniq | sed 's/..$//' | wc -l)
6414	# if [[ $open_files_limit_system_wide != "1" ]]
6415	# then
6416		# open_files_limit_system_wide=$(ansible all --limit '!hypervisor,!localhost,!controller' -b -m shell -a "cat /proc/sys/fs/file-max")
6417		# echo -e "${RED}$open_files_limit_system_wide${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6418	# else
6419		# echo -e "${GREEN}all the hosts returned the same system-wide open files limit value${NC}"
6420	# fi
6421	# elapsed_time_seconds=$(expr $(date +%s) - $start)
6422
6423
6424	####################################################################################################
6425	
6426	
6427	# start=$(date +%s)
6428	# STEPS_COUNTER=$((STEPS_COUNTER+1))
6429	# echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR INTERFACES DRIVER/VERSION/FIRMWARE INCONSISTENCIES BETWEEN THE OVERCLOUD HOSTS (+$elapsed_time_seconds `date '+%T'`)${NC}"		
6430	# if [[ $nuage != "true" ]]
6431	# then
6432		# checks="firmware-version driver version"
6433		# for check in $checks
6434		# do
6435			# echo -e "${CYAN}now checking the interfaces $check${NC}"
6436			# item=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ovs-appctl bond/list | grep tenant-bond | awk '{print \$NF}' | sort -u | xargs -i ethtool -i {}" | grep ^$check | sort -u | wc -l)
6437			# if [[ $item != "1" ]]
6438			# then
6439				# item=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ovs-appctl bond/list | grep tenant-bond | awk '{print \$NF}' | sort -u | xargs -i ethtool -i {} | grep ^$check")
6440				# echo -e "${RED}$item${NC} " ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6441			# else
6442				# echo -e "${GREEN}the $check is the same on all the overcloud hosts${NC} "
6443			# fi
6444		# done
6445	# else
6446		# echo -e "${ORANGE}this check is not applicable for Nuage${NC} "
6447	# fi	
6448	# elapsed_time_seconds=$(expr $(date +%s) - $start)	
6449
6450
6451	####################################################################################################
6452
6453
6454	# start=$(date +%s)
6455	# STEPS_COUNTER=$((STEPS_COUNTER+1))
6456	# echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK THAT IPSEC IS CONFIGURED AND WORKING (+$elapsed_time_seconds `date '+%T'`)${NC}"
6457	# if [[ $cbis_version == "19.0.0.1" ]]
6458	# then
6459		# ipsec_execution_time=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "grep -R 'Finished security hardening: Secured Communication Deployment' /var/log/cbis/ | grep -v api.log | awk -F: '{print \$1}' | head -n1 | xargs -i ls -l {} | awk '{print \$6,\$7,\$8}' | xargs -i date -d "{}" +%s" | grep -v rc=0 | grep ^[0-9])
6460		# if [[ -z $ipsec_execution_time ]]
6461		# then
6462			# echo -e "${MAGENTA}IPsec is not enabled - enable IPsec (secured communications) from the security section in CBIS manager${NC}"
6463		# else
6464			# deployment_execution_time=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "ls -l /var/log/cbis/deployment.log | awk '{print \$6,\$7,\$8}' | xargs -i date -d "{}" +%s" | grep -v rc=0 | grep ^[0-9])
6465			# if [ $deployment_execution_time -gt $ipsec_execution_time ]
6466			# then
6467				# echo -e "${ORANGE}IPsec is not enabled - enable IPsec (secured communications) from the security section in CBIS manager${NC}"
6468			# else
6469				# echo -e "${CYAN}tripleo-ipsec pacemaker resources check${NC}"
6470				# ipsec=$(ansible $last_index_controller -b -m shell -a "pcs resource | grep tripleo-ipsec | grep -E -w -c 'internalapi|redis|storage|storagemgmt|ctlplane'" | grep ^[0-9])
6471				# if [[ $ipsec != "5" ]]
6472				# then
6473					# echo -e "${RED}expected 5 tripleo-ipsec pacemaker resources (internalapi, redis, storage, storagemgmt, ctlplane) and got ($ipsec)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6474				# else
6475					# echo -e "${GREEN}found all 5 tripleo-ipsec pacemaker resources (internalapi, redis, storage, storagemgmt, ctlplane)${NC}"
6476				# fi
6477				# echo -e "${CYAN}check for leaked ipsec packets${NC}"
6478				# ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipsec enumcheck 2>&1 | grep enumcheck:" | grep -v rc=0 | sort -u)
6479				# if [[ $ipsec != "enumcheck: leak detective found no leaks" ]]
6480				# then
6481					# ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipsec enumcheck 2>&1 | grep enumcheck:")
6482					# echo -e "${RED}$ipsec${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6483				# else
6484					# echo -e "${GREEN}no leaks were detected${NC}"
6485				# fi
6486				# echo -e "${CYAN}capture ESP packets using tcpdump${NC}"
6487				# ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "timeout 10 tcpdump -nnepi any proto ESP -c 100 2>&1 | grep 'packets captured' | awk '{ if ( \$2 < 100 ) print \$1 }'" | grep ^[0-9] -B 1)
6488				# if [[ $ipsec ]]
6489				# then
6490					# echo -e "${RED}$ipsec${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6491				# else
6492					# echo -e "${GREEN}sucessfully captured 100 ESP packets from all the hosts${NC}"
6493				# fi		
6494				# echo -e "${CYAN}check iptables IPSEC chain rules${NC}"
6495				# ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "iptables -L IPSEC -n | grep -E -vc 'Vitrage|elasticsearch'" | grep ^[0-9] | sort -u)
6496				# if [[ $ipsec != "17" ]]
6497				# then
6498					# echo -e "${RED}expected (17) iptable rules in the IPSEC chain but got ($ipsec)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6499				# else
6500					# echo -e "${GREEN}all the iptables IPSEC chain rules are found${NC}"
6501				# fi
6502				## Chain IPSEC (4 references)
6503				## target     prot opt source               destination
6504				## ACCEPT     all  --  172.31.0.1           0.0.0.0/0            /* Allow ctlplane traffic from UC */
6505				## ACCEPT     all  --  169.254.169.254      0.0.0.0/0            /* Allow ironic metadata traffic from UC */
6506				## ACCEPT     all  --  0.0.0.0/0            0.0.0.0/0
6507				## ACCEPT     all  --  0.0.0.0/0            0.0.0.0/0
6508				## ACCEPT     tcp  --  0.0.0.0/0            0.0.0.0/0            multiport sports 16514,49152:49215,5900:5999 /* open ports for nova */
6509				## ACCEPT     tcp  --  0.0.0.0/0            0.0.0.0/0            multiport dports 16514,49152:49215,5900:5999 /* open ports for nova */
6510				## ACCEPT     tcp  --  0.0.0.0/0            0.0.0.0/0            tcp spt:22
6511				## ACCEPT     tcp  --  0.0.0.0/0            0.0.0.0/0            tcp dpt:22
6512				## ACCEPT     ah   --  0.0.0.0/0            0.0.0.0/0
6513				## ACCEPT     esp  --  0.0.0.0/0            0.0.0.0/0
6514				## ACCEPT     udp  --  0.0.0.0/0            0.0.0.0/0            udp spt:4500 dpt:4500
6515				## ACCEPT     udp  --  0.0.0.0/0            0.0.0.0/0            udp spt:500 dpt:500
6516				## ACCEPT     all  --  0.0.0.0/0            0.0.0.0/0            policy match dir in pol ipsec
6517				## LOG        all  --  0.0.0.0/0            0.0.0.0/0            limit: avg 2/min burst 5 LOG flags 0 level 4 prefix "IPTables-Dropped:"
6518				## DROP       all  --  0.0.0.0/0            0.0.0.0/0				
6519				# echo -e "${CYAN}check that ESP tunnels were created${NC}"
6520				# ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipsec whack --trafficstatus | wc -l | awk '{ if ( \$1 < 10 ) print \$1 }'" | grep ^[0-9] -B 1)
6521				# if [[ $ipsec ]]
6522				# then
6523					# echo -e "${RED}$ipsec${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6524				# else
6525					# echo -e "${GREEN}each host has at least 10 active ESP tunnels${NC}"
6526				# fi
6527			# fi				
6528		# fi
6529	# fi	
6530	# if [[ $cbis_version == "19.100.1" ]]
6531	# then		
6532		# ipsec=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /usr/share/cbis/seccom_state" | grep -v rc=0)
6533		# if [[ $ipsec != "0" ]]
6534		# then
6535			# echo -e "${MAGENTA}IPsec is not enabled - enable IPsec (secured communications) from the security section in CBIS manager${NC}"
6536		# else
6537			# echo -e "${CYAN}tripleo-ipsec pacemaker resources check${NC}"
6538			# ipsec=$(ansible $last_index_controller -b -m shell -a "pcs resource | grep tripleo-ipsec | grep -E -w -c 'internalapi|redis|storage|storagemgmt|ctlplane'" | grep ^[0-9])
6539			# if [[ $ipsec != "5" ]]
6540			# then
6541				# echo -e "${RED}expected 5 tripleo-ipsec pacemaker resources (internalapi, redis, storage, storagemgmt, ctlplane) and got ($ipsec)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6542			# else
6543				# echo -e "${GREEN}found all 5 tripleo-ipsec pacemaker resources (internalapi, redis, storage, storagemgmt, ctlplane)${NC}"
6544			# fi
6545			# echo -e "${CYAN}check for leaked ipsec packets${NC}"
6546			# ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipsec enumcheck 2>&1 | grep enumcheck:" | grep -v rc=0 | sort -u)
6547			# if [[ $ipsec != "enumcheck: leak detective found no leaks" ]]
6548			# then
6549				# ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipsec enumcheck 2>&1 | grep enumcheck:")
6550				# echo -e "${RED}$ipsec${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6551			# else
6552				# echo -e "${GREEN}no leaks were detected${NC}"
6553			# fi
6554			# echo -e "${CYAN}capture ESP packets using tcpdump${NC}"
6555			# ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "timeout 10 tcpdump -nnepi any proto ESP -c 100 2>&1 | grep 'packets captured' | awk '{ if ( \$2 < 100 ) print \$1 }'" | grep ^[0-9] -B 1)
6556			# if [[ $ipsec ]]
6557			# then
6558				# echo -e "${RED}$ipsec${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6559			# else
6560				# echo -e "${GREEN}sucessfully captured 100 ESP packets from all the hosts${NC}"
6561			# fi		
6562			# echo -e "${CYAN}check iptables IPSEC chain rules${NC}"
6563			# ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "iptables -L IPSEC -n | grep -E -vc 'Vitrage|elasticsearch'" | grep ^[0-9] | sort -u)
6564			# if [[ $ipsec != "17" ]]
6565			# then
6566				# echo -e "${RED}expected (17) iptable rules in the IPSEC chain but got ($ipsec)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6567			# else
6568				# echo -e "${GREEN}all the iptables IPSEC chain rules are found${NC}"
6569			# fi
6570			## Chain IPSEC (4 references)
6571			## target     prot opt source               destination
6572			## ACCEPT     all  --  172.31.0.1           0.0.0.0/0            /* Allow ctlplane traffic from UC */
6573			## ACCEPT     all  --  169.254.169.254      0.0.0.0/0            /* Allow ironic metadata traffic from UC */
6574			## ACCEPT     all  --  0.0.0.0/0            0.0.0.0/0
6575			## ACCEPT     all  --  0.0.0.0/0            0.0.0.0/0
6576			## ACCEPT     tcp  --  0.0.0.0/0            0.0.0.0/0            multiport sports 16514,49152:49215,5900:5999 /* open ports for nova */
6577			## ACCEPT     tcp  --  0.0.0.0/0            0.0.0.0/0            multiport dports 16514,49152:49215,5900:5999 /* open ports for nova */
6578			## ACCEPT     tcp  --  0.0.0.0/0            0.0.0.0/0            tcp spt:22
6579			## ACCEPT     tcp  --  0.0.0.0/0            0.0.0.0/0            tcp dpt:22
6580			## ACCEPT     ah   --  0.0.0.0/0            0.0.0.0/0
6581			## ACCEPT     esp  --  0.0.0.0/0            0.0.0.0/0
6582			## ACCEPT     udp  --  0.0.0.0/0            0.0.0.0/0            udp spt:4500 dpt:4500
6583			## ACCEPT     udp  --  0.0.0.0/0            0.0.0.0/0            udp spt:500 dpt:500
6584			## ACCEPT     all  --  0.0.0.0/0            0.0.0.0/0            policy match dir in pol ipsec
6585			## LOG        all  --  0.0.0.0/0            0.0.0.0/0            limit: avg 2/min burst 5 LOG flags 0 level 4 prefix "IPTables-Dropped:"
6586			## DROP       all  --  0.0.0.0/0            0.0.0.0/0				
6587			# echo -e "${CYAN}check that ESP tunnels were created${NC}"
6588			# ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipsec whack --trafficstatus | wc -l | awk '{ if ( \$1 < 10 ) print \$1 }'" | grep ^[0-9] -B 1)
6589			# if [[ $ipsec ]]
6590			# then
6591				# echo -e "${RED}$ipsec${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6592			# else
6593				# echo -e "${GREEN}each host has at least 10 active ESP tunnels${NC}"
6594			# fi
6595		# fi				
6596	# fi
6597	# if [[ $cbis_version == "20.100.1" ]]
6598	# then
6599		# ipsec=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /opt/install/data/states/seccom_state" | grep -v rc=0 | xargs)
6600		# if [[ $ipsec != "0" ]]
6601		# then
6602			# echo -e "${MAGENTA}IPsec is not enabled - enable IPsec (secured communications) from the security section in CBIS manager${NC}"
6603		# else		
6604			# echo -e "${CYAN}tripleo-ipsec pacemaker resources check${NC}"
6605			# ipsec=$(ansible $last_index_controller -b -m shell -a "pcs resource | grep tripleo-ipsec | grep -E -w -c 'internalapi|redis|storage|storagemgmt|ctlplane'" | grep ^[0-9])
6606			# if [[ $ipsec != "5" ]]
6607			# then
6608				# echo -e "${RED}expected 5 tripleo-ipsec pacemaker resources (internalapi, redis, storage, storagemgmt, ctlplane) and got ($ipsec)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6609			# else
6610				# echo -e "${GREEN}found all 5 tripleo-ipsec pacemaker resources (internalapi, redis, storage, storagemgmt, ctlplane)${NC}"
6611			# fi
6612			# echo -e "${CYAN}check for leaked ipsec packets${NC}"
6613			# ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipsec enumcheck 2>&1 | grep enumcheck:" | grep -v rc=0 | sort -u)
6614			# if [[ $ipsec != "enumcheck: leak detective found no leaks" ]]
6615			# then
6616				# ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipsec enumcheck 2>&1 | grep enumcheck:")
6617				# echo -e "${RED}$ipsec${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6618			# else
6619				# echo -e "${GREEN}no leaks were detected${NC}"
6620			# fi
6621			# echo -e "${CYAN}capture ESP packets using tcpdump${NC}"
6622			# ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "timeout 10 tcpdump -nnepi any proto ESP -c 100 2>&1 | grep 'packets captured' | awk '{ if ( \$2 < 100 ) print \$1 }'" | grep ^[0-9] -B 1)
6623			# if [[ $ipsec ]]
6624			# then
6625				# echo -e "${RED}$ipsec${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6626			# else
6627				# echo -e "${GREEN}sucessfully captured 100 ESP packets from all the hosts${NC}"
6628			# fi		
6629			# echo -e "${CYAN}check iptables IPSEC chain rules${NC}"
6630			# ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "iptables -L IPSEC -n | grep -E -vc 'Vitrage|elasticsearch'" | grep ^[0-9] | sort -u)
6631			# if [[ $ipsec != "17" ]]
6632			# then
6633				# echo -e "${RED}expected (17) iptable rules in the IPSEC chain but got ($ipsec)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6634			# else
6635				# echo -e "${GREEN}all the iptables IPSEC chain rules are found${NC}"
6636			# fi
6637			## Chain IPSEC (4 references)
6638			## target     prot opt source               destination
6639			## ACCEPT     all  --  172.31.0.1           0.0.0.0/0            /* Allow ctlplane traffic from UC */
6640			## ACCEPT     all  --  169.254.169.254      0.0.0.0/0            /* Allow ironic metadata traffic from UC */
6641			## ACCEPT     all  --  0.0.0.0/0            0.0.0.0/0
6642			## ACCEPT     all  --  0.0.0.0/0            0.0.0.0/0
6643			## ACCEPT     tcp  --  0.0.0.0/0            0.0.0.0/0            multiport sports 16514,49152:49215,5900:5999 /* open ports for nova */
6644			## ACCEPT     tcp  --  0.0.0.0/0            0.0.0.0/0            multiport dports 16514,49152:49215,5900:5999 /* open ports for nova */
6645			## ACCEPT     tcp  --  0.0.0.0/0            0.0.0.0/0            tcp spt:22
6646			## ACCEPT     tcp  --  0.0.0.0/0            0.0.0.0/0            tcp dpt:22
6647			## ACCEPT     ah   --  0.0.0.0/0            0.0.0.0/0
6648			## ACCEPT     esp  --  0.0.0.0/0            0.0.0.0/0
6649			## ACCEPT     udp  --  0.0.0.0/0            0.0.0.0/0            udp spt:4500 dpt:4500
6650			## ACCEPT     udp  --  0.0.0.0/0            0.0.0.0/0            udp spt:500 dpt:500
6651			## ACCEPT     all  --  0.0.0.0/0            0.0.0.0/0            policy match dir in pol ipsec
6652			## LOG        all  --  0.0.0.0/0            0.0.0.0/0            limit: avg 2/min burst 5 LOG flags 0 level 4 prefix "IPTables-Dropped:"
6653			## DROP       all  --  0.0.0.0/0            0.0.0.0/0				
6654			# echo -e "${CYAN}check that ESP tunnels were created${NC}"
6655			# ipsec=$(ansible all --limit '!hypervisor,!localhost' -b -m shell -a "ipsec whack --trafficstatus | wc -l | awk '{ if ( \$1 < 10 ) print \$1 }'" | grep ^[0-9] -B 1)
6656			# if [[ $ipsec ]]
6657			# then
6658				# echo -e "${RED}$ipsec${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6659			# else
6660				# echo -e "${GREEN}each host has at least 10 active ESP tunnels${NC}"
6661			# fi
6662		# fi				
6663	# fi
6664	# elapsed_time_seconds=$(expr $(date +%s) - $start)
6665
6666
6667	####################################################################################################
6668
6669
6670	# start=$(date +%s)
6671	# STEPS_COUNTER=$((STEPS_COUNTER+1))
6672	# echo -e "${BLUE}\n\n$STEPS_COUNTER) OBTAIN THE ZABBIX TEMPLATES OF EACH HOST AND COMPARE IT WITH THE EXPECTED TEMPLATES TAKEN FROM configure_zabbix_server_monitoring.py (+$elapsed_time_seconds `date '+%T'`)${NC}"
6673	# all_hosts_zabbix_templates=$(cat /usr/share/cbis/overcloud/postdeploy/scripts/configure_zabbix_server_monitoring.py | awk '/Templates for all hosts/,/]/' | grep -v \# | grep -E '^\s+' | tr -d \'\,\[\] | tr -d " \t" | sort)
6674	# all_hosts_zabbix_templates_fixed=$(echo -e "$all_hosts_zabbix_templates" | paste -sd'|' | sed '$ s/.$//')
6675
6676	# active_controller_zabbix_templates=$(cat /usr/share/cbis/overcloud/postdeploy/scripts/configure_zabbix_server_monitoring.py | awk '/ACTIVE_CONTROLLER_TEMPLATE=/,/]/' | grep -v \# | grep -E '^\s+' | tr -d \'\,\[\] | tr -d " \t" | sort)
6677	# active_controller_zabbix_templates_fixed=$(echo -e "$active_controller_zabbix_templates" | paste -sd'|' | sed '$ s/.$//')
6678	
6679	# computes_hci_zabbix_templates=$(cat /usr/share/cbis/overcloud/postdeploy/scripts/configure_zabbix_server_monitoring.py | awk '/Templates for compute hosts/,/]/' | grep -v \# | grep -E '^\s+' | tr -d \'\,\[\] | tr -d " \t" | sort)
6680	# computes_hci_zabbix_templates_fixed=$(echo -e "$computes_hci_zabbix_templates" | paste -sd'|' | sed '$ s/.$//')
6681
6682	# computes_non_hci_zabbix_templates=$(cat /usr/share/cbis/overcloud/postdeploy/scripts/configure_zabbix_server_monitoring.py | awk '/Templates for compute hosts/,/]/' | grep -v \# | grep -v TemplateAppOpenStackCeph | grep -E '^\s+' | tr -d \'\,\[\] | tr -d " \t" | sort)
6683	# computes_non_hci_zabbix_templates_fixed=$(echo -e "$computes_non_hci_zabbix_templates" | paste -sd'|' | sed '$ s/.$//')
6684	
6685	# controllers_zabbix_templates=$(cat /usr/share/cbis/overcloud/postdeploy/scripts/configure_zabbix_server_monitoring.py | awk '/Templates for controller hosts/,/]/' | grep -v \# | grep -E '^\s+' | tr -d \'\,\[\] | tr -d " \t" | sort)
6686	# controllers_zabbix_templates_fixed=$(echo -e "$controllers_zabbix_templates" | paste -sd'|' | sed '$ s/.$//')
6687	
6688	# undercloud_zabbix_templates=$(cat /usr/share/cbis/overcloud/postdeploy/scripts/configure_zabbix_server_monitoring.py | awk '/UNDERCLOUD_TEMPLATE_NAMES = BASIC_TEMPLATE_NAMES/,/]/' | grep -v \# | grep -E '^\s+' | tr -d \'\,\[\] | tr -d " \t" | sort)
6689	# undercloud_zabbix_templates_fixed=$(echo -e "$undercloud_zabbix_templates" | paste -sd'|' | sed '$ s/.$//')
6690
6691	# echo -e "${CYAN}checking the shared templates on all the hosts${NC}"
6692	# for host in $ansible_overcloud_hosts
6693	# do
6694		# templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
6695		# -H 'Content-Type: application/json-rpc' \
6696		# -H 'Cookie: SERVERID='$last_index_controller'' \
6697		# --data '{
6698			# "jsonrpc": "2.0",
6699			# "method": "host.get",
6700			# "params": {
6701				# "output": ["host"],
6702				# "selectParentTemplates": [
6703					# "templateid",
6704					# "name"
6705				# ],
6706				# "filter": {
6707							# "host": "'$host'"
6708						# }
6709			# },
6710			# "id": 1,
6711			# "auth": '$zabbix_auth'
6712		# }' | jq . | grep \"name\"\: | awk -F: '{print $2}' | tr -d '," ' | sort)
6713		# diff=$(diff <(echo -e "$all_hosts_zabbix_templates") <(echo -e "$templates") | grep -E '$all_hosts_zabbix_templates_fixed')	
6714		# if [[ $diff ]]
6715		# then
6716			# echo -e "${GREEN}the templates: ($all_hosts_zabbix_templates_fixed) are found under $host${NC}"
6717		# else
6718			# echo -e "${RED}$diff\n\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6719		# fi
6720	# done
6721	
6722	# echo -e "${CYAN}checking the dedicated controller templates on on the controller hosts${NC}"
6723	# for host in $ansible_controllers_hosts
6724	# do
6725		# templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
6726		# -H 'Content-Type: application/json-rpc' \
6727		# -H 'Cookie: SERVERID='$last_index_controller'' \
6728		# --data '{
6729			# "jsonrpc": "2.0",
6730			# "method": "host.get",
6731			# "params": {
6732				# "output": ["host"],
6733				# "selectParentTemplates": [
6734					# "templateid",
6735					# "name"
6736				# ],
6737				# "filter": {
6738							# "host": "'$host'"
6739						# }
6740			# },
6741			# "id": 1,
6742			# "auth": '$zabbix_auth'
6743		# }' | jq . | grep \"name\"\: | awk -F: '{print $2}' | tr -d '," ' | sort)
6744		# diff=$(diff <(echo -e "$controllers_zabbix_templates") <(echo -e "$templates") | grep -E '$controllers_zabbix_templates_fixed')	
6745		# if [[ $diff ]]
6746		# then
6747			# echo -e "${GREEN}the templates: ($controllers_zabbix_templates_fixed) are found under $host${NC}"
6748		# else
6749			# echo -e "${RED}$diff\n\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6750		# fi
6751	# done
6752	
6753	# echo -e "${CYAN}checking the dedicated compute templates on on the compute hosts${NC}"
6754	# for host in $ansible_computes_hosts
6755	# do
6756		# templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
6757		# -H 'Content-Type: application/json-rpc' \
6758		# -H 'Cookie: SERVERID='$last_index_controller'' \
6759		# --data '{
6760			# "jsonrpc": "2.0",
6761			# "method": "host.get",
6762			# "params": {
6763				# "output": ["host"],
6764				# "selectParentTemplates": [
6765					# "templateid",
6766					# "name"
6767				# ],
6768				# "filter": {
6769							# "host": "'$host'"
6770						# }
6771			# },
6772			# "id": 1,
6773			# "auth": '$zabbix_auth'
6774		# }' | jq . | grep \"name\"\: | awk -F: '{print $2}' | tr -d '," ' | sort)
6775		# if [[ $hci == "false" ]]
6776		# then
6777			# if [[ " ${computes_non_hci_zabbix_templates[@]} " =~ "$templates" ]]
6778			# then
6779				# echo -e "${GREEN}$host has all the expected templates${NC}"
6780			# else
6781				# echo -e "${RED}> Expecting Templates ($host):\n${computes_non_hci_zabbix_templates[@]}\n> Received Templates ($host):\n$templates${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6782			# fi
6783		# elif [[ $hci == "true" ]]
6784		# then
6785			# if [[ " ${computes_hci_zabbix_templates[@]} " =~ "$templates" ]]
6786			# then
6787				# echo -e "${GREEN}$host has all the expected templates${NC}"
6788			# else
6789				# echo -e "${RED}> Expecting Templates ($host):\n${computes_hci_zabbix_templates[@]}\n> Received Templates ($host):\n$templates${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6790			# fi
6791			
6792		# fi
6793	# done
6794
6795	# templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
6796	# -H 'Content-Type: application/json-rpc' \
6797	# -H 'Cookie: SERVERID='$last_index_controller'' \
6798	# --data '{
6799		# "jsonrpc": "2.0",
6800		# "method": "host.get",
6801		# "params": {
6802			# "output": ["host"],
6803			# "selectParentTemplates": [
6804				# "templateid",
6805				# "name"
6806			# ],
6807			# "filter": {
6808						# "host": "active-controller"
6809					# }
6810		# },
6811		# "id": 1,
6812		# "auth": '$zabbix_auth'
6813	# }' | jq . | grep \"name\"\: | awk -F: '{print $2}' | tr -d '," ' | sort | paste -sd " ")
6814	# if [[ " ${active_controller_zabbix_templates[@]} " =~ "$templates" ]]
6815	# then
6816		# echo -e "${GREEN}active-controller has all the expected templates${NC}"
6817	# else
6818		# echo -e "${RED}> Expecting Templates (active-controller):\n${active_controller_zabbix_templates[@]}\n> Received Templates (active-controller):\n$templates${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6819	# fi
6820
6821	# templates=$(curl -g -s -L -X GET 'https://'$PublicURL'/zabbix/api_jsonrpc.php' \
6822	# -H 'Content-Type: application/json-rpc' \
6823	# -H 'Cookie: SERVERID='$last_index_controller'' \
6824	# --data '{
6825		# "jsonrpc": "2.0",
6826		# "method": "host.get",
6827		# "params": {
6828			# "output": ["host"],
6829			# "selectParentTemplates": [
6830				# "templateid",
6831				# "name"
6832			# ],
6833			# "filter": {
6834						# "host": "undercloud.localdomain"
6835					# }
6836		# },
6837		# "id": 1,
6838		# "auth": '$zabbix_auth'
6839	# }' | jq . | grep \"name\"\: | awk -F: '{print $2}' | tr -d '," ' | sort | paste -sd " ")
6840	# if [[ " ${undercloud_zabbix_templates[@]} " =~ "$templates" ]]
6841	# then
6842		# echo -e "${GREEN}undercloud.localdomain has all the expected templates${NC}"
6843	# else
6844		# echo -e "${RED}> Expecting Templates (undercloud.localdomain):\n${undercloud_zabbix_templates[@]}\n> Received Templates (undercloud.localdomain):\n$templates${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6845	# fi
6846	# elapsed_time_seconds=$(expr $(date +%s) - $start)
6847
6848
6849	####################################################################################################
6850	
6851	
6852	# start=$(date +%s)
6853	# STEPS_COUNTER=$((STEPS_COUNTER+1))
6854	# echo -e "${BLUE}\n\n$STEPS_COUNTER) MEASURE THE TIME OF CORE MAINTENANCE OPERATIONS (+$elapsed_time_seconds `date '+%T'`)${NC}"
6855	# echo -e "${CYAN}measuring the time it took for the overcloud deployment to finish${NC}"
6856	# fail_criteria=5.0
6857	# if [[ $cbis_version == "18.0.0.1" || $cbis_version == "19.0.0.1" ||  $cbis_version == "19.100.1" ]]
6858	# then	
6859		# log=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/deployment.log | awk '/undercloud installation finished/,0'" | grep -v '| SUCCESS |')
6860	# else
6861		# log=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/deployment.log" | grep -v '| SUCCESS |')
6862	# fi
6863	# start=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | head -n1 | xargs -i date -d "{}" +%s)
6864	# end=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | tail -n1 | xargs -i date -d "{}" +%s)
6865	# hours=$(expr $end - $start | xargs -i echo "scale = 2; {} / 60 / 60" | bc)
6866	# result=$(echo $hours'>'$fail_criteria | bc -l)
6867	# if [[ $result == "1" ]]
6868	# then
6869		# echo -e "${RED}the overcloud deployment took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6870		# echo -e "\n${ORANGE}the overcloud deployment may take more time then usual if software raid is enabled on one or more host-groups. check if this is the case${NC}"
6871	# elif [[ $result == "0" ]]
6872	# then
6873		# echo -e "${GREEN}the overcloud deployment took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}"	
6874	# fi
6875	
6876	# echo -e "${CYAN}measuring the time it took for the undercloud deployment to finish${NC}"
6877	# fail_criteria=2.0
6878	# log=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/install_undercloud.log | awk '/Time zone adjusted/,0'" | grep -v '| SUCCESS |')
6879	# start=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | head -n1 | xargs -i date -d "{}" +%s)
6880	# end=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | tail -n1 | xargs -i date -d "{}" +%s)
6881	# hours=$(expr $end - $start | xargs -i echo "scale = 2; {} / 60 / 60" | bc)
6882	# result=$(echo $hours'>'$fail_criteria | bc -l)
6883	# if [[ $result == "1" ]]
6884	# then
6885		# echo -e "${RED}the undercloud deployment took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6886	# elif [[ $result == "0" ]]
6887	# then
6888		# echo -e "${GREEN}the undercloud deployment took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}"
6889	# fi
6890	
6891	# fail_criteria=5.0
6892	# log=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/add_node.log" | grep -v '| SUCCESS |')
6893	# if [[ $log ]]
6894	# then
6895		# echo -e "${CYAN}measuring the time it took for the latest scale-out to finish${NC}"
6896		# start=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | head -n1 | xargs -i date -d "{}" +%s)
6897		# end=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | tail -n1 | xargs -i date -d "{}" +%s)
6898		# hours=$(expr $end - $start | xargs -i echo "scale = 2; {} / 60 / 60" | bc)
6899		# result=$(echo $hours'>'$fail_criteria | bc -l)
6900		# if [[ $result == "1" ]]
6901		# then
6902			# echo -e "${RED}the scale-out operation took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6903		# elif [[ $result == "0" ]]
6904		# then
6905			# echo -e "${GREEN}the scale-out operation took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}"	
6906		# fi
6907	# fi
6908	
6909	# fail_criteria=2.0
6910	# log=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/remove_node.log" | grep -v '| SUCCESS |')
6911	# if [[ $log ]]
6912	# then
6913		# echo -e "${CYAN}measuring the time it took for the latest scale-in to finish${NC}"
6914		# start=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | head -n1 | xargs -i date -d "{}" +%s)
6915		# end=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | tail -n1 | xargs -i date -d "{}" +%s)
6916		# hours=$(expr $end - $start | xargs -i echo "scale = 2; {} / 60 / 60" | bc)
6917		# result=$(echo $hours'>'$fail_criteria | bc -l)
6918		# if [[ $result == "1" ]]
6919		# then
6920			# echo -e "${RED}the scale-in operation took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6921		# elif [[ $result == "0" ]]
6922		# then
6923			# echo -e "${GREEN}the scale-in operation took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}"
6924		# fi	
6925	# fi	
6926
6927	# if [[ $cbis_version != "19.0.0.1" && $cbis_version != "18.0.0.1" ]]
6928	# then
6929		# fail_criteria=5.0
6930		# log=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/replace_controller.log" | grep -v '| SUCCESS |')
6931		# if [[ $log ]]
6932		# then
6933			# echo -e "${CYAN}measuring the time it took for the latest replace controller to finish${NC}"
6934			# start=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | head -n1 | xargs -i date -d "{}" +%s)
6935			# end=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | tail -n1 | xargs -i date -d "{}" +%s)
6936			# hours=$(expr $end - $start | xargs -i echo "scale = 2; {} / 60 / 60" | bc)
6937			# result=$(echo $hours'>'$fail_criteria | bc -l)
6938			# if [[ $result == "1" ]]
6939			# then
6940				# echo -e "${RED}the replace controller operation took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6941			# elif [[ $result == "0" ]]
6942			# then
6943				# echo -e "${GREEN}the replace controller operation took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}"	
6944			# fi
6945		# fi	
6946
6947		# fail_criteria=2.0
6948		# log=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/security_hardening.log" | grep -v '| SUCCESS |')
6949		# if [[ $log ]]
6950		# then
6951			# echo -e "${CYAN}measuring the time it took for the latest security hardening deployment to finish${NC}"
6952			# start=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | head -n1 | xargs -i date -d "{}" +%s)
6953			# end=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | tail -n1 | xargs -i date -d "{}" +%s)
6954			# hours=$(expr $end - $start | xargs -i echo "scale = 2; {} / 60 / 60" | bc)
6955			# result=$(echo $hours'>'$fail_criteria | bc -l)
6956			# if [[ $result == "1" ]]
6957			# then
6958				# echo -e "${RED}the security hardening deployment took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6959			# elif [[ $result == "0" ]]
6960			# then
6961				# echo -e "${GREEN}the security hardening deployment took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}"	
6962			# fi
6963		# fi	
6964
6965		# fail_criteria=0.5
6966		# log=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/security_secured_communication.log" | grep -v '| SUCCESS |')
6967		# if [[ $log ]]
6968		# then
6969			# echo -e "${CYAN}measuring the time it took for the latest ipsec deployment to finish${NC}"
6970			# start=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | head -n1 | xargs -i date -d "{}" +%s)
6971			# end=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | tail -n1 | xargs -i date -d "{}" +%s)
6972			# hours=$(expr $end - $start | xargs -i echo "scale = 2; {} / 60 / 60" | bc)
6973			# result=$(echo $hours'>'$fail_criteria | bc -l)
6974			# if [[ $result == "1" ]]
6975			# then
6976				# echo -e "${RED}the ipsec deployment took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6977			# elif [[ $result == "0" ]]
6978			# then
6979				# echo -e "${GREEN}the ipsec deployment took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}"
6980			# fi
6981		# fi	
6982
6983		# fail_criteria=0.05
6984		# log=$(sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cat /var/log/cbis/add_host_group.log" | grep -v '| SUCCESS |')
6985		# if [[ $log ]]
6986		# then
6987			# echo -e "${CYAN}measuring the time it took for the latest custom host-group creation to finish${NC}"
6988			# start=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | head -n1 | xargs -i date -d "{}" +%s)
6989			# end=$(echo -e "$log" | awk '{print $1,$2}' | grep ^202[0-9]- | tail -n1 | xargs -i date -d "{}" +%s)
6990			# hours=$(expr $end - $start | xargs -i echo "scale = 2; {} / 60 / 60" | bc)
6991			# result=$(echo $hours'>'$fail_criteria | bc -l)
6992			# if [[ $result == "1" ]]
6993			# then
6994				# echo -e "${RED}the custom host-group creation took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
6995			# elif [[ $result == "0" ]]
6996			# then
6997				# echo -e "${GREEN}the custom host-group creation took $hours hours - fail criteria is $fail_criteria hours (arbitrary threshold)${NC}"
6998			# fi
6999		# fi		
7000	# fi	
7001	# elapsed_time_seconds=$(expr $(date +%s) - $start)
7002
7003
7004	####################################################################################################
7005
7006
7007	# start=$(date +%s)
7008	# STEPS_COUNTER=$((STEPS_COUNTER+1))
7009	# echo -e "${BLUE}\n\n$STEPS_COUNTER) BACKUP /cbis-installer/ AND /opt/install/data/ IN THE UNDERCLOUD PHYSICAL SERVER (skip if backup already exists)${NC}"
7010	# echo -e "${CYAN}sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a \"cp -anuR ~/cbis-installer/ ~/cbis-installer-backup/\"${NC}"
7011	# sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cp -anuR ~/cbis-installer/ ~/cbis-installer-backup/"
7012	# echo -e "${CYAN}sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a \"cp -anuR /opt/install/data/ ~/opt-install-data-backup/\"${NC}"
7013	# sshpass -p $hv_cbis_admin_password ansible -k hypervisor -b -m shell -a "cp -anuR /opt/install/data/ ~/opt-install-data-backup/"
7014	# elapsed_time_seconds=$(expr $(date +%s) - $global_start)
7015
7016
7017	####################################################################################################
7018
7019
7020	# start=$(date +%s)
7021	# STEPS_COUNTER=$((STEPS_COUNTER+1))
7022	# echo -e "${BLUE}\n\n$STEPS_COUNTER) CHECK FOR INSTANCES CONNECTIVITY LOSS HISTORY (+$elapsed_time_seconds `date '+%T'`)${NC}"
7023	# if [[ $instances != "0" ]]
7024	# then
7025		# if [ -f "$logs_dir/vms_namespace_connectivity_check.log" ]
7026		# then
7027			# file_content=$(cat $logs_dir/vms_namespace_connectivity_check.log)
7028			# if [[ $file_content ]]
7029			# then
7030				# unreachable_addresses=$(cat $logs_dir/vms_namespace_connectivity_check.log | grep -v '(0.00%)' | grep % -B 1 | grep Statistics | awk '{print $NF}' | sort -u | sed 's/.$/  /' | paste -sd' ')
7031				# echo -e "${RED}unreachable addresses: $unreachable_addresses${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
7032				# echo -e "\n${ORANGE}after you resolved the connectivity issue, stop and start the vms_namespace_connectivity_check.sh script to truncate the previous errors${NC}"
7033			# else
7034				# echo -e "${GREEN}the vms_namespace_connectivity_check.sh file is empty. thus, no connection failures are found${NC}"
7035			# fi
7036		# else
7037			# script_running_check=$(ps aux | grep vms_namespace_connectivity_check.sh | grep -v grep)
7038			# if [[ -z $script_running_check ]]
7039			# then
7040				# echo -e "${MAGENTA}the vms_namespace_connectivity_check.sh script is not running!${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
7041			# fi
7042		# fi
7043	# else
7044		# echo -e "${MAGENTA}no instances are found in the system!${NC}"
7045	# fi
7046	# elapsed_time_seconds=$(expr $(date +%s) - $start)
7047
7048
7049	####################################################################################################
7050
7051
7052	# start=$(date +%s)
7053	# STEPS_COUNTER=$((STEPS_COUNTER+1))
7054	# echo -e "${BLUE}\n\n$STEPS_COUNTER) COMPRESS AND SEND THE SCRIPT PRODUCT TO REMOTE HOST (+$elapsed_time_seconds `date '+%T'`)${NC}"
7055	# if [[ $hotfix_name_build ]]
7056	# then
7057		# if [[ $nuage == "true" ]]
7058		# then
7059			# tar -cvzf "$undercloud_vm_ip"__"$hw_model"__"$hotfix_name_build"__"$nuage_version"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz -P $logs_dir/* > /dev/null
7060			# sshpass -p 'airframe' scp $ssh_params "$undercloud_vm_ip"__"$hw_model"__"$hotfix_name_build"__"$nuage_version"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz airframe@10.104.211.33:/var/www/html/CBIS/scripts/system_validation_logs
7061		# else
7062			# tar -cvzf "$undercloud_vm_ip"__"$hw_model"__"$hotfix_name_build"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz -P $logs_dir/* > /dev/null
7063			# sshpass -p 'airframe' scp $ssh_params "$undercloud_vm_ip"__"$hw_model"__"$hotfix_name_build"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz airframe@10.104.211.33:/var/www/html/CBIS/scripts/system_validation_logs
7064		# fi	
7065	# else
7066		# if [[ $nuage == "true" ]]
7067		# then
7068			# tar -cvzf "$undercloud_vm_ip"__"$hw_model"__"$cbis_version"__"$nuage_version"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz -P $logs_dir/* > /dev/null
7069			# sshpass -p 'airframe' scp $ssh_params "$undercloud_vm_ip"__"$hw_model"__"$cbis_version"__"$nuage_version"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz airframe@10.104.211.33:/var/www/html/CBIS/scripts/system_validation_logs				
7070		# else			
7071			# tar -cvzf "$undercloud_vm_ip"__"$hw_model"__"$cbis_version"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz -P $logs_dir/* > /dev/null
7072			# sshpass -p 'airframe' scp $ssh_params "$undercloud_vm_ip"__"$hw_model"__"$cbis_version"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz airframe@10.104.211.33:/var/www/html/CBIS/scripts/system_validation_logs
7073		# fi
7074	# fi	
7075	# if [[ $hotfix_name_build ]]
7076	# then
7077		# validate_file_on_remote_server=$(curl -g -s -L -k http://10.104.211.33:88/CBIS/scripts/system_validation_logs | grep "$undercloud_vm_ip"__"$hw_model"__"$hotfix_name_build"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz)
7078		# if [[ $validate_file_on_remote_server ]]
7079		# then
7080			# echo -e "${GREEN}"$undercloud_vm_ip"__"$hw_model"__"$hotfix_name_build"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz is found on the remote server${NC}"
7081		# else
7082			# echo -e "${MAGENTA}unable to find "$undercloud_vm_ip"__"$hw_model"__"$hotfix_name_build"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz on the remote server${NC}"
7083		# fi
7084
7085	# else
7086		# validate_file_on_remote_server=$(curl -g -s -L -k http://10.104.211.33:88/CBIS/scripts/system_validation_logs | grep "$undercloud_vm_ip"__"$hw_model"__"$cbis_version"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz)
7087		# if [[ $validate_file_on_remote_server ]]
7088		# then
7089			# echo -e "${GREEN}"$undercloud_vm_ip"__"$hw_model"__"$cbis_version"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz is found on the remote server${NC}"
7090		# else
7091			# echo -e "${MAGENTA}unable to find "$undercloud_vm_ip"__"$hw_model"__"$cbis_version"__BIOS-"$bios"__FW-"$firmware"__"$storage_type".tar.gz on the remote server${NC}"
7092		# fi
7093	# fi
7094	# elapsed_time_seconds=$(expr $(date +%s) - $start)
7095	
7096
7097	####################################################################################################
7098
7099
7100	# start=$(date +%s)
7101	# STEPS_COUNTER=$((STEPS_COUNTER+1))
7102	# echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT THE LATEST AVAILABLE HOTFIX IS USED (+$elapsed_time_seconds `date '+%T'`)${NC}"
7103	# if [[ $hotfix_install_success ]]
7104	# then
7105		# cbis_short_version=$(echo -e "$cbis_version" | awk -F\. '{print $1}')
7106		# cbis_hotfix_name=$(/var/lib/cbis/cbis_hotfix list -c 'Hotfix name' -f json | jq .[].hotfix_name | head -n1 | tr -d \")
7107		# if [[ $cbis_hotfix_name != "CBIS-19.0-SP3" ]]
7108		# then
7109			# cbis_hotfix_build=$(/var/lib/cbis/cbis_hotfix list -c 'Hotfix name' -f json | jq .[].build_number | head -n1 | tr -d \")
7110			# latest_hotfix_build=$(curl -g -sk https://repo3.cci.nokia.net/cbis-generic-candidates-local/cbis_local_repo/hotfix/CBIS-$cbis_short_version.x/$cbis_hotfix_name/ | awk -F\" '{print $2}' | grep ^[0-9] | tr -d / | sort -n | tail -n1)
7111			# if [[ -z $latest_hotfix_build ]]
7112			# then
7113				# echo -e "${MAGENTA}something went wrong, couldn't get the builds information from https://repo3.cci.nokia.net/cbis-generic-candidates-local/cbis_local_repo/hotfix/CBIS-$cbis_short_version.x/$cbis_hotfix_name/${NC}"
7114			# else
7115				# if [[ $latest_hotfix_build != $cbis_hotfix_build ]]
7116				# then
7117					# echo -e "${MAGENTA}$cbis_hotfix_name build $latest_hotfix_build is available while the system is installed with build $cbis_hotfix_build - check with RnD if build $latest_hotfix_build is valid and if so, install it${NC}"
7118				# else
7119					# echo -e "${GREEN}using the latest $cbis_hotfix_name build $latest_hotfix_build${NC}"
7120				# fi
7121			# fi
7122		# else
7123			# echo -e "${GREEN}CBIS-19.0-SP3 hotfix is already published and is a scratch install${NC}"
7124		# fi
7125	# else
7126		# echo -e "${ORANGE}no hotfix is deployed${NC}"
7127	# fi	
7128	
7129
7130	####################################################################################################
7131
7132
7133	# start=$(date +%s)
7134	# STEPS_COUNTER=$((STEPS_COUNTER+1))
7135	# echo -e "${BLUE}\n\n$STEPS_COUNTER) VALIDATE THAT SECURITY HARDENING FINISHED VIA THE SECURITY HARDENING LOG IN CBIS MANAGER (+$elapsed_time_seconds `date '+%T'`)${NC}"
7136
7137	# if [[ $cbis_version == "19.0.0.1" ]]
7138	# then
7139		# hardening_finished_line=$(curl -g -s -k -L  -X  GET 'https://'$HypervisorURL'/log/ansible/ansible.log' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' | tail -n1 | grep 'Finished security hardening')
7140	# else
7141		# hardening_finished_line=$(curl -g -s -k -L  -X  GET 'https://'$HypervisorURL'/log/security_hardening.log' -H 'Content-Type: application/json' -H 'Authorization: Basic '$cbis_manager_token'' | grep '/opt/install/data/states/hardening_state:  0')
7142	# fi
7143	# if [[ $hardening_finished_line ]]
7144	# then
7145		# echo -e "${GREEN}security hardening finished sucessfully${NC}"
7146	# else
7147		# echo -e "${MAGENTA}according to the security hardening log in cbis manager, security hardening was never deployed or deployed but didn't finish as expected\nreview the security hardening logs from within cbis manager security tab${NC}"
7148	# fi
7149	# elapsed_time_seconds=$(expr $(date +%s) - $start)
7150
7151
7152	####################################################################################################
7153	
7154	
7155	# start=$(date +%s)
7156	# STEPS_COUNTER=$((STEPS_COUNTER+1))
7157	# echo -e "${BLUE}\n\n$STEPS_COUNTER) LOCATE INSTANCES LEFTOVERS IN /var/lib/nova/instances/ OF THE COMPUTES (+$elapsed_time_seconds `date '+%T'`)${NC}"
7158	# compute_images=$(ansible compute -b -m shell -a "ls -l /var/lib/nova/instances/_base/ | grep -v total" | grep qemu -B 1)
7159	# compute_images_sorted=$(echo -e "$compute_images" | awk '{print $NF}' | grep ^[0-9a-f] | sort | uniq)
7160	# for compute_image in $compute_images_sorted
7161	# do
7162		# check_if_image_in_glance=$(echo -e "$overcloud_images_list" | grep $compute_image)
7163		# if [[ -z $check_if_image_in_glance ]]
7164		# then
7165			# show_image_compute=$(echo -e "$compute_images" | grep overcloud- | awk '{print $1}')
7166			# echo -e "${RED}/var/lib/nova/instances/_base/$compute_image is not found in openstack image list\n$show_image_compute${NC}\n" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
7167		# else
7168			# echo -e "${GREEN}/var/lib/nova/instances/_base/$compute_image is found in openstack image list${NC}"
7169		# fi			
7170	# done
7171	# elapsed_time_seconds=$(expr $(date +%s) - $global_start)
7172
7173
7174	####################################################################################################
7175
7176
7177	# start=$(date +%s)
7178	# STEPS_COUNTER=$((STEPS_COUNTER+1))
7179	# echo -e "\n\n${BLUE}$STEPS_COUNTER) BACKUP CONFIG AND CRITICAL FILES FROM ALL THE HOSTS (+$elapsed_time_seconds `date '+%T'`)${NC}"
7180	# ansible all --limit '!hypervisor,!localhost' -b -m shell -a "yes | cp -R /var/lib/config-data/puppet-generated/. /home/cbis-admin/conf_backup" > /dev/null 2>&1
7181	# ansible all --limit '!hypervisor,!localhost' -b -m shell -a "tar -czf /home/cbis-admin/conf_backup_\`hostname\`.tar.gz -P /home/cbis-admin/conf_backup/*" > /dev/null 2>&1
7182	# ansible all --limit '!hypervisor,!localhost' -b -m shell -a "chmod 777 /home/cbis-admin/conf_backup_*.tar.gz" > /dev/null 2>&1
7183	# for host in $ansible_overcloud_hosts
7184	# do
7185		# scp $ssh_params cbis-admin@$host:/home/cbis-admin/conf_backup_*.tar.gz .
7186	# done
7187	# mv conf_backup_*.tar.gz $logs_dir/
7188	# tar -czf $logs_dir/conf_backup.tar.gz -P $logs_dir/conf_backup_*.tar.gz > /dev/null 2>&1
7189	# chmod 777 $logs_dir/conf_backup.tar.gz
7190	# rm -rf $logs_dir/conf_backup_*.tar.gz
7191	# if [[ -f "$logs_dir/conf_backup.tar.gz" ]]
7192	# then
7193		# echo -e "${GREEN}$logs_dir/conf_backup.tar.gz is found${NC}"
7194	# else
7195		# echo -e "${RED}$logs_dir/conf_backup.tar.gz can't be found${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
7196	# fi
7197	# if [[ ! -f $logs_dir/templates_backup.tar.gz ]]
7198	# then
7199		# tar -zcf templates_backup.tar.gz -P /home/stack/templates/ user_config.yaml
7200		# mv templates_backup.tar.gz $logs_dir/
7201	# fi
7202	# sshpass -p $hv_cbis_admin_password ansible -k all -b -m shell -a "cat /home/stack/.bash_history /home/cbis-admin/.bash_history /root/.bash_history" > bash_history
7203	# cat bash_history | grep -E '^overcloud|^172.31.7.254' | xargs -i sed -i 's/{}/\n\n\n\n\n&/g' bash_history
7204	# mv bash_history $logs_dir/
7205	# elapsed_time_seconds=$(expr $(date +%s) - $start)
7206
7207
7208	####################################################################################################
7209	
7210	
7211	# start=$(date +%s)
7212	# STEPS_COUNTER=$((STEPS_COUNTER+1))
7213	# echo -e "${BLUE}\n\n$STEPS_COUNTER) VERIFY non-admin CORE RESOURCES (instances, backups, networks, ram, volumes, cores and snapshots) QUOTA IS NOT -1 (UNLIMITED) (+$elapsed_time_seconds `date '+%T'`)${NC}"
7214	# projects=$(source ~/overcloudrc && openstack project list -f value | grep -Fvw -e 'service' -e 'admin' -e 'opnfv_bench' | awk '{print $NF}')
7215	# if [[ $projects ]]
7216	# then
7217		# for project in $projects
7218		# do
7219			# echo -e "${CYAN}checking quota for project $project${NC}"
7220			# quota=$(source ~/overcloudrc && openstack quota show $project -f json | jq '{instances,backups,networks,ram,volumes,cores,snapshots}' | grep '\-1')
7221			# if [[ $quota ]]
7222			# then
7223				# echo -e "${RED}$quota${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
7224			# else
7225				# echo -e "${GREEN}project $project has no -1 (unlimited) quota value for resources instances, backups, networks, ram, volumes, cores and snapshots${NC}"
7226			# fi
7227		# done
7228	# else
7229		# source ~/overcloudrc && openstack project create system_validation_project > /dev/null
7230		# echo -e "${CYAN}checking quota for project system_validation_project${NC}"
7231		# validate_project_creation=$(source ~/overcloudrc && openstack project list | grep system_validation_project)
7232		# while [[ -z $validate_project_creation ]]
7233		# do
7234			# sleep 2
7235			# validate_project_creation=$(source ~/overcloudrc && openstack project list | grep system_validation_project)
7236		# done
7237		# echo -e "${GREEN}project system_validation_project is created successfully${NC}"
7238		# quota=$(source ~/overcloudrc && openstack quota show system_validation_project -f json | jq '{instances,backups,networks,ram,volumes,cores,snapshots}' | grep '\-1')
7239		# if [[ $quota ]]
7240		# then
7241			# echo -e "${RED}$quota${NC}" ; FAILURE_COUNTER=$((FAILURE_COUNTER+1))
7242		# else
7243			# echo -e "${GREEN}project system_validation_project has no -1 (unlimited) quota value for resources instances, backups, networks, ram, volumes, cores and snapshots${NC}"
7244		# fi
7245		# source ~/overcloudrc && openstack project delete system_validation_project
7246		# validate_project_deletion=$(source ~/overcloudrc && openstack project list | grep system_validation_project)
7247		# while [[ $validate_project_deletion ]]
7248		# do
7249			# sleep 2
7250			# validate_project_deletion=$(source ~/overcloudrc && openstack project list | grep system_validation_project)
7251		# done
7252		# echo -e "${GREEN}project system_validation_project is deleted successfully${NC}"
7253	# fi
7254
7255
7256	####################################################################################################
7257
7258
7259	# start=$(date +%s)
7260	# STEPS_COUNTER=$((STEPS_COUNTER+1))
7261	# echo -e "${BLUE}\n\n$STEPS_COUNTER) SAVE rabbitmqctl report IN $logs_dir/rabbitmqctl_report.log (+$elapsed_time_seconds `date '+%T'`)${NC}"
7262	# ansible $last_index_controller -b -m shell -a "docker exec \$(sudo docker ps -f name=rabbitmq-bundle -q) rabbitmqctl report" > $logs_dir/rabbitmqctl_report.log
7263	# if [[ -f "$logs_dir/rabbitmqctl_report.log" ]]
7264	# then
7265		# echo -e "${GREEN}$logs_dir/rabbitmqctl_report.log is found${NC}"
7266	# else
7267		# echo -e "${RED}$logs_dir/rabbitmqctl_report.log is not found${NC}"
7268	# fi
7269	# elapsed_time_seconds=$(expr $(date +%s) - $start)
7270
7271
7272	####################################################################################################
7273fi
7274	####################################################################################################
7275
7276
7277elapsed_time_in_minutes=$(expr $(date +%s) - $global_start | xargs -i echo 'scale=2; '{}'/60' | bc)
7278if (( $(echo "$elapsed_time_in_minutes > 20" | bc -l)  && $(echo "$elapsed_time_in_minutes < 20" | bc -l) ))
7279then 
7280	echo -e "\n\n${ORANGE}it took $elapsed_time_in_minutes minutes for the script to finish. in average the script finishes after 15~ minutes.${NC}"
7281elif (( $(echo "$elapsed_time_in_minutes > 25" | bc -l) ))
7282then
7283	echo -e "\n\n${MAGENTA}it took $elapsed_time_in_minutes minutes for the script to finish. in average the script finishes after 15~ minutes.${NC}"
7284fi
7285date=$(date +"%x %X %Z %Y")
7286
7287
7288if [ $FAILURE_COUNTER -gt 0 ]
7289then
7290	echo -e "\n\n${RED}${BLINK}==================================================="
7291	echo -e "Found $FAILURE_COUNTER failures - Please review the script output"
7292	echo -e "===================================================${NC}"
7293fi
7294
7295
7296echo -e "${BLUE}\n\nTHE SYSTEM HEALTH VALIDATION SCRIPT FINISHED AFTER $elapsed_time_in_minutes MINUTES AT ($date)\nTHE SCRIPT OUTPUT IS SAVED UNDER: $logs_dir/system_health_report_"$logs_count"_"$DESCRIPTION"_"$MODE".log\nFOR QUESTIONS, IMPROVEMENTS, SUGGESTIONS, COMPLAINS AND BUGS, PLEASE CONTACT ${UL}arik.rozenman@nokia.com${NC}"
7297
7298}
7299main_function 2>&1 | tee $logs_dir/system_health_report_"$logs_count"_"$DESCRIPTION"_"$MODE".log
7300