Skip to content

Commit 3342a31

Browse files
committed
Add asic presence filtering for container checking in system-health
1 parent fd3966a commit 3342a31

File tree

1 file changed

+18
-4
lines changed

1 file changed

+18
-4
lines changed

src/system-health/health_checker/service_checker.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,19 @@ def get_expected_running_containers(self, feature_table):
7171
"""
7272
expected_running_containers = set()
7373
container_feature_dict = {}
74+
75+
# Get current asic presence list. For multi_asic system, multi instance containers
76+
# should be checked only for asics present.
77+
asics_id_presence = multi_asic.get_asic_presence_list()
78+
79+
# Some services may run all the instances irrespective of asic presence.
80+
# Add those to exception list.
81+
# database service: Currently services have dependency on all database services to
82+
# be up irrespective of asic presence.
83+
# bgp service: Currently bgp runs all instances. Once this is fixed to be config driven,
84+
# it will be removed from exception list.
85+
run_all_instance_list = ['database', 'bgp']
86+
7487
for feature_name, feature_entry in feature_table.items():
7588
if feature_entry["state"] not in ["disabled", "always_disabled"]:
7689
if multi_asic.is_multi_asic():
@@ -80,8 +93,9 @@ def get_expected_running_containers(self, feature_table):
8093
if feature_entry["has_per_asic_scope"] == "True":
8194
num_asics = multi_asic.get_num_asics()
8295
for asic_id in range(num_asics):
83-
expected_running_containers.add(feature_name + str(asic_id))
84-
container_feature_dict[feature_name + str(asic_id)] = feature_name
96+
if asic_id in asics_id_presence or feature_name in run_all_instance_list:
97+
expected_running_containers.add(feature_name + str(asic_id))
98+
container_feature_dict[feature_name + str(asic_id)] = feature_name
8599
else:
86100
expected_running_containers.add(feature_name)
87101
container_feature_dict[feature_name] = feature_name
@@ -343,7 +357,7 @@ def check_process_existence(self, container_name, critical_process_list, config,
343357
process_status = utils.run_command(cmd)
344358
if process_status is None:
345359
for process_name in critical_process_list:
346-
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
360+
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name))
347361
self.publish_events(container_name, critical_process_list)
348362
return
349363

@@ -356,6 +370,6 @@ def check_process_existence(self, container_name, critical_process_list, config,
356370
# and it is safe to ignore such process. E.g, radv. So here we only check those processes which are in process_status.
357371
if process_name in process_status:
358372
if process_status[process_name] != 'RUNNING':
359-
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
373+
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name))
360374
else:
361375
self.set_object_ok('Process', '{}:{}'.format(container_name, process_name))

0 commit comments

Comments
 (0)