Skip to content

Commit fbef246

Browse files
spilkey-ciscomssonicbld
authored andcommitted
Add asic presence filtering for container checking in system-health (sonic-net#13497)
Why I did it On a supervisor card in a chassis, syncd/teamd/swss/lldp etc dockers are created for each Switch Fabric card. However, not all chassis would have all the switch fabric cards present. In this case, only dockers for Switch Fabrics present would be created. system-health indicates errors in this scenario as it is expecting dockers for all Switch Fabrics (based on NUM_ASIC defined in asic.conf file). system-health process error messages were also altered to indicate which container had the issue; multiple containers may run processes with the same name, which can result in identical system-health error messages, causing ambiguity. How I did it Port container_checker logic from sonic-net#11442 into service_checker for system-health. How to verify it Bringup Supervisor card with one or more missing fabric cards. Execute 'show system-health summary'. The command should not report failure due to missing dockers for the asics on the fabric cards which are not present.
1 parent 1d155b8 commit fbef246

File tree

1 file changed

+18
-4
lines changed

1 file changed

+18
-4
lines changed

src/system-health/health_checker/service_checker.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,19 @@ def get_expected_running_containers(self, feature_table):
7171
"""
7272
expected_running_containers = set()
7373
container_feature_dict = {}
74+
75+
# Get current asic presence list. For multi_asic system, multi instance containers
76+
# should be checked only for asics present.
77+
asics_id_presence = multi_asic.get_asic_presence_list()
78+
79+
# Some services may run all the instances irrespective of asic presence.
80+
# Add those to exception list.
81+
# database service: Currently services have dependency on all database services to
82+
# be up irrespective of asic presence.
83+
# bgp service: Currently bgp runs all instances. Once this is fixed to be config driven,
84+
# it will be removed from exception list.
85+
run_all_instance_list = ['database', 'bgp']
86+
7487
for feature_name, feature_entry in feature_table.items():
7588
if feature_entry["state"] not in ["disabled", "always_disabled"]:
7689
if multi_asic.is_multi_asic():
@@ -80,8 +93,9 @@ def get_expected_running_containers(self, feature_table):
8093
if feature_entry["has_per_asic_scope"] == "True":
8194
num_asics = multi_asic.get_num_asics()
8295
for asic_id in range(num_asics):
83-
expected_running_containers.add(feature_name + str(asic_id))
84-
container_feature_dict[feature_name + str(asic_id)] = feature_name
96+
if asic_id in asics_id_presence or feature_name in run_all_instance_list:
97+
expected_running_containers.add(feature_name + str(asic_id))
98+
container_feature_dict[feature_name + str(asic_id)] = feature_name
8599
else:
86100
expected_running_containers.add(feature_name)
87101
container_feature_dict[feature_name] = feature_name
@@ -343,7 +357,7 @@ def check_process_existence(self, container_name, critical_process_list, config,
343357
process_status = utils.run_command(cmd)
344358
if process_status is None:
345359
for process_name in critical_process_list:
346-
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
360+
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name))
347361
self.publish_events(container_name, critical_process_list)
348362
return
349363

@@ -356,6 +370,6 @@ def check_process_existence(self, container_name, critical_process_list, config,
356370
# and it is safe to ignore such process. E.g, radv. So here we only check those processes which are in process_status.
357371
if process_name in process_status:
358372
if process_status[process_name] != 'RUNNING':
359-
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
373+
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name))
360374
else:
361375
self.set_object_ok('Process', '{}:{}'.format(container_name, process_name))

0 commit comments

Comments
 (0)