Skip to content

Commit 51712aa

Browse files
Junchao-Mellanoxpull[bot]
authored andcommitted
[system-health] Fix error log system_service'state' while doing confi… (#11225)
- Why I did it While doing config reload, FEATURE table may be removed and re-add. During this process, updating FEATURE table is not atomic. It could be that the FEATURE table has entry, but each entry has no field. This PR introduces a retry mechanism to avoid this. - How I did it Introduces a retry mechanism to avoid this. - How to verify it New unit test added to verify the flow as well as running some manual test.
1 parent 8db60ac commit 51712aa

File tree

2 files changed

+57
-6
lines changed

2 files changed

+57
-6
lines changed

src/system-health/health_checker/sysmonitor.py

+37-6
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import os
44
import sys
5+
import time
56
import glob
67
import multiprocessing
78
from datetime import datetime
@@ -145,12 +146,7 @@ def get_all_service_list(self):
145146
dir_list += [os.path.basename(i) for i in glob.glob('{}/*.service'.format(path))]
146147

147148
#add the enabled docker services from config db feature table
148-
feature_table = self.config_db.get_table("FEATURE")
149-
for srv in feature_table.keys():
150-
if feature_table[srv]["state"] not in ["disabled", "always_disabled"]:
151-
srvext = srv + ".service"
152-
if srvext not in dir_list:
153-
dir_list.append(srvext)
149+
self.get_service_from_feature_table(dir_list)
154150

155151
self.config.load_config()
156152
if self.config and self.config.ignore_services:
@@ -161,6 +157,41 @@ def get_all_service_list(self):
161157
dir_list.sort()
162158
return dir_list
163159

160+
def get_service_from_feature_table(self, dir_list):
161+
"""Get service from CONFIG DB FEATURE table. During "config reload" command, filling FEATURE table
162+
is not an atomic operation, sonic-cfggen do it with two steps:
163+
1. Add an empty table entry to CONFIG DB
164+
2. Add all fields to the table
165+
166+
So, if system health read db on middle of step 1 and step 2, it might read invalid data. A retry
167+
mechanism is here to avoid such issue.
168+
169+
Args:
170+
dir_list (list): service list
171+
"""
172+
max_retry = 3
173+
retry_delay = 1
174+
success = True
175+
176+
while max_retry > 0:
177+
success = True
178+
feature_table = self.config_db.get_table("FEATURE")
179+
for srv, fields in feature_table.items():
180+
if 'state' not in fields:
181+
success = False
182+
logger.log_warning("FEATURE table is not fully ready: {}, retrying".format(feature_table))
183+
break
184+
if fields["state"] not in ["disabled", "always_disabled"]:
185+
srvext = srv + ".service"
186+
if srvext not in dir_list:
187+
dir_list.append(srvext)
188+
if not success:
189+
max_retry -= 1
190+
time.sleep(retry_delay)
191+
else:
192+
break
193+
if not success:
194+
logger.log_error("FEATURE table is not fully ready: {}, max retry reached".format(feature_table))
164195

165196
#Checks FEATURE table from config db for the service' check_up_status flag
166197
#if marked to true, then read the service up_status from FEATURE table of state db.

src/system-health/tests/test_system_health.py

+20
Original file line numberDiff line numberDiff line change
@@ -720,3 +720,23 @@ def test_system_service():
720720
sysmon.task_run()
721721
assert sysmon._task_process is not None
722722
sysmon.task_stop()
723+
724+
725+
def test_get_service_from_feature_table():
726+
sysmon = Sysmonitor()
727+
sysmon.config_db = MagicMock()
728+
sysmon.config_db.get_table = MagicMock()
729+
sysmon.config_db.get_table.side_effect = [
730+
{
731+
'bgp': {},
732+
'swss': {}
733+
},
734+
{
735+
'bgp': {'state': 'enabled'},
736+
'swss': {'state': 'disabled'}
737+
}
738+
]
739+
dir_list = []
740+
sysmon.get_service_from_feature_table(dir_list)
741+
assert 'bgp.service' in dir_list
742+
assert 'swss.service' not in dir_list

0 commit comments

Comments
 (0)