Skip to content

Commit 4372ced

Browse files
authored
Add lock to config reload/load_minigraph (sonic-net#3475)
What I did In some cases, if multiple config reload/load_minigraph are running in parallel, they might leave the system in an error state. In this PR, a flock is added to config reload/load_minigraph so they will not run in parallel. The file lock is binding to /etc/sonic/reload.lock. This is to fix issue: sonic-net#19855 Microsoft ADO (number only): 28877643 Signed-off-by: Longxiang Lyu [email protected] How I did it Add flock utility and decoate the reload and load_minigraph with the try_lock to ensure the lock is acquired before reload/load_minigraph. How to verify it UT and on testbed. New command output (if the output of a command-line utility has changed) reload with locking success # config reload Acquired lock on /etc/sonic/reload.lock Clear current config and reload config in config_db format from the default config file(s) ? [y/N]: y Disabling container monitoring ... Stopping SONiC target ... Running command: /usr/local/bin/sonic-cfggen -j /etc/sonic/init_cfg.json -j /etc/sonic/config_db.json --write-to-db Running command: /usr/local/bin/db_migrator.py -o migrate Running command: /usr/local/bin/sonic-cfggen -d -y /etc/sonic/sonic_version.yml -t /usr/share/sonic/templates/sonic-environment.j2,/etc/sonic/sonic-environment Restarting SONiC target ... Enabling container monitoring ... Reloading Monit configuration ... Reinitializing monit daemon Released lock on /etc/sonic/reload.lock reload with locking failure # config reload Failed to acquire lock on /etc/sonic/reload.lock
1 parent 1c4300f commit 4372ced

File tree

4 files changed

+448
-12
lines changed

4 files changed

+448
-12
lines changed

config/main.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
from utilities_common.general import load_db_config, load_module_from_source
4343
from .validated_config_db_connector import ValidatedConfigDBConnector
4444
import utilities_common.multi_asic as multi_asic_util
45+
from utilities_common.flock import try_lock
4546

4647
from .utils import log
4748

@@ -124,6 +125,12 @@
124125
GRE_TYPE_RANGE = click.IntRange(min=0, max=65535)
125126
ADHOC_VALIDATION = True
126127

128+
if os.environ.get("UTILITIES_UNIT_TESTING", "0") in ("1", "2"):
129+
temp_system_reload_lockfile = tempfile.NamedTemporaryFile()
130+
SYSTEM_RELOAD_LOCK = temp_system_reload_lockfile.name
131+
else:
132+
SYSTEM_RELOAD_LOCK = "/etc/sonic/reload.lock"
133+
127134
# Load sonic-cfggen from source since /usr/local/bin/sonic-cfggen does not have .py extension.
128135
sonic_cfggen = load_module_from_source('sonic_cfggen', '/usr/local/bin/sonic-cfggen')
129136

@@ -1753,9 +1760,11 @@ def list_checkpoints(ctx, verbose):
17531760
@click.option('-n', '--no_service_restart', default=False, is_flag=True, help='Do not restart docker services')
17541761
@click.option('-f', '--force', default=False, is_flag=True, help='Force config reload without system checks')
17551762
@click.option('-t', '--file_format', default='config_db',type=click.Choice(['config_yang', 'config_db']),show_default=True,help='specify the file format')
1763+
@click.option('-b', '--bypass-lock', default=False, is_flag=True, help='Do reload without acquiring lock')
17561764
@click.argument('filename', required=False)
17571765
@clicommon.pass_db
1758-
def reload(db, filename, yes, load_sysinfo, no_service_restart, force, file_format):
1766+
@try_lock(SYSTEM_RELOAD_LOCK, timeout=0)
1767+
def reload(db, filename, yes, load_sysinfo, no_service_restart, force, file_format, bypass_lock):
17591768
"""Clear current configuration and import a previous saved config DB dump file.
17601769
<filename> : Names of configuration file(s) to load, separated by comma with no spaces in between
17611770
"""
@@ -1968,8 +1977,10 @@ def load_mgmt_config(filename):
19681977
@click.option('-t', '--traffic_shift_away', default=False, is_flag=True, help='Keep device in maintenance with TSA')
19691978
@click.option('-o', '--override_config', default=False, is_flag=True, help='Enable config override. Proceed with default path.')
19701979
@click.option('-p', '--golden_config_path', help='Provide golden config path to override. Use with --override_config')
1980+
@click.option('-b', '--bypass-lock', default=False, is_flag=True, help='Do load minigraph without acquiring lock')
19711981
@clicommon.pass_db
1972-
def load_minigraph(db, no_service_restart, traffic_shift_away, override_config, golden_config_path):
1982+
@try_lock(SYSTEM_RELOAD_LOCK, timeout=0)
1983+
def load_minigraph(db, no_service_restart, traffic_shift_away, override_config, golden_config_path, bypass_lock):
19731984
"""Reconfigure based on minigraph."""
19741985
argv_str = ' '.join(['config', *sys.argv[1:]])
19751986
log.log_notice(f"'load_minigraph' executing with command: {argv_str}")

tests/config_test.py

+159-10
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from click.testing import CliRunner
1919

2020
from sonic_py_common import device_info, multi_asic
21+
from utilities_common import flock
2122
from utilities_common.db import Db
2223
from utilities_common.general import load_module_from_source
2324
from mock import call, patch, mock_open, MagicMock
@@ -45,6 +46,23 @@
4546
load_minigraph_platform_false_path = os.path.join(load_minigraph_input_path, "platform_false")
4647

4748
load_minigraph_command_output="""\
49+
Acquired lock on {0}
50+
Stopping SONiC target ...
51+
Running command: /usr/local/bin/sonic-cfggen -H -m --write-to-db
52+
Running command: config qos reload --no-dynamic-buffer --no-delay
53+
Running command: pfcwd start_default
54+
Restarting SONiC target ...
55+
Reloading Monit configuration ...
56+
Please note setting loaded from minigraph will be lost after system reboot. To preserve setting, run `config save`.
57+
Released lock on {0}
58+
"""
59+
60+
load_minigraph_lock_failure_output = """\
61+
Failed to acquire lock on {0}
62+
"""
63+
64+
load_minigraph_command_bypass_lock_output = """\
65+
Bypass lock on {}
4866
Stopping SONiC target ...
4967
Running command: /usr/local/bin/sonic-cfggen -H -m --write-to-db
5068
Running command: config qos reload --no-dynamic-buffer --no-delay
@@ -55,6 +73,7 @@
5573
"""
5674

5775
load_minigraph_platform_plugin_command_output="""\
76+
Acquired lock on {0}
5877
Stopping SONiC target ...
5978
Running command: /usr/local/bin/sonic-cfggen -H -m --write-to-db
6079
Running command: config qos reload --no-dynamic-buffer --no-delay
@@ -63,6 +82,7 @@
6382
Restarting SONiC target ...
6483
Reloading Monit configuration ...
6584
Please note setting loaded from minigraph will be lost after system reboot. To preserve setting, run `config save`.
85+
Released lock on {0}
6686
"""
6787

6888
load_mgmt_config_command_ipv4_only_output="""\
@@ -137,51 +157,76 @@
137157
"""
138158

139159
RELOAD_CONFIG_DB_OUTPUT = """\
160+
Acquired lock on {0}
161+
Stopping SONiC target ...
162+
Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json --write-to-db
163+
Restarting SONiC target ...
164+
Reloading Monit configuration ...
165+
Released lock on {0}
166+
"""
167+
168+
RELOAD_CONFIG_DB_LOCK_FAILURE_OUTPUT = """\
169+
Failed to acquire lock on {0}
170+
"""
171+
172+
RELOAD_CONFIG_DB_BYPASS_LOCK_OUTPUT = """\
173+
Bypass lock on {0}
140174
Stopping SONiC target ...
141175
Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json --write-to-db
142176
Restarting SONiC target ...
143177
Reloading Monit configuration ...
144178
"""
145179

146180
RELOAD_YANG_CFG_OUTPUT = """\
181+
Acquired lock on {0}
147182
Stopping SONiC target ...
148183
Running command: /usr/local/bin/sonic-cfggen -Y /tmp/config.json --write-to-db
149184
Restarting SONiC target ...
150185
Reloading Monit configuration ...
186+
Released lock on {0}
151187
"""
152188

153189
RELOAD_MASIC_CONFIG_DB_OUTPUT = """\
190+
Acquired lock on {0}
154191
Stopping SONiC target ...
155192
Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json --write-to-db
156193
Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json -n asic0 --write-to-db
157194
Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json -n asic1 --write-to-db
158195
Restarting SONiC target ...
159196
Reloading Monit configuration ...
197+
Released lock on {0}
160198
"""
161199

162200
reload_config_with_sys_info_command_output="""\
201+
Acquired lock on {0}
163202
Running command: /usr/local/bin/sonic-cfggen -H -k Seastone-DX010-25-50 --write-to-db"""
164203

165204
reload_config_with_disabled_service_output="""\
205+
Acquired lock on {0}
166206
Stopping SONiC target ...
167207
Running command: /usr/local/bin/sonic-cfggen -j /tmp/config.json --write-to-db
168208
Restarting SONiC target ...
169209
Reloading Monit configuration ...
210+
Released lock on {0}
170211
"""
171212

172213
reload_config_masic_onefile_output = """\
214+
Acquired lock on {0}
173215
Stopping SONiC target ...
174216
Restarting SONiC target ...
175217
Reloading Monit configuration ...
218+
Released lock on {0}
176219
"""
177220

178221
reload_config_masic_onefile_gen_sysinfo_output = """\
222+
Acquired lock on {0}
179223
Stopping SONiC target ...
180224
Running command: /usr/local/bin/sonic-cfggen -H -k Mellanox-SN3800-D112C8 --write-to-db
181225
Running command: /usr/local/bin/sonic-cfggen -H -k multi_asic -n asic0 --write-to-db
182226
Running command: /usr/local/bin/sonic-cfggen -H -k multi_asic -n asic1 --write-to-db
183227
Restarting SONiC target ...
184228
Reloading Monit configuration ...
229+
Released lock on {0}
185230
"""
186231

187232
save_config_output = """\
@@ -601,7 +646,8 @@ def test_config_reload(self, get_cmd_module, setup_single_broadcom_asic):
601646

602647
assert result.exit_code == 0
603648

604-
assert "\n".join([l.rstrip() for l in result.output.split('\n')][:1]) == reload_config_with_sys_info_command_output
649+
assert "\n".join([line.rstrip() for line in result.output.split('\n')][:2]) == \
650+
reload_config_with_sys_info_command_output.format(config.SYSTEM_RELOAD_LOCK)
605651

606652
def test_config_reload_stdin(self, get_cmd_module, setup_single_broadcom_asic):
607653
def mock_json_load(f):
@@ -641,7 +687,8 @@ def mock_json_load(f):
641687

642688
assert result.exit_code == 0
643689

644-
assert "\n".join([l.rstrip() for l in result.output.split('\n')][:1]) == reload_config_with_sys_info_command_output
690+
assert "\n".join([line.rstrip() for line in result.output.split('\n')][:2]) == \
691+
reload_config_with_sys_info_command_output.format(config.SYSTEM_RELOAD_LOCK)
645692

646693
@classmethod
647694
def teardown_class(cls):
@@ -747,7 +794,8 @@ def read_json_file_side_effect(filename):
747794
traceback.print_tb(result.exc_info[2])
748795

749796
assert result.exit_code == 0
750-
assert "\n".join([li.rstrip() for li in result.output.split('\n')]) == reload_config_masic_onefile_output
797+
assert "\n".join([li.rstrip() for li in result.output.split('\n')]) == \
798+
reload_config_masic_onefile_output.format(config.SYSTEM_RELOAD_LOCK)
751799

752800
def test_config_reload_onefile_gen_sysinfo_masic(self):
753801
def read_json_file_side_effect(filename):
@@ -823,7 +871,7 @@ def read_json_file_side_effect(filename):
823871
assert result.exit_code == 0
824872
assert "\n".join(
825873
[li.rstrip() for li in result.output.split('\n')]
826-
) == reload_config_masic_onefile_gen_sysinfo_output
874+
) == reload_config_masic_onefile_gen_sysinfo_output.format(config.SYSTEM_RELOAD_LOCK)
827875

828876
def test_config_reload_onefile_bad_format_masic(self):
829877
def read_json_file_side_effect(filename):
@@ -878,11 +926,58 @@ def test_load_minigraph(self, get_cmd_module, setup_single_broadcom_asic):
878926
print(result.output)
879927
traceback.print_tb(result.exc_info[2])
880928
assert result.exit_code == 0
881-
assert "\n".join([l.rstrip() for l in result.output.split('\n')]) == load_minigraph_command_output
929+
assert "\n".join([line.rstrip() for line in result.output.split('\n')]) == \
930+
(load_minigraph_command_output.format(config.SYSTEM_RELOAD_LOCK))
882931
# Verify "systemctl reset-failed" is called for services under sonic.target
883932
mock_run_command.assert_any_call(['systemctl', 'reset-failed', 'swss'])
884933
assert mock_run_command.call_count == 12
885934

935+
@mock.patch('sonic_py_common.device_info.get_paths_to_platform_and_hwsku_dirs',
936+
mock.MagicMock(return_value=("dummy_path", None)))
937+
def test_load_minigraph_lock_failure(self, get_cmd_module, setup_single_broadcom_asic):
938+
with mock.patch("utilities_common.cli.run_command",
939+
mock.MagicMock(side_effect=mock_run_command_side_effect)) as mock_run_command:
940+
(config, _) = get_cmd_module
941+
942+
fd = open(config.SYSTEM_RELOAD_LOCK, 'r')
943+
assert flock.acquire_flock(fd, 0)
944+
945+
try:
946+
runner = CliRunner()
947+
result = runner.invoke(config.config.commands["load_minigraph"], ["-y"])
948+
print(result.exit_code)
949+
print(result.output)
950+
traceback.print_tb(result.exc_info[2])
951+
assert result.exit_code != 0
952+
assert result.output == \
953+
(load_minigraph_lock_failure_output.format(config.SYSTEM_RELOAD_LOCK))
954+
assert mock_run_command.call_count == 0
955+
finally:
956+
flock.release_flock(fd)
957+
958+
@mock.patch('sonic_py_common.device_info.get_paths_to_platform_and_hwsku_dirs',
959+
mock.MagicMock(return_value=("dummy_path", None)))
960+
def test_load_minigraph_bypass_lock(self, get_cmd_module, setup_single_broadcom_asic):
961+
with mock.patch("utilities_common.cli.run_command",
962+
mock.MagicMock(side_effect=mock_run_command_side_effect)) as mock_run_command:
963+
(config, _) = get_cmd_module
964+
965+
fd = open(config.SYSTEM_RELOAD_LOCK, 'r')
966+
assert flock.acquire_flock(fd, 0)
967+
968+
try:
969+
runner = CliRunner()
970+
result = runner.invoke(config.config.commands["load_minigraph"], ["-y", "-b"])
971+
print(result.exit_code)
972+
print(result.output)
973+
traceback.print_tb(result.exc_info[2])
974+
assert result.exit_code == 0
975+
assert result.output == \
976+
load_minigraph_command_bypass_lock_output.format(config.SYSTEM_RELOAD_LOCK)
977+
assert mock_run_command.call_count == 12
978+
finally:
979+
flock.release_flock(fd)
980+
886981
@mock.patch('sonic_py_common.device_info.get_paths_to_platform_and_hwsku_dirs', mock.MagicMock(return_value=(load_minigraph_platform_path, None)))
887982
def test_load_minigraph_platform_plugin(self, get_cmd_module, setup_single_broadcom_asic):
888983
with mock.patch("utilities_common.cli.run_command", mock.MagicMock(side_effect=mock_run_command_side_effect)) as mock_run_command:
@@ -893,7 +988,8 @@ def test_load_minigraph_platform_plugin(self, get_cmd_module, setup_single_broad
893988
print(result.output)
894989
traceback.print_tb(result.exc_info[2])
895990
assert result.exit_code == 0
896-
assert "\n".join([l.rstrip() for l in result.output.split('\n')]) == load_minigraph_platform_plugin_command_output
991+
assert "\n".join([line.rstrip() for line in result.output.split('\n')]) == \
992+
(load_minigraph_platform_plugin_command_output.format(config.SYSTEM_RELOAD_LOCK))
897993
# Verify "systemctl reset-failed" is called for services under sonic.target
898994
mock_run_command.assert_any_call(['systemctl', 'reset-failed', 'swss'])
899995
assert mock_run_command.call_count == 12
@@ -1171,7 +1267,59 @@ def test_reload_config(self, get_cmd_module, setup_single_broadcom_asic):
11711267
traceback.print_tb(result.exc_info[2])
11721268
assert result.exit_code == 0
11731269
assert "\n".join([l.rstrip() for l in result.output.split('\n')]) \
1174-
== RELOAD_CONFIG_DB_OUTPUT
1270+
== RELOAD_CONFIG_DB_OUTPUT.format(config.SYSTEM_RELOAD_LOCK)
1271+
1272+
def test_reload_config_lock_failure(self, get_cmd_module, setup_single_broadcom_asic):
1273+
self.add_sysinfo_to_cfg_file()
1274+
with mock.patch(
1275+
"utilities_common.cli.run_command",
1276+
mock.MagicMock(side_effect=mock_run_command_side_effect)
1277+
):
1278+
(config, show) = get_cmd_module
1279+
runner = CliRunner()
1280+
1281+
fd = open(config.SYSTEM_RELOAD_LOCK, 'r')
1282+
assert flock.acquire_flock(fd, 0)
1283+
1284+
try:
1285+
result = runner.invoke(
1286+
config.config.commands["reload"],
1287+
[self.dummy_cfg_file, '-y', '-f'])
1288+
1289+
print(result.exit_code)
1290+
print(result.output)
1291+
traceback.print_tb(result.exc_info[2])
1292+
assert result.exit_code != 0
1293+
assert "\n".join([line.rstrip() for line in result.output.split('\n')]) \
1294+
== RELOAD_CONFIG_DB_LOCK_FAILURE_OUTPUT.format(config.SYSTEM_RELOAD_LOCK)
1295+
finally:
1296+
flock.release_flock(fd)
1297+
1298+
def test_reload_config_bypass_lock(self, get_cmd_module, setup_single_broadcom_asic):
1299+
self.add_sysinfo_to_cfg_file()
1300+
with mock.patch(
1301+
"utilities_common.cli.run_command",
1302+
mock.MagicMock(side_effect=mock_run_command_side_effect)
1303+
):
1304+
(config, show) = get_cmd_module
1305+
runner = CliRunner()
1306+
1307+
fd = open(config.SYSTEM_RELOAD_LOCK, 'r')
1308+
assert flock.acquire_flock(fd, 0)
1309+
1310+
try:
1311+
result = runner.invoke(
1312+
config.config.commands["reload"],
1313+
[self.dummy_cfg_file, '-y', '-f', '-b'])
1314+
1315+
print(result.exit_code)
1316+
print(result.output)
1317+
traceback.print_tb(result.exc_info[2])
1318+
assert result.exit_code == 0
1319+
assert "\n".join([line.rstrip() for line in result.output.split('\n')]) \
1320+
== RELOAD_CONFIG_DB_BYPASS_LOCK_OUTPUT.format(config.SYSTEM_RELOAD_LOCK)
1321+
finally:
1322+
flock.release_flock(fd)
11751323

11761324
def test_config_reload_disabled_service(self, get_cmd_module, setup_single_broadcom_asic):
11771325
self.add_sysinfo_to_cfg_file()
@@ -1191,7 +1339,8 @@ def test_config_reload_disabled_service(self, get_cmd_module, setup_single_broad
11911339

11921340
assert result.exit_code == 0
11931341

1194-
assert "\n".join([l.rstrip() for l in result.output.split('\n')]) == reload_config_with_disabled_service_output
1342+
assert "\n".join([line.rstrip() for line in result.output.split('\n')]) == \
1343+
reload_config_with_disabled_service_output.format(config.SYSTEM_RELOAD_LOCK)
11951344

11961345
def test_reload_config_masic(self, get_cmd_module, setup_multi_broadcom_masic):
11971346
self.add_sysinfo_to_cfg_file()
@@ -1215,7 +1364,7 @@ def test_reload_config_masic(self, get_cmd_module, setup_multi_broadcom_masic):
12151364
traceback.print_tb(result.exc_info[2])
12161365
assert result.exit_code == 0
12171366
assert "\n".join([l.rstrip() for l in result.output.split('\n')]) \
1218-
== RELOAD_MASIC_CONFIG_DB_OUTPUT
1367+
== RELOAD_MASIC_CONFIG_DB_OUTPUT.format(config.SYSTEM_RELOAD_LOCK)
12191368

12201369
def test_reload_yang_config(self, get_cmd_module,
12211370
setup_single_broadcom_asic):
@@ -1234,7 +1383,7 @@ def test_reload_yang_config(self, get_cmd_module,
12341383
traceback.print_tb(result.exc_info[2])
12351384
assert result.exit_code == 0
12361385
assert "\n".join([l.rstrip() for l in result.output.split('\n')]) \
1237-
== RELOAD_YANG_CFG_OUTPUT
1386+
== RELOAD_YANG_CFG_OUTPUT.format(config.SYSTEM_RELOAD_LOCK)
12381387

12391388
@classmethod
12401389
def teardown_class(cls):

0 commit comments

Comments
 (0)