Skip to content

Commit 9b5a439

Browse files
Add new test case for container based warm restart (#6054)
Added a new test case test_advanced_reboot::test_service_warm_restart. This test tries to run warm restart for each service in FEATURE table. If a URL is provided by CLI option "--new_docker_image", the test case will try download new docker image from the URL and do in service warm upgrade. The new test case reused the advanced-reboot infrastructure in sonic-mgmt and added some special logic for container based warm restart
1 parent 83d7643 commit 9b5a439

File tree

6 files changed

+267
-14
lines changed

6 files changed

+267
-14
lines changed

ansible/roles/test/files/ptftests/advanced-reboot.py

+91-7
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,17 @@ def __init__(self):
253253
self.kvm_test = True
254254
else:
255255
self.kvm_test = False
256+
if "service-warm-restart" in self.test_params['reboot_type']:
257+
self.check_param('service_list', None, required=True)
258+
self.check_param('service_data', None, required=True)
259+
self.service_data = self.test_params['service_data']
260+
for service_name in self.test_params['service_list']:
261+
cmd = 'systemctl show -p ExecMainStartTimestamp {}'.format(service_name)
262+
stdout, _, _ = self.dut_connection.execCommand(cmd)
263+
if service_name not in self.service_data:
264+
self.service_data[service_name] = {}
265+
self.service_data[service_name]['service_start_time'] = str(stdout[0]).strip()
266+
self.log("Service start time for {} is {}".format(service_name, self.service_data[service_name]['service_start_time']))
256267
return
257268

258269
def read_json(self, name):
@@ -437,7 +448,7 @@ def build_vlan_if_port_mapping(self):
437448
portchannel_names = [pc['name'] for pc in portchannel_content.values()]
438449

439450
vlan_content = self.read_json('vlan_ports_file')
440-
451+
441452
vlan_if_port = []
442453
for vlan in self.vlan_ip_range:
443454
for ifname in vlan_content[vlan]['members']:
@@ -926,6 +937,31 @@ def wait_until_control_plane_up(self):
926937
self.no_control_stop = datetime.datetime.now()
927938
self.log("Dut reboots: control plane up at %s" % str(self.no_control_stop))
928939

940+
def wait_until_service_restart(self):
941+
self.log("Wait until sevice restart")
942+
self.reboot_start = datetime.datetime.now()
943+
service_set = set(self.test_params['service_list'])
944+
wait_time = 120
945+
while wait_time > 0:
946+
for service_name in self.test_params['service_list']:
947+
if service_name not in service_set:
948+
continue
949+
cmd = 'systemctl show -p ExecMainStartTimestamp {}'.format(service_name)
950+
stdout, _, _ = self.dut_connection.execCommand(cmd)
951+
if self.service_data[service_name]['service_start_time'] != str(stdout[0]).strip():
952+
service_set.remove(service_name)
953+
if not service_set:
954+
break
955+
wait_time -= 10
956+
time.sleep(10)
957+
958+
if service_set:
959+
self.fails['dut'].add("Container {} hasn't come back up in {} seconds".format(','.join(service_set), wait_time))
960+
raise TimeoutError
961+
962+
# TODO: add timestamp
963+
self.log("Service has restarted")
964+
929965
def handle_fast_reboot_health_check(self):
930966
self.log("Check that device is still forwarding data plane traffic")
931967
self.fails['dut'].add("Data plane has a forwarding problem after CPU went down")
@@ -1017,6 +1053,10 @@ def wait_for_ssh_threads(signal):
10171053
# verify there are no interface flaps after warm boot
10181054
self.neigh_lag_status_check()
10191055

1056+
if 'service-warm-restart' == self.reboot_type:
1057+
# verify there are no interface flaps after warm boot
1058+
self.neigh_lag_status_check()
1059+
10201060
def handle_advanced_reboot_health_check_kvm(self):
10211061
self.log("Wait until data plane stops")
10221062
forward_stop_signal = multiprocessing.Event()
@@ -1193,8 +1233,11 @@ def runTest(self):
11931233
thr = threading.Thread(target=self.reboot_dut)
11941234
thr.setDaemon(True)
11951235
thr.start()
1196-
self.wait_until_control_plane_down()
1197-
self.no_control_start = self.cpu_state.get_state_time('down')
1236+
if self.reboot_type != 'service-warm-restart':
1237+
self.wait_until_control_plane_down()
1238+
self.no_control_start = self.cpu_state.get_state_time('down')
1239+
else:
1240+
self.wait_until_service_restart()
11981241

11991242
if 'warm-reboot' in self.reboot_type:
12001243
finalizer_timeout = 60 + self.test_params['reboot_limit_in_seconds']
@@ -1210,7 +1253,7 @@ def runTest(self):
12101253
else:
12111254
if self.reboot_type == 'fast-reboot':
12121255
self.handle_fast_reboot_health_check()
1213-
if 'warm-reboot' in self.reboot_type:
1256+
if 'warm-reboot' in self.reboot_type or 'service-warm-restart' == self.reboot_type:
12141257
self.handle_warm_reboot_health_check()
12151258
self.handle_post_reboot_health_check()
12161259

@@ -1276,15 +1319,20 @@ def reboot_dut(self):
12761319
time.sleep(self.reboot_delay)
12771320

12781321
if not self.kvm_test and\
1279-
(self.reboot_type == 'fast-reboot' or 'warm-reboot' in self.reboot_type):
1322+
(self.reboot_type == 'fast-reboot' or 'warm-reboot' in self.reboot_type or 'service-warm-restart' in self.reboot_type):
12801323
self.sender_thr = threading.Thread(target = self.send_in_background)
12811324
self.sniff_thr = threading.Thread(target = self.sniff_in_background)
12821325
self.sniffer_started = threading.Event() # Event for the sniff_in_background status.
12831326
self.sniff_thr.start()
12841327
self.sender_thr.start()
12851328

12861329
self.log("Rebooting remote side")
1287-
stdout, stderr, return_code = self.dut_connection.execCommand("sudo " + self.reboot_type, timeout=30)
1330+
if self.reboot_type != 'service-warm-restart':
1331+
stdout, stderr, return_code = self.dut_connection.execCommand("sudo " + self.reboot_type, timeout=30)
1332+
else:
1333+
self.restart_service()
1334+
return
1335+
12881336
if stdout != []:
12891337
self.log("stdout from %s: %s" % (self.reboot_type, str(stdout)))
12901338
if stderr != []:
@@ -1300,6 +1348,42 @@ def reboot_dut(self):
13001348

13011349
return
13021350

1351+
def restart_service(self):
1352+
for service_name in self.test_params['service_list']:
1353+
if 'image_path_on_dut' in self.service_data[service_name]:
1354+
stdout, stderr, return_code = self.dut_connection.execCommand("sudo sonic-installer upgrade-docker {} {} -y --warm".format(service_name, self.service_data[service_name]['image_path_on_dut']), timeout=30)
1355+
else:
1356+
self.dut_connection.execCommand('sudo config warm_restart enable {}'.format(service_name))
1357+
self.pre_service_warm_restart(service_name)
1358+
stdout, stderr, return_code = self.dut_connection.execCommand('sudo service {} restart'.format(service_name))
1359+
1360+
if stdout != []:
1361+
self.log("stdout from %s %s: %s" % (self.reboot_type, service_name, str(stdout)))
1362+
if stderr != []:
1363+
self.log("stderr from %s %s: %s" % (self.reboot_type, service_name, str(stderr)))
1364+
self.fails['dut'].add("service warm restart {} failed with error {}".format(service_name, stderr))
1365+
thread.interrupt_main()
1366+
raise Exception("{} failed with error {}".format(self.reboot_type, stderr))
1367+
self.log("return code from %s %s: %s" % (self.reboot_type, service_name, str(return_code)))
1368+
if return_code not in [0, 255]:
1369+
thread.interrupt_main()
1370+
1371+
def pre_service_warm_restart(self, service_name):
1372+
"""Copy from src/sonic-utilities/sonic_installer/main.py to do some special operation for particular containers
1373+
"""
1374+
if service_name == 'swss':
1375+
cmd = 'docker exec -i swss orchagent_restart_check -w 2000 -r 5'
1376+
stdout, stderr, return_code = self.dut_connection.execCommand(cmd)
1377+
if return_code != 0:
1378+
self.log('stdout from {}: {}'.format(cmd, str(stdout)))
1379+
self.log('stderr from {}: {}'.format(cmd, str(stderr)))
1380+
self.log('orchagent is not in clean state, RESTARTCHECK failed: {}'.format(return_code))
1381+
elif service_name == 'bgp':
1382+
self.dut_connection.execCommand('docker exec -i bgp pkill -9 zebra')
1383+
self.dut_connection.execCommand('docker exec -i bgp pkill -9 bgpd')
1384+
elif service_name == 'teamd':
1385+
self.dut_connection.execCommand('docker exec -i teamd pkill -USR1 teamd > /dev/null')
1386+
13031387
def cmd(self, cmds):
13041388
process = subprocess.Popen(cmds,
13051389
shell=False,
@@ -1325,7 +1409,7 @@ def peer_state_check(self, ip, queue):
13251409
lacp_pdu_down_times and len(lacp_pdu_down_times) > 0 else None
13261410
lacp_pdu_after_reboot = float(lacp_pdu_up_times[0]) if\
13271411
lacp_pdu_up_times and len(lacp_pdu_up_times) > 0 else None
1328-
if 'warm-reboot' in self.reboot_type and lacp_pdu_before_reboot and lacp_pdu_after_reboot:
1412+
if ('warm-reboot' in self.reboot_type or 'service-warm-restart' in self.reboot_type) and lacp_pdu_before_reboot and lacp_pdu_after_reboot:
13291413
lacp_time_diff = lacp_pdu_after_reboot - lacp_pdu_before_reboot
13301414
if lacp_time_diff >= 90 and not self.kvm_test:
13311415
self.fails['dut'].add("LACP session likely terminated by neighbor ({})".format(ip) +\

ansible/roles/test/files/ptftests/arista.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,8 @@ def run(self):
241241
log_data = self.parse_logs(log_lines)
242242
if (self.reboot_type == 'fast-reboot' and \
243243
any(k.startswith('BGP') for k in log_data) and any(k.startswith('PortChannel') for k in log_data)) \
244-
or (self.reboot_type == 'warm-reboot' and any(k.startswith('BGP') for k in log_data)):
244+
or (self.reboot_type == 'warm-reboot' and any(k.startswith('BGP') for k in log_data)) \
245+
or (self.reboot_type == 'service-warm-restart' and any(k.startswith('BGP') for k in log_data)):
245246
log_present = True
246247
break
247248
time.sleep(1) # wait until logs are populated
@@ -324,6 +325,8 @@ def parse_logs(self, data):
324325
return result
325326
elif self.reboot_type == 'warm-reboot' and initial_time_bgp == -1:
326327
return result
328+
elif self.reboot_type == 'service-warm-restart' and initial_time_bgp == -1:
329+
return result
327330

328331
for events in result_bgp.values():
329332
if events[-1][1] != 'Established':
@@ -592,7 +595,7 @@ def change_bgp_neigh_state(self, asn, is_up=True):
592595
if is_up == True:
593596
self.do_cmd('%s' % state[is_up])
594597
else:
595-
# shutdown BGP will pop confirm message, the message is
598+
# shutdown BGP will pop confirm message, the message is
596599
# "You are attempting to shutdown BGP. Are you sure you want to shutdown? [confirm]"
597600
self.do_cmd('%s' % state[is_up], prompt = '[confirm]')
598601
self.do_cmd('y')
@@ -707,6 +710,10 @@ def check_series_status(self, output, entity, what):
707710
self.fails.add("%s must be up when the test stops" % what)
708711
return 0, 0
709712

713+
if len(sorted_keys) == 1:
714+
# for service warm restart, the down count could be 0
715+
return 0, 0
716+
710717
start = sorted_keys[0]
711718
cur_state = True
712719
res = defaultdict(list)

tests/common/fixtures/advanced_reboot.py

+79-4
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def __init__(self, request, duthost, ptfhost, localhost, tbinfo, creds, **kwargs
4242
@param tbinfo: fixture provides information about testbed
4343
@param kwargs: extra parameters including reboot type
4444
'''
45-
assert 'rebootType' in kwargs and ('warm-reboot' in kwargs['rebootType'] or 'fast-reboot' in kwargs['rebootType']) , (
45+
assert 'rebootType' in kwargs and ('warm-reboot' in kwargs['rebootType'] or 'fast-reboot' in kwargs['rebootType'] or 'service-warm-restart' in kwargs['rebootType']) , (
4646
"Please set rebootType var."
4747
)
4848

@@ -90,6 +90,10 @@ def __init__(self, request, duthost, ptfhost, localhost, tbinfo, creds, **kwargs
9090

9191
self.__buildTestbedData(tbinfo)
9292

93+
if self.rebootType == 'service-warm-restart':
94+
assert hasattr(self, 'service_list')
95+
self.service_data = {}
96+
9397
def __extractTestParam(self):
9498
'''
9599
Extract test parameters from pytest request object. Note that all the parameters have default values.
@@ -106,6 +110,7 @@ def __extractTestParam(self):
106110
self.replaceFastRebootScript = self.request.config.getoption("--replace_fast_reboot_script")
107111
self.postRebootCheckScript = self.request.config.getoption("--post_reboot_check_script")
108112
self.bgpV4V6TimeDiff = self.request.config.getoption("--bgp_v4_v6_time_diff")
113+
self.new_docker_image = self.request.config.getoption("--new_docker_image")
109114

110115
# Set default reboot limit if it is not given
111116
if self.rebootLimit is None:
@@ -326,6 +331,35 @@ def __handleRebootImage(self):
326331
logger.info('Remove downloaded tempfile')
327332
self.duthost.shell('rm -f {}'.format(tempfile))
328333

334+
def __handleDockerImage(self):
335+
"""Download and install docker image to DUT
336+
"""
337+
if self.new_docker_image is None:
338+
return
339+
340+
for service_name in self.service_list:
341+
data = {}
342+
docker_image_name = self.duthost.shell('docker ps | grep {} | awk \'{{print $2}}\''.format(service_name))['stdout']
343+
cmd = 'docker images {} --format \{{\{{.ID\}}\}}'.format(docker_image_name)
344+
data['image_id'] = self.duthost.shell(cmd)['stdout']
345+
data['image_name'], data['image_tag'] = docker_image_name.split(':')
346+
347+
local_image_path = '/tmp/{}.gz'.format(data['image_name'])
348+
logger.info('Downloading new docker image for {} to {}'.format(service_name, local_image_path))
349+
output = self.localhost.shell('curl --silent --write-out "%{{http_code}}" {0}/{1}.gz --output {2}'.format(self.new_docker_image, data['image_name'], local_image_path), module_ignore_errors=True)['stdout']
350+
if '404' not in output and os.path.exists(local_image_path):
351+
temp_file = self.duthost.shell('mktemp')['stdout']
352+
self.duthost.copy(src=local_image_path, dest=temp_file)
353+
self.localhost.shell('rm -f /tmp/{}.gz'.format(data['image_name']))
354+
data['image_path_on_dut'] = temp_file
355+
logger.info('Successfully downloaded docker image, will perform in-service upgrade')
356+
else:
357+
data['image_path_on_dut'] = None
358+
logger.info('Docker image not found, will perform in-service reboot')
359+
self.service_data[service_name] = data
360+
361+
logger.info('service data = {}'.format(json.dumps(self.service_data, indent=2)))
362+
329363
def __setupTestbed(self):
330364
'''
331365
Sets testbed up. It tranfers test data files, ARP responder, and runs script to update IPs and MAC addresses.
@@ -438,7 +472,10 @@ def imageInstall(self, prebootList=None, inbootList=None, prebootFiles=None):
438472
self.__setupTestbed()
439473

440474
# Download and install new sonic image
441-
self.__handleRebootImage()
475+
if self.rebootType != 'service-warm-restart':
476+
self.__handleRebootImage()
477+
else:
478+
self.__handleDockerImage()
442479

443480
# Handle mellanox platform
444481
self.__handleMellanoxDut()
@@ -586,7 +623,9 @@ def __runPtfRunner(self, rebootOper=None):
586623
"asic_type": self.duthost.facts["asic_type"],
587624
"allow_mac_jumping": self.allowMacJump,
588625
"preboot_files" : self.prebootFiles,
589-
"alt_password": self.duthost.host.options['variable_manager']._hostvars[self.duthost.hostname].get("ansible_altpassword")
626+
"alt_password": self.duthost.host.options['variable_manager']._hostvars[self.duthost.hostname].get("ansible_altpassword"),
627+
"service_list": None if self.rebootType != 'service-warm-restart' else self.service_list,
628+
"service_data": None if self.rebootType != 'service-warm-restart' else self.service_data,
590629
}
591630

592631
if not isinstance(rebootOper, SadOperation):
@@ -639,6 +678,36 @@ def __restorePrevImage(self):
639678
wait = self.readyTimeout
640679
)
641680

681+
def __restorePrevDockerImage(self):
682+
"""Restore previous docker image.
683+
"""
684+
for service_name, data in self.service_data.items():
685+
if data['image_path_on_dut'] is None:
686+
self.duthost.shell('sudo config warm_restart disable {}'.format(service_name))
687+
continue
688+
689+
# We don't use sonic-installer rollback-docker CLI here because:
690+
# 1. it does not remove the docker image which causes the running container still using the old image
691+
# 2. it runs service restart for some containers which enlarge the test duration
692+
self.duthost.shell('rm -f {}'.format(data['image_path_on_dut']))
693+
logger.info('Restore docker image for {}'.format(service_name))
694+
self.duthost.shell('service {} stop'.format(service_name))
695+
self.duthost.shell('docker rm {}'.format(service_name))
696+
image_ids = self.duthost.shell('docker images {} --format \{{\{{.ID\}}\}}'.format(data['image_name']))['stdout_lines']
697+
for image_id in image_ids:
698+
if image_id != data['image_id']:
699+
self.duthost.shell('docker rmi -f {}'.format(image_id))
700+
break
701+
702+
self.duthost.shell('docker tag {} {}:{}'.format(data['image_id'], data['image_name'], data['image_tag']))
703+
704+
rebootDut(
705+
self.duthost,
706+
self.localhost,
707+
reboot_type='cold',
708+
wait = 300
709+
)
710+
642711
def tearDown(self):
643712
'''
644713
Tears down test case. It also verifies that config_db.json exists.
@@ -658,7 +727,13 @@ def tearDown(self):
658727
self.__runScript([self.postRebootCheckScript], self.duthost)
659728

660729
if not self.stayInTargetImage:
661-
self.__restorePrevImage()
730+
logger.info('Restoring previous image')
731+
if self.rebootType != 'service-warm-restart':
732+
self.__restorePrevImage()
733+
else:
734+
self.__restorePrevDockerImage()
735+
else:
736+
logger.info('Stay in new image')
662737

663738
@pytest.fixture
664739
def get_advanced_reboot(request, duthosts, enum_rand_one_per_hwsku_frontend_hostname, ptfhost, localhost, tbinfo, creds):

tests/platform_tests/args/advanced_reboot_args.py

+16
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,22 @@ def add_advanced_reboot_args(parser):
6969
help="URL of new sonic image",
7070
)
7171

72+
parser.addoption(
73+
"--new_docker_image",
74+
action="store",
75+
type=str,
76+
default=None,
77+
help="URL of new docker image",
78+
)
79+
80+
parser.addoption(
81+
"--ignore_service",
82+
action="store",
83+
type=str,
84+
default=None,
85+
help="Services that ignore for warm restart test",
86+
)
87+
7288
parser.addoption(
7389
"--ready_timeout",
7490
action="store",

tests/platform_tests/test_advanced_reboot.py

-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
pytest.mark.topology('t0')
1414
]
1515

16-
1716
def pytest_generate_tests(metafunc):
1817
input_sad_cases = metafunc.config.getoption("sad_case_list")
1918
input_sad_list = list()

0 commit comments

Comments
 (0)