Skip to content

Commit 8810864

Browse files
jipanyanglguohan
authored andcommitted
[warm-reboot] add docker upgrade --warm option and roll back support (#559)
* [warm-reboot] add docker upgrade --warm option and roll back support Signed-off-by: Jipan Yang <[email protected]> * load docker image before disruptive operations to shorten control plane frozen time. Signed-off-by: Jipan Yang <[email protected]>
1 parent 0fe279f commit 8810864

File tree

1 file changed

+165
-62
lines changed

1 file changed

+165
-62
lines changed

sonic_installer/main.py

+165-62
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import time
99
import click
1010
import urllib
11+
import syslog
1112
import subprocess
1213
from swsssdk import ConfigDBConnector
1314
from swsssdk import SonicV2Connector
@@ -265,6 +266,37 @@ def abort_if_false(ctx, param, value):
265266
if not value:
266267
ctx.abort()
267268

269+
def get_container_image_name(container_name):
270+
# example image: docker-lldp-sv2:latest
271+
cmd = "docker inspect --format '{{.Config.Image}}' " + container_name
272+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
273+
(out, err) = proc.communicate()
274+
if proc.returncode != 0:
275+
sys.exit(proc.returncode)
276+
image_latest = out.rstrip()
277+
278+
# example image_name: docker-lldp-sv2
279+
cmd = "echo " + image_latest + " | cut -d ':' -f 1"
280+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
281+
image_name = proc.stdout.read().rstrip()
282+
return image_name
283+
284+
def get_container_image_id(image_tag):
285+
# TODO: extract commond docker info fetching functions
286+
# this is image_id for image with tag, like 'docker-teamd:latest'
287+
cmd = "docker images --format '{{.ID}}' " + image_tag
288+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
289+
image_id = proc.stdout.read().rstrip()
290+
return image_id
291+
292+
def get_container_image_id_all(image_name):
293+
# All images id under the image name like 'docker-teamd'
294+
cmd = "docker images --format '{{.ID}}' " + image_name
295+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
296+
image_id_all = proc.stdout.read()
297+
image_id_all = image_id_all.splitlines()
298+
image_id_all = set(image_id_all)
299+
return image_id_all
268300

269301
# Main entrypoint
270302
@click.group()
@@ -433,27 +465,19 @@ def cleanup():
433465
@cli.command()
434466
@click.option('-y', '--yes', is_flag=True, callback=abort_if_false,
435467
expose_value=False, prompt='New docker image will be installed, continue?')
436-
@click.option('--cleanup_image', is_flag=True, help="Clean up old docker image(s)")
437-
@click.option('--enforce_check', is_flag=True, help="Enforce pending task check for docker upgrade")
468+
@click.option('--cleanup_image', is_flag=True, help="Clean up old docker image")
469+
@click.option('--skip_check', is_flag=True, help="Skip task check for docker upgrade")
438470
@click.option('--tag', type=str, help="Tag for the new docker image")
471+
@click.option('--warm', is_flag=True, help="Perform warm upgrade")
439472
@click.argument('container_name', metavar='<container_name>', required=True,
440-
type=click.Choice(["swss", "snmp", "lldp", "bgp", "pmon", "dhcp_relay", "telemetry", "teamd"]))
473+
type=click.Choice(["swss", "snmp", "lldp", "bgp", "pmon", "dhcp_relay", "telemetry", "teamd", "radv", "amon"]))
441474
@click.argument('url')
442-
def upgrade_docker(container_name, url, cleanup_image, enforce_check, tag):
475+
def upgrade_docker(container_name, url, cleanup_image, skip_check, tag, warm):
443476
""" Upgrade docker image from local binary or URL"""
444477

445-
# example image: docker-lldp-sv2:latest
446-
cmd = "docker inspect --format '{{.Config.Image}}' " + container_name
447-
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
448-
(out, err) = proc.communicate()
449-
if proc.returncode != 0:
450-
sys.exit(proc.returncode)
451-
image_latest = out.rstrip()
452-
453-
# example image_name: docker-lldp-sv2
454-
cmd = "echo " + image_latest + " | cut -d ':' -f 1"
455-
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
456-
image_name = proc.stdout.read().rstrip()
478+
image_name = get_container_image_name(container_name)
479+
image_latest = image_name + ":latest"
480+
image_id_previous = get_container_image_id(image_latest)
457481

458482
DEFAULT_IMAGE_PATH = os.path.join("/tmp/", image_name)
459483
if url.startswith('http://') or url.startswith('https://'):
@@ -474,87 +498,166 @@ def upgrade_docker(container_name, url, cleanup_image, enforce_check, tag):
474498
click.echo("Image file '{}' does not exist or is not a regular file. Aborting...".format(image_path))
475499
raise click.Abort()
476500

477-
warm = False
501+
warm_configured = False
478502
# warm restart enable/disable config is put in stateDB, not persistent across cold reboot, not saved to config_DB.json file
479503
state_db = SonicV2Connector(host='127.0.0.1')
480504
state_db.connect(state_db.STATE_DB, False)
481505
TABLE_NAME_SEPARATOR = '|'
482506
prefix = 'WARM_RESTART_ENABLE_TABLE' + TABLE_NAME_SEPARATOR
483507
_hash = '{}{}'.format(prefix, container_name)
484508
if state_db.get(state_db.STATE_DB, _hash, "enable") == "true":
485-
warm = True
509+
warm_configured = True
486510
state_db.close(state_db.STATE_DB)
487511

512+
if container_name == "swss" or container_name == "bgp" or container_name == "teamd":
513+
if warm_configured == False and warm:
514+
run_command("config warm_restart enable %s" % container_name)
515+
516+
# Fetch tag of current running image
517+
tag_previous = get_docker_tag_name(image_latest)
518+
# Load the new image beforehand to shorten disruption time
519+
run_command("docker load < %s" % image_path)
520+
warm_app_names = []
488521
# warm restart specific procssing for swss, bgp and teamd dockers.
489-
if warm == True:
522+
if warm_configured == True or warm:
490523
# make sure orchagent is in clean state if swss is to be upgraded
491524
if container_name == "swss":
492-
skipPendingTaskCheck = " -s"
493-
if enforce_check:
494-
skipPendingTaskCheck = ""
495-
496-
cmd = "docker exec -i swss orchagent_restart_check -w 1000 " + skipPendingTaskCheck
497-
for i in range(1, 6):
498-
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
499-
(out, err) = proc.communicate()
500-
if proc.returncode != 0:
501-
if enforce_check:
502-
click.echo("Orchagent is not in clean state, RESTARTCHECK failed {}".format(i))
503-
if i == 5:
504-
sys.exit(proc.returncode)
505-
else:
506-
click.echo("Orchagent is not in clean state, upgrading it anyway")
507-
break
525+
skipPendingTaskCheck = ""
526+
if skip_check:
527+
skipPendingTaskCheck = " -s"
528+
529+
cmd = "docker exec -i swss orchagent_restart_check -w 2000 -r 5 " + skipPendingTaskCheck
530+
531+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
532+
(out, err) = proc.communicate()
533+
if proc.returncode != 0:
534+
if not skip_check:
535+
click.echo("Orchagent is not in clean state, RESTARTCHECK failed")
536+
# Restore orignal config before exit
537+
if warm_configured == False and warm:
538+
run_command("config warm_restart disable %s" % container_name)
539+
# Clean the image loaded earlier
540+
image_id_latest = get_container_image_id(image_latest)
541+
run_command("docker rmi -f %s" % image_id_latest)
542+
# Re-point latest tag to previous tag
543+
run_command("docker tag %s:%s %s" % (image_name, tag_previous, image_latest))
544+
545+
sys.exit(proc.returncode)
508546
else:
509-
click.echo("Orchagent is in clean state and frozen for warm upgrade")
510-
break
511-
run_command("sleep 1")
547+
click.echo("Orchagent is not in clean state, upgrading it anyway")
548+
else:
549+
click.echo("Orchagent is in clean state and frozen for warm upgrade")
550+
551+
warm_app_names = ["orchagent", "neighsyncd"]
512552

513553
elif container_name == "bgp":
514554
# Kill bgpd to restart the bgp graceful restart procedure
515555
click.echo("Stopping bgp ...")
516556
run_command("docker exec -i bgp pkill -9 zebra")
517557
run_command("docker exec -i bgp pkill -9 bgpd")
518-
run_command("sleep 2") # wait 2 seconds for bgp to settle down
558+
warm_app_names = ["bgp"]
519559
click.echo("Stopped bgp ...")
520560

521561
elif container_name == "teamd":
522562
click.echo("Stopping teamd ...")
523563
# Send USR1 signal to all teamd instances to stop them
524564
# It will prepare teamd for warm-reboot
525565
run_command("docker exec -i teamd pkill -USR1 teamd > /dev/null")
526-
run_command("sleep 2") # wait 2 seconds for teamd to settle down
566+
warm_app_names = ["teamsyncd"]
527567
click.echo("Stopped teamd ...")
528568

529-
run_command("systemctl stop %s" % container_name)
569+
# clean app reconcilation state from last warm start if exists
570+
for warm_app_name in warm_app_names:
571+
cmd = "docker exec -i database redis-cli -n 6 hdel 'WARM_RESTART_TABLE|" + warm_app_name + "' state"
572+
run_command(cmd)
573+
574+
run_command("docker kill %s > /dev/null" % container_name)
530575
run_command("docker rm %s " % container_name)
531-
run_command("docker rmi %s " % image_latest)
532-
run_command("docker load < %s" % image_path)
533576
if tag == None:
534577
# example image: docker-lldp-sv2:latest
535578
tag = get_docker_tag_name(image_latest)
536579
run_command("docker tag %s:latest %s:%s" % (image_name, image_name, tag))
537580
run_command("systemctl restart %s" % container_name)
538581

539-
# Clean up old docker images
540-
if cleanup_image:
541-
# All images id under the image name
542-
cmd = "docker images --format '{{.ID}}' " + image_name
543-
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
544-
image_id_all = proc.stdout.read()
545-
image_id_all = image_id_all.splitlines()
546-
image_id_all = set(image_id_all)
547-
548-
# this is image_id for image with "latest" tag
549-
cmd = "docker images --format '{{.ID}}' " + image_latest
550-
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
551-
image_id_latest = proc.stdout.read().rstrip()
552-
553-
for id in image_id_all:
554-
if id != image_id_latest:
555-
run_command("docker rmi -f %s" % id)
556-
557-
run_command("sleep 5") # wait 5 seconds for application to sync
582+
# All images id under the image name
583+
image_id_all = get_container_image_id_all(image_name)
584+
585+
# this is image_id for image with "latest" tag
586+
image_id_latest = get_container_image_id(image_latest)
587+
588+
for id in image_id_all:
589+
if id != image_id_latest:
590+
# Unless requested, the previoud docker image will be preserved
591+
if not cleanup_image and id == image_id_previous:
592+
continue
593+
run_command("docker rmi -f %s" % id)
594+
595+
exp_state = "reconciled"
596+
state = ""
597+
# post warm restart specific procssing for swss, bgp and teamd dockers, wait for reconciliation state.
598+
if warm_configured == True or warm:
599+
count = 0
600+
for warm_app_name in warm_app_names:
601+
state = ""
602+
cmd = "docker exec -i database redis-cli -n 6 hget 'WARM_RESTART_TABLE|" + warm_app_name + "' state"
603+
# Wait up to 180 seconds for reconciled state
604+
while state != exp_state and count < 90:
605+
sys.stdout.write("\r {}: ".format(warm_app_name))
606+
sys.stdout.write("[%-s" % ('='*count))
607+
sys.stdout.flush()
608+
count += 1
609+
time.sleep(2)
610+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
611+
state = proc.stdout.read().rstrip()
612+
syslog.syslog("%s reached %s state"%(warm_app_name, state))
613+
sys.stdout.write("]\n\r")
614+
if state != exp_state:
615+
click.echo("%s failed to reach %s state"%(warm_app_name, exp_state))
616+
syslog.syslog(syslog.LOG_ERR, "%s failed to reach %s state"%(warm_app_name, exp_state))
617+
else:
618+
exp_state = "" # this is cold upgrade
619+
620+
# Restore to previous cold restart setting
621+
if warm_configured == False and warm:
622+
if container_name == "swss" or container_name == "bgp" or container_name == "teamd":
623+
run_command("config warm_restart disable %s" % container_name)
624+
625+
if state == exp_state:
626+
click.echo('Done')
627+
else:
628+
click.echo('Failed')
629+
sys.exit(1)
630+
631+
# rollback docker image
632+
@cli.command()
633+
@click.option('-y', '--yes', is_flag=True, callback=abort_if_false,
634+
expose_value=False, prompt='Docker image will be rolled back, continue?')
635+
@click.argument('container_name', metavar='<container_name>', required=True,
636+
type=click.Choice(["swss", "snmp", "lldp", "bgp", "pmon", "dhcp_relay", "telemetry", "teamd", "radv", "amon"]))
637+
def rollback_docker(container_name):
638+
""" Rollback docker image to previous version"""
639+
image_name = get_container_image_name(container_name)
640+
# All images id under the image name
641+
image_id_all = get_container_image_id_all(image_name)
642+
if len(image_id_all) != 2:
643+
click.echo("Two images required, but there are '{}' images for '{}'. Aborting...".format(len(image_id_all), image_name))
644+
raise click.Abort()
645+
646+
image_latest = image_name + ":latest"
647+
image_id_previous = get_container_image_id(image_latest)
648+
649+
version_tag = ""
650+
for id in image_id_all:
651+
if id != image_id_previous:
652+
version_tag = get_docker_tag_name(id)
653+
654+
# make previous image as latest
655+
run_command("docker tag %s:%s %s:latest" % (image_name, version_tag, image_name))
656+
if container_name == "swss" or container_name == "bgp" or container_name == "teamd":
657+
click.echo("Cold reboot is required to restore system state after '{}' rollback !!".format(container_name))
658+
else:
659+
run_command("systemctl restart %s" % container_name)
660+
558661
click.echo('Done')
559662

560663
if __name__ == '__main__':

0 commit comments

Comments
 (0)