From d819b27428bd5f2daee4d0e38e7c14d2e5933fc9 Mon Sep 17 00:00:00 2001 From: Yun Li Date: Fri, 26 May 2023 11:24:01 +0800 Subject: [PATCH 1/6] Add health check probe for k8s upgrade containers --- .../Dockerfile.j2 | 1 + .../docker-config-engine-buster/Dockerfile.j2 | 1 + rules/docker-config-engine-bullseye.mk | 1 + rules/docker-config-engine-buster.mk | 1 + rules/sonic-ctrmgrd.mk | 4 +++ src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh | 25 +++++++++++++++++++ 6 files changed, 33 insertions(+) create mode 100644 src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh diff --git a/dockers/docker-config-engine-bullseye/Dockerfile.j2 b/dockers/docker-config-engine-bullseye/Dockerfile.j2 index 84c4df100ec5..56bc2e2d0f93 100644 --- a/dockers/docker-config-engine-bullseye/Dockerfile.j2 +++ b/dockers/docker-config-engine-bullseye/Dockerfile.j2 @@ -44,6 +44,7 @@ RUN pip3 install redis==4.5.4 # Copy files COPY ["files/swss_vars.j2", "/usr/share/sonic/templates/"] +COPY ["files/readiness_probe.sh", "/usr/bin/"] ## Clean up RUN apt-get purge -y \ diff --git a/dockers/docker-config-engine-buster/Dockerfile.j2 b/dockers/docker-config-engine-buster/Dockerfile.j2 index 9b8fefeed9f2..7c09f3678fb3 100644 --- a/dockers/docker-config-engine-buster/Dockerfile.j2 +++ b/dockers/docker-config-engine-buster/Dockerfile.j2 @@ -44,6 +44,7 @@ RUN pip3 install redis==4.5.4 # Copy files COPY ["files/swss_vars.j2", "/usr/share/sonic/templates/"] +COPY ["files/readiness_probe.sh", "/usr/bin/"] ## Clean up RUN apt-get purge -y \ diff --git a/rules/docker-config-engine-bullseye.mk b/rules/docker-config-engine-bullseye.mk index 9548391a6917..bd6957f2a8fb 100644 --- a/rules/docker-config-engine-bullseye.mk +++ b/rules/docker-config-engine-bullseye.mk @@ -19,6 +19,7 @@ $(DOCKER_CONFIG_ENGINE_BULLSEYE)_LOAD_DOCKERS += $(DOCKER_BASE_BULLSEYE) $(DOCKER_CONFIG_ENGINE_BULLSEYE)_FILES += $(SWSS_VARS_TEMPLATE) $(DOCKER_CONFIG_ENGINE_BULLSEYE)_FILES += $(RSYSLOG_PLUGIN_CONF_J2) $(DOCKER_CONFIG_ENGINE_BULLSEYE)_FILES += $($(SONIC_CTRMGRD)_CONTAINER_SCRIPT) +$(DOCKER_CONFIG_ENGINE_BUSTER)_FILES += $($(SONIC_CTRMGRD)_HEALTH_PROBE) $(DOCKER_CONFIG_ENGINE_BULLSEYE)_DBG_DEPENDS = $($(DOCKER_BASE_BULLSEYE)_DBG_DEPENDS) \ $(LIBSWSSCOMMON_DBG) \ diff --git a/rules/docker-config-engine-buster.mk b/rules/docker-config-engine-buster.mk index cd6d6f43b46d..fed475952ae6 100644 --- a/rules/docker-config-engine-buster.mk +++ b/rules/docker-config-engine-buster.mk @@ -18,6 +18,7 @@ $(DOCKER_CONFIG_ENGINE_BUSTER)_LOAD_DOCKERS += $(DOCKER_BASE_BUSTER) $(DOCKER_CONFIG_ENGINE_BUSTER)_FILES += $(SWSS_VARS_TEMPLATE) $(DOCKER_CONFIG_ENGINE_BUSTER)_FILES += $(RSYSLOG_PLUGIN_CONF_J2) $(DOCKER_CONFIG_ENGINE_BUSTER)_FILES += $($(SONIC_CTRMGRD)_CONTAINER_SCRIPT) +$(DOCKER_CONFIG_ENGINE_BUSTER)_FILES += $($(SONIC_CTRMGRD)_HEALTH_PROBE) $(DOCKER_CONFIG_ENGINE_BUSTER)_DBG_DEPENDS = $($(DOCKER_BASE_BUSTER)_DBG_DEPENDS) \ $(LIBSWSSCOMMON_DBG) \ diff --git a/rules/sonic-ctrmgrd.mk b/rules/sonic-ctrmgrd.mk index 659a2cf4ace1..167d78c43c88 100644 --- a/rules/sonic-ctrmgrd.mk +++ b/rules/sonic-ctrmgrd.mk @@ -20,12 +20,16 @@ $($(SONIC_CTRMGRD)_CFG_JSON)_PATH = $($(SONIC_CTRMGRD)_FILES_PATH) $(SONIC_CTRMGRD)_SERVICE = ctrmgrd.service $($(SONIC_CTRMGRD)_SERVICE)_PATH = $($(SONIC_CTRMGRD)_FILES_PATH) +$(SONIC_CTRMGRD)_HEALTH_PROBE = readiness_probe.sh +$($(SONIC_CTRMGRD)_HEALTH_PROBE)_PATH = $($(SONIC_CTRMGRD)_FILES_PATH) + SONIC_PYTHON_WHEELS += $(SONIC_CTRMGRD) $(SONIC_CTRMGRD)_FILES = $($(SONIC_CTRMGRD)_CONTAINER_SCRIPT) $(SONIC_CTRMGRD)_FILES += $($(SONIC_CTRMGRD)_STARTUP_SCRIPT) $(SONIC_CTRMGRD)_FILES += $($(SONIC_CTRMGRD)_CFG_JSON) $(SONIC_CTRMGRD)_FILES += $($(SONIC_CTRMGRD)_SERVICE) +$(SONIC_CTRMGRD)_FILES += $($(SONIC_CTRMGRD)_HEALTH_PROBE) SONIC_COPY_FILES += $($(SONIC_CTRMGRD)_FILES) diff --git a/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh b/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh new file mode 100644 index 000000000000..d39286504187 --- /dev/null +++ b/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# This script is used to check the readiness of containers +# Check if the container is ready or not + +#### exit code +# 0: ready +# 1: python script crach exit code +# 2: supervisor start service doesn't exit normally +#### other: return code of post_check_script, define in the script, should not include 1,2 + +if [ $(supervisorctl status start |awk '{print $2}') != 'EXITED' ]; then + exit 2 +fi + +# feature owner can add their own readiness check script +post_check_script="/usr/bin/readiness_probe.py" +if [ -f $post_check_script ]; then + python3 $post_check_script + check_result=$? + if [ $check_result != 0 ]; then + exit $check_result + fi +fi + +exit 0 From 4b53a1293e942a0c1a931b7a2633920abbd2523e Mon Sep 17 00:00:00 2001 From: Yun Li Date: Fri, 26 May 2023 13:29:42 +0800 Subject: [PATCH 2/6] Update --- rules/docker-config-engine-bullseye.mk | 2 +- src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/rules/docker-config-engine-bullseye.mk b/rules/docker-config-engine-bullseye.mk index bd6957f2a8fb..f587160b94a2 100644 --- a/rules/docker-config-engine-bullseye.mk +++ b/rules/docker-config-engine-bullseye.mk @@ -19,7 +19,7 @@ $(DOCKER_CONFIG_ENGINE_BULLSEYE)_LOAD_DOCKERS += $(DOCKER_BASE_BULLSEYE) $(DOCKER_CONFIG_ENGINE_BULLSEYE)_FILES += $(SWSS_VARS_TEMPLATE) $(DOCKER_CONFIG_ENGINE_BULLSEYE)_FILES += $(RSYSLOG_PLUGIN_CONF_J2) $(DOCKER_CONFIG_ENGINE_BULLSEYE)_FILES += $($(SONIC_CTRMGRD)_CONTAINER_SCRIPT) -$(DOCKER_CONFIG_ENGINE_BUSTER)_FILES += $($(SONIC_CTRMGRD)_HEALTH_PROBE) +$(DOCKER_CONFIG_ENGINE_BULLSEYE)_FILES += $($(SONIC_CTRMGRD)_HEALTH_PROBE) $(DOCKER_CONFIG_ENGINE_BULLSEYE)_DBG_DEPENDS = $($(DOCKER_BASE_BULLSEYE)_DBG_DEPENDS) \ $(LIBSWSSCOMMON_DBG) \ diff --git a/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh b/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh index d39286504187..ae71fe2e1bcf 100644 --- a/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh +++ b/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh @@ -8,7 +8,14 @@ # 2: supervisor start service doesn't exit normally #### other: return code of post_check_script, define in the script, should not include 1,2 -if [ $(supervisorctl status start |awk '{print $2}') != 'EXITED' ]; then +# supervisorctl exit code +# 0: Running +# 3: Exited +# 4: ERROR (no such process) +pre_check_service_name="start" +supervisorctl status |awk '{print $1}' |grep -w $pre_check_service_name > /dev/null +start_check_result=$? +if [ $start_check_result = 0 ] && [ $(supervisorctl status $pre_check_service_name |awk '{print $2}') != 'EXITED' ]; then exit 2 fi @@ -16,9 +23,9 @@ fi post_check_script="/usr/bin/readiness_probe.py" if [ -f $post_check_script ]; then python3 $post_check_script - check_result=$? - if [ $check_result != 0 ]; then - exit $check_result + post_check_result=$? + if [ $post_check_result != 0 ]; then + exit $post_check_result fi fi From a74b4083381ca063ac1ba25d5b356bb67260b763 Mon Sep 17 00:00:00 2001 From: Yun Li Date: Fri, 26 May 2023 17:13:36 +0800 Subject: [PATCH 3/6] Update comments --- src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh b/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh index ae71fe2e1bcf..52b7c4af0afc 100644 --- a/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh +++ b/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh @@ -1,17 +1,16 @@ #!/bin/bash -# This script is used to check the readiness of containers -# Check if the container is ready or not +# This script is used by k8s to check the readiness of containers +# Check if the container is readiness or not, exit code 0 means readiness, others mean not readiness -#### exit code -# 0: ready +#### exit code contract, k8s only cares zero or not none-zero, but we want to use none-zero code to indicate different error +# 0: readiness # 1: python script crach exit code # 2: supervisor start service doesn't exit normally -#### other: return code of post_check_script, define in the script, should not include 1,2 +# other exit code: returned by post_check_script, define in the post_check_script, should not include 1,2 -# supervisorctl exit code -# 0: Running -# 3: Exited -# 4: ERROR (no such process) +# check if the start service exists +# if the start service exists, check if it exits normally +# if the start service doesn't exist normally, exit with code 2 pre_check_service_name="start" supervisorctl status |awk '{print $1}' |grep -w $pre_check_service_name > /dev/null start_check_result=$? @@ -20,6 +19,9 @@ if [ $start_check_result = 0 ] && [ $(supervisorctl status $pre_check_service_na fi # feature owner can add their own readiness check script +# check if the post_check_script exists +# if the post_check_script exists, run it +# if the post_check_script exits with non-zero code, exit with the code post_check_script="/usr/bin/readiness_probe.py" if [ -f $post_check_script ]; then python3 $post_check_script From 28ae09bed6c38597e75bef189650e5173d982448 Mon Sep 17 00:00:00 2001 From: Yun Li Date: Fri, 23 Jun 2023 00:13:30 +0800 Subject: [PATCH 4/6] Fix typo issue --- src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh b/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh index 52b7c4af0afc..da38bfc38740 100644 --- a/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh +++ b/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh @@ -4,7 +4,7 @@ #### exit code contract, k8s only cares zero or not none-zero, but we want to use none-zero code to indicate different error # 0: readiness -# 1: python script crach exit code +# 1: python script crash exit code # 2: supervisor start service doesn't exit normally # other exit code: returned by post_check_script, define in the post_check_script, should not include 1,2 From 30b4c0644016bd7fb119cd298e79b4bd2c7bd0f8 Mon Sep 17 00:00:00 2001 From: Yun Li Date: Mon, 3 Jul 2023 18:52:04 +0800 Subject: [PATCH 5/6] Update the hook script to an executable file --- src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh b/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh index da38bfc38740..bb7a1fcec44e 100644 --- a/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh +++ b/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh @@ -4,7 +4,7 @@ #### exit code contract, k8s only cares zero or not none-zero, but we want to use none-zero code to indicate different error # 0: readiness -# 1: python script crash exit code +# 1: if the hook script is python code, the default crash exit code is 1 # 2: supervisor start service doesn't exit normally # other exit code: returned by post_check_script, define in the post_check_script, should not include 1,2 @@ -22,9 +22,9 @@ fi # check if the post_check_script exists # if the post_check_script exists, run it # if the post_check_script exits with non-zero code, exit with the code -post_check_script="/usr/bin/readiness_probe.py" -if [ -f $post_check_script ]; then - python3 $post_check_script +post_check_script="/usr/bin/readiness_probe_hook" +if [ -x $post_check_script ]; then + $post_check_script post_check_result=$? if [ $post_check_result != 0 ]; then exit $post_check_result From 353c0c005951dec419cb2987e8c34809d4c5135d Mon Sep 17 00:00:00 2001 From: Yun Li Date: Tue, 4 Jul 2023 13:50:58 +0800 Subject: [PATCH 6/6] Update check start service logic --- src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh b/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh index bb7a1fcec44e..9e796ca03816 100644 --- a/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh +++ b/src/sonic-ctrmgrd/ctrmgr/readiness_probe.sh @@ -9,12 +9,13 @@ # other exit code: returned by post_check_script, define in the post_check_script, should not include 1,2 # check if the start service exists +# if the start service doesn't exist, do nothing # if the start service exists, check if it exits normally -# if the start service doesn't exist normally, exit with code 2 +# if the start service doesn't exit normally, exit with code 2 pre_check_service_name="start" -supervisorctl status |awk '{print $1}' |grep -w $pre_check_service_name > /dev/null -start_check_result=$? -if [ $start_check_result = 0 ] && [ $(supervisorctl status $pre_check_service_name |awk '{print $2}') != 'EXITED' ]; then +no_process_string="ERROR (no such process)" +service_status=$(supervisorctl status $pre_check_service_name) +if [[ $service_status != *"$no_process_string"* ]] && [[ $(echo $service_status |awk '{print $2}') != 'EXITED' ]]; then exit 2 fi