Skip to content

Commit e1440f0

Browse files
authored
Improve feature mode switch process (#12188)
* Fix kube mode to local mode long duration issue * Remove IPV6 parameters which is not necessary * Fix read node labels bug * Tag the running image to latest if it's stable * Disable image_version_higher check * Change image_version_higher checker test case Signed-off-by: Yun Li <[email protected]>
1 parent a31a4e7 commit e1440f0

10 files changed

+165
-51
lines changed

src/sonic-ctrmgrd/.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ tests/__pycache__/
1010
ctrmgr/__pycache__/
1111
venv
1212
tests/.coverage*
13+
.pytest_cache/

src/sonic-ctrmgrd/ctrmgr/container

+20-5
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ STATE = "state"
3030

3131
KUBE_LABEL_TABLE = "KUBE_LABELS"
3232
KUBE_LABEL_SET_KEY = "SET"
33+
SERVER_TABLE = "KUBERNETES_MASTER"
34+
SERVER_KEY = "SERVER"
35+
ST_SER_CONNECTED = "connected"
36+
ST_SER_UPDATE_TS = "update_time"
3337

3438
# Get seconds to wait for remote docker to start.
3539
# If not, revert to local
@@ -75,8 +79,10 @@ def read_data(is_config, feature, fields):
7579
ret = []
7680

7781
db = cfg_db if is_config else state_db
78-
79-
tbl = swsscommon.Table(db, FEATURE_TABLE)
82+
if feature == SERVER_KEY:
83+
tbl = swsscommon.Table(db, SERVER_TABLE)
84+
else:
85+
tbl = swsscommon.Table(db, FEATURE_TABLE)
8086

8187
data = dict(tbl.get(feature)[1])
8288
for (field, default) in fields:
@@ -104,6 +110,13 @@ def read_state(feature):
104110
[(CURRENT_OWNER, "none"), (REMOTE_STATE, "none"), (CONTAINER_ID, "")])
105111

106112

113+
def read_server_state():
114+
""" Read requried feature state """
115+
116+
return read_data(False, SERVER_KEY,
117+
[(ST_SER_CONNECTED, "false"), (ST_SER_UPDATE_TS, "")])
118+
119+
107120
def docker_action(action, feature, **kwargs):
108121
""" Execute docker action """
109122
try:
@@ -192,9 +205,10 @@ def container_start(feature, **kwargs):
192205

193206
set_owner, fallback, _ = read_config(feature)
194207
_, remote_state, _ = read_state(feature)
208+
server_connected, _ = read_server_state()
195209

196-
debug_msg("{}: set_owner:{} fallback:{} remote_state:{}".format(
197-
feature, set_owner, fallback, remote_state))
210+
debug_msg("{}: set_owner:{} fallback:{} remote_state:{} server_connected:{}".format(
211+
feature, set_owner, fallback, remote_state, server_connected))
198212

199213
data = {
200214
SYSTEM_STATE: "up",
@@ -207,8 +221,9 @@ def container_start(feature, **kwargs):
207221
start_val = START_LOCAL
208222
else:
209223
start_val = START_KUBE
210-
if fallback and (remote_state == "none"):
224+
if fallback and (remote_state == "none" or server_connected == "false"):
211225
start_val |= START_LOCAL
226+
data[REMOTE_STATE] = "none"
212227

213228
if start_val == START_LOCAL:
214229
# Implies *only* local.

src/sonic-ctrmgrd/ctrmgr/container_startup.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -232,14 +232,14 @@ def container_up(feature, owner, version):
232232
do_freeze(feature, "This version is marked disabled. Exiting ...")
233233
return
234234

235-
if not instance_higher(feature, state_data[VERSION], version):
236-
# TODO: May Remove label <feature_name>_<version>_enabled
237-
# Else kubelet will continue to re-deploy every 5 mins, until
238-
# master removes the lable to un-deploy.
239-
#
240-
do_freeze(feature, "bail out as current deploy version {} is not higher".
241-
format(version))
242-
return
235+
# if not instance_higher(feature, state_data[VERSION], version):
236+
# # TODO: May Remove label <feature_name>_<version>_enabled
237+
# # Else kubelet will continue to re-deploy every 5 mins, until
238+
# # master removes the lable to un-deploy.
239+
# #
240+
# do_freeze(feature, "bail out as current deploy version {} is not higher".
241+
# format(version))
242+
# return
243243

244244
update_data(state_db, feature, { VERSION: version })
245245

src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py

+70-5
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
CFG_SER_IP: "",
6161
CFG_SER_PORT: "6443",
6262
CFG_SER_DISABLE: "false",
63-
CFG_SER_INSECURE: "false"
63+
CFG_SER_INSECURE: "true"
6464
}
6565

6666
dflt_st_ser = {
@@ -88,18 +88,20 @@
8888
JOIN_LATENCY = "join_latency_on_boot_seconds"
8989
JOIN_RETRY = "retry_join_interval_seconds"
9090
LABEL_RETRY = "retry_labels_update_seconds"
91+
TAG_IMAGE_LATEST = "tag_latest_image_on_wait_seconds"
9192
USE_K8S_PROXY = "use_k8s_as_http_proxy"
9293

9394
remote_ctr_config = {
9495
JOIN_LATENCY: 10,
9596
JOIN_RETRY: 10,
9697
LABEL_RETRY: 2,
98+
TAG_IMAGE_LATEST: 30,
9799
USE_K8S_PROXY: ""
98100
}
99101

100102
def log_debug(m):
101103
msg = "{}: {}".format(inspect.stack()[1][3], m)
102-
print(msg)
104+
#print(msg)
103105
syslog.syslog(syslog.LOG_DEBUG, msg)
104106

105107

@@ -148,6 +150,8 @@ def init():
148150
with open(SONIC_CTR_CONFIG, "r") as s:
149151
d = json.load(s)
150152
remote_ctr_config.update(d)
153+
if UNIT_TESTING:
154+
remote_ctr_config[TAG_IMAGE_LATEST] = 0
151155

152156

153157
class MainServer:
@@ -172,11 +176,11 @@ def register_db(self, db_name):
172176
self.db_connectors[db_name] = swsscommon.DBConnector(db_name, 0)
173177

174178

175-
def register_timer(self, ts, handler):
179+
def register_timer(self, ts, handler, args=()):
176180
""" Register timer based handler.
177181
The handler will be called on/after give timestamp, ts
178182
"""
179-
self.timer_handlers[ts].append(handler)
183+
self.timer_handlers[ts].append((handler, args))
180184

181185

182186
def register_handler(self, db_name, table_name, handler):
@@ -235,7 +239,7 @@ def run(self):
235239
lst = self.timer_handlers[k]
236240
del self.timer_handlers[k]
237241
for fn in lst:
238-
fn()
242+
fn[0](*fn[1])
239243
else:
240244
timeout = (k - ct_ts).seconds
241245
break
@@ -426,6 +430,54 @@ def do_join(self, ip, port, insecure):
426430
format(remote_ctr_config[JOIN_RETRY], self.start_time))
427431

428432

433+
def tag_latest_image(server, feat, docker_id, image_ver):
434+
res = 1
435+
if not UNIT_TESTING:
436+
status = os.system("docker ps |grep {} >/dev/null".format(docker_id))
437+
if status:
438+
syslog.syslog(syslog.LOG_ERR,
439+
"Feature {}:{} is not stable".format(feat, image_ver))
440+
else:
441+
image_item = os.popen("docker inspect {} |jq -r .[].Image".format(docker_id)).read().strip()
442+
if image_item:
443+
image_id = image_item.split(":")[1][:12]
444+
image_info = os.popen("docker images |grep {}".format(image_id)).read().split()
445+
if image_info:
446+
image_rep = image_info[0]
447+
res = os.system("docker tag {} {}:latest".format(image_id, image_rep))
448+
if res != 0:
449+
syslog.syslog(syslog.LOG_ERR,
450+
"Failed to tag {}:{} to latest".format(image_rep, image_ver))
451+
else:
452+
syslog.syslog(syslog.LOG_INFO,
453+
"Successfully tag {}:{} to latest".format(image_rep, image_ver))
454+
feat_status = os.popen("docker inspect {} |jq -r .[].State.Running".format(feat)).read().strip()
455+
if feat_status:
456+
if feat_status == 'true':
457+
os.system("docker stop {}".format(feat))
458+
syslog.syslog(syslog.LOG_ERR,
459+
"{} should not run, stop it".format(feat))
460+
os.system("docker rm {}".format(feat))
461+
syslog.syslog(syslog.LOG_INFO,
462+
"Delete previous {} container".format(feat))
463+
else:
464+
syslog.syslog(syslog.LOG_ERR,
465+
"Failed to docker images |grep {} to get image repo".format(image_id))
466+
else:
467+
syslog.syslog(syslog.LOG_ERR,
468+
"Failed to inspect container:{} to get image id".format(docker_id))
469+
else:
470+
server.mod_db_entry(STATE_DB_NAME,
471+
FEATURE_TABLE, feat, {"tag_latest": "true"})
472+
res = 0
473+
if res:
474+
log_debug("failed to tag {}:{} to latest".format(feat, image_ver))
475+
else:
476+
log_debug("successfully tag {}:{} to latest".format(feat, image_ver))
477+
478+
return res
479+
480+
429481
#
430482
# Feature changes
431483
#
@@ -523,6 +575,19 @@ def on_state_update(self, key, op, data):
523575
self.st_data[key] = _update_entry(dflt_st_feat, data)
524576
remote_state = self.st_data[key][ST_FEAT_REMOTE_STATE]
525577

578+
if (old_remote_state != remote_state) and (remote_state == "running"):
579+
# Tag latest
580+
start_time = datetime.datetime.now() + datetime.timedelta(
581+
seconds=remote_ctr_config[TAG_IMAGE_LATEST])
582+
self.server.register_timer(start_time, tag_latest_image, (
583+
self.server,
584+
key,
585+
self.st_data[key][ST_FEAT_CTR_ID],
586+
self.st_data[key][ST_FEAT_CTR_VER]))
587+
588+
log_debug("try to tag latest label after {} seconds @{}".format(
589+
remote_ctr_config[TAG_IMAGE_LATEST], start_time))
590+
526591
if (not init) and (
527592
(old_remote_state == remote_state) or (remote_state != "pending")):
528593
# no change or nothing to do.

src/sonic-ctrmgrd/ctrmgr/kube_commands.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def _run_command(cmd, timeout=5):
8484

8585
def kube_read_labels():
8686
""" Read current labels on node and return as dict. """
87-
KUBECTL_GET_CMD = "kubectl --kubeconfig {} get nodes {} --show-labels |tr -s ' ' | cut -f6 -d' '"
87+
KUBECTL_GET_CMD = "kubectl --kubeconfig {} get nodes {} --show-labels --no-headers |tr -s ' ' | cut -f6 -d' '"
8888

8989
labels = {}
9090
ret, out, _ = _run_command(KUBECTL_GET_CMD.format(
@@ -332,12 +332,12 @@ def _do_reset(pending_join = False):
332332

333333

334334
def _do_join(server, port, insecure):
335-
KUBEADM_JOIN_CMD = "kubeadm join --discovery-file {} --node-name {} --apiserver-advertise-address {}"
335+
KUBEADM_JOIN_CMD = "kubeadm join --discovery-file {} --node-name {}"
336336
err = ""
337337
out = ""
338338
ret = 0
339339
try:
340-
local_ipv6 = _get_local_ipv6()
340+
#local_ipv6 = _get_local_ipv6()
341341
#_download_file(server, port, insecure)
342342
_gen_cli_kubeconf(server, port, insecure)
343343
_do_reset(True)
@@ -349,7 +349,7 @@ def _do_join(server, port, insecure):
349349

350350
if ret == 0:
351351
(ret, out, err) = _run_command(KUBEADM_JOIN_CMD.format(
352-
KUBE_ADMIN_CONF, get_device_name(), local_ipv6), timeout=60)
352+
KUBE_ADMIN_CONF, get_device_name()), timeout=60)
353353
log_debug("ret = {}".format(ret))
354354

355355
except IOError as e:

src/sonic-ctrmgrd/ctrmgr/remote_ctr.config.json

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"retry_join_interval_seconds": 30,
44
"retry_labels_update_seconds": 5,
55
"revert_to_local_on_wait_seconds": 60,
6+
"tag_latest_image_on_wait_seconds": 600,
67
"use_k8s_as_http_proxy": "n"
78
}
89

src/sonic-ctrmgrd/tests/container_startup_test.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@
169169
common_test.FEATURE_TABLE: {
170170
"snmp": {
171171
"container_id": "no_change",
172-
"container_version": "20201230.77",
172+
"container_version": "20201230.11",
173173
"current_owner": "no_change",
174174
"remote_state": "no_change",
175175
"system_state": "up"

src/sonic-ctrmgrd/tests/container_test.py

+5
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,11 @@
125125
"current_owner": "none",
126126
"container_id": ""
127127
}
128+
},
129+
common_test.SERVER_TABLE: {
130+
"SERVER": {
131+
"connected": "true"
132+
}
128133
}
129134
}
130135
},

src/sonic-ctrmgrd/tests/ctrmgrd_test.py

+47-2
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@
106106
common_test.KUBE_JOIN: {
107107
"ip": "10.10.10.10",
108108
"port": "6443",
109-
"insecure": "false"
109+
"insecure": "true"
110110
}
111111
}
112112
},
@@ -151,7 +151,7 @@
151151
common_test.KUBE_JOIN: {
152152
"ip": "10.10.10.10",
153153
"port": "6443",
154-
"insecure": "false"
154+
"insecure": "true"
155155
},
156156
common_test.KUBE_RESET: {
157157
"flag": "true"
@@ -276,6 +276,51 @@
276276
}
277277
}
278278
}
279+
},
280+
3: {
281+
common_test.DESCR: "Tag image latest when remote_state changes to running",
282+
common_test.ARGS: "ctrmgrd",
283+
common_test.PRE: {
284+
common_test.CONFIG_DB_NO: {
285+
common_test.FEATURE_TABLE: {
286+
"snmp": {
287+
"set_owner": "kube"
288+
}
289+
}
290+
},
291+
common_test.STATE_DB_NO: {
292+
common_test.FEATURE_TABLE: {
293+
"snmp": {
294+
"remote_state": "pending"
295+
}
296+
}
297+
}
298+
},
299+
common_test.UPD: {
300+
common_test.CONFIG_DB_NO: {
301+
common_test.FEATURE_TABLE: {
302+
"snmp": {
303+
"set_owner": "kube"
304+
}
305+
}
306+
},
307+
common_test.STATE_DB_NO: {
308+
common_test.FEATURE_TABLE: {
309+
"snmp": {
310+
"remote_state": "running"
311+
}
312+
}
313+
}
314+
},
315+
common_test.POST: {
316+
common_test.STATE_DB_NO: {
317+
common_test.FEATURE_TABLE: {
318+
"snmp": {
319+
"tag_latest": "true"
320+
}
321+
}
322+
}
323+
}
279324
}
280325
}
281326

0 commit comments

Comments
 (0)