Skip to content

Commit 60fcab9

Browse files
Enhance SONiC with kubernetes management commands (sonic-net#962)
* Enhance SONiC with kubernetes management commands Config commands: 1) sudo config kubernetes server ip <IP address> 2) sudo config kubernetes server insecure <True/False> 3) sudo config kubernetes server disable <True/False> 4) sudo config kubernetes label add <key> <value> 5) sudo config kubernetes label drop <key> 6) sudo config kubernetes join [-a/--async] [-f/--force] 7) sudo config kubernetes reset Show commands: 1) show kube server 2) show kube status 3) show kube nodes 4) show kube pods A python wrapper for kube_join for invocation by other tools.
1 parent 5c173f7 commit 60fcab9

File tree

7 files changed

+494
-5
lines changed

7 files changed

+494
-5
lines changed

config/kube.py

+319
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,319 @@
1+
#!/usr/bin/python -u
2+
# -*- coding: utf-8 -*-
3+
4+
import fcntl
5+
import os
6+
import shutil
7+
import tempfile
8+
9+
import click
10+
import netaddr
11+
import requests
12+
import urllib3
13+
import yaml
14+
from urlparse import urlparse
15+
16+
from sonic_py_common import device_info
17+
from swsssdk import ConfigDBConnector
18+
from utilities_common.db import Db
19+
import utilities_common.cli as clicommon
20+
21+
from .utils import log
22+
23+
KUBE_ADMIN_CONF = "/etc/sonic/kube_admin.conf"
24+
KUBELET_YAML = "/var/lib/kubelet/config.yaml"
25+
KUBELET_SERVICE = "/etc/systemd/system/multi-user.target.wants/kubelet.service"
26+
27+
SERVER_ADMIN_URL = "https://{}/admin.conf"
28+
KUBEADM_JOIN_CMD = "kubeadm join --discovery-file {} --node-name {}"
29+
30+
LOCK_FILE = "/var/lock/kube_join.lock"
31+
32+
33+
def _update_kube_server(field, val):
34+
config_db = ConfigDBConnector()
35+
config_db.connect()
36+
table = "KUBERNETES_MASTER"
37+
key = "SERVER"
38+
db_data = Db().get_data(table, key)
39+
def_data = {
40+
"IP": "",
41+
"insecure": "False",
42+
"disable": "False"
43+
}
44+
for f in def_data:
45+
if db_data and f in db_data:
46+
if f == field and db_data[f] != val:
47+
config_db.mod_entry(table, key, {field: val})
48+
log.log_info("modify kubernetes server entry {}={}".format(field,val))
49+
else:
50+
# Missing field. Set to default or given value
51+
v = val if f == field else def_data[f]
52+
config_db.mod_entry(table, key, {f: v})
53+
log.log_info("set kubernetes server entry {}={}".format(f,v))
54+
55+
56+
def _take_lock():
57+
lock_fd = None
58+
try:
59+
lock_fd = open(LOCK_FILE, "w")
60+
fcntl.lockf(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
61+
log.log_info("Lock taken {}".format(LOCK_FILE))
62+
except IOError as e:
63+
lock_fd = None
64+
log.log_error("Lock {} failed: {}".format(LOCK_FILE, str(e)))
65+
return lock_fd
66+
67+
68+
def _download_file(server, insecure):
69+
fname = ""
70+
if insecure:
71+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
72+
73+
r = requests.get(SERVER_ADMIN_URL.format(server), verify=not insecure)
74+
if r.status_code == 200:
75+
(h, fname) = tempfile.mkstemp(suffix="_kube_join")
76+
os.write(h, r.text)
77+
os.close(h)
78+
else:
79+
clicommon.do_exit("Failed to download {}".format(
80+
SERVER_ADMIN_URL.format(server)))
81+
82+
# Ensure the admin.conf has given VIP as server-IP.
83+
update_file = "{}.upd".format(fname)
84+
cmd = 'sed "s/server:.*:6443/server: https:\/\/{}:6443/" {} > {}'.format(
85+
server, fname, update_file)
86+
clicommon.run_command(cmd)
87+
88+
shutil.copyfile(update_file, KUBE_ADMIN_CONF)
89+
90+
clicommon.run_command("rm -f {} {}".format(fname, update_file))
91+
92+
93+
def _is_connected(server=""):
94+
if (os.path.exists(KUBE_ADMIN_CONF) and
95+
os.path.exists(KUBELET_YAML) and
96+
os.path.exists(KUBELET_SERVICE)):
97+
98+
with open(KUBE_ADMIN_CONF, 'r') as s:
99+
d = yaml.load(s)
100+
d = d['clusters'] if 'clusters' in d else []
101+
d = d[0] if len(d) > 0 else {}
102+
d = d['cluster'] if 'cluster' in d else {}
103+
d = d['server'] if 'server' in d else ""
104+
if d:
105+
o = urlparse(d)
106+
if o.hostname:
107+
return not server or server == o.hostname
108+
return False
109+
110+
111+
def _get_labels():
112+
labels = []
113+
114+
hwsku = device_info.get_hwsku()
115+
version_info = device_info.get_sonic_version_info()
116+
117+
labels.append("sonic_version={}".format(version_info['build_version']))
118+
labels.append("hwsku={}".format(hwsku))
119+
lh = Db().get_data('DEVICE_METADATA', 'localhost')
120+
labels.append("deployment_type={}".format(
121+
lh['type'] if lh and 'type' in lh else "Unknown"))
122+
labels.append("enable_pods=True")
123+
124+
return labels
125+
126+
127+
def _label_node(label):
128+
cmd = "kubectl --kubeconfig {} label nodes {} {}".format(
129+
KUBE_ADMIN_CONF, device_info.get_hostname(), label)
130+
clicommon.run_command(cmd, ignore_error=True)
131+
132+
133+
def _troubleshoot_tips():
134+
msg = """
135+
if join fails, check the following
136+
137+
a) Ensure both master & node run same or compatible k8s versions
138+
139+
b) Check if this node already exists in master
140+
Use 'sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf get nodes' to list nodes at master.
141+
142+
If yes, delete it, as the node is attempting a new join.
143+
'kubectl --kubeconfig=/etc/kubernetes/admin.conf drain <node name> --ignore-daemonsets'
144+
'kubectl --kubeconfig=/etc/kubernetes/admin.conf delete node <node name>'
145+
146+
c) In Master check if all system pods are running good.
147+
'kubectl get pods --namespace kube-system'
148+
149+
If any not running properly, say READY column has 0/1, decribe pod for more detail.
150+
'kubectl --namespace kube-system describe pod <pod name>'
151+
152+
For additional details, look into pod's logs.
153+
@ node: /var/log/pods/<podname>/...
154+
@ master: 'kubectl logs -n kube-system <pod name>'
155+
"""
156+
157+
(h, fname) = tempfile.mkstemp(suffix="kube_hints_")
158+
os.write(h, msg)
159+
os.close(h)
160+
161+
log.log_error("Refer file {} for troubleshooting tips".format(fname))
162+
163+
164+
def _do_join(server, insecure):
165+
try:
166+
_download_file(server, insecure)
167+
168+
clicommon.run_command("systemctl enable kubelet")
169+
170+
clicommon.run_command("modprobe br_netfilter")
171+
172+
clicommon.run_command(KUBEADM_JOIN_CMD.format(
173+
KUBE_ADMIN_CONF, device_info.get_hostname()), ignore_error=True)
174+
175+
if _is_connected(server):
176+
labels = _get_labels()
177+
for label in labels:
178+
_label_node(label)
179+
180+
except requests.exceptions.RequestException as e:
181+
clicommon.do_exit("Download failed: {}".format(str(e)))
182+
183+
except OSError as e:
184+
clicommon.do_exit("Download failed: {}".format(str(e)))
185+
186+
_troubleshoot_tips()
187+
188+
189+
def kube_reset():
190+
lock_fd = _take_lock()
191+
if not lock_fd:
192+
log.log_error("Lock {} is active; Bail out".format(LOCK_FILE))
193+
return
194+
195+
# Remove a key label and drain/delete self from cluster
196+
# If not, the next join would fail
197+
#
198+
if os.path.exists(KUBE_ADMIN_CONF):
199+
_label_node("enable_pods-")
200+
clicommon.run_command(
201+
"kubectl --kubeconfig {} --request-timeout 20s drain {} --ignore-daemonsets".format(
202+
KUBE_ADMIN_CONF, device_info.get_hostname()),
203+
ignore_error=True)
204+
clicommon.run_command(
205+
"kubectl --kubeconfig {} --request-timeout 20s delete node {}".format(
206+
KUBE_ADMIN_CONF, device_info.get_hostname()),
207+
ignore_error=True)
208+
209+
clicommon.run_command("kubeadm reset -f", ignore_error=True)
210+
clicommon.run_command("rm -rf /etc/cni/net.d")
211+
clicommon.run_command("rm -f {}".format(KUBE_ADMIN_CONF))
212+
clicommon.run_command("systemctl stop kubelet")
213+
clicommon.run_command("systemctl disable kubelet")
214+
215+
216+
def kube_join(force=False):
217+
lock_fd = _take_lock()
218+
if not lock_fd:
219+
log.log_error("Lock {} is active; Bail out".format(LOCK_FILE))
220+
return
221+
222+
db_data = Db().get_data('KUBERNETES_MASTER', 'SERVER')
223+
if not db_data or 'IP' not in db_data or not db_data['IP']:
224+
log.log_error("Kubernetes server is not configured")
225+
226+
if db_data['disable'].lower() != "false":
227+
log.log_error("kube join skipped as kubernetes server is marked disabled")
228+
return
229+
230+
if not force:
231+
if _is_connected(db_data['IP']):
232+
# Already connected. No-Op
233+
return
234+
235+
kube_reset()
236+
_do_join(db_data['IP'], db_data['insecure'])
237+
238+
239+
@click.group(cls=clicommon.AbbreviationGroup)
240+
def kubernetes():
241+
"""kubernetes command line"""
242+
pass
243+
244+
245+
# cmd kubernetes join [-f/--force]
246+
@kubernetes.command()
247+
@click.option('-f', '--force', help='Force a join', is_flag=True)
248+
def join(force):
249+
kube_join(force=force)
250+
251+
252+
# cmd kubernetes reset
253+
@kubernetes.command()
254+
def reset():
255+
kube_reset()
256+
257+
258+
# cmd kubernetes server
259+
@kubernetes.group()
260+
def server():
261+
""" Server configuration """
262+
pass
263+
264+
265+
# cmd kubernetes server IP
266+
@server.command()
267+
@click.argument('vip')
268+
def ip(vip):
269+
"""Specify a kubernetes cluster VIP"""
270+
if not netaddr.IPAddress(vip):
271+
click.echo('Invalid IP address %s' % vip)
272+
return
273+
_update_kube_server('IP', vip)
274+
275+
276+
# cmd kubernetes server insecure
277+
@server.command()
278+
@click.argument('option', type=click.Choice(["on", "off"]))
279+
def insecure(option):
280+
"""Specify a kubernetes cluster VIP access as insecure or not"""
281+
_update_kube_server('insecure', option == "on")
282+
283+
284+
# cmd kubernetes server disable
285+
@server.command()
286+
@click.argument('option', type=click.Choice(["on", "off"]))
287+
def disable(option):
288+
"""Specify a kubernetes cluster VIP access is disabled or not"""
289+
_update_kube_server('disable', option == "on")
290+
291+
292+
# cmd kubernetes label
293+
@kubernetes.group()
294+
def label():
295+
""" label configuration """
296+
pass
297+
298+
299+
# cmd kubernetes label add <key> <val>
300+
@label.command()
301+
@click.argument('key', required=True)
302+
@click.argument('val', required=True)
303+
def add(key, val):
304+
"""Add a label to this node"""
305+
if not key or not val:
306+
click.echo('Require key & val')
307+
return
308+
_label_node("{}={}".format(key, val))
309+
310+
311+
# cmd kubernetes label drop <key>
312+
@label.command()
313+
@click.argument('key', required=True)
314+
def drop(key):
315+
"""Drop a label from this node"""
316+
if not key:
317+
click.echo('Require key to drop')
318+
return
319+
_label_node("{}-".format(key))

config/main.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,10 @@
2424

2525

2626
import aaa
27+
import feature
28+
import kube
2729
import mlnx
2830
import nat
29-
import feature
3031
import vlan
3132
from config_mgmt import ConfigMgmtDPB
3233

@@ -801,15 +802,15 @@ def _restart_services(config_db):
801802
execute_systemctl(services_to_restart, SYSTEMCTL_ACTION_RESTART)
802803

803804

804-
def interface_is_in_vlan(vlan_member_table, interface_name):
805+
def interface_is_in_vlan(vlan_member_table, interface_name):
805806
""" Check if an interface is in a vlan """
806807
for _,intf in vlan_member_table.keys():
807808
if intf == interface_name:
808809
return True
809810

810811
return False
811812

812-
def interface_is_in_portchannel(portchannel_member_table, interface_name):
813+
def interface_is_in_portchannel(portchannel_member_table, interface_name):
813814
""" Check if an interface is part of portchannel """
814815
for _,intf in portchannel_member_table.keys():
815816
if intf == interface_name:
@@ -922,13 +923,16 @@ def config(ctx):
922923

923924
ctx.obj = Db()
924925

926+
927+
# Add groups from other modules
925928
config.add_command(aaa.aaa)
926929
config.add_command(aaa.tacacs)
927930
config.add_command(feature.feature)
928-
# === Add NAT Configuration ==========
931+
config.add_command(kube.kubernetes)
929932
config.add_command(nat.nat)
930933
config.add_command(vlan.vlan)
931934

935+
932936
@config.command()
933937
@click.option('-y', '--yes', is_flag=True, callback=_abort_if_false,
934938
expose_value=False, prompt='Existing files will be overwritten, continue?')

0 commit comments

Comments
 (0)