|
| 1 | +#!/usr/bin/python -u |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +import fcntl |
| 5 | +import os |
| 6 | +import shutil |
| 7 | +import tempfile |
| 8 | + |
| 9 | +import click |
| 10 | +import netaddr |
| 11 | +import requests |
| 12 | +import urllib3 |
| 13 | +import yaml |
| 14 | +from urlparse import urlparse |
| 15 | + |
| 16 | +from sonic_py_common import device_info |
| 17 | +from swsssdk import ConfigDBConnector |
| 18 | +from utilities_common.db import Db |
| 19 | +import utilities_common.cli as clicommon |
| 20 | + |
| 21 | +from .utils import log |
| 22 | + |
| 23 | +KUBE_ADMIN_CONF = "/etc/sonic/kube_admin.conf" |
| 24 | +KUBELET_YAML = "/var/lib/kubelet/config.yaml" |
| 25 | +KUBELET_SERVICE = "/etc/systemd/system/multi-user.target.wants/kubelet.service" |
| 26 | + |
| 27 | +SERVER_ADMIN_URL = "https://{}/admin.conf" |
| 28 | +KUBEADM_JOIN_CMD = "kubeadm join --discovery-file {} --node-name {}" |
| 29 | + |
| 30 | +LOCK_FILE = "/var/lock/kube_join.lock" |
| 31 | + |
| 32 | + |
| 33 | +def _update_kube_server(field, val): |
| 34 | + config_db = ConfigDBConnector() |
| 35 | + config_db.connect() |
| 36 | + table = "KUBERNETES_MASTER" |
| 37 | + key = "SERVER" |
| 38 | + db_data = Db().get_data(table, key) |
| 39 | + def_data = { |
| 40 | + "IP": "", |
| 41 | + "insecure": "False", |
| 42 | + "disable": "False" |
| 43 | + } |
| 44 | + for f in def_data: |
| 45 | + if db_data and f in db_data: |
| 46 | + if f == field and db_data[f] != val: |
| 47 | + config_db.mod_entry(table, key, {field: val}) |
| 48 | + log.log_info("modify kubernetes server entry {}={}".format(field,val)) |
| 49 | + else: |
| 50 | + # Missing field. Set to default or given value |
| 51 | + v = val if f == field else def_data[f] |
| 52 | + config_db.mod_entry(table, key, {f: v}) |
| 53 | + log.log_info("set kubernetes server entry {}={}".format(f,v)) |
| 54 | + |
| 55 | + |
| 56 | +def _take_lock(): |
| 57 | + lock_fd = None |
| 58 | + try: |
| 59 | + lock_fd = open(LOCK_FILE, "w") |
| 60 | + fcntl.lockf(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) |
| 61 | + log.log_info("Lock taken {}".format(LOCK_FILE)) |
| 62 | + except IOError as e: |
| 63 | + lock_fd = None |
| 64 | + log.log_error("Lock {} failed: {}".format(LOCK_FILE, str(e))) |
| 65 | + return lock_fd |
| 66 | + |
| 67 | + |
| 68 | +def _download_file(server, insecure): |
| 69 | + fname = "" |
| 70 | + if insecure: |
| 71 | + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) |
| 72 | + |
| 73 | + r = requests.get(SERVER_ADMIN_URL.format(server), verify=not insecure) |
| 74 | + if r.status_code == 200: |
| 75 | + (h, fname) = tempfile.mkstemp(suffix="_kube_join") |
| 76 | + os.write(h, r.text) |
| 77 | + os.close(h) |
| 78 | + else: |
| 79 | + clicommon.do_exit("Failed to download {}".format( |
| 80 | + SERVER_ADMIN_URL.format(server))) |
| 81 | + |
| 82 | + # Ensure the admin.conf has given VIP as server-IP. |
| 83 | + update_file = "{}.upd".format(fname) |
| 84 | + cmd = 'sed "s/server:.*:6443/server: https:\/\/{}:6443/" {} > {}'.format( |
| 85 | + server, fname, update_file) |
| 86 | + clicommon.run_command(cmd) |
| 87 | + |
| 88 | + shutil.copyfile(update_file, KUBE_ADMIN_CONF) |
| 89 | + |
| 90 | + clicommon.run_command("rm -f {} {}".format(fname, update_file)) |
| 91 | + |
| 92 | + |
| 93 | +def _is_connected(server=""): |
| 94 | + if (os.path.exists(KUBE_ADMIN_CONF) and |
| 95 | + os.path.exists(KUBELET_YAML) and |
| 96 | + os.path.exists(KUBELET_SERVICE)): |
| 97 | + |
| 98 | + with open(KUBE_ADMIN_CONF, 'r') as s: |
| 99 | + d = yaml.load(s) |
| 100 | + d = d['clusters'] if 'clusters' in d else [] |
| 101 | + d = d[0] if len(d) > 0 else {} |
| 102 | + d = d['cluster'] if 'cluster' in d else {} |
| 103 | + d = d['server'] if 'server' in d else "" |
| 104 | + if d: |
| 105 | + o = urlparse(d) |
| 106 | + if o.hostname: |
| 107 | + return not server or server == o.hostname |
| 108 | + return False |
| 109 | + |
| 110 | + |
| 111 | +def _get_labels(): |
| 112 | + labels = [] |
| 113 | + |
| 114 | + hwsku = device_info.get_hwsku() |
| 115 | + version_info = device_info.get_sonic_version_info() |
| 116 | + |
| 117 | + labels.append("sonic_version={}".format(version_info['build_version'])) |
| 118 | + labels.append("hwsku={}".format(hwsku)) |
| 119 | + lh = Db().get_data('DEVICE_METADATA', 'localhost') |
| 120 | + labels.append("deployment_type={}".format( |
| 121 | + lh['type'] if lh and 'type' in lh else "Unknown")) |
| 122 | + labels.append("enable_pods=True") |
| 123 | + |
| 124 | + return labels |
| 125 | + |
| 126 | + |
| 127 | +def _label_node(label): |
| 128 | + cmd = "kubectl --kubeconfig {} label nodes {} {}".format( |
| 129 | + KUBE_ADMIN_CONF, device_info.get_hostname(), label) |
| 130 | + clicommon.run_command(cmd, ignore_error=True) |
| 131 | + |
| 132 | + |
| 133 | +def _troubleshoot_tips(): |
| 134 | + msg = """ |
| 135 | +if join fails, check the following |
| 136 | +
|
| 137 | +a) Ensure both master & node run same or compatible k8s versions |
| 138 | +
|
| 139 | +b) Check if this node already exists in master |
| 140 | + Use 'sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf get nodes' to list nodes at master. |
| 141 | +
|
| 142 | + If yes, delete it, as the node is attempting a new join. |
| 143 | + 'kubectl --kubeconfig=/etc/kubernetes/admin.conf drain <node name> --ignore-daemonsets' |
| 144 | + 'kubectl --kubeconfig=/etc/kubernetes/admin.conf delete node <node name>' |
| 145 | +
|
| 146 | +c) In Master check if all system pods are running good. |
| 147 | + 'kubectl get pods --namespace kube-system' |
| 148 | +
|
| 149 | + If any not running properly, say READY column has 0/1, decribe pod for more detail. |
| 150 | + 'kubectl --namespace kube-system describe pod <pod name>' |
| 151 | +
|
| 152 | + For additional details, look into pod's logs. |
| 153 | + @ node: /var/log/pods/<podname>/... |
| 154 | + @ master: 'kubectl logs -n kube-system <pod name>' |
| 155 | + """ |
| 156 | + |
| 157 | + (h, fname) = tempfile.mkstemp(suffix="kube_hints_") |
| 158 | + os.write(h, msg) |
| 159 | + os.close(h) |
| 160 | + |
| 161 | + log.log_error("Refer file {} for troubleshooting tips".format(fname)) |
| 162 | + |
| 163 | + |
| 164 | +def _do_join(server, insecure): |
| 165 | + try: |
| 166 | + _download_file(server, insecure) |
| 167 | + |
| 168 | + clicommon.run_command("systemctl enable kubelet") |
| 169 | + |
| 170 | + clicommon.run_command("modprobe br_netfilter") |
| 171 | + |
| 172 | + clicommon.run_command(KUBEADM_JOIN_CMD.format( |
| 173 | + KUBE_ADMIN_CONF, device_info.get_hostname()), ignore_error=True) |
| 174 | + |
| 175 | + if _is_connected(server): |
| 176 | + labels = _get_labels() |
| 177 | + for label in labels: |
| 178 | + _label_node(label) |
| 179 | + |
| 180 | + except requests.exceptions.RequestException as e: |
| 181 | + clicommon.do_exit("Download failed: {}".format(str(e))) |
| 182 | + |
| 183 | + except OSError as e: |
| 184 | + clicommon.do_exit("Download failed: {}".format(str(e))) |
| 185 | + |
| 186 | + _troubleshoot_tips() |
| 187 | + |
| 188 | + |
| 189 | +def kube_reset(): |
| 190 | + lock_fd = _take_lock() |
| 191 | + if not lock_fd: |
| 192 | + log.log_error("Lock {} is active; Bail out".format(LOCK_FILE)) |
| 193 | + return |
| 194 | + |
| 195 | + # Remove a key label and drain/delete self from cluster |
| 196 | + # If not, the next join would fail |
| 197 | + # |
| 198 | + if os.path.exists(KUBE_ADMIN_CONF): |
| 199 | + _label_node("enable_pods-") |
| 200 | + clicommon.run_command( |
| 201 | + "kubectl --kubeconfig {} --request-timeout 20s drain {} --ignore-daemonsets".format( |
| 202 | + KUBE_ADMIN_CONF, device_info.get_hostname()), |
| 203 | + ignore_error=True) |
| 204 | + clicommon.run_command( |
| 205 | + "kubectl --kubeconfig {} --request-timeout 20s delete node {}".format( |
| 206 | + KUBE_ADMIN_CONF, device_info.get_hostname()), |
| 207 | + ignore_error=True) |
| 208 | + |
| 209 | + clicommon.run_command("kubeadm reset -f", ignore_error=True) |
| 210 | + clicommon.run_command("rm -rf /etc/cni/net.d") |
| 211 | + clicommon.run_command("rm -f {}".format(KUBE_ADMIN_CONF)) |
| 212 | + clicommon.run_command("systemctl stop kubelet") |
| 213 | + clicommon.run_command("systemctl disable kubelet") |
| 214 | + |
| 215 | + |
| 216 | +def kube_join(force=False): |
| 217 | + lock_fd = _take_lock() |
| 218 | + if not lock_fd: |
| 219 | + log.log_error("Lock {} is active; Bail out".format(LOCK_FILE)) |
| 220 | + return |
| 221 | + |
| 222 | + db_data = Db().get_data('KUBERNETES_MASTER', 'SERVER') |
| 223 | + if not db_data or 'IP' not in db_data or not db_data['IP']: |
| 224 | + log.log_error("Kubernetes server is not configured") |
| 225 | + |
| 226 | + if db_data['disable'].lower() != "false": |
| 227 | + log.log_error("kube join skipped as kubernetes server is marked disabled") |
| 228 | + return |
| 229 | + |
| 230 | + if not force: |
| 231 | + if _is_connected(db_data['IP']): |
| 232 | + # Already connected. No-Op |
| 233 | + return |
| 234 | + |
| 235 | + kube_reset() |
| 236 | + _do_join(db_data['IP'], db_data['insecure']) |
| 237 | + |
| 238 | + |
| 239 | +@click.group(cls=clicommon.AbbreviationGroup) |
| 240 | +def kubernetes(): |
| 241 | + """kubernetes command line""" |
| 242 | + pass |
| 243 | + |
| 244 | + |
| 245 | +# cmd kubernetes join [-f/--force] |
| 246 | +@kubernetes.command() |
| 247 | +@click.option('-f', '--force', help='Force a join', is_flag=True) |
| 248 | +def join(force): |
| 249 | + kube_join(force=force) |
| 250 | + |
| 251 | + |
| 252 | +# cmd kubernetes reset |
| 253 | +@kubernetes.command() |
| 254 | +def reset(): |
| 255 | + kube_reset() |
| 256 | + |
| 257 | + |
| 258 | +# cmd kubernetes server |
| 259 | +@kubernetes.group() |
| 260 | +def server(): |
| 261 | + """ Server configuration """ |
| 262 | + pass |
| 263 | + |
| 264 | + |
| 265 | +# cmd kubernetes server IP |
| 266 | +@server.command() |
| 267 | +@click.argument('vip') |
| 268 | +def ip(vip): |
| 269 | + """Specify a kubernetes cluster VIP""" |
| 270 | + if not netaddr.IPAddress(vip): |
| 271 | + click.echo('Invalid IP address %s' % vip) |
| 272 | + return |
| 273 | + _update_kube_server('IP', vip) |
| 274 | + |
| 275 | + |
| 276 | +# cmd kubernetes server insecure |
| 277 | +@server.command() |
| 278 | +@click.argument('option', type=click.Choice(["on", "off"])) |
| 279 | +def insecure(option): |
| 280 | + """Specify a kubernetes cluster VIP access as insecure or not""" |
| 281 | + _update_kube_server('insecure', option == "on") |
| 282 | + |
| 283 | + |
| 284 | +# cmd kubernetes server disable |
| 285 | +@server.command() |
| 286 | +@click.argument('option', type=click.Choice(["on", "off"])) |
| 287 | +def disable(option): |
| 288 | + """Specify a kubernetes cluster VIP access is disabled or not""" |
| 289 | + _update_kube_server('disable', option == "on") |
| 290 | + |
| 291 | + |
| 292 | +# cmd kubernetes label |
| 293 | +@kubernetes.group() |
| 294 | +def label(): |
| 295 | + """ label configuration """ |
| 296 | + pass |
| 297 | + |
| 298 | + |
| 299 | +# cmd kubernetes label add <key> <val> |
| 300 | +@label.command() |
| 301 | +@click.argument('key', required=True) |
| 302 | +@click.argument('val', required=True) |
| 303 | +def add(key, val): |
| 304 | + """Add a label to this node""" |
| 305 | + if not key or not val: |
| 306 | + click.echo('Require key & val') |
| 307 | + return |
| 308 | + _label_node("{}={}".format(key, val)) |
| 309 | + |
| 310 | + |
| 311 | +# cmd kubernetes label drop <key> |
| 312 | +@label.command() |
| 313 | +@click.argument('key', required=True) |
| 314 | +def drop(key): |
| 315 | + """Drop a label from this node""" |
| 316 | + if not key: |
| 317 | + click.echo('Require key to drop') |
| 318 | + return |
| 319 | + _label_node("{}-".format(key)) |
0 commit comments