|
1 | 1 | #!/bin/sh
|
| 2 | +# SPDX-FileCopyrightText: 2020, 2024 k0s authors |
| 3 | +# SPDX-License-Identifier: Apache-2.0 |
| 4 | +#shellcheck disable=SC3040,SC3043,SC3052 |
2 | 5 |
|
3 |
| -set -eu |
| 6 | +set -euo pipefail |
4 | 7 |
|
5 |
| -# DNS fixup adapted from kind |
6 |
| -# https://github.com/kubernetes-sigs/kind/blob/7568bf728147c1253e651f25edfd0e0a75534b8a/images/base/files/usr/local/bin/entrypoint#L447-L487 |
| 8 | +usage() { |
| 9 | + cat <<EOF |
| 10 | +Usage: $0 ARGS... |
7 | 11 |
|
8 |
| -# well-known docker embedded DNS is at 127.0.0.11:53 |
9 |
| -docker_embedded_dns_ip=127.0.0.11 |
| 12 | +The container entry point script for k0s. It sets the stage for k0s to work in |
| 13 | +a containerized environment. This includes possible cgroup and iptables rule |
| 14 | +customizations. |
10 | 15 |
|
11 |
| -# first we need to detect an IP to use for reaching the docker host |
12 |
| -docker_host_ip=$(timeout 5 getent ahostsv4 host.docker.internal | head -n1 | cut -d' ' -f1 || true) |
13 |
| -# if the ip doesn't exist or is a loopback address use the default gateway |
14 |
| -case "$docker_host_ip" in |
15 |
| -'' | 127.*) docker_host_ip=$(ip -4 route show default | cut -d' ' -f3) ;; |
16 |
| -esac |
| 16 | +Arguments: |
| 17 | + help, -h, --help Print this help message and exit |
| 18 | +
|
| 19 | +Environment variables: |
| 20 | + K0S_CONFIG |
| 21 | + Optional configuration for k0s, written to /etc/k0s/config.yaml if set. |
| 22 | +
|
| 23 | + K0S_ENTRYPOINT_REMOUNT_CGROUP2FS |
| 24 | + Set to 1 to force remounting of the cgroup2 filesystem in read-write mode. |
| 25 | + Set to 0 to disable remounting. |
| 26 | + The default is automatic detection. |
| 27 | +
|
| 28 | + K0S_ENTRYPOINT_ENABLE_CGROUPV2_NESTING |
| 29 | + Set to 1 to always enable all available cgroup controllers for the root cgroup. |
| 30 | + Set to 0 to disable cgroup nesting. |
| 31 | + The default is automatic detection. |
| 32 | +
|
| 33 | + K0S_ENTRYPOINT_DNS_FIXUP |
| 34 | + Set to 1 to apply DNS fixes required for the Docker embedded DNS setup. |
| 35 | + Set to 0 to disable. |
| 36 | + Default is automatic detection. |
| 37 | +
|
| 38 | + K0S_ENTRYPOINT_ROLE |
| 39 | + Specifies the role for k0s. Can be 'worker', 'controller', or 'controller+worker'. |
| 40 | + Default is to autodetect the role from the arguments. |
| 41 | + Depending on the role, some of the above features will be disabled by default. |
| 42 | +
|
| 43 | + K0S_ENTRYPOINT_QUIET |
| 44 | + Set to 1 to suppress printing status messages. |
| 45 | +
|
| 46 | +EOF |
| 47 | +} |
| 48 | + |
| 49 | +# Get the effective process capabilities. |
| 50 | +get_effective_capabilities() { |
| 51 | + local key val |
| 52 | + while read -r key val; do |
| 53 | + if [ "$key" = CapEff: ]; then |
| 54 | + echo $((16#$val)) |
| 55 | + return |
| 56 | + fi |
| 57 | + done </proc/self/status |
| 58 | + return 1 |
| 59 | +} |
| 60 | + |
| 61 | +has_effective_capability() { |
| 62 | + local cap_eff |
| 63 | + cap_eff=$(get_effective_capabilities) || return 2 |
| 64 | + # Check if the requested bit is set. |
| 65 | + [ "$((cap_eff & $((1 << $1))))" != 0 ] |
| 66 | +} |
| 67 | + |
| 68 | +# Checks if this process has CAP_NET_ADMIN. |
| 69 | +has_cap_net_admin() { |
| 70 | + # CAP_NET_ADMIN is bit 12 (https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/include/uapi/linux/capability.h?h=v3.10#n188) |
| 71 | + has_effective_capability 12 |
| 72 | +} |
| 73 | + |
| 74 | +# Checks if this process has CAP_SYS_ADMIN. |
| 75 | +has_cap_sys_admin() { |
| 76 | + # CAP_SYS_ADMIN is bit 21 (https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/include/uapi/linux/capability.h?h=v3.10#n263) |
| 77 | + has_effective_capability 21 |
| 78 | +} |
| 79 | + |
| 80 | +# Checks if the cgroup2 file system is mounted at its well-known path. |
| 81 | +is_cgroupv2() { |
| 82 | + # Check for the magic number of the cgroup2 fs. |
| 83 | + # https://www.kernel.org/doc/html/v5.16/admin-guide/cgroup-v2.html#mounting |
| 84 | + [ "$(stat -f -c %t /sys/fs/cgroup)" = 63677270 ] |
| 85 | +} |
| 86 | + |
| 87 | +# Checks if the file system mounted at the well-known cgroup2 path is read-write. |
| 88 | +is_cgroupfs_rw() { |
| 89 | + local _device mountpoint _fstype opts _rest |
| 90 | + while read -r _device mountpoint _fstype opts _rest; do |
| 91 | + if [ "$mountpoint" = /sys/fs/cgroup ]; then |
| 92 | + case "$opts" in |
| 93 | + rw* | *,rw | *,rw,*) return 0 ;; |
| 94 | + esac |
| 95 | + break |
| 96 | + fi |
| 97 | + done </proc/mounts |
| 98 | + return 1 |
| 99 | +} |
| 100 | + |
| 101 | +# Remounts the cgroup2 file system in read-write mode, if necessary. |
| 102 | +remount_cgroup2fs() { |
| 103 | + case "${K0S_ENTRYPOINT_REMOUNT_CGROUP2FS-}" in |
| 104 | + 0) return ;; # disabled |
| 105 | + 1) ;; # enabled |
| 106 | + *) # auto detect |
| 107 | + if ! is_cgroupv2 || is_cgroupfs_rw; then |
| 108 | + return |
| 109 | + fi |
| 110 | + has_cap_sys_admin || { |
| 111 | + if [ $? -eq 1 ]; then |
| 112 | + echo "$0: won't remount /sys/fs/cgroup without CAP_SYS_ADMIN (disable with K0S_ENTRYPOINT_REMOUNT_CGROUP2FS=0)" >&2 |
| 113 | + return |
| 114 | + fi |
| 115 | + |
| 116 | + echo "$0: failed to determine capabilities (disable with K0S_ENTRYPOINT_REMOUNT_CGROUP2FS=0)" >&2 |
| 117 | + } |
| 118 | + ;; |
| 119 | + esac |
| 120 | + |
| 121 | + mount --make-rslave / # don't propagate mount events to other namespaces |
| 122 | + mount -o remount,rw /sys/fs/cgroup |
| 123 | + |
| 124 | + [ "${K0S_ENTRYPOINT_QUIET-}" = '1' ] || echo "$0: remounted /sys/fs/cgroup" >&2 |
| 125 | +} |
| 126 | + |
| 127 | +get_process_cgroupv2() { |
| 128 | + local cg |
| 129 | + while read -r cg; do |
| 130 | + case "$cg" in |
| 131 | + # The entry for cgroup v2 is always in the format "0::$PATH" |
| 132 | + # https://www.kernel.org/doc/html/v5.16/admin-guide/cgroup-v2.html#processes |
| 133 | + 0::/*) echo "${cg#0::*}" && return 0 ;; |
| 134 | + *) cg='' ;; |
| 135 | + esac |
| 136 | + done </proc/self/cgroup |
| 137 | + return 1 |
| 138 | +} |
| 139 | + |
| 140 | +# Enables all available controllers for the root cgroup. |
| 141 | +enable_cgroupv2_nesting() { |
| 142 | + case "${K0S_ENTRYPOINT_ENABLE_CGROUPV2_NESTING-}" in |
| 143 | + 0) return ;; # disabled |
| 144 | + 1) local force=1 ;; # enabled |
| 145 | + *) local force=0 ;; # auto detect |
| 146 | + esac |
| 147 | + |
| 148 | + [ $force = 1 ] || is_cgroupv2 || return |
| 149 | + |
| 150 | + local cg |
| 151 | + cg="$(get_process_cgroupv2)" || { |
| 152 | + echo "$0: failed to determine process cgroup (disable with K0S_ENTRYPOINT_ENABLE_CGROUPV2_NESTING=0)" >&2 |
| 153 | + return 1 |
| 154 | + } |
| 155 | + local cg_path=/sys/fs/cgroup"$cg" |
| 156 | + |
| 157 | + local all_controllers |
| 158 | + read -r all_controllers <"$cg_path"/cgroup.controllers || { |
| 159 | + echo "$0: failed to load available controllers for cgroup $cg (disable with K0S_ENTRYPOINT_ENABLE_CGROUPV2_NESTING=0)" >&2 |
| 160 | + return 1 |
| 161 | + } |
| 162 | + |
| 163 | + if [ $force != 1 ]; then |
| 164 | + local enabled_controllers |
| 165 | + read -r enabled_controllers <"$cg_path"/cgroup.subtree_control || true # file may be empty |
| 166 | + [ "$all_controllers" != "$enabled_controllers" ] || return |
17 | 167 |
|
18 |
| -for iptables in iptables iptables-nft; do |
19 |
| - # patch docker's iptables rules to switch out the DNS IP |
20 |
| - "$iptables"-save \ |
21 |
| - | sed \ |
22 |
| - `# switch docker DNS DNAT rules to our chosen IP` \ |
23 |
| - -e "s/-d ${docker_embedded_dns_ip}/-d ${docker_host_ip}/g" \ |
24 |
| - `# we need to also apply these rules to non-local traffic (from pods)` \ |
25 |
| - -e 's/-A OUTPUT \(.*\) -j DOCKER_OUTPUT/\0\n-A PREROUTING \1 -j DOCKER_OUTPUT/' \ |
26 |
| - `# switch docker DNS SNAT rules rules to our chosen IP` \ |
27 |
| - -e "s/--to-source :53/--to-source ${docker_host_ip}:53/g" \ |
28 |
| - `# nftables incompatibility between 1.8.8 and 1.8.7 omit the --dport flag on DNAT rules` \ |
29 |
| - `# ensure --dport on DNS rules, due to https://github.com/kubernetes-sigs/kind/issues/3054` \ |
30 |
| - -e "s/p -j DNAT --to-destination ${docker_embedded_dns_ip}/p --dport 53 -j DNAT --to-destination ${docker_embedded_dns_ip}/g" \ |
31 |
| - | "$iptables"-restore |
32 |
| -done |
33 |
| - |
34 |
| -# now we can ensure that DNS is configured to use our IP |
35 |
| -cp /etc/resolv.conf /etc/resolv.conf.original |
36 |
| -sed -e "s/${docker_embedded_dns_ip}/${docker_host_ip}/g" /etc/resolv.conf.original >/etc/resolv.conf |
37 |
| - |
38 |
| -# write config from environment variable |
39 |
| -if [ -n "${K0S_CONFIG-}" ]; then |
40 |
| - mkdir -p /etc/k0s |
41 |
| - printf %s "$K0S_CONFIG" >/etc/k0s/config.yaml |
42 |
| -fi |
43 |
| - |
44 |
| -exec "$@" |
| 168 | + is_cgroupfs_rw || { |
| 169 | + echo "$0: won't enable all available cgroup controllers for cgroup $cg as the cgroup2 file system is read-only (disable with K0S_ENTRYPOINT_ENABLE_CGROUPV2_NESTING=0)" >&2 |
| 170 | + return |
| 171 | + } |
| 172 | + fi |
| 173 | + |
| 174 | + # move all processes out of the root cgroup, otherwise the controllers can't be enabled |
| 175 | + if [ "$cg" = / ]; then |
| 176 | + mkdir -p /sys/fs/cgroup/entrypoint.scope |
| 177 | + local pid |
| 178 | + while read -r pid; do |
| 179 | + echo "$pid" >/sys/fs/cgroup/entrypoint.scope/cgroup.procs |
| 180 | + done </sys/fs/cgroup/cgroup.procs |
| 181 | + fi |
| 182 | + |
| 183 | + # enable all available controllers |
| 184 | + set -- |
| 185 | + local controller |
| 186 | + for controller in $all_controllers; do set -- "$@" +"$controller"; done |
| 187 | + echo "$@" >/sys/fs/cgroup/cgroup.subtree_control |
| 188 | + [ "${K0S_ENTRYPOINT_QUIET-}" = '1' ] || { |
| 189 | + echo "$0: enabled all available controllers for cgroup $cg: $all_controllers" >&2 |
| 190 | + } |
| 191 | +} |
| 192 | + |
| 193 | +# DNS fixup adapted from kind. |
| 194 | +dns_fixup() { |
| 195 | + case "${K0S_ENTRYPOINT_DNS_FIXUP-}" in |
| 196 | + 0) return ;; # disabled |
| 197 | + 1) ;; # enabled |
| 198 | + *) # auto detect |
| 199 | + has_cap_net_admin || { |
| 200 | + if [ $? -eq 1 ]; then |
| 201 | + echo "$0: won't apply DNS fixes without CAP_NET_ADMIN (disable with K0S_ENTRYPOINT_DNS_FIXUP=0)" >&2 |
| 202 | + return |
| 203 | + fi |
| 204 | + |
| 205 | + echo "$0: failed to determine capabilities (disable with K0S_ENTRYPOINT_DNS_FIXUP=0)" >&2 |
| 206 | + } ;; |
| 207 | + esac |
| 208 | + |
| 209 | + # SPDX-SnippetBegin |
| 210 | + # SPDX-License-Identifier: Apache-2.0 |
| 211 | + # SPDX-SnippetCopyrightText: 2019 The Kubernetes Authors. |
| 212 | + # SPDX-SnippetCopyrightText: 2020 the k0s authors |
| 213 | + # SDPX—SnippetName: Modified parts of the enable_network_magic function from kind's entrypoint script |
| 214 | + # SPDX-SnippetComment: https://github.com/kubernetes-sigs/kind/blob/7568bf728147c1253e651f25edfd0e0a75534b8a/images/base/files/usr/local/bin/entrypoint#L447-L487 |
| 215 | + |
| 216 | + local docker_embedded_dns_ip docker_host_ip iptables |
| 217 | + |
| 218 | + # well-known docker embedded DNS is at 127.0.0.11:53 |
| 219 | + docker_embedded_dns_ip=127.0.0.11 |
| 220 | + |
| 221 | + # first we need to detect an IP to use for reaching the docker host |
| 222 | + docker_host_ip=$(timeout 5 getent ahostsv4 host.docker.internal | head -n1 | cut -d' ' -f1 || true) |
| 223 | + # if the ip doesn't exist or is a loopback address use the default gateway |
| 224 | + case "$docker_host_ip" in |
| 225 | + '' | 127.*) docker_host_ip=$(ip -4 route show default | cut -d' ' -f3) ;; |
| 226 | + esac |
| 227 | + |
| 228 | + for iptables in iptables iptables-nft; do |
| 229 | + # patch docker's iptables rules to switch out the DNS IP |
| 230 | + "$iptables"-save \ |
| 231 | + | sed \ |
| 232 | + `# switch docker DNS DNAT rules to our chosen IP` \ |
| 233 | + -e "s/-d ${docker_embedded_dns_ip}/-d ${docker_host_ip}/g" \ |
| 234 | + `# we need to also apply these rules to non-local traffic (from pods)` \ |
| 235 | + -e 's/-A OUTPUT \(.*\) -j DOCKER_OUTPUT/\0\n-A PREROUTING \1 -j DOCKER_OUTPUT/' \ |
| 236 | + `# switch docker DNS SNAT rules rules to our chosen IP` \ |
| 237 | + -e "s/--to-source :53/--to-source ${docker_host_ip}:53/g" \ |
| 238 | + `# nftables incompatibility between 1.8.8 and 1.8.7 omit the --dport flag on DNAT rules` \ |
| 239 | + `# ensure --dport on DNS rules, due to https://github.com/kubernetes-sigs/kind/issues/3054` \ |
| 240 | + -e "s/p -j DNAT --to-destination ${docker_embedded_dns_ip}/p --dport 53 -j DNAT --to-destination ${docker_embedded_dns_ip}/g" \ |
| 241 | + | "$iptables"-restore |
| 242 | + done |
| 243 | + |
| 244 | + # now we can ensure that DNS is configured to use our IP |
| 245 | + cp /etc/resolv.conf /etc/resolv.conf.original |
| 246 | + sed -e "s/${docker_embedded_dns_ip}/${docker_host_ip}/g" /etc/resolv.conf.original >/etc/resolv.conf |
| 247 | + |
| 248 | + # SPDX-SnippetEnd |
| 249 | + |
| 250 | + echo "$0: applied DNS fixes ($docker_embedded_dns_ip -> $docker_host_ip)" >&2 |
| 251 | +} |
| 252 | + |
| 253 | +# Writes the k0s config from the environment variable to the config file. |
| 254 | +write_k0s_config() { |
| 255 | + if [ -n "${K0S_CONFIG-}" ]; then |
| 256 | + mkdir -p /etc/k0s |
| 257 | + printf %s "$K0S_CONFIG" >/etc/k0s/config.yaml |
| 258 | + fi |
| 259 | +} |
| 260 | + |
| 261 | +# Determines the k0s role from the given command line. |
| 262 | +k0s_role() { |
| 263 | + [ -z "${K0S_ENTRYPOINT_ROLE-}" ] || { |
| 264 | + echo "$K0S_ENTRYPOINT_ROLE" |
| 265 | + return |
| 266 | + } |
| 267 | + |
| 268 | + # scan cmdline if k0s is the executable |
| 269 | + [ "$(basename -- "${1-}")" = k0s ] || return |
| 270 | + shift |
| 271 | + |
| 272 | + while [ $# -gt 0 ]; do |
| 273 | + case "$1" in |
| 274 | + -*) shift ;; # skip all flags before first command |
| 275 | + worker) echo worker && return ;; # a worker is a worker |
| 276 | + controller) # examine controller flags |
| 277 | + shift |
| 278 | + while [ $# -gt 0 ]; do |
| 279 | + case "$1" in |
| 280 | + --single | --enable-worker) echo controller+worker && return ;; |
| 281 | + esac |
| 282 | + shift |
| 283 | + done |
| 284 | + |
| 285 | + echo controller |
| 286 | + return |
| 287 | + ;; |
| 288 | + *) return ;; # some other command |
| 289 | + esac |
| 290 | + done |
| 291 | +} |
| 292 | + |
| 293 | +main() { |
| 294 | + case "$(k0s_role "$@")" in |
| 295 | + worker | controller+worker) |
| 296 | + # Don't disable anything. |
| 297 | + ;; |
| 298 | + |
| 299 | + controller | *) |
| 300 | + # Disable everything that's only required when running nested containers. |
| 301 | + : "${K0S_ENTRYPOINT_REMOUNT_CGROUP2FS:=0}" |
| 302 | + : "${K0S_ENTRYPOINT_ENABLE_CGROUPV2_NESTING:=0}" |
| 303 | + : "${K0S_ENTRYPOINT_DNS_FIXUP:=0}" |
| 304 | + ;; |
| 305 | + esac |
| 306 | + |
| 307 | + remount_cgroup2fs |
| 308 | + enable_cgroupv2_nesting |
| 309 | + dns_fixup |
| 310 | + write_k0s_config |
| 311 | + |
| 312 | + [ "${K0S_ENTRYPOINT_QUIET-}" = '1' ] || echo "$0: executing ${1-}" >&2 |
| 313 | + |
| 314 | + exec env \ |
| 315 | + -u K0S_ENTRYPOINT_QUIET \ |
| 316 | + -u K0S_ENTRYPOINT_ROLE \ |
| 317 | + -u K0S_ENTRYPOINT_REMOUNT_CGROUP2FS \ |
| 318 | + -u K0S_ENTRYPOINT_ENABLE_CGROUPV2_NESTING \ |
| 319 | + -u K0S_ENTRYPOINT_DNS_FIXUP \ |
| 320 | + -u K0S_CONFIG \ |
| 321 | + -- "$@" |
| 322 | +} |
| 323 | + |
| 324 | +case "$*" in |
| 325 | +help | -h | --help) usage && exit 0 ;; |
| 326 | +*) main "$@" ;; |
| 327 | +esac |
0 commit comments