Skip to content

Commit 18d3545

Browse files
committed
Overhaul container entrypoint script and Docker docs
Add some extra smarts to the k0s container entrypoint script. By adding cgroup setup capabilities, it's no longer necessary to use the host's cgroups when running k0s in a container. Track down and document the necessary security-related container flags, so there's a slightly more audited alternative to just using --privileged. Re-add the /run directory as tmpfs, as the data in it is not meant to be persistent and could potentially cause problems when containers are restarted. Signed-off-by: Tom Wieczorek <[email protected]>
1 parent c4feec0 commit 18d3545

File tree

4 files changed

+518
-99
lines changed

4 files changed

+518
-99
lines changed

Dockerfile

+2
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,6 @@ ADD ./k0s-${TARGETARCH}/k0s /usr/local/bin/k0s
1515

1616
ENTRYPOINT ["/sbin/tini", "--", "/entrypoint.sh" ]
1717

18+
# Start CMD
1819
CMD ["k0s", "controller", "--enable-worker"]
20+
# End CMD

docker-entrypoint.sh

+321-38
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,327 @@
11
#!/bin/sh
2+
# SPDX-FileCopyrightText: 2020, 2024 k0s authors
3+
# SPDX-License-Identifier: Apache-2.0
4+
#shellcheck disable=SC3040,SC3043,SC3052
25

3-
set -eu
6+
set -euo pipefail
47

5-
# DNS fixup adapted from kind
6-
# https://github.com/kubernetes-sigs/kind/blob/7568bf728147c1253e651f25edfd0e0a75534b8a/images/base/files/usr/local/bin/entrypoint#L447-L487
8+
usage() {
9+
cat <<EOF
10+
Usage: $0 ARGS...
711
8-
# well-known docker embedded DNS is at 127.0.0.11:53
9-
docker_embedded_dns_ip=127.0.0.11
12+
The container entry point script for k0s. It sets the stage for k0s to work in
13+
a containerized environment. This includes possible cgroup and iptables rule
14+
customizations.
1015
11-
# first we need to detect an IP to use for reaching the docker host
12-
docker_host_ip=$(timeout 5 getent ahostsv4 host.docker.internal | head -n1 | cut -d' ' -f1 || true)
13-
# if the ip doesn't exist or is a loopback address use the default gateway
14-
case "$docker_host_ip" in
15-
'' | 127.*) docker_host_ip=$(ip -4 route show default | cut -d' ' -f3) ;;
16-
esac
16+
Arguments:
17+
help, -h, --help Print this help message and exit
18+
19+
Environment variables:
20+
K0S_CONFIG
21+
Optional configuration for k0s, written to /etc/k0s/config.yaml if set.
22+
23+
K0S_ENTRYPOINT_REMOUNT_CGROUP2FS
24+
Set to 1 to force remounting of the cgroup2 filesystem in read-write mode.
25+
Set to 0 to disable remounting.
26+
The default is automatic detection.
27+
28+
K0S_ENTRYPOINT_ENABLE_CGROUPV2_NESTING
29+
Set to 1 to always enable all available cgroup controllers for the root cgroup.
30+
Set to 0 to disable cgroup nesting.
31+
The default is automatic detection.
32+
33+
K0S_ENTRYPOINT_DNS_FIXUP
34+
Set to 1 to apply DNS fixes required for the Docker embedded DNS setup.
35+
Set to 0 to disable.
36+
Default is automatic detection.
37+
38+
K0S_ENTRYPOINT_ROLE
39+
Specifies the role for k0s. Can be 'worker', 'controller', or 'controller+worker'.
40+
Default is to autodetect the role from the arguments.
41+
Depending on the role, some of the above features will be disabled by default.
42+
43+
K0S_ENTRYPOINT_QUIET
44+
Set to 1 to suppress printing status messages.
45+
46+
EOF
47+
}
48+
49+
# Get the effective process capabilities.
50+
get_effective_capabilities() {
51+
local key val
52+
while read -r key val; do
53+
if [ "$key" = CapEff: ]; then
54+
echo $((16#$val))
55+
return
56+
fi
57+
done </proc/self/status
58+
return 1
59+
}
60+
61+
has_effective_capability() {
62+
local cap_eff
63+
cap_eff=$(get_effective_capabilities) || return 2
64+
# Check if the requested bit is set.
65+
[ "$((cap_eff & $((1 << $1))))" != 0 ]
66+
}
67+
68+
# Checks if this process has CAP_NET_ADMIN.
69+
has_cap_net_admin() {
70+
# CAP_NET_ADMIN is bit 12 (https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/include/uapi/linux/capability.h?h=v3.10#n188)
71+
has_effective_capability 12
72+
}
73+
74+
# Checks if this process has CAP_SYS_ADMIN.
75+
has_cap_sys_admin() {
76+
# CAP_SYS_ADMIN is bit 21 (https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/include/uapi/linux/capability.h?h=v3.10#n263)
77+
has_effective_capability 21
78+
}
79+
80+
# Checks if the cgroup2 file system is mounted at its well-known path.
81+
is_cgroupv2() {
82+
# Check for the magic number of the cgroup2 fs.
83+
# https://www.kernel.org/doc/html/v5.16/admin-guide/cgroup-v2.html#mounting
84+
[ "$(stat -f -c %t /sys/fs/cgroup)" = 63677270 ]
85+
}
86+
87+
# Checks if the file system mounted at the well-known cgroup2 path is read-write.
88+
is_cgroupfs_rw() {
89+
local _device mountpoint _fstype opts _rest
90+
while read -r _device mountpoint _fstype opts _rest; do
91+
if [ "$mountpoint" = /sys/fs/cgroup ]; then
92+
case "$opts" in
93+
rw* | *,rw | *,rw,*) return 0 ;;
94+
esac
95+
break
96+
fi
97+
done </proc/mounts
98+
return 1
99+
}
100+
101+
# Remounts the cgroup2 file system in read-write mode, if necessary.
102+
remount_cgroup2fs() {
103+
case "${K0S_ENTRYPOINT_REMOUNT_CGROUP2FS-}" in
104+
0) return ;; # disabled
105+
1) ;; # enabled
106+
*) # auto detect
107+
if ! is_cgroupv2 || is_cgroupfs_rw; then
108+
return
109+
fi
110+
has_cap_sys_admin || {
111+
if [ $? -eq 1 ]; then
112+
echo "$0: won't remount /sys/fs/cgroup without CAP_SYS_ADMIN (disable with K0S_ENTRYPOINT_REMOUNT_CGROUP2FS=0)" >&2
113+
return
114+
fi
115+
116+
echo "$0: failed to determine capabilities (disable with K0S_ENTRYPOINT_REMOUNT_CGROUP2FS=0)" >&2
117+
}
118+
;;
119+
esac
120+
121+
mount --make-rslave / # don't propagate mount events to other namespaces
122+
mount -o remount,rw /sys/fs/cgroup
123+
124+
[ "${K0S_ENTRYPOINT_QUIET-}" = '1' ] || echo "$0: remounted /sys/fs/cgroup" >&2
125+
}
126+
127+
get_process_cgroupv2() {
128+
local cg
129+
while read -r cg; do
130+
case "$cg" in
131+
# The entry for cgroup v2 is always in the format "0::$PATH"
132+
# https://www.kernel.org/doc/html/v5.16/admin-guide/cgroup-v2.html#processes
133+
0::/*) echo "${cg#0::*}" && return 0 ;;
134+
*) cg='' ;;
135+
esac
136+
done </proc/self/cgroup
137+
return 1
138+
}
139+
140+
# Enables all available controllers for the root cgroup.
141+
enable_cgroupv2_nesting() {
142+
case "${K0S_ENTRYPOINT_ENABLE_CGROUPV2_NESTING-}" in
143+
0) return ;; # disabled
144+
1) local force=1 ;; # enabled
145+
*) local force=0 ;; # auto detect
146+
esac
147+
148+
[ $force = 1 ] || is_cgroupv2 || return
149+
150+
local cg
151+
cg="$(get_process_cgroupv2)" || {
152+
echo "$0: failed to determine process cgroup (disable with K0S_ENTRYPOINT_ENABLE_CGROUPV2_NESTING=0)" >&2
153+
return 1
154+
}
155+
local cg_path=/sys/fs/cgroup"$cg"
156+
157+
local all_controllers
158+
read -r all_controllers <"$cg_path"/cgroup.controllers || {
159+
echo "$0: failed to load available controllers for cgroup $cg (disable with K0S_ENTRYPOINT_ENABLE_CGROUPV2_NESTING=0)" >&2
160+
return 1
161+
}
162+
163+
if [ $force != 1 ]; then
164+
local enabled_controllers
165+
read -r enabled_controllers <"$cg_path"/cgroup.subtree_control || true # file may be empty
166+
[ "$all_controllers" != "$enabled_controllers" ] || return
17167

18-
for iptables in iptables iptables-nft; do
19-
# patch docker's iptables rules to switch out the DNS IP
20-
"$iptables"-save \
21-
| sed \
22-
`# switch docker DNS DNAT rules to our chosen IP` \
23-
-e "s/-d ${docker_embedded_dns_ip}/-d ${docker_host_ip}/g" \
24-
`# we need to also apply these rules to non-local traffic (from pods)` \
25-
-e 's/-A OUTPUT \(.*\) -j DOCKER_OUTPUT/\0\n-A PREROUTING \1 -j DOCKER_OUTPUT/' \
26-
`# switch docker DNS SNAT rules rules to our chosen IP` \
27-
-e "s/--to-source :53/--to-source ${docker_host_ip}:53/g" \
28-
`# nftables incompatibility between 1.8.8 and 1.8.7 omit the --dport flag on DNAT rules` \
29-
`# ensure --dport on DNS rules, due to https://github.com/kubernetes-sigs/kind/issues/3054` \
30-
-e "s/p -j DNAT --to-destination ${docker_embedded_dns_ip}/p --dport 53 -j DNAT --to-destination ${docker_embedded_dns_ip}/g" \
31-
| "$iptables"-restore
32-
done
33-
34-
# now we can ensure that DNS is configured to use our IP
35-
cp /etc/resolv.conf /etc/resolv.conf.original
36-
sed -e "s/${docker_embedded_dns_ip}/${docker_host_ip}/g" /etc/resolv.conf.original >/etc/resolv.conf
37-
38-
# write config from environment variable
39-
if [ -n "${K0S_CONFIG-}" ]; then
40-
mkdir -p /etc/k0s
41-
printf %s "$K0S_CONFIG" >/etc/k0s/config.yaml
42-
fi
43-
44-
exec "$@"
168+
is_cgroupfs_rw || {
169+
echo "$0: won't enable all available cgroup controllers for cgroup $cg as the cgroup2 file system is read-only (disable with K0S_ENTRYPOINT_ENABLE_CGROUPV2_NESTING=0)" >&2
170+
return
171+
}
172+
fi
173+
174+
# move all processes out of the root cgroup, otherwise the controllers can't be enabled
175+
if [ "$cg" = / ]; then
176+
mkdir -p /sys/fs/cgroup/entrypoint.scope
177+
local pid
178+
while read -r pid; do
179+
echo "$pid" >/sys/fs/cgroup/entrypoint.scope/cgroup.procs
180+
done </sys/fs/cgroup/cgroup.procs
181+
fi
182+
183+
# enable all available controllers
184+
set --
185+
local controller
186+
for controller in $all_controllers; do set -- "$@" +"$controller"; done
187+
echo "$@" >/sys/fs/cgroup/cgroup.subtree_control
188+
[ "${K0S_ENTRYPOINT_QUIET-}" = '1' ] || {
189+
echo "$0: enabled all available controllers for cgroup $cg: $all_controllers" >&2
190+
}
191+
}
192+
193+
# DNS fixup adapted from kind.
194+
dns_fixup() {
195+
case "${K0S_ENTRYPOINT_DNS_FIXUP-}" in
196+
0) return ;; # disabled
197+
1) ;; # enabled
198+
*) # auto detect
199+
has_cap_net_admin || {
200+
if [ $? -eq 1 ]; then
201+
echo "$0: won't apply DNS fixes without CAP_NET_ADMIN (disable with K0S_ENTRYPOINT_DNS_FIXUP=0)" >&2
202+
return
203+
fi
204+
205+
echo "$0: failed to determine capabilities (disable with K0S_ENTRYPOINT_DNS_FIXUP=0)" >&2
206+
} ;;
207+
esac
208+
209+
# SPDX-SnippetBegin
210+
# SPDX-License-Identifier: Apache-2.0
211+
# SPDX-SnippetCopyrightText: 2019 The Kubernetes Authors.
212+
# SPDX-SnippetCopyrightText: 2020 the k0s authors
213+
# SDPX—SnippetName: Modified parts of the enable_network_magic function from kind's entrypoint script
214+
# SPDX-SnippetComment: https://github.com/kubernetes-sigs/kind/blob/7568bf728147c1253e651f25edfd0e0a75534b8a/images/base/files/usr/local/bin/entrypoint#L447-L487
215+
216+
local docker_embedded_dns_ip docker_host_ip iptables
217+
218+
# well-known docker embedded DNS is at 127.0.0.11:53
219+
docker_embedded_dns_ip=127.0.0.11
220+
221+
# first we need to detect an IP to use for reaching the docker host
222+
docker_host_ip=$(timeout 5 getent ahostsv4 host.docker.internal | head -n1 | cut -d' ' -f1 || true)
223+
# if the ip doesn't exist or is a loopback address use the default gateway
224+
case "$docker_host_ip" in
225+
'' | 127.*) docker_host_ip=$(ip -4 route show default | cut -d' ' -f3) ;;
226+
esac
227+
228+
for iptables in iptables iptables-nft; do
229+
# patch docker's iptables rules to switch out the DNS IP
230+
"$iptables"-save \
231+
| sed \
232+
`# switch docker DNS DNAT rules to our chosen IP` \
233+
-e "s/-d ${docker_embedded_dns_ip}/-d ${docker_host_ip}/g" \
234+
`# we need to also apply these rules to non-local traffic (from pods)` \
235+
-e 's/-A OUTPUT \(.*\) -j DOCKER_OUTPUT/\0\n-A PREROUTING \1 -j DOCKER_OUTPUT/' \
236+
`# switch docker DNS SNAT rules rules to our chosen IP` \
237+
-e "s/--to-source :53/--to-source ${docker_host_ip}:53/g" \
238+
`# nftables incompatibility between 1.8.8 and 1.8.7 omit the --dport flag on DNAT rules` \
239+
`# ensure --dport on DNS rules, due to https://github.com/kubernetes-sigs/kind/issues/3054` \
240+
-e "s/p -j DNAT --to-destination ${docker_embedded_dns_ip}/p --dport 53 -j DNAT --to-destination ${docker_embedded_dns_ip}/g" \
241+
| "$iptables"-restore
242+
done
243+
244+
# now we can ensure that DNS is configured to use our IP
245+
cp /etc/resolv.conf /etc/resolv.conf.original
246+
sed -e "s/${docker_embedded_dns_ip}/${docker_host_ip}/g" /etc/resolv.conf.original >/etc/resolv.conf
247+
248+
# SPDX-SnippetEnd
249+
250+
echo "$0: applied DNS fixes ($docker_embedded_dns_ip -> $docker_host_ip)" >&2
251+
}
252+
253+
# Writes the k0s config from the environment variable to the config file.
254+
write_k0s_config() {
255+
if [ -n "${K0S_CONFIG-}" ]; then
256+
mkdir -p /etc/k0s
257+
printf %s "$K0S_CONFIG" >/etc/k0s/config.yaml
258+
fi
259+
}
260+
261+
# Determines the k0s role from the given command line.
262+
k0s_role() {
263+
[ -z "${K0S_ENTRYPOINT_ROLE-}" ] || {
264+
echo "$K0S_ENTRYPOINT_ROLE"
265+
return
266+
}
267+
268+
# scan cmdline if k0s is the executable
269+
[ "$(basename -- "${1-}")" = k0s ] || return
270+
shift
271+
272+
while [ $# -gt 0 ]; do
273+
case "$1" in
274+
-*) shift ;; # skip all flags before first command
275+
worker) echo worker && return ;; # a worker is a worker
276+
controller) # examine controller flags
277+
shift
278+
while [ $# -gt 0 ]; do
279+
case "$1" in
280+
--single | --enable-worker) echo controller+worker && return ;;
281+
esac
282+
shift
283+
done
284+
285+
echo controller
286+
return
287+
;;
288+
*) return ;; # some other command
289+
esac
290+
done
291+
}
292+
293+
main() {
294+
case "$(k0s_role "$@")" in
295+
worker | controller+worker)
296+
# Don't disable anything.
297+
;;
298+
299+
controller | *)
300+
# Disable everything that's only required when running nested containers.
301+
: "${K0S_ENTRYPOINT_REMOUNT_CGROUP2FS:=0}"
302+
: "${K0S_ENTRYPOINT_ENABLE_CGROUPV2_NESTING:=0}"
303+
: "${K0S_ENTRYPOINT_DNS_FIXUP:=0}"
304+
;;
305+
esac
306+
307+
remount_cgroup2fs
308+
enable_cgroupv2_nesting
309+
dns_fixup
310+
write_k0s_config
311+
312+
[ "${K0S_ENTRYPOINT_QUIET-}" = '1' ] || echo "$0: executing ${1-}" >&2
313+
314+
exec env \
315+
-u K0S_ENTRYPOINT_QUIET \
316+
-u K0S_ENTRYPOINT_ROLE \
317+
-u K0S_ENTRYPOINT_REMOUNT_CGROUP2FS \
318+
-u K0S_ENTRYPOINT_ENABLE_CGROUPV2_NESTING \
319+
-u K0S_ENTRYPOINT_DNS_FIXUP \
320+
-u K0S_CONFIG \
321+
-- "$@"
322+
}
323+
324+
case "$*" in
325+
help | -h | --help) usage && exit 0 ;;
326+
*) main "$@" ;;
327+
esac

0 commit comments

Comments
 (0)