Skip to content

Commit de8145a

Browse files
committed
Add support for mounting other filesystems in user namespaces
Signed-off-by: David Leadbeater <[email protected]>
1 parent aac15ef commit de8145a

File tree

7 files changed

+232
-8
lines changed

7 files changed

+232
-8
lines changed

README.md

Lines changed: 97 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ chmod: /: Bad message
4242
```
4343

4444
## Demo on Kubernetes
45-
Before you install the demo on k8s, please ensure all [the requirements](./docs/install.md) are satisfied
45+
Before you install the demo on k8s, please ensure all [the requirements](./docs/install.md) are satisfied.
4646

4747
This demo shows that the Seccomp Agent can have different behaviour depending on the Kubernetes pod (in this case, the pod's namespace and name).
4848

@@ -82,11 +82,16 @@ apiVersion: v1
8282
kind: Pod
8383
metadata:
8484
name: mynotifypod
85-
# /var/lib/kubelet/seccomp/notify.json
85+
# For older versions of Kubernetes:
8686
annotations:
8787
seccomp.security.alpha.kubernetes.io/pod: localhost/notify.json
8888
spec:
8989
restartPolicy: Never
90+
securityContext:
91+
# /var/lib/kubelet/seccomp/notify.json
92+
seccompProfile:
93+
type: Localhost
94+
localhostProfile: notify.json
9095
containers:
9196
- name: container1
9297
image: busybox
@@ -108,3 +113,93 @@ proc on /root type proc (rw,relatime)
108113
/ # time -f %E /bin/true
109114
0m 2.00s
110115
```
116+
117+
## Combining with user namespaces
118+
119+
By combining this with Kubernetes's user namespace support it is possible to
120+
allow a user within a user namespace to perform some operations which would
121+
otherwise be limited to host root.
122+
123+
One example is mounting other filesystem types. This is most useful combined
124+
with user namespaces to allow mounting network file systems while a pod is
125+
running. This is far safer than giving the container `privileged` access but
126+
does expose more of the kernel to the pod, so you should consider your security
127+
carefully.
128+
129+
Configure a policy, similar to above, but with the following metadata:
130+
```json
131+
{
132+
"architectures" : [
133+
"SCMP_ARCH_X86",
134+
"SCMP_ARCH_X32"
135+
],
136+
"defaultAction" : "SCMP_ACT_ALLOW",
137+
"listenerPath": "/run/seccomp-agent.socket",
138+
"listenerMetadata": "MOUNT_OTHER_FS_LIST=cifs\nMOUNT_NEED_CAP_ADMIN=true",
139+
"syscalls" : [
140+
{
141+
"action" : "SCMP_ACT_NOTIFY",
142+
"names" : [
143+
"mount"
144+
]
145+
},
146+
{
147+
"action" : "SCMP_ACT_ALLOW",
148+
"names" : [
149+
"umount"
150+
]
151+
}
152+
]
153+
}
154+
```
155+
156+
(Policy cut down for sake of example, recommended to use a full policy that
157+
additionally configures notify for mount and allows umount.)
158+
159+
This has currently been successfully tested with cifs. Other filesystem types
160+
should work; NFS will need NFS client utilities installing within the container
161+
*and* on the host (e.g. to make upcalls work).
162+
163+
* Deploy a pod with the seccomp policy and user namespaces:
164+
```yaml
165+
apiVersion: v1
166+
kind: Pod
167+
metadata:
168+
name: mynotifypod-userns
169+
spec:
170+
restartPolicy: Never
171+
# Needs "UserNamespacesSupport" feature gate currently
172+
hostUsers: false
173+
securityContext:
174+
# /var/lib/kubelet/seccomp/notify.json
175+
seccompProfile:
176+
type: Localhost
177+
localhostProfile: notify.json
178+
containers:
179+
- name: container1
180+
image: alpine
181+
command: ["sh"]
182+
args: ["-c", "sleep infinity"]
183+
securityContext:
184+
capabilities:
185+
# This is safe combined with hostUsers: false
186+
add: [SYS_ADMIN]
187+
```
188+
189+
* Run commands in the pod:
190+
```shell
191+
$ kubectl exec -it mynotifypod-userns -- /bin/sh
192+
/ # mkdir /mnt
193+
/ # mount -t cifs -o username=user,password=pass '//10.0.0.1/C' /mnt
194+
/ # df -h /mnt
195+
/mnt # df -h /mnt
196+
Filesystem Size Used Available Use% Mounted on
197+
//10.0.0.1/C 95.4G 85.3G 10.1G 89% /mnt
198+
/ # ls /mnt
199+
$Recycle.Bin Documents and Settings Program files
200+
[...]
201+
/ # sed -i 's!^\(nobody.*/\)false!\1sh!' /etc/passwd
202+
/ # su nobody
203+
/ $ mount -t cifs -o username=user,password=pass '//10.0.0.1/C' /mnt
204+
mount: permission denied (are you root?)
205+
```

cmd/seccompagent/seccompagent.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
//go:build linux && cgo
1516
// +build linux,cgo
1617

1718
package main
@@ -121,7 +122,7 @@ func main() {
121122
// / # ls /root/self/cmdline
122123
// /root/self/cmdline
123124
allowedFilesystems := map[string]struct{}{"proc": struct{}{}}
124-
r.SyscallHandler["mount"] = handlers.Mount(allowedFilesystems)
125+
r.SyscallHandler["mount"] = handlers.Mount(allowedFilesystems, false /* do not check capabilities */)
125126

126127
// Example:
127128
// # chmod 777 /
@@ -214,8 +215,19 @@ func main() {
214215
if v, ok := metadata["MOUNT_SYSFS"]; ok && v == "true" {
215216
allowedFilesystems["sysfs"] = struct{}{}
216217
}
218+
if v, ok := metadata["MOUNT_OTHER_FS_LIST"]; ok {
219+
for _, fs := range strings.Split(v, ",") {
220+
allowedFilesystems[fs] = struct{}{}
221+
}
222+
}
223+
224+
requireCapsForMount := false
225+
if v, ok := metadata["MOUNT_NEED_CAP_ADMIN"]; ok && v == "true" {
226+
requireCapsForMount = true
227+
}
228+
217229
if len(allowedFilesystems) > 0 {
218-
r.SyscallHandler["mount"] = handlers.Mount(allowedFilesystems)
230+
r.SyscallHandler["mount"] = handlers.Mount(allowedFilesystems, requireCapsForMount)
219231
}
220232
return r
221233
}

deploy/seccompagent.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ spec:
4242
hostPID: true
4343
containers:
4444
- name: seccomp-agent
45-
image: quay.io/kinvolk/seccompagent:latest
45+
image: docker.i.d.cx/seccomp-agent-dgl:1
4646
command: [ "/bin/seccompagent", "-resolver=kubernetes", "-log=trace" ]
47-
imagePullPolicy: Always
47+
imagePullPolicy: Never
4848
env:
4949
- name: NODE_NAME
5050
valueFrom:

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ require (
6161
k8s.io/klog/v2 v2.70.1 // indirect
6262
k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 // indirect
6363
k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed // indirect
64+
kernel.org/pub/linux/libs/security/libcap/cap v1.2.69 // indirect
65+
kernel.org/pub/linux/libs/security/libcap/psx v1.2.69 // indirect
6466
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect
6567
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
6668
sigs.k8s.io/yaml v1.3.0 // indirect

go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,10 @@ k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 h1:MQ8BAZPZlWk3S9K4a9NCkI
329329
k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1/go.mod h1:C/N6wCaBHeBHkHUesQOQy2/MZqGgMAFPqGsGQLdbZBU=
330330
k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed h1:jAne/RjBTyawwAy0utX5eqigAwz/lQhTmy+Hr/Cpue4=
331331
k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA=
332+
kernel.org/pub/linux/libs/security/libcap/cap v1.2.69 h1:N0m3tKYbkRMmDobh/47ngz+AWeV7PcfXMDi8xu3Vrag=
333+
kernel.org/pub/linux/libs/security/libcap/cap v1.2.69/go.mod h1:Tk5Ip2TuxaWGpccL7//rAsLRH6RQ/jfqTGxuN/+i/FQ=
334+
kernel.org/pub/linux/libs/security/libcap/psx v1.2.69 h1:IdrOs1ZgwGw5CI+BH6GgVVlOt+LAXoPyh7enr8lfaXs=
335+
kernel.org/pub/linux/libs/security/libcap/psx v1.2.69/go.mod h1:+l6Ee2F59XiJ2I6WR5ObpC1utCQJZ/VLsEbQCD8RG24=
332336
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 h1:iXTIw73aPyC+oRdyqqvVJuloN1p0AC/kzH07hu3NE+k=
333337
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
334338
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE=

pkg/handlers/mount.go

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
//go:build linux && cgo
1516
// +build linux,cgo
1617

1718
package handlers
@@ -28,6 +29,7 @@ import (
2829
"github.com/kinvolk/seccompagent/pkg/nsenter"
2930
"github.com/kinvolk/seccompagent/pkg/readarg"
3031
"github.com/kinvolk/seccompagent/pkg/registry"
32+
"github.com/kinvolk/seccompagent/pkg/userns"
3133
)
3234

3335
var _ = nsenter.RegisterModule("mount", runMountInNamespaces)
@@ -37,6 +39,8 @@ type mountModuleParams struct {
3739
Source string `json:"source,omitempty"`
3840
Dest string `json:"dest,omitempty"`
3941
Filesystem string `json:"filesystem,omitempty"`
42+
Flags int64 `json:"flags,omitempty"`
43+
Options string `json:"options,omitempty"`
4044
}
4145

4246
func runMountInNamespaces(param []byte) string {
@@ -46,14 +50,14 @@ func runMountInNamespaces(param []byte) string {
4650
return fmt.Sprintf("%d", int(unix.ENOSYS))
4751
}
4852

49-
err = unix.Mount(params.Source, params.Dest, params.Filesystem, 0, "")
53+
err = unix.Mount(params.Source, params.Dest, params.Filesystem, 0, params.Options)
5054
if err != nil {
5155
return fmt.Sprintf("%d", int(err.(unix.Errno)))
5256
}
5357
return "0"
5458
}
5559

56-
func Mount(allowedFilesystems map[string]struct{}) registry.HandlerFunc {
60+
func Mount(allowedFilesystems map[string]struct{}, requireUserNamespaceAdmin bool) registry.HandlerFunc {
5761
return func(fd libseccomp.ScmpFd, req *libseccomp.ScmpNotifReq) (result registry.HandlerResult) {
5862
memFile, err := readarg.OpenMem(req.Pid)
5963
if err != nil {
@@ -96,12 +100,17 @@ func Mount(allowedFilesystems map[string]struct{}) registry.HandlerFunc {
96100
return registry.HandlerResultErrno(unix.EFAULT)
97101
}
98102

103+
// We don't handle flags, we may want to consider allowing a few.
104+
// This is here so the debug logging makes it possible to see flags used.
105+
flags := int64(req.Data.Args[3])
106+
99107
log.WithFields(log.Fields{
100108
"fd": fd,
101109
"pid": req.Pid,
102110
"source": source,
103111
"dest": dest,
104112
"filesystem": filesystem,
113+
"flags": flags,
105114
}).Debug("Mount")
106115

107116
if _, ok := allowedFilesystems[filesystem]; !ok {
@@ -110,11 +119,69 @@ func Mount(allowedFilesystems map[string]struct{}) registry.HandlerFunc {
110119
return registry.HandlerResultContinue()
111120
}
112121

122+
var options string
123+
if req.Data.Args[4] != 0/* NULL */ && filesystem != "proc" && filesystem != "sysfs" {
124+
// Get options, we assume because this is specified in
125+
// allowedFilesystems that the data argument to mount(2)
126+
// is a string so this is safe now.
127+
options, err = readarg.ReadString(memFile, int64(req.Data.Args[4]))
128+
if err != nil {
129+
log.WithFields(log.Fields{
130+
"fd": fd,
131+
"pid": req.Pid,
132+
"arg": 4,
133+
"err": err,
134+
}).Error("Cannot read argument")
135+
return registry.HandlerResultErrno(unix.EFAULT)
136+
}
137+
138+
// Log this at trace level only as it could have user credentials.
139+
log.WithFields(log.Fields{
140+
"fd": fd,
141+
"pid": req.Pid,
142+
"source": source,
143+
"dest": dest,
144+
"filesystem": filesystem,
145+
"flags": flags,
146+
"options": options,
147+
}).Trace("Handle mount")
148+
}
149+
150+
if requireUserNamespaceAdmin {
151+
ok, err := userns.IsPIDAdminCapable(req.Pid)
152+
if err != nil {
153+
log.WithFields(log.Fields{
154+
"fd": fd,
155+
"pid": req.Pid,
156+
"err": err,
157+
}).Error("Cannot check user namespace capabilities")
158+
return registry.HandlerResultErrno(unix.EFAULT)
159+
}
160+
if !ok {
161+
log.WithFields(log.Fields{
162+
"fd": fd,
163+
"pid": req.Pid,
164+
}).Info("Mount attempted without CAP_SYS_ADMIN")
165+
return registry.HandlerResultErrno(unix.EPERM)
166+
}
167+
168+
// Ensure the notification is still valid after checking user namespace capabilities.
169+
if err := libseccomp.NotifIDValid(fd, req.ID); err != nil {
170+
log.WithFields(log.Fields{
171+
"fd": fd,
172+
"req": req,
173+
"err": err,
174+
}).Debug("Notification no longer valid")
175+
return registry.HandlerResultIntr()
176+
}
177+
}
178+
113179
params := mountModuleParams{
114180
Module: "mount",
115181
Source: source,
116182
Dest: dest,
117183
Filesystem: filesystem,
184+
Options: options,
118185
}
119186

120187
mntns, err := nsenter.OpenNamespace(req.Pid, "mnt")

pkg/userns/check.go

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
package userns
2+
3+
import (
4+
"fmt"
5+
6+
"golang.org/x/sys/unix"
7+
"kernel.org/pub/linux/libs/security/libcap/cap"
8+
)
9+
10+
// IsPIDAdminCapable returns true if the PID is considered an admin of a user
11+
// namespace, that is, it's in either in the init user namespace or one created
12+
// by the host root and has CAP_SYS_ADMIN. The protects against a less
13+
// privileged user either mounting a directory over a tree that gives them more
14+
// access (e.g. /etc/sudoers.d) or hiding files.
15+
func IsPIDAdminCapable(pid uint32) (bool, error) {
16+
// We unfortunately need to reimplement some of the kernel's user namespace logic.
17+
// Our goal is to allow a user with CAP_SYS_ADMIN inside the first user
18+
// namespace to call mount(). If the user nests a user namespace below that,
19+
// we don't want to allow that process to call mount.
20+
21+
// This is security sensitive code, however TOCTOU isn't a concern in this case
22+
// as this is designed to be used while blocked on a syscall and the kernel
23+
// does not let multi-threaded processes change their user namespace (see
24+
// setns() and unshare() docs).
25+
fd, err := unix.Open(fmt.Sprintf("/proc/%d/ns/user", pid), unix.O_RDONLY, 0)
26+
if err != nil {
27+
return false, err
28+
}
29+
defer unix.Close(fd)
30+
31+
uid, err := unix.IoctlGetInt(fd, unix.NS_GET_OWNER_UID)
32+
if err != nil {
33+
return false, err
34+
}
35+
if uid != 0 {
36+
return false, err
37+
}
38+
set, err := cap.GetPID(int(pid))
39+
if err != nil {
40+
return false, err
41+
}
42+
43+
return set.GetFlag(cap.Effective, cap.SYS_ADMIN)
44+
}

0 commit comments

Comments
 (0)