|
7 | 7 | package syscall
|
8 | 8 |
|
9 | 9 | import (
|
| 10 | + errpkg "errors" |
10 | 11 | "internal/itoa"
|
11 | 12 | "runtime"
|
12 | 13 | "unsafe"
|
@@ -330,6 +331,7 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
|
330 | 331 | if clone3 != nil {
|
331 | 332 | pid, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3), 0)
|
332 | 333 | } else {
|
| 334 | + // N.B. Keep in sync with doCheckClonePidfd. |
333 | 335 | flags |= uintptr(SIGCHLD)
|
334 | 336 | if runtime.GOARCH == "s390x" {
|
335 | 337 | // On Linux/s390, the first two arguments of clone(2) are swapped.
|
@@ -758,3 +760,82 @@ func forkAndExecFailureCleanup(attr *ProcAttr, sys *SysProcAttr) {
|
758 | 760 | *sys.PidFD = -1
|
759 | 761 | }
|
760 | 762 | }
|
| 763 | + |
| 764 | +// checkClonePidfd verifies that clone(CLONE_PIDFD) works by actually doing a |
| 765 | +// clone. |
| 766 | +// |
| 767 | +//go:linkname os_checkClonePidfd os.checkClonePidfd |
| 768 | +func os_checkClonePidfd() error { |
| 769 | + pidfd := int32(-1) |
| 770 | + pid, errno := doCheckClonePidfd(&pidfd) |
| 771 | + if errno != 0 { |
| 772 | + return errno |
| 773 | + } |
| 774 | + |
| 775 | + if pidfd == -1 { |
| 776 | + // Bad: CLONE_PIDFD failed to provide a pidfd. Reap the process |
| 777 | + // before returning. |
| 778 | + |
| 779 | + var err error |
| 780 | + for { |
| 781 | + var status WaitStatus |
| 782 | + _, err = Wait4(int(pid), &status, 0, nil) |
| 783 | + if err != EINTR { |
| 784 | + break |
| 785 | + } |
| 786 | + } |
| 787 | + if err != nil { |
| 788 | + return err |
| 789 | + } |
| 790 | + |
| 791 | + return errpkg.New("clone(CLONE_PIDFD) failed to return pidfd") |
| 792 | + } |
| 793 | + |
| 794 | + // Good: CLONE_PIDFD provided a pidfd. Reap the process and close the |
| 795 | + // pidfd. |
| 796 | + defer Close(int(pidfd)) |
| 797 | + |
| 798 | + for { |
| 799 | + const _P_PIDFD = 3 |
| 800 | + _, _, errno = Syscall6(SYS_WAITID, _P_PIDFD, uintptr(pidfd), 0, WEXITED, 0, 0) |
| 801 | + if errno != EINTR { |
| 802 | + break |
| 803 | + } |
| 804 | + } |
| 805 | + if errno != 0 { |
| 806 | + return errno |
| 807 | + } |
| 808 | + |
| 809 | + return nil |
| 810 | +} |
| 811 | + |
| 812 | +// doCheckClonePidfd implements the actual clone call of os_checkClonePidfd and |
| 813 | +// child execution. This is a separate function so we can separate the child's |
| 814 | +// and parent's stack frames if we're using vfork. |
| 815 | +// |
| 816 | +// This is go:noinline because the point is to keep the stack frames of this |
| 817 | +// and os_checkClonePidfd separate. |
| 818 | +// |
| 819 | +//go:noinline |
| 820 | +func doCheckClonePidfd(pidfd *int32) (pid uintptr, errno Errno) { |
| 821 | + flags := uintptr(CLONE_VFORK|CLONE_VM|CLONE_PIDFD|SIGCHLD) |
| 822 | + if runtime.GOARCH == "s390x" { |
| 823 | + // On Linux/s390, the first two arguments of clone(2) are swapped. |
| 824 | + pid, errno = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(pidfd))) |
| 825 | + } else { |
| 826 | + pid, errno = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(pidfd))) |
| 827 | + } |
| 828 | + if errno != 0 || pid != 0 { |
| 829 | + // If we're in the parent, we must return immediately |
| 830 | + // so we're not in the same stack frame as the child. |
| 831 | + // This can at most use the return PC, which the child |
| 832 | + // will not modify, and the results of |
| 833 | + // rawVforkSyscall, which must have been written after |
| 834 | + // the child was replaced. |
| 835 | + return |
| 836 | + } |
| 837 | + |
| 838 | + for { |
| 839 | + RawSyscall(SYS_EXIT, 0, 0, 0) |
| 840 | + } |
| 841 | +} |
0 commit comments