|
7 | 7 | package syscall
|
8 | 8 |
|
9 | 9 | import (
|
| 10 | + errpkg "errors" |
10 | 11 | "internal/itoa"
|
11 | 12 | "runtime"
|
12 | 13 | "unsafe"
|
@@ -328,6 +329,7 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
|
328 | 329 | if clone3 != nil {
|
329 | 330 | pid, err1 = rawVforkSyscall(_SYS_clone3, uintptr(unsafe.Pointer(clone3)), unsafe.Sizeof(*clone3), 0)
|
330 | 331 | } else {
|
| 332 | + // N.B. Keep in sync with doCheckClonePidfd. |
331 | 333 | flags |= uintptr(SIGCHLD)
|
332 | 334 | if runtime.GOARCH == "s390x" {
|
333 | 335 | // On Linux/s390, the first two arguments of clone(2) are swapped.
|
@@ -743,3 +745,82 @@ func forkAndExecFailureCleanup(attr *ProcAttr, sys *SysProcAttr) {
|
743 | 745 | *sys.PidFD = -1
|
744 | 746 | }
|
745 | 747 | }
|
| 748 | + |
| 749 | +// checkClonePidfd verifies that clone(CLONE_PIDFD) works by actually doing a |
| 750 | +// clone. |
| 751 | +// |
| 752 | +//go:linkname os_checkClonePidfd os.checkClonePidfd |
| 753 | +func os_checkClonePidfd() error { |
| 754 | + pidfd := int32(-1) |
| 755 | + pid, errno := doCheckClonePidfd(&pidfd) |
| 756 | + if errno != 0 { |
| 757 | + return errno |
| 758 | + } |
| 759 | + |
| 760 | + if pidfd == -1 { |
| 761 | + // Bad: CLONE_PIDFD failed to provide a pidfd. Reap the process |
| 762 | + // before returning. |
| 763 | + |
| 764 | + var err error |
| 765 | + for { |
| 766 | + var status WaitStatus |
| 767 | + _, err = Wait4(int(pid), &status, 0, nil) |
| 768 | + if err != EINTR { |
| 769 | + break |
| 770 | + } |
| 771 | + } |
| 772 | + if err != nil { |
| 773 | + return err |
| 774 | + } |
| 775 | + |
| 776 | + return errpkg.New("clone(CLONE_PIDFD) failed to return pidfd") |
| 777 | + } |
| 778 | + |
| 779 | + // Good: CLONE_PIDFD provided a pidfd. Reap the process and close the |
| 780 | + // pidfd. |
| 781 | + defer Close(int(pidfd)) |
| 782 | + |
| 783 | + for { |
| 784 | + const _P_PIDFD = 3 |
| 785 | + _, _, errno = Syscall6(SYS_WAITID, _P_PIDFD, uintptr(pidfd), 0, WEXITED, 0, 0) |
| 786 | + if errno != EINTR { |
| 787 | + break |
| 788 | + } |
| 789 | + } |
| 790 | + if errno != 0 { |
| 791 | + return errno |
| 792 | + } |
| 793 | + |
| 794 | + return nil |
| 795 | +} |
| 796 | + |
| 797 | +// doCheckClonePidfd implements the actual clone call of os_checkClonePidfd and |
| 798 | +// child execution. This is a separate function so we can separate the child's |
| 799 | +// and parent's stack frames if we're using vfork. |
| 800 | +// |
| 801 | +// This is go:noinline because the point is to keep the stack frames of this |
| 802 | +// and os_checkClonePidfd separate. |
| 803 | +// |
| 804 | +//go:noinline |
| 805 | +func doCheckClonePidfd(pidfd *int32) (pid uintptr, errno Errno) { |
| 806 | + flags := uintptr(CLONE_VFORK|CLONE_VM|CLONE_PIDFD|SIGCHLD) |
| 807 | + if runtime.GOARCH == "s390x" { |
| 808 | + // On Linux/s390, the first two arguments of clone(2) are swapped. |
| 809 | + pid, errno = rawVforkSyscall(SYS_CLONE, 0, flags, uintptr(unsafe.Pointer(pidfd))) |
| 810 | + } else { |
| 811 | + pid, errno = rawVforkSyscall(SYS_CLONE, flags, 0, uintptr(unsafe.Pointer(pidfd))) |
| 812 | + } |
| 813 | + if errno != 0 || pid != 0 { |
| 814 | + // If we're in the parent, we must return immediately |
| 815 | + // so we're not in the same stack frame as the child. |
| 816 | + // This can at most use the return PC, which the child |
| 817 | + // will not modify, and the results of |
| 818 | + // rawVforkSyscall, which must have been written after |
| 819 | + // the child was replaced. |
| 820 | + return |
| 821 | + } |
| 822 | + |
| 823 | + for { |
| 824 | + RawSyscall(SYS_EXIT, 0, 0, 0) |
| 825 | + } |
| 826 | +} |
0 commit comments