package libcontainer

import (
	"bytes"
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"maps"
	"net"
	"os"
	"os/exec"
	"path"
	"path/filepath"
	"runtime"
	"strconv"
	"strings"
	"syscall"
	"time"

	"github.com/sirupsen/logrus"
	"golang.org/x/sys/unix"

	"github.com/opencontainers/runtime-spec/specs-go"

	"github.com/opencontainers/cgroups"
	"github.com/opencontainers/cgroups/fs2"
	"github.com/opencontainers/runc/internal/cmsg"
	"github.com/opencontainers/runc/internal/linux"
	"github.com/opencontainers/runc/libcontainer/configs"
	"github.com/opencontainers/runc/libcontainer/intelrdt"
	"github.com/opencontainers/runc/libcontainer/internal/userns"
	"github.com/opencontainers/runc/libcontainer/logs"
	"github.com/opencontainers/runc/libcontainer/system"
	"github.com/opencontainers/runc/libcontainer/utils"
)

type parentProcess interface {
	// pid returns the pid for the running process.
	pid() int

	// start starts the process execution.
	start() error

	// send a SIGKILL to the process and wait for the exit.
	terminate() error

	// wait waits on the process returning the process state.
	wait() (*os.ProcessState, error)

	// startTime returns the process start time.
	startTime() (uint64, error)
	signal(os.Signal) error
	externalDescriptors() []string
	setExternalDescriptors(fds []string)
	forwardChildLogs() chan error
}

type processComm struct {
	// Used to send initial configuration to "runc init" and for "runc init" to
	// indicate that it is ready.
	initSockParent *os.File
	initSockChild  *os.File
	// Used for control messages between parent and "runc init".
	syncSockParent *syncSocket
	syncSockChild  *syncSocket
	// Used for log forwarding from "runc init" to the parent.
	logPipeParent *os.File
	logPipeChild  *os.File
}

func newProcessComm() (_ *processComm, retErr error) {
	var (
		comm processComm
		err  error
	)
	comm.initSockParent, comm.initSockChild, err = utils.NewSockPair("init")
	if err != nil {
		return nil, fmt.Errorf("unable to create init pipe: %w", err)
	}
	defer func() {
		if retErr != nil {
			comm.initSockParent.Close()
			comm.initSockChild.Close()
		}
	}()

	comm.syncSockParent, comm.syncSockChild, err = newSyncSockpair("sync")
	if err != nil {
		return nil, fmt.Errorf("unable to create sync pipe: %w", err)
	}
	defer func() {
		if retErr != nil {
			comm.syncSockParent.Close()
			comm.syncSockChild.Close()
		}
	}()

	comm.logPipeParent, comm.logPipeChild, err = os.Pipe()
	if err != nil {
		return nil, fmt.Errorf("unable to create log pipe: %w", err)
	}
	return &comm, nil
}

func (c *processComm) closeChild() {
	_ = c.initSockChild.Close()
	_ = c.syncSockChild.Close()
	_ = c.logPipeChild.Close()
}

func (c *processComm) closeParent() {
	_ = c.initSockParent.Close()
	_ = c.syncSockParent.Close()
	// c.logPipeParent is kept alive for ForwardLogs
}

type containerProcess struct {
	cmd           *exec.Cmd
	comm          *processComm
	config        *initConfig
	manager       cgroups.Manager
	fds           []string
	process       *Process
	bootstrapData io.Reader
	container     *Container
}

func (p *containerProcess) pid() int {
	return p.cmd.Process.Pid
}

func (p *containerProcess) startTime() (uint64, error) {
	stat, err := system.Stat(p.pid())
	return stat.StartTime, err
}

func (p *containerProcess) signal(sig os.Signal) error {
	return p.cmd.Process.Signal(sig)
}

func (p *containerProcess) externalDescriptors() []string {
	return p.fds
}

func (p *containerProcess) setExternalDescriptors(newFds []string) {
	p.fds = newFds
}

func (p *containerProcess) forwardChildLogs() chan error {
	return logs.ForwardLogs(p.comm.logPipeParent)
}

// terminate sends a SIGKILL to the forked process for the setns routine then waits to
// avoid the process becoming a zombie.
func (p *containerProcess) terminate() error {
	if p.cmd.Process == nil {
		return nil
	}
	err := p.cmd.Process.Kill()
	if _, werr := p.wait(); err == nil {
		err = werr
	}
	return err
}

func (p *containerProcess) wait() (*os.ProcessState, error) { //nolint:unparam
	err := p.cmd.Wait()

	// Return actual ProcessState even on Wait error
	return p.cmd.ProcessState, err
}

type setnsProcess struct {
	containerProcess
	rootlessCgroups bool
	intelRdtPath    string
	initProcessPid  int
}

// tryResetCPUAffinity tries to reset the CPU affinity of the process
// identified by pid to include all possible CPUs (notwithstanding cgroup
// cpuset restrictions, isolated CPUs and CPU online status).
func tryResetCPUAffinity(pid int) {
	// When resetting the CPU affinity, we want to allow all
	// possible CPUs in the system, including those not in
	// cpuset.cpus, online or even present (hot-plugged) at call
	// time. Using a cpumask any tighter this that may disallow
	// using those CPUs if they are added to cpuset.cpus later.
	//
	// Note that sched_setaffinity(2) will implicitly:
	//
	//  * Clamp the cpumask so that it matches the number of CPUs
	//    supported by the kernel.
	//
	//  * Mask out any CPUs that are not a member of the target task's
	//    configured cgroup cpuset. This is for task's effective affinity,
	//    without forgetting masked-out CPUs should the cgroup cpuset
	//    change later.
	//
	// Therefore, preparing the cpumask, we can avoid reading
	// /sys/devices/system/cpu/possible and kernel_max.
	// Instead, we use a huge buffer similarly to go 1.25 runtime in
	// getCPUCount().
	const maxCPUs = 64 * 1024
	buf := bytes.Repeat([]byte{0xff}, maxCPUs/8)
	if err := linux.SchedSetaffinity(pid, buf); err != nil {
		logrus.WithError(err).Warnf("resetting the CPU affinity of pid %d failed -- the container process may inherit runc's CPU affinity", pid)
		return
	}
}

// Starts setns process with specified initial CPU affinity.
func (p *setnsProcess) startWithCPUAffinity() error {
	aff := p.config.CPUAffinity
	if aff == nil || aff.Initial == nil {
		return p.cmd.Start()
	}
	errCh := make(chan error)
	defer close(errCh)

	// Use a goroutine to dedicate an OS thread.
	go func() {
		runtime.LockOSThread()
		// Command inherits the CPU affinity.
		if err := unix.SchedSetaffinity(unix.Gettid(), aff.Initial); err != nil {
			errCh <- fmt.Errorf("error setting initial CPU affinity: %w", err)
			return
		}

		errCh <- p.cmd.Start()
		// Deliberately omit runtime.UnlockOSThread here.
		// https://pkg.go.dev/runtime#LockOSThread says:
		// "If the calling goroutine exits without unlocking the
		// thread, the thread will be terminated".
	}()

	return <-errCh
}

func (p *setnsProcess) setFinalCPUAffinity() error {
	aff := p.config.CPUAffinity
	// If there was no affinity configured at all, we want to reset
	// the affinity to make sure we don't inherit an unexpected one.
	if aff == nil || aff.Final == nil && aff.Initial == nil {
		tryResetCPUAffinity(p.pid())
		return nil
	}
	if aff.Final == nil {
		return nil
	}
	if err := unix.SchedSetaffinity(p.pid(), aff.Final); err != nil {
		return fmt.Errorf("error setting final CPU affinity: %w", err)
	}
	return nil
}

func (p *setnsProcess) addIntoCgroupV1() error {
	if sub, ok := p.process.SubCgroupPaths[""]; ok || len(p.process.SubCgroupPaths) == 0 {
		// Either same sub-cgroup for all paths, or no sub-cgroup.
		err := p.manager.AddPid(sub, p.pid())
		if err != nil && !p.rootlessCgroups {
			return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
		}
		return nil
	}

	// Per-controller sub-cgroup paths. Not supported by AddPid (or systemd),
	// so we have to calculate and check all sub-cgroup paths, and write
	// directly to cgroupfs.
	paths := maps.Clone(p.manager.GetPaths())
	for ctrl, sub := range p.process.SubCgroupPaths {
		base, ok := paths[ctrl]
		if !ok {
			return fmt.Errorf("unknown controller %s in SubCgroupPaths", ctrl)
		}
		cgPath := path.Join(base, sub)
		if !strings.HasPrefix(cgPath, base) {
			return fmt.Errorf("bad sub cgroup path: %s", sub)
		}
		paths[ctrl] = cgPath
	}

	for _, path := range paths {
		if err := cgroups.WriteCgroupProc(path, p.pid()); err != nil && !p.rootlessCgroups {
			return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
		}
	}

	return nil
}

// initProcessCgroupPath returns container init's cgroup path,
// as read from /proc/PID/cgroup. Only works for cgroup v2.
// Returns empty string if the path can not be obtained.
//
// This is used by runc exec in these cases:
//
//  1. On cgroup v2 + nesting + domain controllers, adding to initial cgroup
//     may fail with EBUSY (https://github.com/opencontainers/runc/issues/2356);
//
//  2. A container init process with no cgroupns and /sys/fs/cgroup rw access
//     may move itself to any other cgroup, and the original cgroup will disappear.
func (p *setnsProcess) initProcessCgroupPath() string {
	if p.initProcessPid == 0 || !cgroups.IsCgroup2UnifiedMode() {
		return ""
	}

	cg, err := cgroups.ParseCgroupFile("/proc/" + strconv.Itoa(p.initProcessPid) + "/cgroup")
	if err != nil {
		return ""
	}
	cgroup, ok := cg[""]
	if !ok {
		return ""
	}

	return fs2.UnifiedMountpoint + cgroup
}

func (p *setnsProcess) addIntoCgroupV2() error {
	sub := p.process.SubCgroupPaths[""]
	err := p.manager.AddPid(sub, p.pid())
	if err == nil {
		return nil
	}

	// Failed to join the configured cgroup. Fall back to container init's cgroup
	// unless sub-cgroup is explicitly requested.
	var path string
	if sub != "" {
		goto fail
	}
	path = p.initProcessCgroupPath()
	if path == "" {
		goto fail
	}
	logrus.Debugf("adding pid %d to configured cgroup failed (%v), will join container init cgroup %q", p.pid(), err, path)
	// NOTE: path is not guaranteed to exist because we didn't pause the container.
	err = cgroups.WriteCgroupProc(path, p.pid())
	if err != nil {
		goto fail
	}
	return nil

fail:
	if p.rootlessCgroups {
		// Ignore cgroup join errors when rootless.
		return nil
	}

	return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
}

func (p *setnsProcess) addIntoCgroup() error {
	if p.cmd.SysProcAttr.UseCgroupFD {
		// We've used cgroupfd successfully, so the process is
		// already in the proper cgroup, nothing to do here.
		return nil
	}
	if cgroups.IsCgroup2UnifiedMode() {
		return p.addIntoCgroupV2()
	}
	return p.addIntoCgroupV1()
}

// prepareCgroupFD sets up p.cmd to use clone3 with CLONE_INTO_CGROUP
// to join cgroup early, in p.cmd.Start. Returns an *os.File which
// must be closed by the caller after p.Cmd.Start return.
func (p *setnsProcess) prepareCgroupFD() (*os.File, error) {
	const openFlags = unix.O_PATH | unix.O_DIRECTORY | unix.O_CLOEXEC

	if !cgroups.IsCgroup2UnifiedMode() {
		return nil, nil
	}

	base := p.manager.Path("")
	if base == "" { // No cgroup to join.
		return nil, nil
	}
	sub := ""
	if p.process.SubCgroupPaths != nil {
		sub = p.process.SubCgroupPaths[""]
	}
	cgroup := path.Join(base, sub)
	if !strings.HasPrefix(cgroup, base) {
		return nil, fmt.Errorf("bad sub cgroup path: %s", sub)
	}

	fd, err := cgroups.OpenFile(base, sub, openFlags)
	if err == nil {
		goto success
	}
	// Failed to open the configured cgroup. Fall back to container init's cgroup
	// unless sub-cgroup is explicitly requested. The fallback logic should be
	// the same as in addIntoCgroupV2.
	if sub != "" {
		goto fail
	}
	cgroup = p.initProcessCgroupPath()
	if cgroup == "" {
		goto fail
	}
	logrus.Debugf("failed to open configured cgroup (%v), will open container init cgroup %q", err, cgroup)
	// NOTE: path is not guaranteed to exist because we didn't pause the container.
	fd, err = cgroups.OpenFile(cgroup, "", openFlags)
	if err != nil {
		goto fail
	}

success:
	logrus.Debugf("using CLONE_INTO_CGROUP %q", cgroup)
	if p.cmd.SysProcAttr == nil {
		p.cmd.SysProcAttr = &syscall.SysProcAttr{}
	}
	p.cmd.SysProcAttr.UseCgroupFD = true
	p.cmd.SysProcAttr.CgroupFD = int(fd.Fd())

	return fd, nil

fail:
	// Ignore cgroup join error for rootless.
	if p.rootlessCgroups {
		return nil, nil
	}
	return nil, fmt.Errorf("can't open cgroup: %w", err)
}

// startWithCgroupFD starts a process via clone3 with CLONE_INTO_CGROUP,
// with a fallback if it fails (e.g. not available).
func (p *setnsProcess) startWithCgroupFD() error {
	// Close the child side of the pipes.
	defer p.comm.closeChild()

	fd, err := p.prepareCgroupFD()
	if err != nil {
		return err
	}
	if fd != nil {
		defer fd.Close()
	}

	cmdCopy := cloneCmd(p.cmd)
	err = p.startWithCPUAffinity()
	if err != nil && p.cmd.SysProcAttr.UseCgroupFD {
		logrus.Debugf("exec with CLONE_INTO_CGROUP failed: %v; retrying without", err)
		// SysProcAttr.CgroupFD is never used when UseCgroupFD is unset.
		cmdCopy.SysProcAttr.UseCgroupFD = false
		// Must not reuse exec.Cmd.
		p.cmd = cmdCopy
		err = p.startWithCPUAffinity()
	}

	return err
}

func (p *setnsProcess) start() (retErr error) {
	defer p.comm.closeParent()

	// Get the "before" value of oom kill count.
	oom, _ := p.manager.OOMKillCount()

	if err := p.startWithCgroupFD(); err != nil {
		return fmt.Errorf("error starting setns process: %w", err)
	}

	defer func() {
		if retErr != nil {
			if newOom, err := p.manager.OOMKillCount(); err == nil && newOom != oom {
				// Someone in this cgroup was killed, this _might_ be us.
				retErr = fmt.Errorf("%w (possibly OOM-killed)", retErr)
			}
			err := ignoreTerminateErrors(p.terminate())
			if err != nil {
				logrus.WithError(err).Warn("unable to terminate setnsProcess")
			}
		}
	}()

	if p.bootstrapData != nil {
		if _, err := io.Copy(p.comm.initSockParent, p.bootstrapData); err != nil {
			return fmt.Errorf("error copying bootstrap data to pipe: %w", err)
		}
	}
	if err := p.execSetns(); err != nil {
		return fmt.Errorf("error executing setns process: %w", err)
	}
	if err := p.addIntoCgroup(); err != nil {
		return err
	}
	// Set final CPU affinity right after the process is moved into container's cgroup.
	if err := p.setFinalCPUAffinity(); err != nil {
		return err
	}
	if p.intelRdtPath != "" {
		// if Intel RDT "resource control" filesystem path exists
		_, err := os.Stat(p.intelRdtPath)
		if err == nil {
			if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil {
				return fmt.Errorf("error adding pid %d to Intel RDT: %w", p.pid(), err)
			}
		}
	}

	if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil {
		return fmt.Errorf("error writing config to pipe: %w", err)
	}

	var seenProcReady bool
	ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error {
		switch sync.Type {
		case procReady:
			seenProcReady = true
			// Set rlimits, this has to be done here because we lose permissions
			// to raise the limits once we enter a user-namespace
			if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
				return fmt.Errorf("error setting rlimits for ready process: %w", err)
			}

			// Sync with child.
			if err := writeSync(p.comm.syncSockParent, procRun); err != nil {
				return err
			}
		case procHooks:
			// This shouldn't happen.
			panic("unexpected procHooks in setns")
		case procMountPlease:
			// This shouldn't happen.
			panic("unexpected procMountPlease in setns")
		case procSeccomp:
			if p.config.Config.Seccomp.ListenerPath == "" {
				return errors.New("seccomp listenerPath is not set")
			}
			if sync.Arg == nil {
				return fmt.Errorf("sync %q is missing an argument", sync.Type)
			}
			var srcFd int
			if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil {
				return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err)
			}
			seccompFd, err := pidGetFd(p.pid(), srcFd)
			if err != nil {
				return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err)
			}
			defer seccompFd.Close()
			// We have a copy, the child can keep working. We don't need to
			// wait for the seccomp notify listener to get the fd before we
			// permit the child to continue because the child will happily wait
			// for the listener if it hits SCMP_ACT_NOTIFY.
			if err := writeSync(p.comm.syncSockParent, procSeccompDone); err != nil {
				return err
			}

			bundle, annotations := utils.Annotations(p.config.Config.Labels)
			containerProcessState := &specs.ContainerProcessState{
				Version:  specs.Version,
				Fds:      []string{specs.SeccompFdName},
				Pid:      p.cmd.Process.Pid,
				Metadata: p.config.Config.Seccomp.ListenerMetadata,
				State: specs.State{
					Version:     specs.Version,
					ID:          p.config.ContainerID,
					Status:      specs.StateRunning,
					Pid:         p.initProcessPid,
					Bundle:      bundle,
					Annotations: annotations,
				},
			}
			if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath,
				containerProcessState, seccompFd); err != nil {
				return err
			}
		default:
			return errors.New("invalid JSON payload from child")
		}
		return nil
	})

	if err := p.comm.syncSockParent.Shutdown(unix.SHUT_WR); err != nil && ierr == nil {
		return err
	}
	if !seenProcReady && ierr == nil {
		ierr = errors.New("procReady not received")
	}
	// Must be done after Shutdown so the child will exit and we can wait for it.
	if ierr != nil {
		_, _ = p.wait()
		return ierr
	}
	return nil
}

// execSetns runs the process that executes C code to perform the setns calls
// because setns support requires the C process to fork off a child and perform the setns
// before the go runtime boots, we wait on the process to die and receive the child's pid
// over the provided pipe.
func (p *setnsProcess) execSetns() error {
	status, err := p.cmd.Process.Wait()
	if err != nil {
		_ = p.cmd.Wait()
		return fmt.Errorf("error waiting on setns process to finish: %w", err)
	}
	if !status.Success() {
		_ = p.cmd.Wait()
		return &exec.ExitError{ProcessState: status}
	}
	var pid *pid
	if err := json.NewDecoder(p.comm.initSockParent).Decode(&pid); err != nil {
		_ = p.cmd.Wait()
		return fmt.Errorf("error reading pid from init pipe: %w", err)
	}

	// Clean up the zombie parent process
	// On Unix systems FindProcess always succeeds.
	firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)

	// Ignore the error in case the child has already been reaped for any reason
	_, _ = firstChildProcess.Wait()

	process, err := os.FindProcess(pid.Pid)
	if err != nil {
		return err
	}
	p.cmd.Process = process
	p.process.ops = p
	return nil
}

type initProcess struct {
	containerProcess
	intelRdtManager *intelrdt.Manager
}

// getChildPid receives the final child's pid over the provided pipe.
func (p *initProcess) getChildPid() (int, error) {
	var pid pid
	if err := json.NewDecoder(p.comm.initSockParent).Decode(&pid); err != nil {
		_ = p.cmd.Wait()
		return -1, err
	}

	// Clean up the zombie parent process
	// On Unix systems FindProcess always succeeds.
	firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)

	// Ignore the error in case the child has already been reaped for any reason
	_, _ = firstChildProcess.Wait()

	return pid.Pid, nil
}

func (p *initProcess) waitForChildExit(childPid int) error {
	status, err := p.cmd.Process.Wait()
	if err != nil {
		_ = p.cmd.Wait()
		return err
	}
	if !status.Success() {
		_ = p.cmd.Wait()
		return &exec.ExitError{ProcessState: status}
	}

	process, err := os.FindProcess(childPid)
	if err != nil {
		return err
	}
	p.cmd.Process = process
	p.process.ops = p
	return nil
}

type mountSourceRequestFn func(*configs.Mount) (*mountSource, error)

// goCreateMountSources spawns a goroutine which creates open_tree(2)-style
// mountfds based on the requested configs.Mount configuration. The returned
// requestFn and cancelFn are used to interact with the goroutine.
//
// The caller of the returned mountSourceRequestFn is responsible for closing
// the returned file.
func (p *initProcess) goCreateMountSources(ctx context.Context) (mountSourceRequestFn, context.CancelFunc, error) {
	type response struct {
		src *mountSource
		err error
	}

	errCh := make(chan error, 1)
	requestCh := make(chan *configs.Mount)
	responseCh := make(chan response)

	ctx, cancelFn := context.WithTimeout(ctx, 1*time.Minute)
	context.AfterFunc(ctx, func() { close(requestCh) })

	go func() {
		// We lock this thread because we need to setns(2) here. There is no
		// UnlockOSThread() here, to ensure that the Go runtime will kill this
		// thread once this goroutine returns (ensuring no other goroutines run
		// in this context).
		runtime.LockOSThread()

		// Detach from the shared fs of the rest of the Go process in order to
		// be able to CLONE_NEWNS.
		if err := unix.Unshare(unix.CLONE_FS); err != nil {
			err = os.NewSyscallError("unshare(CLONE_FS)", err)
			errCh <- fmt.Errorf("mount source thread: %w", err)
			return
		}

		// Attach to the container's mount namespace.
		nsFd, err := os.Open(fmt.Sprintf("/proc/%d/ns/mnt", p.pid()))
		if err != nil {
			errCh <- fmt.Errorf("mount source thread: open container mntns: %w", err)
			return
		}
		defer nsFd.Close()
		if err := unix.Setns(int(nsFd.Fd()), unix.CLONE_NEWNS); err != nil {
			err = os.NewSyscallError("setns", err)
			errCh <- fmt.Errorf("mount source thread: join container mntns: %w", err)
			return
		}

		// No errors during setup!
		close(errCh)
		logrus.Debugf("mount source thread: successfully running in container mntns")

		nsHandles := new(userns.Handles)
		defer nsHandles.Release()
	loop:
		for {
			select {
			case m, ok := <-requestCh:
				if !ok {
					break loop
				}
				src, err := mountFd(nsHandles, m)
				logrus.Debugf("mount source thread: handling request for %q: %v %v", m.Source, src, err)
				responseCh <- response{
					src: src,
					err: err,
				}
			case <-ctx.Done():
				break loop
			}
		}
		logrus.Debugf("mount source thread: closing thread: %v", ctx.Err())
		close(responseCh)
	}()

	// Check for setup errors.
	err := <-errCh
	if err != nil {
		cancelFn()
		return nil, nil, err
	}

	requestFn := func(m *configs.Mount) (*mountSource, error) {
		var err error
		select {
		case requestCh <- m:
			select {
			case resp, ok := <-responseCh:
				if ok {
					return resp.src, resp.err
				}
				err = fmt.Errorf("response channel closed unexpectedly")
			case <-ctx.Done():
				err = fmt.Errorf("receive mount source context cancelled: %w", ctx.Err())
			}
		case <-ctx.Done():
			err = fmt.Errorf("send mount request cancelled: %w", ctx.Err())
		}
		return nil, err
	}
	return requestFn, cancelFn, nil
}

func (p *initProcess) start() (retErr error) {
	defer p.comm.closeParent()
	err := p.cmd.Start()
	p.process.ops = p
	// close the child-side of the pipes (controlled by child)
	p.comm.closeChild()
	if err != nil {
		p.process.ops = nil
		return fmt.Errorf("unable to start init: %w", err)
	}

	defer func() {
		if retErr != nil {
			// Find out if init is killed by the kernel's OOM killer.
			// Get the count before killing init as otherwise cgroup
			// might be removed by systemd.
			oom, err := p.manager.OOMKillCount()
			if err != nil {
				logrus.WithError(err).Warn("unable to get oom kill count")
			} else if oom > 0 {
				// Does not matter what the particular error was,
				// its cause is most probably OOM, so report that.
				const oomError = "container init was OOM-killed (memory limit too low?)"

				if logrus.GetLevel() >= logrus.DebugLevel {
					// Only show the original error if debug is set,
					// as it is not generally very useful.
					retErr = fmt.Errorf(oomError+": %w", retErr)
				} else {
					retErr = errors.New(oomError)
				}
			}

			// Terminate the process to ensure we can remove cgroups.
			if err := ignoreTerminateErrors(p.terminate()); err != nil {
				logrus.WithError(err).Warn("unable to terminate initProcess")
			}

			_ = p.manager.Destroy()
			if p.intelRdtManager != nil {
				_ = p.intelRdtManager.Destroy()
			}
		}
	}()

	// Do this before syncing with child so that no children can escape the
	// cgroup. We don't need to worry about not doing this and not being root
	// because we'd be using the rootless cgroup manager in that case.
	if err := p.manager.Apply(p.pid()); err != nil {
		if errors.Is(err, cgroups.ErrRootless) {
			// ErrRootless is to be ignored except when
			// the container doesn't have private pidns.
			if !p.config.Config.Namespaces.IsPrivate(configs.NEWPID) {
				// TODO: make this an error in runc 1.3.
				logrus.Warn("Creating a rootless container with no cgroup and no private pid namespace. " +
					"Such configuration is strongly discouraged (as it is impossible to properly kill all container's processes) " +
					"and will result in an error in a future runc version.")
			}
		} else {
			return fmt.Errorf("unable to apply cgroup configuration: %w", err)
		}
	}
	// Reset the CPU affinity after cgroups are configured to make sure it
	// matches any configured cpuset.
	tryResetCPUAffinity(p.pid())
	if p.intelRdtManager != nil {
		if err := p.intelRdtManager.Apply(p.pid()); err != nil {
			return fmt.Errorf("unable to apply Intel RDT configuration: %w", err)
		}
	}
	if _, err := io.Copy(p.comm.initSockParent, p.bootstrapData); err != nil {
		return fmt.Errorf("can't copy bootstrap data to pipe: %w", err)
	}

	childPid, err := p.getChildPid()
	if err != nil {
		return fmt.Errorf("can't get final child's PID from pipe: %w", err)
	}

	// Save the standard descriptor names before the container process
	// can potentially move them (e.g., via dup2()).  If we don't do this now,
	// we won't know at checkpoint time which file descriptor to look up.
	fds, err := getPipeFds(childPid)
	if err != nil {
		return fmt.Errorf("error getting pipe fds for pid %d: %w", childPid, err)
	}
	p.setExternalDescriptors(fds)

	// Wait for our first child to exit
	if err := p.waitForChildExit(childPid); err != nil {
		return fmt.Errorf("error waiting for our first child to exit: %w", err)
	}

	// Spin up a goroutine to handle remapping mount requests by runc init.
	// There is no point doing this for rootless containers because they cannot
	// configure MOUNT_ATTR_IDMAP, nor do OPEN_TREE_CLONE. We could just
	// service plain-open requests for plain bind-mounts but there's no need
	// (rootless containers will never have permission issues on a source mount
	// that the parent process can help with -- they are the same user).
	var mountRequest mountSourceRequestFn
	if !p.container.config.RootlessEUID {
		request, cancel, err := p.goCreateMountSources(context.Background())
		if err != nil {
			return fmt.Errorf("error spawning mount remapping thread: %w", err)
		}
		defer cancel()
		mountRequest = request
	}

	if err := p.createNetworkInterfaces(); err != nil {
		return fmt.Errorf("error creating network interfaces: %w", err)
	}

	if err := p.setupNetworkDevices(); err != nil {
		return fmt.Errorf("error creating network interfaces: %w", err)
	}

	// initConfig.SpecState is only needed to run hooks that are executed
	// inside a container, i.e. CreateContainer and StartContainer.
	if p.config.Config.HasHook(configs.CreateContainer, configs.StartContainer) {
		p.config.SpecState, err = p.container.currentOCIState()
		if err != nil {
			return fmt.Errorf("error getting current state: %w", err)
		}
	}

	if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil {
		return fmt.Errorf("error sending config to init process: %w", err)
	}

	var seenProcReady bool
	ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error {
		switch sync.Type {
		case procMountPlease:
			if mountRequest == nil {
				return fmt.Errorf("cannot fulfil mount requests as a rootless user")
			}
			var m *configs.Mount
			if sync.Arg == nil {
				return fmt.Errorf("sync %q is missing an argument", sync.Type)
			}
			if err := json.Unmarshal(*sync.Arg, &m); err != nil {
				return fmt.Errorf("sync %q passed invalid mount arg: %w", sync.Type, err)
			}
			mnt, err := mountRequest(m)
			if err != nil {
				return fmt.Errorf("failed to fulfil mount request: %w", err)
			}
			defer mnt.file.Close()

			arg, err := json.Marshal(mnt)
			if err != nil {
				return fmt.Errorf("sync %q failed to marshal mountSource: %w", sync.Type, err)
			}
			argMsg := json.RawMessage(arg)
			if err := doWriteSync(p.comm.syncSockParent, syncT{
				Type: procMountFd,
				Arg:  &argMsg,
				File: mnt.file,
			}); err != nil {
				return err
			}
		case procSeccomp:
			if p.config.Config.Seccomp.ListenerPath == "" {
				return errors.New("seccomp listenerPath is not set")
			}
			var srcFd int
			if sync.Arg == nil {
				return fmt.Errorf("sync %q is missing an argument", sync.Type)
			}
			if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil {
				return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err)
			}
			seccompFd, err := pidGetFd(p.pid(), srcFd)
			if err != nil {
				return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err)
			}
			defer seccompFd.Close()
			// We have a copy, the child can keep working. We don't need to
			// wait for the seccomp notify listener to get the fd before we
			// permit the child to continue because the child will happily wait
			// for the listener if it hits SCMP_ACT_NOTIFY.
			if err := writeSync(p.comm.syncSockParent, procSeccompDone); err != nil {
				return err
			}

			s, err := p.container.currentOCIState()
			if err != nil {
				return err
			}

			// initProcessStartTime hasn't been set yet.
			s.Pid = p.cmd.Process.Pid
			s.Status = specs.StateCreating
			containerProcessState := &specs.ContainerProcessState{
				Version:  specs.Version,
				Fds:      []string{specs.SeccompFdName},
				Pid:      s.Pid,
				Metadata: p.config.Config.Seccomp.ListenerMetadata,
				State:    *s,
			}
			if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath,
				containerProcessState, seccompFd); err != nil {
				return err
			}
		case procReady:
			seenProcReady = true
			// Set rlimits, this has to be done here because we lose permissions
			// to raise the limits once we enter a user-namespace
			if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
				return fmt.Errorf("error setting rlimits for ready process: %w", err)
			}

			// generate a timestamp indicating when the container was started
			p.container.created = time.Now().UTC()
			p.container.state = &createdState{
				c: p.container,
			}

			// NOTE: If the procRun state has been synced and the
			// runc-create process has been killed for some reason,
			// the runc-init[2:stage] process will be leaky. And
			// the runc command also fails to parse root directory
			// because the container doesn't have state.json.
			//
			// In order to cleanup the runc-init[2:stage] by
			// runc-delete/stop, we should store the status before
			// procRun sync.
			state, uerr := p.container.updateState(p)
			if uerr != nil {
				return fmt.Errorf("unable to store init state: %w", uerr)
			}
			p.container.initProcessStartTime = state.InitProcessStartTime

			// Sync with child.
			if err := writeSync(p.comm.syncSockParent, procRun); err != nil {
				return err
			}
		case procHooks:
			// Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
			if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {
				return fmt.Errorf("error setting cgroup config for procHooks process: %w", err)
			}
			if p.intelRdtManager != nil {
				if err := p.intelRdtManager.Set(p.config.Config); err != nil {
					return fmt.Errorf("error setting Intel RDT config for procHooks process: %w", err)
				}
			}
			if p.config.Config.HasHook(configs.Prestart, configs.CreateRuntime) {
				s, err := p.container.currentOCIState()
				if err != nil {
					return err
				}
				// initProcessStartTime hasn't been set yet.
				s.Pid = p.cmd.Process.Pid
				s.Status = specs.StateCreating
				hooks := p.config.Config.Hooks

				if err := hooks.Run(configs.Prestart, s); err != nil {
					return err
				}
				if err := hooks.Run(configs.CreateRuntime, s); err != nil {
					return err
				}
			}
			// Sync with child.
			if err := writeSync(p.comm.syncSockParent, procHooksDone); err != nil {
				return err
			}
		default:
			return errors.New("invalid JSON payload from child")
		}
		return nil
	})

	if err := p.comm.syncSockParent.Shutdown(unix.SHUT_WR); err != nil && ierr == nil {
		return err
	}
	if !seenProcReady && ierr == nil {
		ierr = errors.New("procReady not received")
	}
	if ierr != nil {
		return fmt.Errorf("error during container init: %w", ierr)
	}
	return nil
}

func (p *initProcess) createNetworkInterfaces() error {
	for _, config := range p.config.Config.Networks {
		strategy, err := getStrategy(config.Type)
		if err != nil {
			return err
		}
		n := &network{
			Network: *config,
		}
		if err := strategy.create(n, p.pid()); err != nil {
			return err
		}
		p.config.Networks = append(p.config.Networks, n)
	}
	return nil
}

// setupNetworkDevices sets up and initializes any defined network interface inside the container.
func (p *initProcess) setupNetworkDevices() error {
	// host network pods does not move network devices.
	if !p.config.Config.Namespaces.Contains(configs.NEWNET) {
		return nil
	}
	// the container init process has already joined the provided net namespace,
	// so we can use the process's net ns path directly.
	nsPath := fmt.Sprintf("/proc/%d/ns/net", p.pid())

	// If moving any of the network devices fails, we return an error immediately.
	// The runtime spec requires that the kernel handles moving back any devices
	// that were successfully moved before the failure occurred.
	// See: https://github.com/opencontainers/runtime-spec/blob/27cb0027fd92ef81eda1ea3a8153b8337f56d94a/config-linux.md#namespace-lifecycle-and-container-termination
	for name, netDevice := range p.config.Config.NetDevices {
		err := devChangeNetNamespace(name, nsPath, *netDevice)
		if err != nil {
			return fmt.Errorf("move netDevice %s to namespace %s: %w", name, nsPath, err)
		}
	}

	return nil
}

func pidGetFd(pid, srcFd int) (*os.File, error) {
	pidFd, err := unix.PidfdOpen(pid, 0)
	if err != nil {
		return nil, os.NewSyscallError("pidfd_open", err)
	}
	defer unix.Close(pidFd)
	fd, err := unix.PidfdGetfd(pidFd, srcFd, 0)
	if err != nil {
		return nil, os.NewSyscallError("pidfd_getfd", err)
	}
	return os.NewFile(uintptr(fd), "[pidfd_getfd]"), nil
}

func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, file *os.File) error {
	conn, err := net.Dial("unix", listenerPath)
	if err != nil {
		return fmt.Errorf("failed to connect with seccomp agent specified in the seccomp profile: %w", err)
	}
	defer conn.Close()

	socket, err := conn.(*net.UnixConn).File()
	if err != nil {
		return fmt.Errorf("cannot get seccomp socket: %w", err)
	}
	defer socket.Close()

	b, err := json.Marshal(state)
	if err != nil {
		return fmt.Errorf("cannot marshall seccomp state: %w", err)
	}

	if err := cmsg.SendRawFd(socket, string(b), file.Fd()); err != nil {
		return fmt.Errorf("cannot send seccomp fd to %s: %w", listenerPath, err)
	}
	runtime.KeepAlive(file)
	return nil
}

func getPipeFds(pid int) ([]string, error) {
	fds := make([]string, 3)

	dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
	for i := range 3 {
		// XXX: This breaks if the path is not a valid symlink (which can
		//      happen in certain particularly unlucky mount namespace setups).
		f := filepath.Join(dirPath, strconv.Itoa(i))
		target, err := os.Readlink(f)
		if err != nil {
			// Ignore permission errors, for rootless containers and other
			// non-dumpable processes. if we can't get the fd for a particular
			// file, there's not much we can do.
			if errors.Is(err, os.ErrPermission) {
				continue
			}
			return fds, err
		}
		fds[i] = target
	}
	return fds, nil
}

// InitializeIO creates pipes for use with the process's stdio and returns the
// opposite side for each. Do not use this if you want to have a pseudoterminal
// set up for you by libcontainer (TODO: fix that too).
// TODO: This is mostly unnecessary, and should be handled by clients.
func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
	var fds []uintptr
	i = &IO{}
	// cleanup in case of an error
	defer func() {
		if err != nil {
			for _, fd := range fds {
				_ = unix.Close(int(fd))
			}
		}
	}()
	// STDIN
	r, w, err := os.Pipe()
	if err != nil {
		return nil, err
	}
	fds = append(fds, r.Fd(), w.Fd())
	p.Stdin, i.Stdin = r, w
	// STDOUT
	if r, w, err = os.Pipe(); err != nil {
		return nil, err
	}
	fds = append(fds, r.Fd(), w.Fd())
	p.Stdout, i.Stdout = w, r
	// STDERR
	if r, w, err = os.Pipe(); err != nil {
		return nil, err
	}
	fds = append(fds, r.Fd(), w.Fd())
	p.Stderr, i.Stderr = w, r
	// change ownership of the pipes in case we are in a user namespace
	for _, fd := range fds {
		if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil {
			return nil, &os.PathError{Op: "fchown", Path: "fd " + strconv.Itoa(int(fd)), Err: err}
		}
	}
	return i, nil
}
