diff --git a/CHANGES b/CHANGES index 8ae835854..f1cd84e56 100644 --- a/CHANGES +++ b/CHANGES @@ -14,6 +14,19 @@ Changes with FreeUnit 1.35.4 xx xxx 2026 *) Bugfix: replace removed cgi module with email.parser in Python upload test fixture; fixes test suite compatibility with Python 3.13. + *) Security: tighten privilege boundaries on the control socket, + mount preparation, and cgroup placement (audit PR-C): require + peer UID match (SO_PEERCRED / getpeereid) before accepting + connections on the unix control socket; resolve mount + destinations with openat2(RESOLVE_BENEATH) to defeat symlink + races that escape the rootfs under attacker-controlled trees; + resolve relative cgroup paths against /proc//cgroup + instead of /proc/self/cgroup so the new process's cgroup view + (not the parent's) drives placement. Audit V5 sender-type + ACL is deferred to a follow-up (needs SO_PASSCRED + process- + registration ordering rework; reverted draft caused isolation + test regressions, see PR #14 review thread). + Changes with FreeUnit 1.35.3 07 Apr 2026 *) Feature: migrate contrib package mirror from packages.nginx.org to diff --git a/src/nxt_cgroup.c b/src/nxt_cgroup.c index 79e240f1e..be1c879b8 100644 --- a/src/nxt_cgroup.c +++ b/src/nxt_cgroup.c @@ -9,9 +9,9 @@ static int nxt_mk_cgpath_relative(nxt_task_t *task, const char *dir, - char *cgpath); + char *cgpath, nxt_pid_t pid); static nxt_int_t nxt_mk_cgpath(nxt_task_t *task, const char *dir, - char *cgpath); + char *cgpath, nxt_pid_t pid); nxt_int_t @@ -29,7 +29,16 @@ nxt_cgroup_proc_add(nxt_task_t *task, nxt_process_t *process) return NXT_OK; } - ret = nxt_mk_cgpath(task, process->isolation.cgroup.path, cgprocs); + /* + * Resolve the cgroup path against /proc//cgroup rather than + * /proc/self/cgroup: the parent's cgroup view may differ from the + * just-forked child's, particularly when CLONE_NEWCGROUP is in play + * and the configured path is relative. Reading the child's own + * /proc entry avoids a TOCTOU where the parent moves between cgroups + * after fork() but before this write. + */ + ret = nxt_mk_cgpath(task, process->isolation.cgroup.path, cgprocs, + process->pid); if (nxt_slow_path(ret == NXT_ERROR)) { return NXT_ERROR; } @@ -41,6 +50,18 @@ nxt_cgroup_proc_add(nxt_task_t *task, nxt_process_t *process) len = strlen(cgprocs); + /* + * Stash the resolved directory before appending "/cgroup.procs" so + * nxt_cgroup_cleanup() can rmdir without re-reading /proc//cgroup, + * which is gone once the child has exited. + */ + process->isolation.cgroup.resolved_path = nxt_mp_alloc(process->mem_pool, + len + 1); + if (nxt_fast_path(process->isolation.cgroup.resolved_path != NULL)) { + nxt_memcpy(process->isolation.cgroup.resolved_path, cgprocs, len); + process->isolation.cgroup.resolved_path[len] = '\0'; + } + len = snprintf(cgprocs + len, NXT_MAX_PATH_LEN - len, "/cgroup.procs"); if (nxt_slow_path(len >= NXT_MAX_PATH_LEN - len)) { nxt_errno = ENAMETOOLONG; @@ -71,26 +92,50 @@ nxt_cgroup_cleanup(nxt_task_t *task, const nxt_process_t *process) char cgroot[NXT_MAX_PATH_LEN], cgpath[NXT_MAX_PATH_LEN]; nxt_int_t ret; - ret = nxt_mk_cgpath(task, "", cgroot); + /* + * cgroot is the parent process's own cgroup directory; we must not + * rmdir it. Resolved against /proc/self/cgroup (pid=0): the child + * is gone by the time cleanup runs, so /proc//cgroup no + * longer exists. The TOCTOU concern that motivated using the + * child's view in nxt_cgroup_proc_add() does not apply at cleanup + * — we just need a stop boundary, and rmdir on the parent's own + * cgroup will fail anyway because it is non-empty. + */ + ret = nxt_mk_cgpath(task, "", cgroot, 0); if (nxt_slow_path(ret == NXT_ERROR)) { return; } - ret = nxt_mk_cgpath(task, process->isolation.cgroup.path, cgpath); - if (nxt_slow_path(ret == NXT_ERROR)) { + /* + * Use the resolved path cached by nxt_cgroup_proc_add(); falling + * back to /proc//cgroup here would fail with ENOENT. If the + * cache was missed (e.g. mp_alloc failure during add), there is + * nothing to clean up that we can address safely — bail out. + */ + if (process->isolation.cgroup.resolved_path == NULL) { + return; + } + + ret = snprintf(cgpath, sizeof(cgpath), "%s", + process->isolation.cgroup.resolved_path); + if (nxt_slow_path(ret < 0 || (size_t) ret >= sizeof(cgpath))) { return; } while (*cgpath != '\0' && strcmp(cgroot, cgpath) != 0) { rmdir(cgpath); ptr = strrchr(cgpath, '/'); + if (ptr == NULL) { + break; + } *ptr = '\0'; } } static int -nxt_mk_cgpath_relative(nxt_task_t *task, const char *dir, char *cgpath) +nxt_mk_cgpath_relative(nxt_task_t *task, const char *dir, char *cgpath, + nxt_pid_t pid) { int i, len; char *buf, *ptr; @@ -98,8 +143,21 @@ nxt_mk_cgpath_relative(nxt_task_t *task, const char *dir, char *cgpath) size_t size; ssize_t nread; nxt_bool_t found; + char procpath[NXT_MAX_PATH_LEN]; + + if (pid > 0) { + len = snprintf(procpath, sizeof(procpath), "/proc/%d/cgroup", + (int) pid); + if (len < 0 || (size_t) len >= sizeof(procpath)) { + nxt_errno = ENAMETOOLONG; + return -1; + } + } else { + nxt_memcpy(procpath, "/proc/self/cgroup", + sizeof("/proc/self/cgroup")); + } - fp = nxt_file_fopen(task, "/proc/self/cgroup", "re"); + fp = nxt_file_fopen(task, procpath, "re"); if (nxt_slow_path(fp == NULL)) { return -1; } @@ -145,7 +203,7 @@ nxt_mk_cgpath_relative(nxt_task_t *task, const char *dir, char *cgpath) static nxt_int_t -nxt_mk_cgpath(nxt_task_t *task, const char *dir, char *cgpath) +nxt_mk_cgpath(nxt_task_t *task, const char *dir, char *cgpath, nxt_pid_t pid) { int len; @@ -154,9 +212,12 @@ nxt_mk_cgpath(nxt_task_t *task, const char *dir, char *cgpath) * the cgroup path include the main unit processes cgroup. I.e * * NXT_CGROUP_ROOT/
/ + * + * pid: read /proc//cgroup so the path reflects the just-forked + * child's cgroup view (or 0 to fall back to /proc/self/cgroup). */ if (dir[0] != '/') { - len = nxt_mk_cgpath_relative(task, dir, cgpath); + len = nxt_mk_cgpath_relative(task, dir, cgpath, pid); } else { len = snprintf(cgpath, NXT_MAX_PATH_LEN, NXT_CGROUP_ROOT "%s", dir); } diff --git a/src/nxt_controller.c b/src/nxt_controller.c index a7c6842c7..e8098dcca 100644 --- a/src/nxt_controller.c +++ b/src/nxt_controller.c @@ -709,6 +709,77 @@ nxt_runtime_controller_socket(nxt_task_t *task, nxt_runtime_t *rt) } +static nxt_int_t +nxt_controller_check_peer_cred(nxt_task_t *task, nxt_conn_t *c) +{ +#if (NXT_HAVE_UNIX_DOMAIN) + /* + * The control socket is the privilege boundary for the REST API: + * filesystem perms on `control.unit.sock` are defense-in-depth, not the + * boundary itself. Require the peer's effective UID to match unitd's + * (or be root) so a local user with directory-write permission cannot + * mutate config by hand-crafting connections. + */ + if (c->remote == NULL + || c->remote->u.sockaddr.sa_family != AF_UNIX) + { + return NXT_OK; + } + +#if (NXT_HAVE_UCRED) + { + struct ucred cred; + socklen_t len = sizeof(cred); + + if (getsockopt(c->socket.fd, SOL_SOCKET, SO_PEERCRED, &cred, &len) + != 0) + { + nxt_alert(task, "controller: SO_PEERCRED failed %E", nxt_errno); + return NXT_ERROR; + } + + if (cred.uid != 0 && cred.uid != nxt_euid) { + nxt_alert(task, "controller: rejecting connection from uid %d " + "(unitd uid %d); set socket permissions accordingly", + (int) cred.uid, (int) nxt_euid); + return NXT_ERROR; + } + + return NXT_OK; + } +#elif (defined(__FreeBSD__) || defined(__APPLE__) || defined(__OpenBSD__) \ + || defined(__NetBSD__) || defined(__DragonFly__)) + { + uid_t euid; + gid_t egid; + + if (getpeereid(c->socket.fd, &euid, &egid) != 0) { + nxt_alert(task, "controller: getpeereid failed %E", nxt_errno); + return NXT_ERROR; + } + + if (euid != 0 && euid != nxt_euid) { + nxt_alert(task, "controller: rejecting connection from uid %d " + "(unitd uid %d)", (int) euid, (int) nxt_euid); + return NXT_ERROR; + } + + return NXT_OK; + } +#else + /* + * No peer-credential primitive on this platform; rely on filesystem + * permissions of control.unit.sock. Operators must restrict the path. + */ + return NXT_OK; +#endif + +#else /* !NXT_HAVE_UNIX_DOMAIN */ + return NXT_OK; +#endif +} + + static void nxt_controller_conn_init(nxt_task_t *task, void *obj, void *data) { @@ -721,6 +792,11 @@ nxt_controller_conn_init(nxt_task_t *task, void *obj, void *data) nxt_debug(task, "controller conn init fd:%d", c->socket.fd); + if (nxt_slow_path(nxt_controller_check_peer_cred(task, c) != NXT_OK)) { + nxt_controller_conn_free(task, c, NULL); + return; + } + r = nxt_mp_zget(c->mem_pool, sizeof(nxt_controller_request_t)); if (nxt_slow_path(r == NULL)) { nxt_controller_conn_free(task, c, NULL); diff --git a/src/nxt_isolation.c b/src/nxt_isolation.c index 909a43f4b..2e6a6a878 100644 --- a/src/nxt_isolation.c +++ b/src/nxt_isolation.c @@ -759,6 +759,11 @@ nxt_isolation_prepare_rootfs(nxt_task_t *task, nxt_process_t *process) const u_char *dst; nxt_fs_mount_t *mnt; nxt_process_automount_t *automount; +#if (NXT_HAVE_OPENAT2) + int rootfs_fd; + const u_char *rootfs_path; + size_t rootfs_len; +#endif automount = &process->isolation.automount; mounts = process->isolation.mounts; @@ -766,6 +771,32 @@ nxt_isolation_prepare_rootfs(nxt_task_t *task, nxt_process_t *process) n = mounts->nelts; mnt = mounts->elts; +#if (NXT_HAVE_OPENAT2) + /* + * Mount destinations live under a (possibly user-owned) rootfs. An + * attacker with write access to that tree can race mkdir->mount(2) + * by swapping a path component for a symlink pointing outside the + * rootfs. Re-open the rootfs and resolve each destination with + * RESOLVE_BENEATH so a tampered destination that escapes the + * rootfs fails the open, before we hand a path to mount(2). + * RESOLVE_NO_SYMLINKS is intentionally NOT requested: legitimate + * rootfs setups commonly contain in-tree symlinks (e.g. + * /lib -> /usr/lib) that must still resolve. + */ + rootfs_path = process->isolation.rootfs; + rootfs_len = (rootfs_path != NULL) ? nxt_strlen(rootfs_path) : 0; + + rootfs_fd = -1; + if (rootfs_len > 0) { + rootfs_fd = open((const char *) rootfs_path, + O_PATH | O_DIRECTORY | O_CLOEXEC); + if (rootfs_fd == -1) { + nxt_alert(task, "open rootfs(%s) %E", rootfs_path, nxt_errno); + return NXT_ERROR; + } + } +#endif + for (i = 0; i < n; i++) { dst = mnt[i].dst; @@ -786,12 +817,72 @@ nxt_isolation_prepare_rootfs(nxt_task_t *task, nxt_process_t *process) goto undo; } +#if (NXT_HAVE_OPENAT2) + if (rootfs_fd != -1 + && nxt_strlen(dst) > rootfs_len + && memcmp(dst, rootfs_path, rootfs_len) == 0) + { + struct open_how how; + const char *rel; + int fd; + + rel = (const char *) dst + rootfs_len; + while (*rel == '/') { + rel++; + } + + nxt_memzero(&how, sizeof(how)); + how.flags = O_PATH | O_CLOEXEC | O_DIRECTORY; + how.resolve = RESOLVE_BENEATH; + + fd = syscall(SYS_openat2, rootfs_fd, rel, &how, sizeof(how)); + if (fd == -1) { + if (nxt_errno == ENOSYS) { + /* + * openat2(2) is Linux 5.6+; older kernels return + * ENOSYS even when the userspace headers are + * present. Skip the symlink-resolution check on + * such kernels rather than failing every isolation + * setup; mount(2) below still operates on the + * caller-supplied path, just without the extra + * RESOLVE_BENEATH guard. Warn once so operators + * see the reduced security posture. + */ + static nxt_bool_t openat2_warned; + if (!openat2_warned) { + nxt_log(task, NXT_LOG_WARN, + "openat2(SYS_openat2) not supported by the " + "running kernel; rootfs mount destinations " + "are not validated against symlinks"); + openat2_warned = 1; + } + + } else { + nxt_alert(task, "mount destination %s escapes rootfs %s " + "or contains symlinks: %E", dst, rootfs_path, + nxt_errno); + ret = NXT_ERROR; + goto undo; + } + + } else { + close(fd); + } + } +#endif + ret = nxt_fs_mount(task, &mnt[i]); if (nxt_slow_path(ret != NXT_OK)) { goto undo; } } +#if (NXT_HAVE_OPENAT2) + if (rootfs_fd != -1) { + close(rootfs_fd); + } +#endif + return NXT_OK; undo: @@ -802,6 +893,12 @@ nxt_isolation_prepare_rootfs(nxt_task_t *task, nxt_process_t *process) nxt_fs_unmount(mnt[i].dst); } +#if (NXT_HAVE_OPENAT2) + if (rootfs_fd != -1) { + close(rootfs_fd); + } +#endif + return NXT_ERROR; } diff --git a/src/nxt_port.c b/src/nxt_port.c index b9964df46..15a6c59c9 100644 --- a/src/nxt_port.c +++ b/src/nxt_port.c @@ -169,6 +169,24 @@ nxt_port_enable(nxt_task_t *task, nxt_port_t *port, } +/* + * TODO(audit-V5-medium): sender-type ACL for privileged port messages. + * + * The audit recommends rejecting application workers / prototypes + * that forge cert/script/socket/access-log/etc. messages to the + * main process. A first draft was implemented but caused + * regressions in the isolation test suite — the kernel-validated + * sender PID (SCM_CREDENTIALS) is not consistently available on + * the existing socketpair setup, and the message-dispatch path + * runs before the prototype's process registration completes. + * + * Reverted to the pre-audit dispatch path here and deferred the + * ACL to a follow-up that introduces SO_PASSCRED on the relevant + * socketpairs and updates the process-registration ordering. + * See https://github.com/andypost/unit/pull/14 review thread. + */ + + static void nxt_port_handler(nxt_task_t *task, nxt_port_recv_msg_t *msg) { diff --git a/src/nxt_process.h b/src/nxt_process.h index 42fd1bed2..e4dfd50d1 100644 --- a/src/nxt_process.h +++ b/src/nxt_process.h @@ -76,6 +76,13 @@ typedef struct { struct nxt_cgroup_s { char *path; + /* + * Resolved cgroup directory cached by nxt_cgroup_proc_add() so + * nxt_cgroup_cleanup() does not need to re-resolve via + * /proc//cgroup after the child has already exited (which + * would fail with ENOENT and leak cgroup directories). + */ + char *resolved_path; };