andypost · andypost · May 8, 2026 · gemini-code-assist · May 8, 2026
diff --git a/CHANGES b/CHANGES
@@ -14,6 +14,19 @@ Changes with FreeUnit 1.35.4                                     xx xxx 2026
     *) Bugfix: replace removed cgi module with email.parser in Python upload
        test fixture; fixes test suite compatibility with Python 3.13.
 
+    *) Security: tighten privilege boundaries on the control socket,
+       mount preparation, and cgroup placement (audit PR-C): require
+       peer UID match (SO_PEERCRED / getpeereid) before accepting
+       connections on the unix control socket; resolve mount
+       destinations with openat2(RESOLVE_BENEATH) to defeat symlink
+       races that escape the rootfs under attacker-controlled trees;
+       resolve relative cgroup paths against /proc/<child>/cgroup
+       instead of /proc/self/cgroup so the new process's cgroup view
+       (not the parent's) drives placement.  Audit V5 sender-type
+       ACL is deferred to a follow-up (needs SO_PASSCRED + process-
+       registration ordering rework; reverted draft caused isolation
+       test regressions, see PR #14 review thread).
+
 Changes with FreeUnit 1.35.3                                     07 Apr 2026
 
     *) Feature: migrate contrib package mirror from packages.nginx.org to

diff --git a/src/nxt_cgroup.c b/src/nxt_cgroup.c
@@ -9,9 +9,9 @@
 
 
 static int nxt_mk_cgpath_relative(nxt_task_t *task, const char *dir,
-    char *cgpath);
+    char *cgpath, nxt_pid_t pid);
 static nxt_int_t nxt_mk_cgpath(nxt_task_t *task, const char *dir,
-    char *cgpath);
+    char *cgpath, nxt_pid_t pid);
 
 
 nxt_int_t
@@ -29,7 +29,16 @@ nxt_cgroup_proc_add(nxt_task_t *task, nxt_process_t *process)
         return NXT_OK;
     }
 
-    ret = nxt_mk_cgpath(task, process->isolation.cgroup.path, cgprocs);
+    /*
+     * Resolve the cgroup path against /proc/<child>/cgroup rather than
+     * /proc/self/cgroup: the parent's cgroup view may differ from the
+     * just-forked child's, particularly when CLONE_NEWCGROUP is in play
+     * and the configured path is relative.  Reading the child's own
+     * /proc entry avoids a TOCTOU where the parent moves between cgroups
+     * after fork() but before this write.
+     */
+    ret = nxt_mk_cgpath(task, process->isolation.cgroup.path, cgprocs,
+                        process->pid);
     if (nxt_slow_path(ret == NXT_ERROR)) {
         return NXT_ERROR;
     }
@@ -41,6 +50,18 @@ nxt_cgroup_proc_add(nxt_task_t *task, nxt_process_t *process)
 
     len = strlen(cgprocs);
 
+    /*
+     * Stash the resolved directory before appending "/cgroup.procs" so
+     * nxt_cgroup_cleanup() can rmdir without re-reading /proc/<pid>/cgroup,
+     * which is gone once the child has exited.
+     */
+    process->isolation.cgroup.resolved_path = nxt_mp_alloc(process->mem_pool,
+                                                           len + 1);
+    if (nxt_fast_path(process->isolation.cgroup.resolved_path != NULL)) {
+        nxt_memcpy(process->isolation.cgroup.resolved_path, cgprocs, len);
+        process->isolation.cgroup.resolved_path[len] = '\0';
+    }
+
     len = snprintf(cgprocs + len, NXT_MAX_PATH_LEN - len, "/cgroup.procs");
     if (nxt_slow_path(len >= NXT_MAX_PATH_LEN - len)) {
         nxt_errno = ENAMETOOLONG;
@@ -71,35 +92,72 @@ nxt_cgroup_cleanup(nxt_task_t *task, const nxt_process_t *process)
     char       cgroot[NXT_MAX_PATH_LEN], cgpath[NXT_MAX_PATH_LEN];
     nxt_int_t  ret;
 
-    ret = nxt_mk_cgpath(task, "", cgroot);
+    /*
+     * cgroot is the parent process's own cgroup directory; we must not
+     * rmdir it.  Resolved against /proc/self/cgroup (pid=0): the child
+     * is gone by the time cleanup runs, so /proc/<child_pid>/cgroup no
+     * longer exists.  The TOCTOU concern that motivated using the
+     * child's view in nxt_cgroup_proc_add() does not apply at cleanup
+     * — we just need a stop boundary, and rmdir on the parent's own
+     * cgroup will fail anyway because it is non-empty.
+     */
+    ret = nxt_mk_cgpath(task, "", cgroot, 0);
     if (nxt_slow_path(ret == NXT_ERROR)) {
         return;
     }
 
-    ret = nxt_mk_cgpath(task, process->isolation.cgroup.path, cgpath);
-    if (nxt_slow_path(ret == NXT_ERROR)) {
+    /*
+     * Use the resolved path cached by nxt_cgroup_proc_add(); falling
+     * back to /proc/<pid>/cgroup here would fail with ENOENT.  If the
+     * cache was missed (e.g. mp_alloc failure during add), there is
+     * nothing to clean up that we can address safely — bail out.
+     */
+    if (process->isolation.cgroup.resolved_path == NULL) {
+        return;
+    }
+
+    ret = snprintf(cgpath, sizeof(cgpath), "%s",
+                   process->isolation.cgroup.resolved_path);
+    if (nxt_slow_path(ret < 0 || (size_t) ret >= sizeof(cgpath))) {
         return;
     }
 
     while (*cgpath != '\0' && strcmp(cgroot, cgpath) != 0) {
         rmdir(cgpath);
         ptr = strrchr(cgpath, '/');
+        if (ptr == NULL) {
+            break;
+        }
         *ptr = '\0';
     }
 }
 
 
 static int
-nxt_mk_cgpath_relative(nxt_task_t *task, const char *dir, char *cgpath)
+nxt_mk_cgpath_relative(nxt_task_t *task, const char *dir, char *cgpath,
+    nxt_pid_t pid)
 {
     int         i, len;
     char        *buf, *ptr;
     FILE        *fp;
     size_t      size;
     ssize_t     nread;
     nxt_bool_t  found;
+    char        procpath[NXT_MAX_PATH_LEN];
+
+    if (pid > 0) {
+        len = snprintf(procpath, sizeof(procpath), "/proc/%d/cgroup",
+                       (int) pid);
+        if (len < 0 || (size_t) len >= sizeof(procpath)) {
+            nxt_errno = ENAMETOOLONG;
+            return -1;
+        }
+    } else {
+        nxt_memcpy(procpath, "/proc/self/cgroup",
+                   sizeof("/proc/self/cgroup"));
+    }
 
-    fp = nxt_file_fopen(task, "/proc/self/cgroup", "re");
+    fp = nxt_file_fopen(task, procpath, "re");
     if (nxt_slow_path(fp == NULL)) {
         return -1;
     }
@@ -145,7 +203,7 @@ nxt_mk_cgpath_relative(nxt_task_t *task, const char *dir, char *cgpath)
 
 
 static nxt_int_t
-nxt_mk_cgpath(nxt_task_t *task, const char *dir, char *cgpath)
+nxt_mk_cgpath(nxt_task_t *task, const char *dir, char *cgpath, nxt_pid_t pid)
 {
     int  len;
 
@@ -154,9 +212,12 @@ nxt_mk_cgpath(nxt_task_t *task, const char *dir, char *cgpath)
      * the cgroup path include the main unit processes cgroup. I.e
      *
      *   NXT_CGROUP_ROOT/<main process cgroup>/<cgroup path>
+     *
+     * pid: read /proc/<pid>/cgroup so the path reflects the just-forked
+     *      child's cgroup view (or 0 to fall back to /proc/self/cgroup).
      */
     if (dir[0] != '/') {
-        len = nxt_mk_cgpath_relative(task, dir, cgpath);
+        len = nxt_mk_cgpath_relative(task, dir, cgpath, pid);
     } else {
         len = snprintf(cgpath, NXT_MAX_PATH_LEN, NXT_CGROUP_ROOT "%s", dir);
     }

diff --git a/src/nxt_controller.c b/src/nxt_controller.c
@@ -709,6 +709,77 @@ nxt_runtime_controller_socket(nxt_task_t *task, nxt_runtime_t *rt)
 }
 
 
+static nxt_int_t
+nxt_controller_check_peer_cred(nxt_task_t *task, nxt_conn_t *c)
+{
+#if (NXT_HAVE_UNIX_DOMAIN)
+    /*
+     * The control socket is the privilege boundary for the REST API:
+     * filesystem perms on `control.unit.sock` are defense-in-depth, not the
+     * boundary itself.  Require the peer's effective UID to match unitd's
+     * (or be root) so a local user with directory-write permission cannot
+     * mutate config by hand-crafting connections.
+     */
+    if (c->remote == NULL
+        || c->remote->u.sockaddr.sa_family != AF_UNIX)
+    {
+        return NXT_OK;
+    }
+
+#if (NXT_HAVE_UCRED)
+    {
+        struct ucred  cred;
+        socklen_t     len = sizeof(cred);
+
+        if (getsockopt(c->socket.fd, SOL_SOCKET, SO_PEERCRED, &cred, &len)
+            != 0)
+        {
+            nxt_alert(task, "controller: SO_PEERCRED failed %E", nxt_errno);
+            return NXT_ERROR;
+        }
+
+        if (cred.uid != 0 && cred.uid != nxt_euid) {
+            nxt_alert(task, "controller: rejecting connection from uid %d "
+                      "(unitd uid %d); set socket permissions accordingly",
+                      (int) cred.uid, (int) nxt_euid);
+            return NXT_ERROR;
+        }
+
+        return NXT_OK;
+    }
+#elif (defined(__FreeBSD__) || defined(__APPLE__) || defined(__OpenBSD__) \
+       || defined(__NetBSD__) || defined(__DragonFly__))
+    {
+        uid_t  euid;
+        gid_t  egid;
+
+        if (getpeereid(c->socket.fd, &euid, &egid) != 0) {
+            nxt_alert(task, "controller: getpeereid failed %E", nxt_errno);
+            return NXT_ERROR;
+        }
+
+        if (euid != 0 && euid != nxt_euid) {
+            nxt_alert(task, "controller: rejecting connection from uid %d "
+                      "(unitd uid %d)", (int) euid, (int) nxt_euid);
+            return NXT_ERROR;
+        }
+
+        return NXT_OK;
+    }
+#else
+    /*
+     * No peer-credential primitive on this platform; rely on filesystem
+     * permissions of control.unit.sock.  Operators must restrict the path.
+     */
+    return NXT_OK;
+#endif
+
+#else  /* !NXT_HAVE_UNIX_DOMAIN */
+    return NXT_OK;
+#endif
+}
+
+
 static void
 nxt_controller_conn_init(nxt_task_t *task, void *obj, void *data)
 {
@@ -721,6 +792,11 @@ nxt_controller_conn_init(nxt_task_t *task, void *obj, void *data)
 
     nxt_debug(task, "controller conn init fd:%d", c->socket.fd);
 
+    if (nxt_slow_path(nxt_controller_check_peer_cred(task, c) != NXT_OK)) {
+        nxt_controller_conn_free(task, c, NULL);
+        return;
+    }
+
     r = nxt_mp_zget(c->mem_pool, sizeof(nxt_controller_request_t));
     if (nxt_slow_path(r == NULL)) {
         nxt_controller_conn_free(task, c, NULL);

diff --git a/src/nxt_isolation.c b/src/nxt_isolation.c
@@ -759,13 +759,44 @@ nxt_isolation_prepare_rootfs(nxt_task_t *task, nxt_process_t *process)
     const u_char             *dst;
     nxt_fs_mount_t           *mnt;
     nxt_process_automount_t  *automount;
+#if (NXT_HAVE_OPENAT2)
+    int                      rootfs_fd;
+    const u_char             *rootfs_path;
+    size_t                   rootfs_len;
+#endif
 
     automount = &process->isolation.automount;
     mounts = process->isolation.mounts;
 
     n = mounts->nelts;
     mnt = mounts->elts;
 
+#if (NXT_HAVE_OPENAT2)
+    /*
+     * Mount destinations live under a (possibly user-owned) rootfs.  An
+     * attacker with write access to that tree can race mkdir->mount(2)
+     * by swapping a path component for a symlink pointing outside the
+     * rootfs.  Re-open the rootfs and resolve each destination with
+     * RESOLVE_BENEATH so a tampered destination that escapes the
+     * rootfs fails the open, before we hand a path to mount(2).
+     * RESOLVE_NO_SYMLINKS is intentionally NOT requested: legitimate
+     * rootfs setups commonly contain in-tree symlinks (e.g.
+     * /lib -> /usr/lib) that must still resolve.
+     */
+    rootfs_path = process->isolation.rootfs;
+    rootfs_len = (rootfs_path != NULL) ? nxt_strlen(rootfs_path) : 0;
+
+    rootfs_fd = -1;
+    if (rootfs_len > 0) {
+        rootfs_fd = open((const char *) rootfs_path,
+                         O_PATH | O_DIRECTORY | O_CLOEXEC);
+        if (rootfs_fd == -1) {
+            nxt_alert(task, "open rootfs(%s) %E", rootfs_path, nxt_errno);
+            return NXT_ERROR;
+        }
+    }
+#endif
+
     for (i = 0; i < n; i++) {
         dst = mnt[i].dst;
 
@@ -786,12 +817,72 @@ nxt_isolation_prepare_rootfs(nxt_task_t *task, nxt_process_t *process)
             goto undo;
         }
 
+#if (NXT_HAVE_OPENAT2)
+        if (rootfs_fd != -1
+            && nxt_strlen(dst) > rootfs_len
+            && memcmp(dst, rootfs_path, rootfs_len) == 0)
+        {
+            struct open_how  how;
+            const char      *rel;
+            int              fd;
+
+            rel = (const char *) dst + rootfs_len;
+            while (*rel == '/') {
+                rel++;
+            }
+
+            nxt_memzero(&how, sizeof(how));
+            how.flags = O_PATH | O_CLOEXEC | O_DIRECTORY;
+            how.resolve = RESOLVE_BENEATH;
+
+            fd = syscall(SYS_openat2, rootfs_fd, rel, &how, sizeof(how));
+            if (fd == -1) {
+                if (nxt_errno == ENOSYS) {
+                    /*
+                     * openat2(2) is Linux 5.6+; older kernels return
+                     * ENOSYS even when the userspace headers are
+                     * present.  Skip the symlink-resolution check on
+                     * such kernels rather than failing every isolation
+                     * setup; mount(2) below still operates on the
+                     * caller-supplied path, just without the extra
+                     * RESOLVE_BENEATH guard.  Warn once so operators
+                     * see the reduced security posture.
+                     */
+                    static nxt_bool_t  openat2_warned;
+                    if (!openat2_warned) {
+                        nxt_log(task, NXT_LOG_WARN,
+                                "openat2(SYS_openat2) not supported by the "
+                                "running kernel; rootfs mount destinations "
+                                "are not validated against symlinks");
+                        openat2_warned = 1;
+                    }
+
+                } else {
+                    nxt_alert(task, "mount destination %s escapes rootfs %s "
+                              "or contains symlinks: %E", dst, rootfs_path,
+                              nxt_errno);
+                    ret = NXT_ERROR;
+                    goto undo;
+                }
+
+            } else {
+                close(fd);
+            }
+        }
+#endif
+
         ret = nxt_fs_mount(task, &mnt[i]);
         if (nxt_slow_path(ret != NXT_OK)) {
             goto undo;
         }
     }
 
+#if (NXT_HAVE_OPENAT2)
+    if (rootfs_fd != -1) {
+        close(rootfs_fd);
+    }
+#endif
+
     return NXT_OK;
 
 undo:
@@ -802,6 +893,12 @@ nxt_isolation_prepare_rootfs(nxt_task_t *task, nxt_process_t *process)
         nxt_fs_unmount(mnt[i].dst);
     }
 
+#if (NXT_HAVE_OPENAT2)
+    if (rootfs_fd != -1) {
+        close(rootfs_fd);
+    }
+#endif
+
     return NXT_ERROR;
 }