Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,19 @@ Changes with FreeUnit 1.35.4 xx xxx 2026
*) Bugfix: replace removed cgi module with email.parser in Python upload
test fixture; fixes test suite compatibility with Python 3.13.

*) Security: tighten privilege boundaries on the control socket,
mount preparation, and cgroup placement (audit PR-C): require
peer UID match (SO_PEERCRED / getpeereid) before accepting
connections on the unix control socket; resolve mount
destinations with openat2(RESOLVE_BENEATH) to defeat symlink
races that escape the rootfs under attacker-controlled trees;
resolve relative cgroup paths against /proc/<child>/cgroup
instead of /proc/self/cgroup so the new process's cgroup view
(not the parent's) drives placement. Audit V5 sender-type
ACL is deferred to a follow-up (needs SO_PASSCRED + process-
registration ordering rework; reverted draft caused isolation
test regressions, see PR #14 review thread).

Changes with FreeUnit 1.35.3 07 Apr 2026

*) Feature: migrate contrib package mirror from packages.nginx.org to
Expand Down
81 changes: 71 additions & 10 deletions src/nxt_cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@


static int nxt_mk_cgpath_relative(nxt_task_t *task, const char *dir,
char *cgpath);
char *cgpath, nxt_pid_t pid);
static nxt_int_t nxt_mk_cgpath(nxt_task_t *task, const char *dir,
char *cgpath);
char *cgpath, nxt_pid_t pid);


nxt_int_t
Expand All @@ -29,7 +29,16 @@ nxt_cgroup_proc_add(nxt_task_t *task, nxt_process_t *process)
return NXT_OK;
}

ret = nxt_mk_cgpath(task, process->isolation.cgroup.path, cgprocs);
/*
* Resolve the cgroup path against /proc/<child>/cgroup rather than
* /proc/self/cgroup: the parent's cgroup view may differ from the
* just-forked child's, particularly when CLONE_NEWCGROUP is in play
* and the configured path is relative. Reading the child's own
* /proc entry avoids a TOCTOU where the parent moves between cgroups
* after fork() but before this write.
*/
ret = nxt_mk_cgpath(task, process->isolation.cgroup.path, cgprocs,
process->pid);
if (nxt_slow_path(ret == NXT_ERROR)) {
return NXT_ERROR;
}
Expand All @@ -41,6 +50,18 @@ nxt_cgroup_proc_add(nxt_task_t *task, nxt_process_t *process)

len = strlen(cgprocs);

/*
* Stash the resolved directory before appending "/cgroup.procs" so
* nxt_cgroup_cleanup() can rmdir without re-reading /proc/<pid>/cgroup,
* which is gone once the child has exited.
*/
process->isolation.cgroup.resolved_path = nxt_mp_alloc(process->mem_pool,
len + 1);
if (nxt_fast_path(process->isolation.cgroup.resolved_path != NULL)) {
nxt_memcpy(process->isolation.cgroup.resolved_path, cgprocs, len);
process->isolation.cgroup.resolved_path[len] = '\0';
}

len = snprintf(cgprocs + len, NXT_MAX_PATH_LEN - len, "/cgroup.procs");
if (nxt_slow_path(len >= NXT_MAX_PATH_LEN - len)) {
nxt_errno = ENAMETOOLONG;
Expand Down Expand Up @@ -71,35 +92,72 @@ nxt_cgroup_cleanup(nxt_task_t *task, const nxt_process_t *process)
char cgroot[NXT_MAX_PATH_LEN], cgpath[NXT_MAX_PATH_LEN];
nxt_int_t ret;

ret = nxt_mk_cgpath(task, "", cgroot);
/*
* cgroot is the parent process's own cgroup directory; we must not
* rmdir it. Resolved against /proc/self/cgroup (pid=0): the child
* is gone by the time cleanup runs, so /proc/<child_pid>/cgroup no
* longer exists. The TOCTOU concern that motivated using the
* child's view in nxt_cgroup_proc_add() does not apply at cleanup
* — we just need a stop boundary, and rmdir on the parent's own
* cgroup will fail anyway because it is non-empty.
*/
ret = nxt_mk_cgpath(task, "", cgroot, 0);
if (nxt_slow_path(ret == NXT_ERROR)) {
return;
}

ret = nxt_mk_cgpath(task, process->isolation.cgroup.path, cgpath);
if (nxt_slow_path(ret == NXT_ERROR)) {
/*
* Use the resolved path cached by nxt_cgroup_proc_add(); falling
* back to /proc/<pid>/cgroup here would fail with ENOENT. If the
* cache was missed (e.g. mp_alloc failure during add), there is
* nothing to clean up that we can address safely — bail out.
*/
if (process->isolation.cgroup.resolved_path == NULL) {
return;
}

ret = snprintf(cgpath, sizeof(cgpath), "%s",
process->isolation.cgroup.resolved_path);
if (nxt_slow_path(ret < 0 || (size_t) ret >= sizeof(cgpath))) {
return;
}

while (*cgpath != '\0' && strcmp(cgroot, cgpath) != 0) {
rmdir(cgpath);
ptr = strrchr(cgpath, '/');
if (ptr == NULL) {
break;
}
*ptr = '\0';
}
}


static int
nxt_mk_cgpath_relative(nxt_task_t *task, const char *dir, char *cgpath)
nxt_mk_cgpath_relative(nxt_task_t *task, const char *dir, char *cgpath,
nxt_pid_t pid)
{
int i, len;
char *buf, *ptr;
FILE *fp;
size_t size;
ssize_t nread;
nxt_bool_t found;
char procpath[NXT_MAX_PATH_LEN];

if (pid > 0) {
len = snprintf(procpath, sizeof(procpath), "/proc/%d/cgroup",
(int) pid);
if (len < 0 || (size_t) len >= sizeof(procpath)) {
nxt_errno = ENAMETOOLONG;
return -1;
}
} else {
nxt_memcpy(procpath, "/proc/self/cgroup",
sizeof("/proc/self/cgroup"));
}

fp = nxt_file_fopen(task, "/proc/self/cgroup", "re");
fp = nxt_file_fopen(task, procpath, "re");
if (nxt_slow_path(fp == NULL)) {
return -1;
}
Expand Down Expand Up @@ -145,7 +203,7 @@ nxt_mk_cgpath_relative(nxt_task_t *task, const char *dir, char *cgpath)


static nxt_int_t
nxt_mk_cgpath(nxt_task_t *task, const char *dir, char *cgpath)
nxt_mk_cgpath(nxt_task_t *task, const char *dir, char *cgpath, nxt_pid_t pid)
{
int len;

Expand All @@ -154,9 +212,12 @@ nxt_mk_cgpath(nxt_task_t *task, const char *dir, char *cgpath)
* the cgroup path include the main unit processes cgroup. I.e
*
* NXT_CGROUP_ROOT/<main process cgroup>/<cgroup path>
*
* pid: read /proc/<pid>/cgroup so the path reflects the just-forked
* child's cgroup view (or 0 to fall back to /proc/self/cgroup).
*/
if (dir[0] != '/') {
len = nxt_mk_cgpath_relative(task, dir, cgpath);
len = nxt_mk_cgpath_relative(task, dir, cgpath, pid);
} else {
len = snprintf(cgpath, NXT_MAX_PATH_LEN, NXT_CGROUP_ROOT "%s", dir);
}
Expand Down
76 changes: 76 additions & 0 deletions src/nxt_controller.c
Original file line number Diff line number Diff line change
Expand Up @@ -709,6 +709,77 @@ nxt_runtime_controller_socket(nxt_task_t *task, nxt_runtime_t *rt)
}


static nxt_int_t
nxt_controller_check_peer_cred(nxt_task_t *task, nxt_conn_t *c)
{
#if (NXT_HAVE_UNIX_DOMAIN)
/*
* The control socket is the privilege boundary for the REST API:
* filesystem perms on `control.unit.sock` are defense-in-depth, not the
* boundary itself. Require the peer's effective UID to match unitd's
* (or be root) so a local user with directory-write permission cannot
* mutate config by hand-crafting connections.
*/
if (c->remote == NULL
|| c->remote->u.sockaddr.sa_family != AF_UNIX)
{
return NXT_OK;
}

#if (NXT_HAVE_UCRED)
{
struct ucred cred;
socklen_t len = sizeof(cred);

if (getsockopt(c->socket.fd, SOL_SOCKET, SO_PEERCRED, &cred, &len)
!= 0)
{
nxt_alert(task, "controller: SO_PEERCRED failed %E", nxt_errno);
return NXT_ERROR;
}

if (cred.uid != 0 && cred.uid != nxt_euid) {
nxt_alert(task, "controller: rejecting connection from uid %d "
"(unitd uid %d); set socket permissions accordingly",
(int) cred.uid, (int) nxt_euid);
return NXT_ERROR;
}

return NXT_OK;
}
#elif (defined(__FreeBSD__) || defined(__APPLE__) || defined(__OpenBSD__) \
|| defined(__NetBSD__) || defined(__DragonFly__))
{
uid_t euid;
gid_t egid;

if (getpeereid(c->socket.fd, &euid, &egid) != 0) {
nxt_alert(task, "controller: getpeereid failed %E", nxt_errno);
return NXT_ERROR;
}

if (euid != 0 && euid != nxt_euid) {
nxt_alert(task, "controller: rejecting connection from uid %d "
"(unitd uid %d)", (int) euid, (int) nxt_euid);
return NXT_ERROR;
}

return NXT_OK;
}
#else
/*
* No peer-credential primitive on this platform; rely on filesystem
* permissions of control.unit.sock. Operators must restrict the path.
*/
return NXT_OK;
#endif

#else /* !NXT_HAVE_UNIX_DOMAIN */
return NXT_OK;
#endif
}


static void
nxt_controller_conn_init(nxt_task_t *task, void *obj, void *data)
{
Expand All @@ -721,6 +792,11 @@ nxt_controller_conn_init(nxt_task_t *task, void *obj, void *data)

nxt_debug(task, "controller conn init fd:%d", c->socket.fd);

if (nxt_slow_path(nxt_controller_check_peer_cred(task, c) != NXT_OK)) {
nxt_controller_conn_free(task, c, NULL);
return;
}

r = nxt_mp_zget(c->mem_pool, sizeof(nxt_controller_request_t));
if (nxt_slow_path(r == NULL)) {
nxt_controller_conn_free(task, c, NULL);
Expand Down
97 changes: 97 additions & 0 deletions src/nxt_isolation.c
Original file line number Diff line number Diff line change
Expand Up @@ -759,13 +759,44 @@ nxt_isolation_prepare_rootfs(nxt_task_t *task, nxt_process_t *process)
const u_char *dst;
nxt_fs_mount_t *mnt;
nxt_process_automount_t *automount;
#if (NXT_HAVE_OPENAT2)
int rootfs_fd;
const u_char *rootfs_path;
size_t rootfs_len;
#endif

automount = &process->isolation.automount;
mounts = process->isolation.mounts;

n = mounts->nelts;
mnt = mounts->elts;

#if (NXT_HAVE_OPENAT2)
/*
* Mount destinations live under a (possibly user-owned) rootfs. An
* attacker with write access to that tree can race mkdir->mount(2)
* by swapping a path component for a symlink pointing outside the
* rootfs. Re-open the rootfs and resolve each destination with
* RESOLVE_BENEATH so a tampered destination that escapes the
* rootfs fails the open, before we hand a path to mount(2).
* RESOLVE_NO_SYMLINKS is intentionally NOT requested: legitimate
* rootfs setups commonly contain in-tree symlinks (e.g.
* /lib -> /usr/lib) that must still resolve.
*/
rootfs_path = process->isolation.rootfs;
rootfs_len = (rootfs_path != NULL) ? nxt_strlen(rootfs_path) : 0;

rootfs_fd = -1;
if (rootfs_len > 0) {
rootfs_fd = open((const char *) rootfs_path,
O_PATH | O_DIRECTORY | O_CLOEXEC);
if (rootfs_fd == -1) {
nxt_alert(task, "open rootfs(%s) %E", rootfs_path, nxt_errno);
return NXT_ERROR;
}
}
#endif

for (i = 0; i < n; i++) {
dst = mnt[i].dst;

Expand All @@ -786,12 +817,72 @@ nxt_isolation_prepare_rootfs(nxt_task_t *task, nxt_process_t *process)
goto undo;
}

#if (NXT_HAVE_OPENAT2)
if (rootfs_fd != -1
&& nxt_strlen(dst) > rootfs_len
&& memcmp(dst, rootfs_path, rootfs_len) == 0)
{
struct open_how how;
const char *rel;
int fd;

rel = (const char *) dst + rootfs_len;
while (*rel == '/') {
rel++;
}

nxt_memzero(&how, sizeof(how));
how.flags = O_PATH | O_CLOEXEC | O_DIRECTORY;
how.resolve = RESOLVE_BENEATH;

fd = syscall(SYS_openat2, rootfs_fd, rel, &how, sizeof(how));
if (fd == -1) {
if (nxt_errno == ENOSYS) {
/*
* openat2(2) is Linux 5.6+; older kernels return
* ENOSYS even when the userspace headers are
* present. Skip the symlink-resolution check on
* such kernels rather than failing every isolation
* setup; mount(2) below still operates on the
* caller-supplied path, just without the extra
* RESOLVE_BENEATH guard. Warn once so operators
* see the reduced security posture.
*/
static nxt_bool_t openat2_warned;
if (!openat2_warned) {
nxt_log(task, NXT_LOG_WARN,
"openat2(SYS_openat2) not supported by the "
"running kernel; rootfs mount destinations "
"are not validated against symlinks");
openat2_warned = 1;
}

} else {
nxt_alert(task, "mount destination %s escapes rootfs %s "
"or contains symlinks: %E", dst, rootfs_path,
nxt_errno);
ret = NXT_ERROR;
goto undo;
}

} else {
close(fd);
}
Comment on lines +838 to +870
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The syscall(SYS_openat2, ...) call does not handle the ENOSYS error. If Unit is compiled on a system where openat2 is supported in the headers but run on an older kernel that lacks the syscall, applications with rootfs isolation will fail to start with an alert.

It would be safer to check for ENOSYS and either provide a more descriptive error message or fall back to a different validation method if the security policy allows.

}
#endif

ret = nxt_fs_mount(task, &mnt[i]);
if (nxt_slow_path(ret != NXT_OK)) {
goto undo;
}
}

#if (NXT_HAVE_OPENAT2)
if (rootfs_fd != -1) {
close(rootfs_fd);
}
#endif

return NXT_OK;

undo:
Expand All @@ -802,6 +893,12 @@ nxt_isolation_prepare_rootfs(nxt_task_t *task, nxt_process_t *process)
nxt_fs_unmount(mnt[i].dst);
}

#if (NXT_HAVE_OPENAT2)
if (rootfs_fd != -1) {
close(rootfs_fd);
}
#endif

return NXT_ERROR;
}

Expand Down
Loading
Loading