diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -1017,9 +1017,9 @@ size_t namelen, onamelen, pnamelen; int created, cuflags, descend, drflags, enforce; int error, errmsg_len, errmsg_pos; - int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; + int gotchildmax, gotenforce, gothid, gotpcpuset, gotrsnum, gotslevel; int deadid, jfd_in, jfd_out, jfd_pos, jid, jsys, len, level; - int childmax, osreldt, rsnum, slevel; + int childmax, osreldt, pcpusetid, rsnum, slevel; #ifdef INET int ip4s; bool redo_ip4; @@ -1156,6 +1156,15 @@ else gotchildmax = 1; + error = vfs_copyopt(opts, "cpuset.parent", &pcpusetid, + sizeof(pcpusetid)); + if (error == ENOENT) + gotpcpuset = 0; + else if (error != 0) + goto done_free; + else + gotpcpuset = 1; + error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce)); if (error == ENOENT) gotenforce = 0; @@ -1819,10 +1828,29 @@ } #endif /* - * Allocate a dedicated cpuset for each jail. + * Allocate a dedicated cpuset for each jail. Our jail + * topology matches our cpuset topology by default, but we allow + * jails to be created with other cpusets that are visible + * to the creating process to create other hierarchies if they + * would prefer and know what they are doing. + * * Unlike other initial settings, this may return an error. */ - error = cpuset_create_root(ppr, &pr->pr_cpuset); + if (gotpcpuset) { + struct cpuset *pcpuset; + + error = cpuset_which(CPU_WHICH_CPUSET, pcpusetid, NULL, + NULL, &pcpuset); + if (error) + goto done_deref; + + error = cpuset_create_root_from(pcpuset, + &pr->pr_cpuset); + cpuset_rel(pcpuset); + } else { + error = cpuset_create_root(ppr, &pr->pr_cpuset); + } + if (error) goto done_deref; @@ -1837,6 +1865,14 @@ prison_hold(pr); drflags |= PD_DEREF; } + + if (gotpcpuset) { + error = EINVAL; + vfs_opterror(opts, + "cpuset.parent cannot be changed after creation"); + goto done_deref; + } + #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((pr->pr_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { @@ -2660,6 +2696,10 @@ sizeof(pr->pr_cpuset->cs_id)); if (error != 0 && error != ENOENT) goto done; + error = vfs_setopt(opts, "cpuset.parent", + &pr->pr_cpuset->cs_parent->cs_id, sizeof(pr->pr_cpuset->cs_id)); + if (error != 0 && error != ENOENT) + goto done; error = vfs_setopts(opts, "path", prison_path(mypr, pr)); if (error != 0 && error != ENOENT) goto done; @@ -4993,6 +5033,8 @@ SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset"); SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID"); +SYSCTL_JAIL_PARAM(_cpuset, parent, CTLTYPE_INT | CTLFLAG_RD, "I", + "Jail parent cpuset ID"); #ifdef INET SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN, diff --git a/usr.sbin/jail/jail.8 b/usr.sbin/jail/jail.8 --- a/usr.sbin/jail/jail.8 +++ b/usr.sbin/jail/jail.8 @@ -26,7 +26,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd November 13, 2025 +.Dd December 6, 2025 .Dt JAIL 8 .Os .Sh NAME @@ -501,6 +501,11 @@ pseudo-parameter set. .It Va cpuset.id The ID of the cpuset associated with this jail (read-only). +.It Va cpuset.parent +The ID of the parent of the cpuset associated with jail. +This may be used to create different cpuset topologies than the default scheme +that matches the jail layout. +The cpuset parent may be set at jail creation time, but is read-only afterwards. .It Va dying This is true if the jail is in the process of shutting down (read-only). .It Va parent diff --git a/usr.sbin/jail/tests/jail_basic_test.sh b/usr.sbin/jail/tests/jail_basic_test.sh --- a/usr.sbin/jail/tests/jail_basic_test.sh +++ b/usr.sbin/jail/tests/jail_basic_test.sh @@ -24,6 +24,19 @@ # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. +clean_jails() +{ + if [ ! -s jails.lst ]; then + return 0 + fi + + while read jail; do + if jls -c -j "$jail"; then + jail -r "$jail" + fi + done < jails.lst +} + atf_test_case "basic" "cleanup" basic_head() { @@ -165,6 +178,53 @@ fi } +atf_test_case "cpuset_parent" "cleanup" +cpuset_parent_head() +{ + atf_set descr 'cpuset.parent jail test' + atf_set require.user root + # XXX A require.jailed 'true/false' could be useful here if we want to + # start annotating test requirements like that. +} + +cpuset_parent_body() +{ + # We try hard not to assume the CPU IDs available to prison0, so we'll + # process cpuset(1) output. + cpus_avail=$(cpuset -gr -p $$ | \ + sed -n -Ee 's/^[^:]+: //' -e 's/,//g' -e 1p) + + set -- $cpus_avail + if [ $# -le 1 ]; then + atf_skip "This test requires a multi-core system" + fi + + # We'll remove the first cpu from the list and create a jail with the + # resulting cpuset as its parent. + shift + cpulist=$(echo -n "$@" | tr -s '[:space:]' ',') + + echo basejail >> jails.lst + + cat < mkjail.sh +setid=\$(cpuset -gi -p \$$ | sed -Ee 's/^[^:]+:[[:space:]]+//') +jail -c name=basejail path=/ cpuset.parent=\$setid command=cpuset -gr +EOF + + # Create it with children.max=1 and confirm that it can't create a jail + # based off cpuset 0. + fetchid="$(atf_get_srcdir)/cpuset_id.sh" + atf_check -o save:cpulist cpuset -cl "$cpulist" sh mkjail.sh + + rootcpus=$(sed -n -Ee 's/^[^:]+: //' -e 's/[[:space:]]+//g' -e 1p cpulist) + atf_check_equal "$cpulist" "$rootcpus" +} + +cpuset_parent_cleanup() +{ + clean_jails +} + atf_test_case "jid_name_set" "cleanup" jid_name_set_head() { @@ -191,18 +251,6 @@ echo "$jid" | tee -a jails.lst } -clean_jails() -{ - if [ ! -s jails.lst ]; then - return 0 - fi - - while read jail; do - if jls -c -j "$jail"; then - jail -r "$jail" - fi - done < jails.lst -} jid_name_set_body() { @@ -331,6 +379,7 @@ atf_add_test_case "list" atf_add_test_case "nested" atf_add_test_case "commands" + atf_add_test_case "cpuset_parent" atf_add_test_case "jid_name_set" atf_add_test_case "param_consistency" atf_add_test_case "setaudit"