steam: Allow the use of sub-namespaces

...and add a seccomp_unsafe profile just for these apps.

Steam Runtime uses Bubblewrap to isolate the games on its own. There is
no way to make Bubblewrap work in our containers other than to allow
unprivileged namespaces inside them.
This commit is contained in:
Peter Cai 2023-06-07 15:44:19 -04:00
parent ef3c3c001f
commit 92285258b0
5 changed files with 808 additions and 13 deletions

View file

@ -8,6 +8,4 @@ USER user
ENV PROTON_NO_FSYNC=1
# We disable CEF sandbox because we already have podman
# and our seccomp filters do not permit another layer of namespaces
ENTRYPOINT [ "/usr/games/steam", "-no-cef-sandbox" ]
ENTRYPOINT [ "/usr/games/steam" ]

View file

@ -1,4 +1,8 @@
#!/usr/bin/env bash
INVALIDATE_CACHE_UPSTREAM_UBUNTU="steam"
DESKTOP_FILE_PATH="/usr/share/applications/steam.desktop"
# Steam Runtime requires its own namespaces
# so we have to allow them in our seccomp filter.
# Don't use it if you don't trust their sandboxing (pressure-vessel).
UNSAFE_I_KNOW_WHAT_I_AM_DOING_ALLOW_NAMESPACES=true

775
assets/seccomp_unsafe.json Normal file
View file

@ -0,0 +1,775 @@
{
"defaultAction": "SCMP_ACT_ERRNO",
"defaultErrnoRet": 1,
"archMap": [
{
"architecture": "SCMP_ARCH_X86_64",
"subArchitectures": [
"SCMP_ARCH_X86",
"SCMP_ARCH_X32"
]
},
{
"architecture": "SCMP_ARCH_AARCH64",
"subArchitectures": [
"SCMP_ARCH_ARM"
]
},
{
"architecture": "SCMP_ARCH_MIPS64",
"subArchitectures": [
"SCMP_ARCH_MIPS",
"SCMP_ARCH_MIPS64N32"
]
},
{
"architecture": "SCMP_ARCH_MIPS64N32",
"subArchitectures": [
"SCMP_ARCH_MIPS",
"SCMP_ARCH_MIPS64"
]
},
{
"architecture": "SCMP_ARCH_MIPSEL64",
"subArchitectures": [
"SCMP_ARCH_MIPSEL",
"SCMP_ARCH_MIPSEL64N32"
]
},
{
"architecture": "SCMP_ARCH_MIPSEL64N32",
"subArchitectures": [
"SCMP_ARCH_MIPSEL",
"SCMP_ARCH_MIPSEL64"
]
},
{
"architecture": "SCMP_ARCH_S390X",
"subArchitectures": [
"SCMP_ARCH_S390"
]
},
{
"architecture": "SCMP_ARCH_RISCV64",
"subArchitectures": null
}
],
"syscalls": [
{
"names": [
"accept",
"accept4",
"access",
"adjtimex",
"alarm",
"bind",
"brk",
"capget",
"capset",
"chdir",
"chmod",
"chown",
"chown32",
"clock_adjtime",
"clock_adjtime64",
"clock_getres",
"clock_getres_time64",
"clock_gettime",
"clock_gettime64",
"clock_nanosleep",
"clock_nanosleep_time64",
"close",
"close_range",
"connect",
"copy_file_range",
"creat",
"dup",
"dup2",
"dup3",
"epoll_create",
"epoll_create1",
"epoll_ctl",
"epoll_ctl_old",
"epoll_pwait",
"epoll_pwait2",
"epoll_wait",
"epoll_wait_old",
"eventfd",
"eventfd2",
"execve",
"execveat",
"exit",
"exit_group",
"faccessat",
"faccessat2",
"fadvise64",
"fadvise64_64",
"fallocate",
"fanotify_mark",
"fchdir",
"fchmod",
"fchmodat",
"fchown",
"fchown32",
"fchownat",
"fcntl",
"fcntl64",
"fdatasync",
"fgetxattr",
"flistxattr",
"flock",
"fork",
"fremovexattr",
"fsetxattr",
"fstat",
"fstat64",
"fstatat64",
"fstatfs",
"fstatfs64",
"fsync",
"ftruncate",
"ftruncate64",
"futex",
"futex_time64",
"futex_waitv",
"futimesat",
"getcpu",
"getcwd",
"getdents",
"getdents64",
"getegid",
"getegid32",
"geteuid",
"geteuid32",
"getgid",
"getgid32",
"getgroups",
"getgroups32",
"getitimer",
"getpeername",
"getpgid",
"getpgrp",
"getpid",
"getppid",
"getpriority",
"getrandom",
"getresgid",
"getresgid32",
"getresuid",
"getresuid32",
"getrlimit",
"get_robust_list",
"getrusage",
"getsid",
"getsockname",
"getsockopt",
"get_thread_area",
"gettid",
"gettimeofday",
"getuid",
"getuid32",
"getxattr",
"inotify_add_watch",
"inotify_init",
"inotify_init1",
"inotify_rm_watch",
"io_cancel",
"ioctl",
"io_destroy",
"io_getevents",
"io_pgetevents",
"io_pgetevents_time64",
"ioprio_get",
"ioprio_set",
"io_setup",
"io_submit",
"io_uring_enter",
"io_uring_register",
"io_uring_setup",
"ipc",
"kill",
"landlock_add_rule",
"landlock_create_ruleset",
"landlock_restrict_self",
"lchown",
"lchown32",
"lgetxattr",
"link",
"linkat",
"listen",
"listxattr",
"llistxattr",
"_llseek",
"lremovexattr",
"lseek",
"lsetxattr",
"lstat",
"lstat64",
"madvise",
"membarrier",
"memfd_create",
"memfd_secret",
"mincore",
"mkdir",
"mkdirat",
"mknod",
"mknodat",
"mlock",
"mlock2",
"mlockall",
"mmap",
"mmap2",
"mprotect",
"mq_getsetattr",
"mq_notify",
"mq_open",
"mq_timedreceive",
"mq_timedreceive_time64",
"mq_timedsend",
"mq_timedsend_time64",
"mq_unlink",
"mremap",
"msgctl",
"msgget",
"msgrcv",
"msgsnd",
"msync",
"munlock",
"munlockall",
"munmap",
"nanosleep",
"newfstatat",
"_newselect",
"open",
"openat",
"openat2",
"pause",
"pidfd_open",
"pidfd_send_signal",
"pipe",
"pipe2",
"pkey_alloc",
"pkey_free",
"pkey_mprotect",
"poll",
"ppoll",
"ppoll_time64",
"prctl",
"pread64",
"preadv",
"preadv2",
"prlimit64",
"process_mrelease",
"pselect6",
"pselect6_time64",
"pwrite64",
"pwritev",
"pwritev2",
"read",
"readahead",
"readlink",
"readlinkat",
"readv",
"recv",
"recvfrom",
"recvmmsg",
"recvmmsg_time64",
"recvmsg",
"remap_file_pages",
"removexattr",
"rename",
"renameat",
"renameat2",
"restart_syscall",
"rmdir",
"rseq",
"rt_sigaction",
"rt_sigpending",
"rt_sigprocmask",
"rt_sigqueueinfo",
"rt_sigreturn",
"rt_sigsuspend",
"rt_sigtimedwait",
"rt_sigtimedwait_time64",
"rt_tgsigqueueinfo",
"sched_getaffinity",
"sched_getattr",
"sched_getparam",
"sched_get_priority_max",
"sched_get_priority_min",
"sched_getscheduler",
"sched_rr_get_interval",
"sched_rr_get_interval_time64",
"sched_setaffinity",
"sched_setattr",
"sched_setparam",
"sched_setscheduler",
"sched_yield",
"seccomp",
"select",
"semctl",
"semget",
"semop",
"semtimedop",
"semtimedop_time64",
"send",
"sendfile",
"sendfile64",
"sendmmsg",
"sendmsg",
"sendto",
"setfsgid",
"setfsgid32",
"setfsuid",
"setfsuid32",
"setgid",
"setgid32",
"setgroups",
"setgroups32",
"setitimer",
"setpgid",
"setpriority",
"setregid",
"setregid32",
"setresgid",
"setresgid32",
"setresuid",
"setresuid32",
"setreuid",
"setreuid32",
"setrlimit",
"set_robust_list",
"setsid",
"setsockopt",
"set_thread_area",
"set_tid_address",
"setuid",
"setuid32",
"setxattr",
"shmat",
"shmctl",
"shmdt",
"shmget",
"shutdown",
"sigaltstack",
"signalfd",
"signalfd4",
"sigprocmask",
"sigreturn",
"socketcall",
"socketpair",
"splice",
"stat",
"stat64",
"statfs",
"statfs64",
"statx",
"symlink",
"symlinkat",
"sync",
"sync_file_range",
"syncfs",
"sysinfo",
"tee",
"tgkill",
"time",
"timer_create",
"timer_delete",
"timer_getoverrun",
"timer_gettime",
"timer_gettime64",
"timer_settime",
"timer_settime64",
"timerfd_create",
"timerfd_gettime",
"timerfd_gettime64",
"timerfd_settime",
"timerfd_settime64",
"times",
"tkill",
"truncate",
"truncate64",
"ugetrlimit",
"umask",
"uname",
"unlink",
"unlinkat",
"utime",
"utimensat",
"utimensat_time64",
"utimes",
"vfork",
"vmsplice",
"wait4",
"waitid",
"waitpid",
"write",
"writev"
],
"action": "SCMP_ACT_ALLOW"
},
{
"names": [
"process_vm_readv",
"process_vm_writev",
"ptrace"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"minKernel": "4.8"
}
},
{
"names": [
"socket"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 40,
"op": "SCMP_CMP_NE"
}
]
},
{
"names": [
"personality"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 0,
"op": "SCMP_CMP_EQ"
}
]
},
{
"names": [
"personality"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 8,
"op": "SCMP_CMP_EQ"
}
]
},
{
"names": [
"personality"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 131072,
"op": "SCMP_CMP_EQ"
}
]
},
{
"names": [
"personality"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 131080,
"op": "SCMP_CMP_EQ"
}
]
},
{
"names": [
"personality"
],
"action": "SCMP_ACT_ALLOW",
"args": [
{
"index": 0,
"value": 4294967295,
"op": "SCMP_CMP_EQ"
}
]
},
{
"names": [
"sync_file_range2",
"swapcontext"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"arches": [
"ppc64le"
]
}
},
{
"names": [
"arm_fadvise64_64",
"arm_sync_file_range",
"sync_file_range2",
"breakpoint",
"cacheflush",
"set_tls"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"arches": [
"arm",
"arm64"
]
}
},
{
"names": [
"arch_prctl"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"arches": [
"amd64",
"x32"
]
}
},
{
"names": [
"modify_ldt"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"arches": [
"amd64",
"x32",
"x86"
]
}
},
{
"names": [
"s390_pci_mmio_read",
"s390_pci_mmio_write",
"s390_runtime_instr"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"arches": [
"s390",
"s390x"
]
}
},
{
"names": [
"riscv_flush_icache"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"arches": [
"riscv64"
]
}
},
{
"names": [
"open_by_handle_at"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_DAC_READ_SEARCH"
]
}
},
{
"names": [
"fanotify_init",
"fsconfig",
"fsmount",
"fsopen",
"fspick",
"lookup_dcookie",
"name_to_handle_at",
"open_tree",
"perf_event_open",
"quotactl",
"quotactl_fd",
"setdomainname",
"sethostname",
"setns",
"syslog"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_ADMIN"
]
}
},
{
"names": [
"clone",
"clone3",
"unshare",
"bpf",
"mount",
"umount",
"umount2",
"pivot_root"
],
"action": "SCMP_ACT_ALLOW",
"comment": "Syscalls required by containers inside our containers (e.g. Steam Runtime); DO NOT USE unless the app is trusted to perform sandboxing correctly"
},
{
"names": [
"reboot"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_BOOT"
]
}
},
{
"names": [
"chroot"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_CHROOT"
]
}
},
{
"names": [
"delete_module",
"init_module",
"finit_module"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_MODULE"
]
}
},
{
"names": [
"acct"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_PACCT"
]
}
},
{
"names": [
"kcmp",
"pidfd_getfd",
"process_madvise",
"process_vm_readv",
"process_vm_writev",
"ptrace"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_PTRACE"
]
}
},
{
"names": [
"iopl",
"ioperm"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_RAWIO"
]
}
},
{
"names": [
"settimeofday",
"stime",
"clock_settime",
"clock_settime64"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_TIME"
]
}
},
{
"names": [
"vhangup"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_TTY_CONFIG"
]
}
},
{
"names": [
"get_mempolicy",
"mbind",
"set_mempolicy"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYS_NICE"
]
}
},
{
"names": [
"syslog"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_SYSLOG"
]
}
},
{
"names": [
"bpf"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_BPF"
]
}
},
{
"names": [
"perf_event_open"
],
"action": "SCMP_ACT_ALLOW",
"includes": {
"caps": [
"CAP_PERFMON"
]
}
}
]
}

View file

@ -24,6 +24,17 @@ if container_exists "$container_name"; then
exit 0
fi
# Load app control file because some apps require run-time customization
# TODO: Maybe these things should really be container labels?
[ -f "$script_path/apps/$1/control" ] && . "$script_path/apps/$1/control"
if [ "$UNSAFE_I_KNOW_WHAT_I_AM_DOING_ALLOW_NAMESPACES" == "true" ]; then
log "Enabling sub-namespaces support inside this container"
log "This is considered UNSAFE; DO NOT USE if the app inside container does not do its own sandboxing"
log "DO NOT USE if you don't trust sandboxing done by the app inside"
update_podman_security_args "seccomp_unsafe.json"
fi
if [[ -n $PULSE_SERVER ]]; then # remove prefix
host_pulse=${PULSE_SERVER#unix:}
else # default guess

View file

@ -9,16 +9,23 @@ DOBU_TMP=/tmp/dobu
. "$script_path/config-default.sh"
[ -f "$script_path/config.sh" ] && . "$script_path/config.sh"
# Default security-related arguments ALWAYS passed to podman
# Install a seccomp filter that disallows sub-namespaces which could lead to exploits
# and use the keep-id mode of userns, such that the user 1100 is mapped to the
# current host user, and the root user inside the namespace is mapped to an unrelated
# large uid on the host.
podman_security_args=(
--security-opt
seccomp="$script_path/assets/seccomp.json"
--userns=keep-id:uid=1100,gid=1100
)
# This function exists so that we can update the seccomp profile used
update_podman_security_args() {
local seccomp_profile="$1"
[ -z "$seccomp_profile" ] && seccomp_profile="seccomp.json"
# Default security-related arguments ALWAYS passed to podman
# Install a seccomp filter that disallows sub-namespaces which could lead to exploits
# and use the keep-id mode of userns, such that the user 1100 is mapped to the
# current host user, and the root user inside the namespace is mapped to an unrelated
# large uid on the host.
podman_security_args=(
--security-opt
seccomp="$script_path/assets/$seccomp_profile"
--userns=keep-id:uid=1100,gid=1100
)
}
update_podman_security_args
assert_prerequisites() {
command -v podman >/dev/null 2>&1 || die "Podman is required"