Skip to content

Commit 0212048

Browse files
authored
Merge pull request from GHSA-xr7r-f8xq-vfvv
fix GHSA-xr7r-f8xq-vfvv and harden fd leaks
2 parents 8454bbb + d8edada commit 0212048

File tree

7 files changed

+166
-38
lines changed

7 files changed

+166
-38
lines changed

libcontainer/cgroups/file.go

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -66,16 +66,16 @@ var (
6666
// TestMode is set to true by unit tests that need "fake" cgroupfs.
6767
TestMode bool
6868

69-
cgroupFd int = -1
70-
prepOnce sync.Once
71-
prepErr error
72-
resolveFlags uint64
69+
cgroupRootHandle *os.File
70+
prepOnce sync.Once
71+
prepErr error
72+
resolveFlags uint64
7373
)
7474

7575
func prepareOpenat2() error {
7676
prepOnce.Do(func() {
7777
fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
78-
Flags: unix.O_DIRECTORY | unix.O_PATH,
78+
Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC,
7979
})
8080
if err != nil {
8181
prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
@@ -86,15 +86,16 @@ func prepareOpenat2() error {
8686
}
8787
return
8888
}
89+
file := os.NewFile(uintptr(fd), cgroupfsDir)
90+
8991
var st unix.Statfs_t
90-
if err = unix.Fstatfs(fd, &st); err != nil {
92+
if err := unix.Fstatfs(int(file.Fd()), &st); err != nil {
9193
prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
9294
logrus.Warnf("falling back to securejoin: %s", prepErr)
9395
return
9496
}
9597

96-
cgroupFd = fd
97-
98+
cgroupRootHandle = file
9899
resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
99100
if st.Type == unix.CGROUP2_SUPER_MAGIC {
100101
// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
@@ -121,29 +122,29 @@ func openFile(dir, file string, flags int) (*os.File, error) {
121122
return openFallback(path, flags, mode)
122123
}
123124

124-
fd, err := unix.Openat2(cgroupFd, relPath,
125+
fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath,
125126
&unix.OpenHow{
126127
Resolve: resolveFlags,
127128
Flags: uint64(flags) | unix.O_CLOEXEC,
128129
Mode: uint64(mode),
129130
})
130131
if err != nil {
131132
err = &os.PathError{Op: "openat2", Path: path, Err: err}
132-
// Check if cgroupFd is still opened to cgroupfsDir
133+
// Check if cgroupRootHandle is still opened to cgroupfsDir
133134
// (happens when this package is incorrectly used
134135
// across the chroot/pivot_root/mntns boundary, or
135136
// when /sys/fs/cgroup is remounted).
136137
//
137138
// TODO: if such usage will ever be common, amend this
138-
// to reopen cgroupFd and retry openat2.
139-
fdPath, closer := utils.ProcThreadSelf("fd/" + strconv.Itoa(cgroupFd))
139+
// to reopen cgroupRootHandle and retry openat2.
140+
fdPath, closer := utils.ProcThreadSelf("fd/" + strconv.Itoa(int(cgroupRootHandle.Fd())))
140141
defer closer()
141142
fdDest, _ := os.Readlink(fdPath)
142143
if fdDest != cgroupfsDir {
143-
// Wrap the error so it is clear that cgroupFd
144+
// Wrap the error so it is clear that cgroupRootHandle
144145
// is opened to an unexpected/wrong directory.
145-
err = fmt.Errorf("cgroupFd %d unexpectedly opened to %s != %s: %w",
146-
cgroupFd, fdDest, cgroupfsDir, err)
146+
err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w",
147+
cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err)
147148
}
148149
return nil, err
149150
}

libcontainer/container_linux.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,15 @@ func (c *Container) start(process *Process) (retErr error) {
332332
}()
333333
}
334334

335+
// Before starting "runc init", mark all non-stdio open files as O_CLOEXEC
336+
// to make sure we don't leak any files into "runc init". Any files to be
337+
// passed to "runc init" through ExtraFiles will get dup2'd by the Go
338+
// runtime and thus their O_CLOEXEC flag will be cleared. This is some
339+
// additional protection against attacks like CVE-2024-21626, by making
340+
// sure we never leak files to "runc init" we didn't intend to.
341+
if err := utils.CloseExecFrom(3); err != nil {
342+
return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err)
343+
}
335344
if err := parent.start(); err != nil {
336345
return fmt.Errorf("unable to start container process: %w", err)
337346
}

libcontainer/init_linux.go

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"fmt"
88
"net"
99
"os"
10+
"path/filepath"
1011
"runtime"
1112
"runtime/debug"
1213
"strconv"
@@ -89,7 +90,7 @@ func Init() {
8990
}
9091
// Normally, StartInitialization() never returns, meaning
9192
// if we are here, it had failed.
92-
os.Exit(1)
93+
os.Exit(255)
9394
}
9495

9596
// Normally, this function does not return. If it returns, with or without an
@@ -274,6 +275,32 @@ func populateProcessEnvironment(env []string) error {
274275
return nil
275276
}
276277

278+
// verifyCwd ensures that the current directory is actually inside the mount
279+
// namespace root of the current process.
280+
func verifyCwd() error {
281+
// getcwd(2) on Linux detects if cwd is outside of the rootfs of the
282+
// current mount namespace root, and in that case prefixes "(unreachable)"
283+
// to the returned string. glibc's getcwd(3) and Go's Getwd() both detect
284+
// when this happens and return ENOENT rather than returning a non-absolute
285+
// path. In both cases we can therefore easily detect if we have an invalid
286+
// cwd by checking the return value of getcwd(3). See getcwd(3) for more
287+
// details, and CVE-2024-21626 for the security issue that motivated this
288+
// check.
289+
//
290+
// We have to use unix.Getwd() here because os.Getwd() has a workaround for
291+
// $PWD which involves doing stat(.), which can fail if the current
292+
// directory is inaccessible to the container process.
293+
if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) {
294+
return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected")
295+
} else if err != nil {
296+
return fmt.Errorf("failed to verify if current working directory is safe: %w", err)
297+
} else if !filepath.IsAbs(wd) {
298+
// We shouldn't ever hit this, but check just in case.
299+
return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd)
300+
}
301+
return nil
302+
}
303+
277304
// finalizeNamespace drops the caps, sets the correct user
278305
// and working dir, and closes any leaked file descriptors
279306
// before executing the command inside the namespace
@@ -332,6 +359,10 @@ func finalizeNamespace(config *initConfig) error {
332359
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
333360
}
334361
}
362+
// Make sure our final working directory is inside the container.
363+
if err := verifyCwd(); err != nil {
364+
return err
365+
}
335366
if err := system.ClearKeepCaps(); err != nil {
336367
return fmt.Errorf("unable to clear keep caps: %w", err)
337368
}

libcontainer/integration/seccomp_test.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ import (
1313
libseccomp "github.com/seccomp/libseccomp-golang"
1414
)
1515

16-
func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
16+
func TestSeccompDenySyslogWithErrno(t *testing.T) {
1717
if testing.Short() {
1818
return
1919
}
@@ -25,7 +25,7 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
2525
DefaultAction: configs.Allow,
2626
Syscalls: []*configs.Syscall{
2727
{
28-
Name: "getcwd",
28+
Name: "syslog",
2929
Action: configs.Errno,
3030
ErrnoRet: &errnoRet,
3131
},
@@ -39,7 +39,7 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
3939
buffers := newStdBuffers()
4040
pwd := &libcontainer.Process{
4141
Cwd: "/",
42-
Args: []string{"pwd"},
42+
Args: []string{"dmesg"},
4343
Env: standardEnvironment,
4444
Stdin: buffers.Stdin,
4545
Stdout: buffers.Stdout,
@@ -65,17 +65,17 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
6565
}
6666

6767
if exitCode == 0 {
68-
t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode)
68+
t.Fatalf("dmesg should fail with negative exit code, instead got %d!", exitCode)
6969
}
7070

71-
expected := "pwd: getcwd: No such process"
71+
expected := "dmesg: klogctl: No such process"
7272
actual := strings.Trim(buffers.Stderr.String(), "\n")
7373
if actual != expected {
7474
t.Fatalf("Expected output %s but got %s\n", expected, actual)
7575
}
7676
}
7777

78-
func TestSeccompDenyGetcwd(t *testing.T) {
78+
func TestSeccompDenySyslog(t *testing.T) {
7979
if testing.Short() {
8080
return
8181
}
@@ -85,7 +85,7 @@ func TestSeccompDenyGetcwd(t *testing.T) {
8585
DefaultAction: configs.Allow,
8686
Syscalls: []*configs.Syscall{
8787
{
88-
Name: "getcwd",
88+
Name: "syslog",
8989
Action: configs.Errno,
9090
},
9191
},
@@ -98,7 +98,7 @@ func TestSeccompDenyGetcwd(t *testing.T) {
9898
buffers := newStdBuffers()
9999
pwd := &libcontainer.Process{
100100
Cwd: "/",
101-
Args: []string{"pwd"},
101+
Args: []string{"dmesg"},
102102
Env: standardEnvironment,
103103
Stdin: buffers.Stdin,
104104
Stdout: buffers.Stdout,
@@ -124,10 +124,10 @@ func TestSeccompDenyGetcwd(t *testing.T) {
124124
}
125125

126126
if exitCode == 0 {
127-
t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode)
127+
t.Fatalf("dmesg should fail with negative exit code, instead got %d!", exitCode)
128128
}
129129

130-
expected := "pwd: getcwd: Operation not permitted"
130+
expected := "dmesg: klogctl: Operation not permitted"
131131
actual := strings.Trim(buffers.Stderr.String(), "\n")
132132
if actual != expected {
133133
t.Fatalf("Expected output %s but got %s\n", expected, actual)

libcontainer/setns_init_linux.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"github.com/opencontainers/runc/libcontainer/keys"
1515
"github.com/opencontainers/runc/libcontainer/seccomp"
1616
"github.com/opencontainers/runc/libcontainer/system"
17+
"github.com/opencontainers/runc/libcontainer/utils"
1718
)
1819

1920
// linuxSetnsInit performs the container's initialization for running a new process
@@ -138,5 +139,23 @@ func (l *linuxSetnsInit) Init() error {
138139
l.config.Args[0] = name
139140
return system.Fexecve(l.dmzExe.Fd(), l.config.Args, os.Environ())
140141
}
142+
// Close all file descriptors we are not passing to the container. This is
143+
// necessary because the execve target could use internal runc fds as the
144+
// execve path, potentially giving access to binary files from the host
145+
// (which can then be opened by container processes, leading to container
146+
// escapes). Note that because this operation will close any open file
147+
// descriptors that are referenced by (*os.File) handles from underneath
148+
// the Go runtime, we must not do any file operations after this point
149+
// (otherwise the (*os.File) finaliser could close the wrong file). See
150+
// CVE-2024-21626 for more information as to why this protection is
151+
// necessary.
152+
//
153+
// This is not needed for runc-dmz, because the extra execve(2) step means
154+
// that all O_CLOEXEC file descriptors have already been closed and thus
155+
// the second execve(2) from runc-dmz cannot access internal file
156+
// descriptors from runc.
157+
if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
158+
return err
159+
}
141160
return system.Exec(name, l.config.Args, os.Environ())
142161
}

libcontainer/standard_init_linux.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,5 +281,23 @@ func (l *linuxStandardInit) Init() error {
281281
l.config.Args[0] = name
282282
return system.Fexecve(l.dmzExe.Fd(), l.config.Args, os.Environ())
283283
}
284+
// Close all file descriptors we are not passing to the container. This is
285+
// necessary because the execve target could use internal runc fds as the
286+
// execve path, potentially giving access to binary files from the host
287+
// (which can then be opened by container processes, leading to container
288+
// escapes). Note that because this operation will close any open file
289+
// descriptors that are referenced by (*os.File) handles from underneath
290+
// the Go runtime, we must not do any file operations after this point
291+
// (otherwise the (*os.File) finaliser could close the wrong file). See
292+
// CVE-2024-21626 for more information as to why this protection is
293+
// necessary.
294+
//
295+
// This is not needed for runc-dmz, because the extra execve(2) step means
296+
// that all O_CLOEXEC file descriptors have already been closed and thus
297+
// the second execve(2) from runc-dmz cannot access internal file
298+
// descriptors from runc.
299+
if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
300+
return err
301+
}
284302
return system.Exec(name, l.config.Args, os.Environ())
285303
}

libcontainer/utils/utils_unix.go

Lines changed: 62 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"runtime"
1212
"strconv"
1313
"sync"
14+
_ "unsafe" // for go:linkname
1415

1516
securejoin "github.com/cyphar/filepath-securejoin"
1617
"github.com/sirupsen/logrus"
@@ -53,14 +54,11 @@ func haveCloseRangeCloexec() bool {
5354
return haveCloseRangeCloexecBool
5455
}
5556

56-
// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for
57-
// the process (except for those below the given fd value).
58-
func CloseExecFrom(minFd int) error {
59-
if haveCloseRangeCloexec() {
60-
err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC)
61-
return os.NewSyscallError("close_range", err)
62-
}
57+
type fdFunc func(fd int)
6358

59+
// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
60+
// the current process.
61+
func fdRangeFrom(minFd int, fn fdFunc) error {
6462
procSelfFd, closer := ProcThreadSelf("fd")
6563
defer closer()
6664

@@ -88,15 +86,67 @@ func CloseExecFrom(minFd int) error {
8886
if fd < minFd {
8987
continue
9088
}
91-
// Intentionally ignore errors from unix.CloseOnExec -- the cases where
92-
// this might fail are basically file descriptors that have already
93-
// been closed (including and especially the one that was created when
94-
// os.ReadDir did the "opendir" syscall).
95-
unix.CloseOnExec(fd)
89+
// Ignore the file descriptor we used for readdir, as it will be closed
90+
// when we return.
91+
if uintptr(fd) == fdDir.Fd() {
92+
continue
93+
}
94+
// Run the closure.
95+
fn(fd)
9696
}
9797
return nil
9898
}
9999

100+
// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
101+
// equal to minFd in the current process.
102+
func CloseExecFrom(minFd int) error {
103+
// Use close_range(CLOSE_RANGE_CLOEXEC) if possible.
104+
if haveCloseRangeCloexec() {
105+
err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC)
106+
return os.NewSyscallError("close_range", err)
107+
}
108+
// Otherwise, fall back to the standard loop.
109+
return fdRangeFrom(minFd, unix.CloseOnExec)
110+
}
111+
112+
//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor
113+
114+
// In order to make sure we do not close the internal epoll descriptors the Go
115+
// runtime uses, we need to ensure that we skip descriptors that match
116+
// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing,
117+
// unfortunately there's no other way to be sure we're only keeping the file
118+
// descriptors the Go runtime needs. Hopefully nothing blows up doing this...
119+
func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive
120+
121+
// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the
122+
// current process, except for those critical to Go's runtime (such as the
123+
// netpoll management descriptors).
124+
//
125+
// NOTE: That this function is incredibly dangerous to use in most Go code, as
126+
// closing file descriptors from underneath *os.File handles can lead to very
127+
// bad behaviour (the closed file descriptor can be re-used and then any
128+
// *os.File operations would apply to the wrong file). This function is only
129+
// intended to be called from the last stage of runc init.
130+
func UnsafeCloseFrom(minFd int) error {
131+
// We cannot use close_range(2) even if it is available, because we must
132+
// not close some file descriptors.
133+
return fdRangeFrom(minFd, func(fd int) {
134+
if runtime_IsPollDescriptor(uintptr(fd)) {
135+
// These are the Go runtimes internal netpoll file descriptors.
136+
// These file descriptors are operated on deep in the Go scheduler,
137+
// and closing those files from underneath Go can result in panics.
138+
// There is no issue with keeping them because they are not
139+
// executable and are not useful to an attacker anyway. Also we
140+
// don't have any choice.
141+
return
142+
}
143+
// There's nothing we can do about errors from close(2), and the
144+
// only likely error to be seen is EBADF which indicates the fd was
145+
// already closed (in which case, we got what we wanted).
146+
_ = unix.Close(fd)
147+
})
148+
}
149+
100150
// NewSockPair returns a new SOCK_STREAM unix socket pair.
101151
func NewSockPair(name string) (parent, child *os.File, err error) {
102152
fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)

0 commit comments

Comments
 (0)