overhaul posix_spawn to use CLONE_VM instead of vfork

the proposed change was described in detail in detail previously on
the mailing list. in short, vfork is unsafe because:

1. the compiler could make optimizations that cause the child to
clobber the parent's local vars.

2. strace is buggy and allows the vforking parent to run before the
child execs when run under strace.

the new design uses a close-on-exec pipe instead of vfork semantics to
synchronize the parent and child so that the parent does not return
before the child has finished using its arguments (and now, also its
stack). this also allows reporting exec failures to the caller instead
of giving the caller a child that mysteriously exits with status 127
on exec error.

basic testing has been performed on both the success and failure code
paths. further testing should be done.
This commit is contained in:
Rich Felker
2013-02-03 16:42:40 -05:00
parent 89d3df6e54
commit fb6b159d9e
3 changed files with 122 additions and 52 deletions

View File

@ -4,7 +4,7 @@
struct fdop { struct fdop {
struct fdop *next, *prev; struct fdop *next, *prev;
int cmd, fd, newfd, oflag; int cmd, fd, srcfd, oflag;
mode_t mode; mode_t mode;
char path[]; char path[];
}; };

View File

@ -1,51 +1,47 @@
#define _GNU_SOURCE
#include <spawn.h> #include <spawn.h>
#include <sched.h>
#include <unistd.h> #include <unistd.h>
#include <signal.h> #include <signal.h>
#include <stdint.h>
#include <fcntl.h> #include <fcntl.h>
#include <sys/wait.h>
#include "syscall.h" #include "syscall.h"
#include "pthread_impl.h" #include "pthread_impl.h"
#include "fdop.h" #include "fdop.h"
#include "libc.h" #include "libc.h"
extern char **environ;
static void dummy_0() static void dummy_0()
{ {
} }
weak_alias(dummy_0, __acquire_ptc); weak_alias(dummy_0, __acquire_ptc);
weak_alias(dummy_0, __release_ptc); weak_alias(dummy_0, __release_ptc);
pid_t __vfork(void); struct args {
int p[2];
int __posix_spawnx(pid_t *restrict res, const char *restrict path,
int (*exec)(const char *, char *const *, char *const *),
const posix_spawn_file_actions_t *fa,
const posix_spawnattr_t *restrict attr,
char *const argv[restrict], char *const envp[restrict])
{
pid_t pid;
sigset_t oldmask; sigset_t oldmask;
int i; const char *path;
posix_spawnattr_t dummy_attr = { 0 }; int (*exec)(const char *, char *const *, char *const *);
const posix_spawn_file_actions_t *fa;
const posix_spawnattr_t *restrict attr;
char *const *argv, *const *envp;
};
if (!attr) attr = &dummy_attr; static int child(void *args_vp)
{
int i, ret;
struct sigaction sa;
struct args *args = args_vp;
int p = args->p[1];
const posix_spawn_file_actions_t *fa = args->fa;
const posix_spawnattr_t *restrict attr = args->attr;
sigprocmask(SIG_BLOCK, SIGALL_SET, &oldmask); close(args->p[0]);
__acquire_ptc(); /* All signal dispositions must be either SIG_DFL or SIG_IGN
pid = __vfork(); * before signals are unblocked. Otherwise a signal handler
* from the parent might get run in the child while sharing
if (pid) { * memory, with unpredictable and dangerous results. */
__release_ptc(); for (i=1; i<_NSIG; i++) {
sigprocmask(SIG_SETMASK, &oldmask, 0);
if (pid < 0) return -pid;
*res = pid;
return 0;
}
for (i=1; i<=8*__SYSCALL_SSLEN; i++) {
struct sigaction sa;
__libc_sigaction(i, 0, &sa); __libc_sigaction(i, 0, &sa);
if (sa.sa_handler!=SIG_DFL && (sa.sa_handler!=SIG_IGN || if (sa.sa_handler!=SIG_DFL && (sa.sa_handler!=SIG_IGN ||
((attr->__flags & POSIX_SPAWN_SETSIGDEF) ((attr->__flags & POSIX_SPAWN_SETSIGDEF)
@ -55,50 +51,124 @@ int __posix_spawnx(pid_t *restrict res, const char *restrict path,
} }
} }
if ((attr->__flags&POSIX_SPAWN_SETPGROUP) && setpgid(0, attr->__pgrp)) if (attr->__flags & POSIX_SPAWN_SETPGROUP)
_exit(127); if ((ret=__syscall(SYS_setpgid, 0, attr->__pgrp)))
goto fail;
/* Use syscalls directly because pthread state is not consistent /* Use syscalls directly because pthread state because the
* for making calls to the library wrappers... */ * library functions attempt to do a multi-threaded synchronized
if ((attr->__flags&POSIX_SPAWN_RESETIDS) && ( * id-change, which would trash the parent's state. */
__syscall(SYS_setgid, __syscall(SYS_getgid)) || if (attr->__flags & POSIX_SPAWN_RESETIDS)
__syscall(SYS_setuid, __syscall(SYS_getuid)) )) if ((ret=__syscall(SYS_setgid, __syscall(SYS_getgid))) ||
_exit(127); (ret=__syscall(SYS_setuid, __syscall(SYS_getuid))) )
goto fail;
if (fa && fa->__actions) { if (fa && fa->__actions) {
struct fdop *op; struct fdop *op;
int ret, fd; int fd;
for (op = fa->__actions; op->next; op = op->next); for (op = fa->__actions; op->next; op = op->next);
for (; op; op = op->prev) { for (; op; op = op->prev) {
/* It's possible that a file operation would clobber
* the pipe fd used for synchronizing with the
* parent. To avoid that, we dup the pipe onto
* an unoccupied fd. */
if (op->fd == p) {
ret = __syscall(SYS_dup, p);
if (ret < 0) goto fail;
__syscall(SYS_close, p);
p = ret;
}
switch(op->cmd) { switch(op->cmd) {
case FDOP_CLOSE: case FDOP_CLOSE:
ret = __syscall(SYS_close, op->fd); if ((ret=__syscall(SYS_close, op->fd)))
goto fail;
break; break;
case FDOP_DUP2: case FDOP_DUP2:
ret = __syscall(SYS_dup2, op->fd, op->newfd)<0; if ((ret=__syscall(SYS_dup2, op->srcfd, op->fd))<0)
goto fail;
break; break;
case FDOP_OPEN: case FDOP_OPEN:
fd = __syscall(SYS_open, op->path, fd = __syscall(SYS_open, op->path,
op->oflag | O_LARGEFILE, op->mode); op->oflag | O_LARGEFILE, op->mode);
if (fd == op->fd) { if ((ret=fd) < 0) goto fail;
ret = 0; if (fd != op->fd) {
} else { if ((ret=__syscall(SYS_dup2, fd, op->fd))<0)
ret = __syscall(SYS_dup2, fd, op->fd)<0; goto fail;
__syscall(SYS_close, fd); __syscall(SYS_close, fd);
} }
break; break;
} }
if (ret) _exit(127);
} }
} }
sigprocmask(SIG_SETMASK, (attr->__flags & POSIX_SPAWN_SETSIGMASK) /* Close-on-exec flag may have been lost if we moved the pipe
? &attr->__mask : &oldmask, 0); * to a different fd. We don't use F_DUPFD_CLOEXEC above because
* it would fail on older kernels and atomicity is not needed --
* in this process there are no threads or signal handlers. */
__syscall(SYS_fcntl, p, F_SETFD, FD_CLOEXEC);
exec(path, argv, envp ? envp : environ); pthread_sigmask(SIG_SETMASK, (attr->__flags & POSIX_SPAWN_SETSIGMASK)
? &attr->__mask : &args->oldmask, 0);
args->exec(args->path, args->argv, args->envp);
fail:
/* Since sizeof errno < PIPE_BUF, the write is atomic. */
ret = -ret;
if (ret) while (write(p, &ret, sizeof ret) < 0);
_exit(127); _exit(127);
}
return 0;
int __posix_spawnx(pid_t *restrict res, const char *restrict path,
int (*exec)(const char *, char *const *, char *const *),
const posix_spawn_file_actions_t *fa,
const posix_spawnattr_t *restrict attr,
char *const argv[restrict], char *const envp[restrict])
{
pid_t pid;
char stack[1024];
int ec=0, cs;
struct args args;
if (pipe2(args.p, O_CLOEXEC))
return errno;
pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
args.path = path;
args.exec = exec;
args.fa = fa;
args.attr = attr ? attr : &(const posix_spawnattr_t){0};
args.argv = argv;
args.envp = envp;
pthread_sigmask(SIG_BLOCK, SIGALL_SET, &args.oldmask);
/* This lock prevents setuid/setgid operations while the parent
* is sharing memory with the child. Situations where processes
* with different permissions share VM are fundamentally unsafe. */
__acquire_ptc();
pid = __clone(child, stack+sizeof stack, CLONE_VM|SIGCHLD, &args);
close(args.p[1]);
if (pid > 0) {
if (read(args.p[0], &ec, sizeof ec) < sizeof ec) ec = 0;
else waitpid(pid, &(int){0}, 0);
} else {
ec = -pid;
}
/* At this point, the child has either exited or successfully
* performed exec, so the lock may be released. */
__release_ptc();
close(args.p[0]);
if (!ec) *res = pid;
pthread_sigmask(SIG_SETMASK, &args.oldmask, 0);
pthread_setcancelstate(cs, 0);
return ec;
} }
int posix_spawn(pid_t *restrict res, const char *restrict path, int posix_spawn(pid_t *restrict res, const char *restrict path,

View File

@ -3,13 +3,13 @@
#include <errno.h> #include <errno.h>
#include "fdop.h" #include "fdop.h"
int posix_spawn_file_actions_adddup2(posix_spawn_file_actions_t *fa, int fd, int newfd) int posix_spawn_file_actions_adddup2(posix_spawn_file_actions_t *fa, int srcfd, int fd)
{ {
struct fdop *op = malloc(sizeof *op); struct fdop *op = malloc(sizeof *op);
if (!op) return ENOMEM; if (!op) return ENOMEM;
op->cmd = FDOP_DUP2; op->cmd = FDOP_DUP2;
op->srcfd = srcfd;
op->fd = fd; op->fd = fd;
op->newfd = newfd;
if ((op->next = fa->__actions)) op->next->prev = op; if ((op->next = fa->__actions)) op->next->prev = op;
op->prev = 0; op->prev = 0;
fa->__actions = op; fa->__actions = op;