syscall_numbering.c (11538B)
1/* SPDX-License-Identifier: GPL-2.0 */ 2/* 3 * syscall_numbering.c - test calling the x86-64 kernel with various 4 * valid and invalid system call numbers. 5 * 6 * Copyright (c) 2018 Andrew Lutomirski 7 */ 8 9#define _GNU_SOURCE 10 11#include <stdlib.h> 12#include <stdio.h> 13#include <stdbool.h> 14#include <errno.h> 15#include <unistd.h> 16#include <string.h> 17#include <fcntl.h> 18#include <limits.h> 19#include <signal.h> 20#include <sysexits.h> 21 22#include <sys/ptrace.h> 23#include <sys/user.h> 24#include <sys/wait.h> 25#include <sys/mman.h> 26 27#include <linux/ptrace.h> 28 29/* Common system call numbers */ 30#define SYS_READ 0 31#define SYS_WRITE 1 32#define SYS_GETPID 39 33/* x64-only system call numbers */ 34#define X64_IOCTL 16 35#define X64_READV 19 36#define X64_WRITEV 20 37/* x32-only system call numbers (without X32_BIT) */ 38#define X32_IOCTL 514 39#define X32_READV 515 40#define X32_WRITEV 516 41 42#define X32_BIT 0x40000000 43 44static int nullfd = -1; /* File descriptor for /dev/null */ 45static bool with_x32; /* x32 supported on this kernel? */ 46 47enum ptrace_pass { 48 PTP_NOTHING, 49 PTP_GETREGS, 50 PTP_WRITEBACK, 51 PTP_FUZZRET, 52 PTP_FUZZHIGH, 53 PTP_INTNUM, 54 PTP_DONE 55}; 56 57static const char * const ptrace_pass_name[] = 58{ 59 [PTP_NOTHING] = "just stop, no data read", 60 [PTP_GETREGS] = "only getregs", 61 [PTP_WRITEBACK] = "getregs, unmodified setregs", 62 [PTP_FUZZRET] = "modifying the default return", 63 [PTP_FUZZHIGH] = "clobbering the top 32 bits", 64 [PTP_INTNUM] = "sign-extending the syscall number", 65}; 66 67/* 68 * Shared memory block between tracer and test 69 */ 70struct shared { 71 unsigned int nerr; /* Total error count */ 72 unsigned int indent; /* Message indentation level */ 73 enum ptrace_pass ptrace_pass; 74 bool probing_syscall; /* In probe_syscall() */ 75}; 76static volatile struct shared *sh; 77 78static inline unsigned int offset(void) 79{ 80 unsigned int level = sh ? sh->indent : 0; 81 82 return 8 + level * 4; 83} 84 85#define msg(lvl, fmt, ...) printf("%-*s" fmt, offset(), "[" #lvl "]", \ 86 ## __VA_ARGS__) 87 88#define run(fmt, ...) msg(RUN, fmt, ## __VA_ARGS__) 89#define info(fmt, ...) msg(INFO, fmt, ## __VA_ARGS__) 90#define ok(fmt, ...) msg(OK, fmt, ## __VA_ARGS__) 91 92#define fail(fmt, ...) \ 93 do { \ 94 msg(FAIL, fmt, ## __VA_ARGS__); \ 95 sh->nerr++; \ 96 } while (0) 97 98#define crit(fmt, ...) \ 99 do { \ 100 sh->indent = 0; \ 101 msg(FAIL, fmt, ## __VA_ARGS__); \ 102 msg(SKIP, "Unable to run test\n"); \ 103 exit(EX_OSERR); \ 104 } while (0) 105 106/* Sentinel for ptrace-modified return value */ 107#define MODIFIED_BY_PTRACE -9999 108 109/* 110 * Directly invokes the given syscall with nullfd as the first argument 111 * and the rest zero. Avoids involving glibc wrappers in case they ever 112 * end up intercepting some system calls for some reason, or modify 113 * the system call number itself. 114 */ 115static long long probe_syscall(int msb, int lsb) 116{ 117 register long long arg1 asm("rdi") = nullfd; 118 register long long arg2 asm("rsi") = 0; 119 register long long arg3 asm("rdx") = 0; 120 register long long arg4 asm("r10") = 0; 121 register long long arg5 asm("r8") = 0; 122 register long long arg6 asm("r9") = 0; 123 long long nr = ((long long)msb << 32) | (unsigned int)lsb; 124 long long ret; 125 126 /* 127 * We pass in an extra copy of the extended system call number 128 * in %rbx, so we can examine it from the ptrace handler without 129 * worrying about it being possibly modified. This is to test 130 * the validity of struct user regs.orig_rax a.k.a. 131 * struct pt_regs.orig_ax. 132 */ 133 sh->probing_syscall = true; 134 asm volatile("syscall" 135 : "=a" (ret) 136 : "a" (nr), "b" (nr), 137 "r" (arg1), "r" (arg2), "r" (arg3), 138 "r" (arg4), "r" (arg5), "r" (arg6) 139 : "rcx", "r11", "memory", "cc"); 140 sh->probing_syscall = false; 141 142 return ret; 143} 144 145static const char *syscall_str(int msb, int start, int end) 146{ 147 static char buf[64]; 148 const char * const type = (start & X32_BIT) ? "x32" : "x64"; 149 int lsb = start; 150 151 /* 152 * Improve readability by stripping the x32 bit, but round 153 * toward zero so we don't display -1 as -1073741825. 154 */ 155 if (lsb < 0) 156 lsb |= X32_BIT; 157 else 158 lsb &= ~X32_BIT; 159 160 if (start == end) 161 snprintf(buf, sizeof buf, "%s syscall %d:%d", 162 type, msb, lsb); 163 else 164 snprintf(buf, sizeof buf, "%s syscalls %d:%d..%d", 165 type, msb, lsb, lsb + (end-start)); 166 167 return buf; 168} 169 170static unsigned int _check_for(int msb, int start, int end, long long expect, 171 const char *expect_str) 172{ 173 unsigned int err = 0; 174 175 sh->indent++; 176 if (start != end) 177 sh->indent++; 178 179 for (int nr = start; nr <= end; nr++) { 180 long long ret = probe_syscall(msb, nr); 181 182 if (ret != expect) { 183 fail("%s returned %lld, but it should have returned %s\n", 184 syscall_str(msb, nr, nr), 185 ret, expect_str); 186 err++; 187 } 188 } 189 190 if (start != end) 191 sh->indent--; 192 193 if (err) { 194 if (start != end) 195 fail("%s had %u failure%s\n", 196 syscall_str(msb, start, end), 197 err, err == 1 ? "s" : ""); 198 } else { 199 ok("%s returned %s as expected\n", 200 syscall_str(msb, start, end), expect_str); 201 } 202 203 sh->indent--; 204 205 return err; 206} 207 208#define check_for(msb,start,end,expect) \ 209 _check_for(msb,start,end,expect,#expect) 210 211static bool check_zero(int msb, int nr) 212{ 213 return check_for(msb, nr, nr, 0); 214} 215 216static bool check_enosys(int msb, int nr) 217{ 218 return check_for(msb, nr, nr, -ENOSYS); 219} 220 221/* 222 * Anyone diagnosing a failure will want to know whether the kernel 223 * supports x32. Tell them. This can also be used to conditionalize 224 * tests based on existence or nonexistence of x32. 225 */ 226static bool test_x32(void) 227{ 228 long long ret; 229 pid_t mypid = getpid(); 230 231 run("Checking for x32 by calling x32 getpid()\n"); 232 ret = probe_syscall(0, SYS_GETPID | X32_BIT); 233 234 sh->indent++; 235 if (ret == mypid) { 236 info("x32 is supported\n"); 237 with_x32 = true; 238 } else if (ret == -ENOSYS) { 239 info("x32 is not supported\n"); 240 with_x32 = false; 241 } else { 242 fail("x32 getpid() returned %lld, but it should have returned either %lld or -ENOSYS\n", ret, (long long)mypid); 243 with_x32 = false; 244 } 245 sh->indent--; 246 return with_x32; 247} 248 249static void test_syscalls_common(int msb) 250{ 251 enum ptrace_pass pass = sh->ptrace_pass; 252 253 run("Checking some common syscalls as 64 bit\n"); 254 check_zero(msb, SYS_READ); 255 check_zero(msb, SYS_WRITE); 256 257 run("Checking some 64-bit only syscalls as 64 bit\n"); 258 check_zero(msb, X64_READV); 259 check_zero(msb, X64_WRITEV); 260 261 run("Checking out of range system calls\n"); 262 check_for(msb, -64, -2, -ENOSYS); 263 if (pass >= PTP_FUZZRET) 264 check_for(msb, -1, -1, MODIFIED_BY_PTRACE); 265 else 266 check_for(msb, -1, -1, -ENOSYS); 267 check_for(msb, X32_BIT-64, X32_BIT-1, -ENOSYS); 268 check_for(msb, -64-X32_BIT, -1-X32_BIT, -ENOSYS); 269 check_for(msb, INT_MAX-64, INT_MAX-1, -ENOSYS); 270} 271 272static void test_syscalls_with_x32(int msb) 273{ 274 /* 275 * Syscalls 512-547 are "x32" syscalls. They are 276 * intended to be called with the x32 (0x40000000) bit 277 * set. Calling them without the x32 bit set is 278 * nonsense and should not work. 279 */ 280 run("Checking x32 syscalls as 64 bit\n"); 281 check_for(msb, 512, 547, -ENOSYS); 282 283 run("Checking some common syscalls as x32\n"); 284 check_zero(msb, SYS_READ | X32_BIT); 285 check_zero(msb, SYS_WRITE | X32_BIT); 286 287 run("Checking some x32 syscalls as x32\n"); 288 check_zero(msb, X32_READV | X32_BIT); 289 check_zero(msb, X32_WRITEV | X32_BIT); 290 291 run("Checking some 64-bit syscalls as x32\n"); 292 check_enosys(msb, X64_IOCTL | X32_BIT); 293 check_enosys(msb, X64_READV | X32_BIT); 294 check_enosys(msb, X64_WRITEV | X32_BIT); 295} 296 297static void test_syscalls_without_x32(int msb) 298{ 299 run("Checking for absence of x32 system calls\n"); 300 check_for(msb, 0 | X32_BIT, 999 | X32_BIT, -ENOSYS); 301} 302 303static void test_syscall_numbering(void) 304{ 305 static const int msbs[] = { 306 0, 1, -1, X32_BIT-1, X32_BIT, X32_BIT-1, -X32_BIT, INT_MAX, 307 INT_MIN, INT_MIN+1 308 }; 309 310 sh->indent++; 311 312 /* 313 * The MSB is supposed to be ignored, so we loop over a few 314 * to test that out. 315 */ 316 for (size_t i = 0; i < sizeof(msbs)/sizeof(msbs[0]); i++) { 317 int msb = msbs[i]; 318 run("Checking system calls with msb = %d (0x%x)\n", 319 msb, msb); 320 321 sh->indent++; 322 323 test_syscalls_common(msb); 324 if (with_x32) 325 test_syscalls_with_x32(msb); 326 else 327 test_syscalls_without_x32(msb); 328 329 sh->indent--; 330 } 331 332 sh->indent--; 333} 334 335static void syscall_numbering_tracee(void) 336{ 337 enum ptrace_pass pass; 338 339 if (ptrace(PTRACE_TRACEME, 0, 0, 0)) { 340 crit("Failed to request tracing\n"); 341 return; 342 } 343 raise(SIGSTOP); 344 345 for (sh->ptrace_pass = pass = PTP_NOTHING; pass < PTP_DONE; 346 sh->ptrace_pass = ++pass) { 347 run("Running tests under ptrace: %s\n", ptrace_pass_name[pass]); 348 test_syscall_numbering(); 349 } 350} 351 352static void mess_with_syscall(pid_t testpid, enum ptrace_pass pass) 353{ 354 struct user_regs_struct regs; 355 356 sh->probing_syscall = false; /* Do this on entry only */ 357 358 /* For these, don't even getregs */ 359 if (pass == PTP_NOTHING || pass == PTP_DONE) 360 return; 361 362 ptrace(PTRACE_GETREGS, testpid, NULL, ®s); 363 364 if (regs.orig_rax != regs.rbx) { 365 fail("orig_rax %#llx doesn't match syscall number %#llx\n", 366 (unsigned long long)regs.orig_rax, 367 (unsigned long long)regs.rbx); 368 } 369 370 switch (pass) { 371 case PTP_GETREGS: 372 /* Just read, no writeback */ 373 return; 374 case PTP_WRITEBACK: 375 /* Write back the same register state verbatim */ 376 break; 377 case PTP_FUZZRET: 378 regs.rax = MODIFIED_BY_PTRACE; 379 break; 380 case PTP_FUZZHIGH: 381 regs.rax = MODIFIED_BY_PTRACE; 382 regs.orig_rax = regs.orig_rax | 0xffffffff00000000ULL; 383 break; 384 case PTP_INTNUM: 385 regs.rax = MODIFIED_BY_PTRACE; 386 regs.orig_rax = (int)regs.orig_rax; 387 break; 388 default: 389 crit("invalid ptrace_pass\n"); 390 break; 391 } 392 393 ptrace(PTRACE_SETREGS, testpid, NULL, ®s); 394} 395 396static void syscall_numbering_tracer(pid_t testpid) 397{ 398 int wstatus; 399 400 do { 401 pid_t wpid = waitpid(testpid, &wstatus, 0); 402 if (wpid < 0 && errno != EINTR) 403 break; 404 if (wpid != testpid) 405 continue; 406 if (!WIFSTOPPED(wstatus)) 407 break; /* Thread exited? */ 408 409 if (sh->probing_syscall && WSTOPSIG(wstatus) == SIGTRAP) 410 mess_with_syscall(testpid, sh->ptrace_pass); 411 } while (sh->ptrace_pass != PTP_DONE && 412 !ptrace(PTRACE_SYSCALL, testpid, NULL, NULL)); 413 414 ptrace(PTRACE_DETACH, testpid, NULL, NULL); 415 416 /* Wait for the child process to terminate */ 417 while (waitpid(testpid, &wstatus, 0) != testpid || !WIFEXITED(wstatus)) 418 /* wait some more */; 419} 420 421static void test_traced_syscall_numbering(void) 422{ 423 pid_t testpid; 424 425 /* Launch the test thread; this thread continues as the tracer thread */ 426 testpid = fork(); 427 428 if (testpid < 0) { 429 crit("Unable to launch tracer process\n"); 430 } else if (testpid == 0) { 431 syscall_numbering_tracee(); 432 _exit(0); 433 } else { 434 syscall_numbering_tracer(testpid); 435 } 436} 437 438int main(void) 439{ 440 unsigned int nerr; 441 442 /* 443 * It is quite likely to get a segfault on a failure, so make 444 * sure the message gets out by setting stdout to nonbuffered. 445 */ 446 setvbuf(stdout, NULL, _IONBF, 0); 447 448 /* 449 * Harmless file descriptor to work on... 450 */ 451 nullfd = open("/dev/null", O_RDWR); 452 if (nullfd < 0) { 453 crit("Unable to open /dev/null: %s\n", strerror(errno)); 454 } 455 456 /* 457 * Set up a block of shared memory... 458 */ 459 sh = mmap(NULL, sysconf(_SC_PAGE_SIZE), PROT_READ|PROT_WRITE, 460 MAP_ANONYMOUS|MAP_SHARED, 0, 0); 461 if (sh == MAP_FAILED) { 462 crit("Unable to allocated shared memory block: %s\n", 463 strerror(errno)); 464 } 465 466 with_x32 = test_x32(); 467 468 run("Running tests without ptrace...\n"); 469 test_syscall_numbering(); 470 471 test_traced_syscall_numbering(); 472 473 nerr = sh->nerr; 474 if (!nerr) { 475 ok("All system calls succeeded or failed as expected\n"); 476 return 0; 477 } else { 478 fail("A total of %u system call%s had incorrect behavior\n", 479 nerr, nerr != 1 ? "s" : ""); 480 return 1; 481 } 482}