common.c (12232B)
1// SPDX-License-Identifier: GPL-2.0 2 3#include <linux/context_tracking.h> 4#include <linux/entry-common.h> 5#include <linux/resume_user_mode.h> 6#include <linux/highmem.h> 7#include <linux/jump_label.h> 8#include <linux/livepatch.h> 9#include <linux/audit.h> 10#include <linux/tick.h> 11 12#include "common.h" 13 14#define CREATE_TRACE_POINTS 15#include <trace/events/syscalls.h> 16 17/* See comment for enter_from_user_mode() in entry-common.h */ 18static __always_inline void __enter_from_user_mode(struct pt_regs *regs) 19{ 20 arch_enter_from_user_mode(regs); 21 lockdep_hardirqs_off(CALLER_ADDR0); 22 23 CT_WARN_ON(ct_state() != CONTEXT_USER); 24 user_exit_irqoff(); 25 26 instrumentation_begin(); 27 trace_hardirqs_off_finish(); 28 instrumentation_end(); 29} 30 31void noinstr enter_from_user_mode(struct pt_regs *regs) 32{ 33 __enter_from_user_mode(regs); 34} 35 36static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) 37{ 38 if (unlikely(audit_context())) { 39 unsigned long args[6]; 40 41 syscall_get_arguments(current, regs, args); 42 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); 43 } 44} 45 46static long syscall_trace_enter(struct pt_regs *regs, long syscall, 47 unsigned long work) 48{ 49 long ret = 0; 50 51 /* 52 * Handle Syscall User Dispatch. This must comes first, since 53 * the ABI here can be something that doesn't make sense for 54 * other syscall_work features. 55 */ 56 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 57 if (syscall_user_dispatch(regs)) 58 return -1L; 59 } 60 61 /* Handle ptrace */ 62 if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { 63 ret = ptrace_report_syscall_entry(regs); 64 if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) 65 return -1L; 66 } 67 68 /* Do seccomp after ptrace, to catch any tracer changes. */ 69 if (work & SYSCALL_WORK_SECCOMP) { 70 ret = __secure_computing(NULL); 71 if (ret == -1L) 72 return ret; 73 } 74 75 /* Either of the above might have changed the syscall number */ 76 syscall = syscall_get_nr(current, regs); 77 78 if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) 79 trace_sys_enter(regs, syscall); 80 81 syscall_enter_audit(regs, syscall); 82 83 return ret ? : syscall; 84} 85 86static __always_inline long 87__syscall_enter_from_user_work(struct pt_regs *regs, long syscall) 88{ 89 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 90 91 if (work & SYSCALL_WORK_ENTER) 92 syscall = syscall_trace_enter(regs, syscall, work); 93 94 return syscall; 95} 96 97long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) 98{ 99 return __syscall_enter_from_user_work(regs, syscall); 100} 101 102noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) 103{ 104 long ret; 105 106 __enter_from_user_mode(regs); 107 108 instrumentation_begin(); 109 local_irq_enable(); 110 ret = __syscall_enter_from_user_work(regs, syscall); 111 instrumentation_end(); 112 113 return ret; 114} 115 116noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) 117{ 118 __enter_from_user_mode(regs); 119 instrumentation_begin(); 120 local_irq_enable(); 121 instrumentation_end(); 122} 123 124/* See comment for exit_to_user_mode() in entry-common.h */ 125static __always_inline void __exit_to_user_mode(void) 126{ 127 instrumentation_begin(); 128 trace_hardirqs_on_prepare(); 129 lockdep_hardirqs_on_prepare(); 130 instrumentation_end(); 131 132 user_enter_irqoff(); 133 arch_exit_to_user_mode(); 134 lockdep_hardirqs_on(CALLER_ADDR0); 135} 136 137void noinstr exit_to_user_mode(void) 138{ 139 __exit_to_user_mode(); 140} 141 142/* Workaround to allow gradual conversion of architecture code */ 143void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } 144 145static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 146 unsigned long ti_work) 147{ 148 /* 149 * Before returning to user space ensure that all pending work 150 * items have been completed. 151 */ 152 while (ti_work & EXIT_TO_USER_MODE_WORK) { 153 154 local_irq_enable_exit_to_user(ti_work); 155 156 if (ti_work & _TIF_NEED_RESCHED) 157 schedule(); 158 159 if (ti_work & _TIF_UPROBE) 160 uprobe_notify_resume(regs); 161 162 if (ti_work & _TIF_PATCH_PENDING) 163 klp_update_patch_state(current); 164 165 if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) 166 arch_do_signal_or_restart(regs); 167 168 if (ti_work & _TIF_NOTIFY_RESUME) 169 resume_user_mode_work(regs); 170 171 /* Architecture specific TIF work */ 172 arch_exit_to_user_mode_work(regs, ti_work); 173 174 /* 175 * Disable interrupts and reevaluate the work flags as they 176 * might have changed while interrupts and preemption was 177 * enabled above. 178 */ 179 local_irq_disable_exit_to_user(); 180 181 /* Check if any of the above work has queued a deferred wakeup */ 182 tick_nohz_user_enter_prepare(); 183 184 ti_work = read_thread_flags(); 185 } 186 187 /* Return the latest work state for arch_exit_to_user_mode() */ 188 return ti_work; 189} 190 191static void exit_to_user_mode_prepare(struct pt_regs *regs) 192{ 193 unsigned long ti_work = read_thread_flags(); 194 195 lockdep_assert_irqs_disabled(); 196 197 /* Flush pending rcuog wakeup before the last need_resched() check */ 198 tick_nohz_user_enter_prepare(); 199 200 if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) 201 ti_work = exit_to_user_mode_loop(regs, ti_work); 202 203 arch_exit_to_user_mode_prepare(regs, ti_work); 204 205 /* Ensure that the address limit is intact and no locks are held */ 206 addr_limit_user_check(); 207 kmap_assert_nomap(); 208 lockdep_assert_irqs_disabled(); 209 lockdep_sys_exit(); 210} 211 212/* 213 * If SYSCALL_EMU is set, then the only reason to report is when 214 * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall 215 * instruction has been already reported in syscall_enter_from_user_mode(). 216 */ 217static inline bool report_single_step(unsigned long work) 218{ 219 if (work & SYSCALL_WORK_SYSCALL_EMU) 220 return false; 221 222 return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; 223} 224 225static void syscall_exit_work(struct pt_regs *regs, unsigned long work) 226{ 227 bool step; 228 229 /* 230 * If the syscall was rolled back due to syscall user dispatching, 231 * then the tracers below are not invoked for the same reason as 232 * the entry side was not invoked in syscall_trace_enter(): The ABI 233 * of these syscalls is unknown. 234 */ 235 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 236 if (unlikely(current->syscall_dispatch.on_dispatch)) { 237 current->syscall_dispatch.on_dispatch = false; 238 return; 239 } 240 } 241 242 audit_syscall_exit(regs); 243 244 if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) 245 trace_sys_exit(regs, syscall_get_return_value(current, regs)); 246 247 step = report_single_step(work); 248 if (step || work & SYSCALL_WORK_SYSCALL_TRACE) 249 ptrace_report_syscall_exit(regs, step); 250} 251 252/* 253 * Syscall specific exit to user mode preparation. Runs with interrupts 254 * enabled. 255 */ 256static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) 257{ 258 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 259 unsigned long nr = syscall_get_nr(current, regs); 260 261 CT_WARN_ON(ct_state() != CONTEXT_KERNEL); 262 263 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { 264 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) 265 local_irq_enable(); 266 } 267 268 rseq_syscall(regs); 269 270 /* 271 * Do one-time syscall specific work. If these work items are 272 * enabled, we want to run them exactly once per syscall exit with 273 * interrupts enabled. 274 */ 275 if (unlikely(work & SYSCALL_WORK_EXIT)) 276 syscall_exit_work(regs, work); 277} 278 279static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs) 280{ 281 syscall_exit_to_user_mode_prepare(regs); 282 local_irq_disable_exit_to_user(); 283 exit_to_user_mode_prepare(regs); 284} 285 286void syscall_exit_to_user_mode_work(struct pt_regs *regs) 287{ 288 __syscall_exit_to_user_mode_work(regs); 289} 290 291__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) 292{ 293 instrumentation_begin(); 294 __syscall_exit_to_user_mode_work(regs); 295 instrumentation_end(); 296 __exit_to_user_mode(); 297} 298 299noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) 300{ 301 __enter_from_user_mode(regs); 302} 303 304noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) 305{ 306 instrumentation_begin(); 307 exit_to_user_mode_prepare(regs); 308 instrumentation_end(); 309 __exit_to_user_mode(); 310} 311 312noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) 313{ 314 irqentry_state_t ret = { 315 .exit_rcu = false, 316 }; 317 318 if (user_mode(regs)) { 319 irqentry_enter_from_user_mode(regs); 320 return ret; 321 } 322 323 /* 324 * If this entry hit the idle task invoke rcu_irq_enter() whether 325 * RCU is watching or not. 326 * 327 * Interrupts can nest when the first interrupt invokes softirq 328 * processing on return which enables interrupts. 329 * 330 * Scheduler ticks in the idle task can mark quiescent state and 331 * terminate a grace period, if and only if the timer interrupt is 332 * not nested into another interrupt. 333 * 334 * Checking for rcu_is_watching() here would prevent the nesting 335 * interrupt to invoke rcu_irq_enter(). If that nested interrupt is 336 * the tick then rcu_flavor_sched_clock_irq() would wrongfully 337 * assume that it is the first interrupt and eventually claim 338 * quiescent state and end grace periods prematurely. 339 * 340 * Unconditionally invoke rcu_irq_enter() so RCU state stays 341 * consistent. 342 * 343 * TINY_RCU does not support EQS, so let the compiler eliminate 344 * this part when enabled. 345 */ 346 if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { 347 /* 348 * If RCU is not watching then the same careful 349 * sequence vs. lockdep and tracing is required 350 * as in irqentry_enter_from_user_mode(). 351 */ 352 lockdep_hardirqs_off(CALLER_ADDR0); 353 rcu_irq_enter(); 354 instrumentation_begin(); 355 trace_hardirqs_off_finish(); 356 instrumentation_end(); 357 358 ret.exit_rcu = true; 359 return ret; 360 } 361 362 /* 363 * If RCU is watching then RCU only wants to check whether it needs 364 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() 365 * already contains a warning when RCU is not watching, so no point 366 * in having another one here. 367 */ 368 lockdep_hardirqs_off(CALLER_ADDR0); 369 instrumentation_begin(); 370 rcu_irq_enter_check_tick(); 371 trace_hardirqs_off_finish(); 372 instrumentation_end(); 373 374 return ret; 375} 376 377void raw_irqentry_exit_cond_resched(void) 378{ 379 if (!preempt_count()) { 380 /* Sanity check RCU and thread stack */ 381 rcu_irq_exit_check_preempt(); 382 if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) 383 WARN_ON_ONCE(!on_thread_stack()); 384 if (need_resched()) 385 preempt_schedule_irq(); 386 } 387} 388#ifdef CONFIG_PREEMPT_DYNAMIC 389#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 390DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); 391#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 392DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); 393void dynamic_irqentry_exit_cond_resched(void) 394{ 395 if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) 396 return; 397 raw_irqentry_exit_cond_resched(); 398} 399#endif 400#endif 401 402noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) 403{ 404 lockdep_assert_irqs_disabled(); 405 406 /* Check whether this returns to user mode */ 407 if (user_mode(regs)) { 408 irqentry_exit_to_user_mode(regs); 409 } else if (!regs_irqs_disabled(regs)) { 410 /* 411 * If RCU was not watching on entry this needs to be done 412 * carefully and needs the same ordering of lockdep/tracing 413 * and RCU as the return to user mode path. 414 */ 415 if (state.exit_rcu) { 416 instrumentation_begin(); 417 /* Tell the tracer that IRET will enable interrupts */ 418 trace_hardirqs_on_prepare(); 419 lockdep_hardirqs_on_prepare(); 420 instrumentation_end(); 421 rcu_irq_exit(); 422 lockdep_hardirqs_on(CALLER_ADDR0); 423 return; 424 } 425 426 instrumentation_begin(); 427 if (IS_ENABLED(CONFIG_PREEMPTION)) 428 irqentry_exit_cond_resched(); 429 430 /* Covers both tracing and lockdep */ 431 trace_hardirqs_on(); 432 instrumentation_end(); 433 } else { 434 /* 435 * IRQ flags state is correct already. Just tell RCU if it 436 * was not watching on entry. 437 */ 438 if (state.exit_rcu) 439 rcu_irq_exit(); 440 } 441} 442 443irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) 444{ 445 irqentry_state_t irq_state; 446 447 irq_state.lockdep = lockdep_hardirqs_enabled(); 448 449 __nmi_enter(); 450 lockdep_hardirqs_off(CALLER_ADDR0); 451 lockdep_hardirq_enter(); 452 rcu_nmi_enter(); 453 454 instrumentation_begin(); 455 trace_hardirqs_off_finish(); 456 ftrace_nmi_enter(); 457 instrumentation_end(); 458 459 return irq_state; 460} 461 462void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) 463{ 464 instrumentation_begin(); 465 ftrace_nmi_exit(); 466 if (irq_state.lockdep) { 467 trace_hardirqs_on_prepare(); 468 lockdep_hardirqs_on_prepare(); 469 } 470 instrumentation_end(); 471 472 rcu_nmi_exit(); 473 lockdep_hardirq_exit(); 474 if (irq_state.lockdep) 475 lockdep_hardirqs_on(CALLER_ADDR0); 476 __nmi_exit(); 477}