hyperv_timer.c (15411B)
1// SPDX-License-Identifier: GPL-2.0 2 3/* 4 * Clocksource driver for the synthetic counter and timers 5 * provided by the Hyper-V hypervisor to guest VMs, as described 6 * in the Hyper-V Top Level Functional Spec (TLFS). This driver 7 * is instruction set architecture independent. 8 * 9 * Copyright (C) 2019, Microsoft, Inc. 10 * 11 * Author: Michael Kelley <mikelley@microsoft.com> 12 */ 13 14#include <linux/percpu.h> 15#include <linux/cpumask.h> 16#include <linux/clockchips.h> 17#include <linux/clocksource.h> 18#include <linux/sched_clock.h> 19#include <linux/mm.h> 20#include <linux/cpuhotplug.h> 21#include <linux/interrupt.h> 22#include <linux/irq.h> 23#include <linux/acpi.h> 24#include <clocksource/hyperv_timer.h> 25#include <asm/hyperv-tlfs.h> 26#include <asm/mshyperv.h> 27 28static struct clock_event_device __percpu *hv_clock_event; 29static u64 hv_sched_clock_offset __ro_after_init; 30 31/* 32 * If false, we're using the old mechanism for stimer0 interrupts 33 * where it sends a VMbus message when it expires. The old 34 * mechanism is used when running on older versions of Hyper-V 35 * that don't support Direct Mode. While Hyper-V provides 36 * four stimer's per CPU, Linux uses only stimer0. 37 * 38 * Because Direct Mode does not require processing a VMbus 39 * message, stimer interrupts can be enabled earlier in the 40 * process of booting a CPU, and consistent with when timer 41 * interrupts are enabled for other clocksource drivers. 42 * However, for legacy versions of Hyper-V when Direct Mode 43 * is not enabled, setting up stimer interrupts must be 44 * delayed until VMbus is initialized and can process the 45 * interrupt message. 46 */ 47static bool direct_mode_enabled; 48 49static int stimer0_irq = -1; 50static int stimer0_message_sint; 51static DEFINE_PER_CPU(long, stimer0_evt); 52 53/* 54 * Common code for stimer0 interrupts coming via Direct Mode or 55 * as a VMbus message. 56 */ 57void hv_stimer0_isr(void) 58{ 59 struct clock_event_device *ce; 60 61 ce = this_cpu_ptr(hv_clock_event); 62 ce->event_handler(ce); 63} 64EXPORT_SYMBOL_GPL(hv_stimer0_isr); 65 66/* 67 * stimer0 interrupt handler for architectures that support 68 * per-cpu interrupts, which also implies Direct Mode. 69 */ 70static irqreturn_t hv_stimer0_percpu_isr(int irq, void *dev_id) 71{ 72 hv_stimer0_isr(); 73 return IRQ_HANDLED; 74} 75 76static int hv_ce_set_next_event(unsigned long delta, 77 struct clock_event_device *evt) 78{ 79 u64 current_tick; 80 81 current_tick = hv_read_reference_counter(); 82 current_tick += delta; 83 hv_set_register(HV_REGISTER_STIMER0_COUNT, current_tick); 84 return 0; 85} 86 87static int hv_ce_shutdown(struct clock_event_device *evt) 88{ 89 hv_set_register(HV_REGISTER_STIMER0_COUNT, 0); 90 hv_set_register(HV_REGISTER_STIMER0_CONFIG, 0); 91 if (direct_mode_enabled && stimer0_irq >= 0) 92 disable_percpu_irq(stimer0_irq); 93 94 return 0; 95} 96 97static int hv_ce_set_oneshot(struct clock_event_device *evt) 98{ 99 union hv_stimer_config timer_cfg; 100 101 timer_cfg.as_uint64 = 0; 102 timer_cfg.enable = 1; 103 timer_cfg.auto_enable = 1; 104 if (direct_mode_enabled) { 105 /* 106 * When it expires, the timer will directly interrupt 107 * on the specified hardware vector/IRQ. 108 */ 109 timer_cfg.direct_mode = 1; 110 timer_cfg.apic_vector = HYPERV_STIMER0_VECTOR; 111 if (stimer0_irq >= 0) 112 enable_percpu_irq(stimer0_irq, IRQ_TYPE_NONE); 113 } else { 114 /* 115 * When it expires, the timer will generate a VMbus message, 116 * to be handled by the normal VMbus interrupt handler. 117 */ 118 timer_cfg.direct_mode = 0; 119 timer_cfg.sintx = stimer0_message_sint; 120 } 121 hv_set_register(HV_REGISTER_STIMER0_CONFIG, timer_cfg.as_uint64); 122 return 0; 123} 124 125/* 126 * hv_stimer_init - Per-cpu initialization of the clockevent 127 */ 128static int hv_stimer_init(unsigned int cpu) 129{ 130 struct clock_event_device *ce; 131 132 if (!hv_clock_event) 133 return 0; 134 135 ce = per_cpu_ptr(hv_clock_event, cpu); 136 ce->name = "Hyper-V clockevent"; 137 ce->features = CLOCK_EVT_FEAT_ONESHOT; 138 ce->cpumask = cpumask_of(cpu); 139 ce->rating = 1000; 140 ce->set_state_shutdown = hv_ce_shutdown; 141 ce->set_state_oneshot = hv_ce_set_oneshot; 142 ce->set_next_event = hv_ce_set_next_event; 143 144 clockevents_config_and_register(ce, 145 HV_CLOCK_HZ, 146 HV_MIN_DELTA_TICKS, 147 HV_MAX_MAX_DELTA_TICKS); 148 return 0; 149} 150 151/* 152 * hv_stimer_cleanup - Per-cpu cleanup of the clockevent 153 */ 154int hv_stimer_cleanup(unsigned int cpu) 155{ 156 struct clock_event_device *ce; 157 158 if (!hv_clock_event) 159 return 0; 160 161 /* 162 * In the legacy case where Direct Mode is not enabled 163 * (which can only be on x86/64), stimer cleanup happens 164 * relatively early in the CPU offlining process. We 165 * must unbind the stimer-based clockevent device so 166 * that the LAPIC timer can take over until clockevents 167 * are no longer needed in the offlining process. Note 168 * that clockevents_unbind_device() eventually calls 169 * hv_ce_shutdown(). 170 * 171 * The unbind should not be done when Direct Mode is 172 * enabled because we may be on an architecture where 173 * there are no other clockevent devices to fallback to. 174 */ 175 ce = per_cpu_ptr(hv_clock_event, cpu); 176 if (direct_mode_enabled) 177 hv_ce_shutdown(ce); 178 else 179 clockevents_unbind_device(ce, cpu); 180 181 return 0; 182} 183EXPORT_SYMBOL_GPL(hv_stimer_cleanup); 184 185/* 186 * These placeholders are overridden by arch specific code on 187 * architectures that need special setup of the stimer0 IRQ because 188 * they don't support per-cpu IRQs (such as x86/x64). 189 */ 190void __weak hv_setup_stimer0_handler(void (*handler)(void)) 191{ 192}; 193 194void __weak hv_remove_stimer0_handler(void) 195{ 196}; 197 198/* Called only on architectures with per-cpu IRQs (i.e., not x86/x64) */ 199static int hv_setup_stimer0_irq(void) 200{ 201 int ret; 202 203 ret = acpi_register_gsi(NULL, HYPERV_STIMER0_VECTOR, 204 ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_HIGH); 205 if (ret < 0) { 206 pr_err("Can't register Hyper-V stimer0 GSI. Error %d", ret); 207 return ret; 208 } 209 stimer0_irq = ret; 210 211 ret = request_percpu_irq(stimer0_irq, hv_stimer0_percpu_isr, 212 "Hyper-V stimer0", &stimer0_evt); 213 if (ret) { 214 pr_err("Can't request Hyper-V stimer0 IRQ %d. Error %d", 215 stimer0_irq, ret); 216 acpi_unregister_gsi(stimer0_irq); 217 stimer0_irq = -1; 218 } 219 return ret; 220} 221 222static void hv_remove_stimer0_irq(void) 223{ 224 if (stimer0_irq == -1) { 225 hv_remove_stimer0_handler(); 226 } else { 227 free_percpu_irq(stimer0_irq, &stimer0_evt); 228 acpi_unregister_gsi(stimer0_irq); 229 stimer0_irq = -1; 230 } 231} 232 233/* hv_stimer_alloc - Global initialization of the clockevent and stimer0 */ 234int hv_stimer_alloc(bool have_percpu_irqs) 235{ 236 int ret; 237 238 /* 239 * Synthetic timers are always available except on old versions of 240 * Hyper-V on x86. In that case, return as error as Linux will use a 241 * clockevent based on emulated LAPIC timer hardware. 242 */ 243 if (!(ms_hyperv.features & HV_MSR_SYNTIMER_AVAILABLE)) 244 return -EINVAL; 245 246 hv_clock_event = alloc_percpu(struct clock_event_device); 247 if (!hv_clock_event) 248 return -ENOMEM; 249 250 direct_mode_enabled = ms_hyperv.misc_features & 251 HV_STIMER_DIRECT_MODE_AVAILABLE; 252 253 /* 254 * If Direct Mode isn't enabled, the remainder of the initialization 255 * is done later by hv_stimer_legacy_init() 256 */ 257 if (!direct_mode_enabled) 258 return 0; 259 260 if (have_percpu_irqs) { 261 ret = hv_setup_stimer0_irq(); 262 if (ret) 263 goto free_clock_event; 264 } else { 265 hv_setup_stimer0_handler(hv_stimer0_isr); 266 } 267 268 /* 269 * Since we are in Direct Mode, stimer initialization 270 * can be done now with a CPUHP value in the same range 271 * as other clockevent devices. 272 */ 273 ret = cpuhp_setup_state(CPUHP_AP_HYPERV_TIMER_STARTING, 274 "clockevents/hyperv/stimer:starting", 275 hv_stimer_init, hv_stimer_cleanup); 276 if (ret < 0) { 277 hv_remove_stimer0_irq(); 278 goto free_clock_event; 279 } 280 return ret; 281 282free_clock_event: 283 free_percpu(hv_clock_event); 284 hv_clock_event = NULL; 285 return ret; 286} 287EXPORT_SYMBOL_GPL(hv_stimer_alloc); 288 289/* 290 * hv_stimer_legacy_init -- Called from the VMbus driver to handle 291 * the case when Direct Mode is not enabled, and the stimer 292 * must be initialized late in the CPU onlining process. 293 * 294 */ 295void hv_stimer_legacy_init(unsigned int cpu, int sint) 296{ 297 if (direct_mode_enabled) 298 return; 299 300 /* 301 * This function gets called by each vCPU, so setting the 302 * global stimer_message_sint value each time is conceptually 303 * not ideal, but the value passed in is always the same and 304 * it avoids introducing yet another interface into this 305 * clocksource driver just to set the sint in the legacy case. 306 */ 307 stimer0_message_sint = sint; 308 (void)hv_stimer_init(cpu); 309} 310EXPORT_SYMBOL_GPL(hv_stimer_legacy_init); 311 312/* 313 * hv_stimer_legacy_cleanup -- Called from the VMbus driver to 314 * handle the case when Direct Mode is not enabled, and the 315 * stimer must be cleaned up early in the CPU offlining 316 * process. 317 */ 318void hv_stimer_legacy_cleanup(unsigned int cpu) 319{ 320 if (direct_mode_enabled) 321 return; 322 (void)hv_stimer_cleanup(cpu); 323} 324EXPORT_SYMBOL_GPL(hv_stimer_legacy_cleanup); 325 326/* 327 * Do a global cleanup of clockevents for the cases of kexec and 328 * vmbus exit 329 */ 330void hv_stimer_global_cleanup(void) 331{ 332 int cpu; 333 334 /* 335 * hv_stime_legacy_cleanup() will stop the stimer if Direct 336 * Mode is not enabled, and fallback to the LAPIC timer. 337 */ 338 for_each_present_cpu(cpu) { 339 hv_stimer_legacy_cleanup(cpu); 340 } 341 342 if (!hv_clock_event) 343 return; 344 345 if (direct_mode_enabled) { 346 cpuhp_remove_state(CPUHP_AP_HYPERV_TIMER_STARTING); 347 hv_remove_stimer0_irq(); 348 stimer0_irq = -1; 349 } 350 free_percpu(hv_clock_event); 351 hv_clock_event = NULL; 352 353} 354EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup); 355 356/* 357 * Code and definitions for the Hyper-V clocksources. Two 358 * clocksources are defined: one that reads the Hyper-V defined MSR, and 359 * the other that uses the TSC reference page feature as defined in the 360 * TLFS. The MSR version is for compatibility with old versions of 361 * Hyper-V and 32-bit x86. The TSC reference page version is preferred. 362 */ 363 364static union { 365 struct ms_hyperv_tsc_page page; 366 u8 reserved[PAGE_SIZE]; 367} tsc_pg __aligned(PAGE_SIZE); 368 369struct ms_hyperv_tsc_page *hv_get_tsc_page(void) 370{ 371 return &tsc_pg.page; 372} 373EXPORT_SYMBOL_GPL(hv_get_tsc_page); 374 375static u64 notrace read_hv_clock_tsc(void) 376{ 377 u64 current_tick = hv_read_tsc_page(hv_get_tsc_page()); 378 379 if (current_tick == U64_MAX) 380 current_tick = hv_get_register(HV_REGISTER_TIME_REF_COUNT); 381 382 return current_tick; 383} 384 385static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg) 386{ 387 return read_hv_clock_tsc(); 388} 389 390static u64 notrace read_hv_sched_clock_tsc(void) 391{ 392 return (read_hv_clock_tsc() - hv_sched_clock_offset) * 393 (NSEC_PER_SEC / HV_CLOCK_HZ); 394} 395 396static void suspend_hv_clock_tsc(struct clocksource *arg) 397{ 398 u64 tsc_msr; 399 400 /* Disable the TSC page */ 401 tsc_msr = hv_get_register(HV_REGISTER_REFERENCE_TSC); 402 tsc_msr &= ~BIT_ULL(0); 403 hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr); 404} 405 406 407static void resume_hv_clock_tsc(struct clocksource *arg) 408{ 409 phys_addr_t phys_addr = virt_to_phys(&tsc_pg); 410 u64 tsc_msr; 411 412 /* Re-enable the TSC page */ 413 tsc_msr = hv_get_register(HV_REGISTER_REFERENCE_TSC); 414 tsc_msr &= GENMASK_ULL(11, 0); 415 tsc_msr |= BIT_ULL(0) | (u64)phys_addr; 416 hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr); 417} 418 419#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK 420static int hv_cs_enable(struct clocksource *cs) 421{ 422 vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK); 423 return 0; 424} 425#endif 426 427static struct clocksource hyperv_cs_tsc = { 428 .name = "hyperv_clocksource_tsc_page", 429 .rating = 500, 430 .read = read_hv_clock_tsc_cs, 431 .mask = CLOCKSOURCE_MASK(64), 432 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 433 .suspend= suspend_hv_clock_tsc, 434 .resume = resume_hv_clock_tsc, 435#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK 436 .enable = hv_cs_enable, 437 .vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK, 438#else 439 .vdso_clock_mode = VDSO_CLOCKMODE_NONE, 440#endif 441}; 442 443static u64 notrace read_hv_clock_msr(void) 444{ 445 /* 446 * Read the partition counter to get the current tick count. This count 447 * is set to 0 when the partition is created and is incremented in 448 * 100 nanosecond units. 449 */ 450 return hv_get_register(HV_REGISTER_TIME_REF_COUNT); 451} 452 453static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg) 454{ 455 return read_hv_clock_msr(); 456} 457 458static u64 notrace read_hv_sched_clock_msr(void) 459{ 460 return (read_hv_clock_msr() - hv_sched_clock_offset) * 461 (NSEC_PER_SEC / HV_CLOCK_HZ); 462} 463 464static struct clocksource hyperv_cs_msr = { 465 .name = "hyperv_clocksource_msr", 466 .rating = 500, 467 .read = read_hv_clock_msr_cs, 468 .mask = CLOCKSOURCE_MASK(64), 469 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 470}; 471 472/* 473 * Reference to pv_ops must be inline so objtool 474 * detection of noinstr violations can work correctly. 475 */ 476#ifdef CONFIG_GENERIC_SCHED_CLOCK 477static __always_inline void hv_setup_sched_clock(void *sched_clock) 478{ 479 /* 480 * We're on an architecture with generic sched clock (not x86/x64). 481 * The Hyper-V sched clock read function returns nanoseconds, not 482 * the normal 100ns units of the Hyper-V synthetic clock. 483 */ 484 sched_clock_register(sched_clock, 64, NSEC_PER_SEC); 485} 486#elif defined CONFIG_PARAVIRT 487static __always_inline void hv_setup_sched_clock(void *sched_clock) 488{ 489 /* We're on x86/x64 *and* using PV ops */ 490 paravirt_set_sched_clock(sched_clock); 491} 492#else /* !CONFIG_GENERIC_SCHED_CLOCK && !CONFIG_PARAVIRT */ 493static __always_inline void hv_setup_sched_clock(void *sched_clock) {} 494#endif /* CONFIG_GENERIC_SCHED_CLOCK */ 495 496static bool __init hv_init_tsc_clocksource(void) 497{ 498 u64 tsc_msr; 499 phys_addr_t phys_addr; 500 501 if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) 502 return false; 503 504 if (hv_root_partition) 505 return false; 506 507 /* 508 * If Hyper-V offers TSC_INVARIANT, then the virtualized TSC correctly 509 * handles frequency and offset changes due to live migration, 510 * pause/resume, and other VM management operations. So lower the 511 * Hyper-V Reference TSC rating, causing the generic TSC to be used. 512 * TSC_INVARIANT is not offered on ARM64, so the Hyper-V Reference 513 * TSC will be preferred over the virtualized ARM64 arch counter. 514 * While the Hyper-V MSR clocksource won't be used since the 515 * Reference TSC clocksource is present, change its rating as 516 * well for consistency. 517 */ 518 if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { 519 hyperv_cs_tsc.rating = 250; 520 hyperv_cs_msr.rating = 250; 521 } 522 523 hv_read_reference_counter = read_hv_clock_tsc; 524 phys_addr = virt_to_phys(hv_get_tsc_page()); 525 526 /* 527 * The Hyper-V TLFS specifies to preserve the value of reserved 528 * bits in registers. So read the existing value, preserve the 529 * low order 12 bits, and add in the guest physical address 530 * (which already has at least the low 12 bits set to zero since 531 * it is page aligned). Also set the "enable" bit, which is bit 0. 532 */ 533 tsc_msr = hv_get_register(HV_REGISTER_REFERENCE_TSC); 534 tsc_msr &= GENMASK_ULL(11, 0); 535 tsc_msr = tsc_msr | 0x1 | (u64)phys_addr; 536 hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr); 537 538 clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); 539 540 hv_sched_clock_offset = hv_read_reference_counter(); 541 hv_setup_sched_clock(read_hv_sched_clock_tsc); 542 543 return true; 544} 545 546void __init hv_init_clocksource(void) 547{ 548 /* 549 * Try to set up the TSC page clocksource. If it succeeds, we're 550 * done. Otherwise, set up the MSR clocksource. At least one of 551 * these will always be available except on very old versions of 552 * Hyper-V on x86. In that case we won't have a Hyper-V 553 * clocksource, but Linux will still run with a clocksource based 554 * on the emulated PIT or LAPIC timer. 555 */ 556 if (hv_init_tsc_clocksource()) 557 return; 558 559 if (!(ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE)) 560 return; 561 562 hv_read_reference_counter = read_hv_clock_msr; 563 clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); 564 565 hv_sched_clock_offset = hv_read_reference_counter(); 566 hv_setup_sched_clock(read_hv_sched_clock_msr); 567}