perf_cpum_cf.c (45306B)
1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Performance event support for s390x - CPU-measurement Counter Facility 4 * 5 * Copyright IBM Corp. 2012, 2021 6 * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> 7 * Thomas Richter <tmricht@linux.ibm.com> 8 */ 9#define KMSG_COMPONENT "cpum_cf" 10#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 11 12#include <linux/kernel.h> 13#include <linux/kernel_stat.h> 14#include <linux/percpu.h> 15#include <linux/notifier.h> 16#include <linux/init.h> 17#include <linux/export.h> 18#include <linux/miscdevice.h> 19 20#include <asm/cpu_mcf.h> 21#include <asm/hwctrset.h> 22#include <asm/debug.h> 23 24static unsigned int cfdiag_cpu_speed; /* CPU speed for CF_DIAG trailer */ 25static debug_info_t *cf_dbg; 26 27#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */ 28 /* interval in seconds */ 29 30/* Counter sets are stored as data stream in a page sized memory buffer and 31 * exported to user space via raw data attached to the event sample data. 32 * Each counter set starts with an eight byte header consisting of: 33 * - a two byte eye catcher (0xfeef) 34 * - a one byte counter set number 35 * - a two byte counter set size (indicates the number of counters in this set) 36 * - a three byte reserved value (must be zero) to make the header the same 37 * size as a counter value. 38 * All counter values are eight byte in size. 39 * 40 * All counter sets are followed by a 64 byte trailer. 41 * The trailer consists of a: 42 * - flag field indicating valid fields when corresponding bit set 43 * - the counter facility first and second version number 44 * - the CPU speed if nonzero 45 * - the time stamp the counter sets have been collected 46 * - the time of day (TOD) base value 47 * - the machine type. 48 * 49 * The counter sets are saved when the process is prepared to be executed on a 50 * CPU and saved again when the process is going to be removed from a CPU. 51 * The difference of both counter sets are calculated and stored in the event 52 * sample data area. 53 */ 54struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */ 55 unsigned int def:16; /* 0-15 Data Entry Format */ 56 unsigned int set:16; /* 16-31 Counter set identifier */ 57 unsigned int ctr:16; /* 32-47 Number of stored counters */ 58 unsigned int res1:16; /* 48-63 Reserved */ 59}; 60 61struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */ 62 /* 0 - 7 */ 63 union { 64 struct { 65 unsigned int clock_base:1; /* TOD clock base set */ 66 unsigned int speed:1; /* CPU speed set */ 67 /* Measurement alerts */ 68 unsigned int mtda:1; /* Loss of MT ctr. data alert */ 69 unsigned int caca:1; /* Counter auth. change alert */ 70 unsigned int lcda:1; /* Loss of counter data alert */ 71 }; 72 unsigned long flags; /* 0-63 All indicators */ 73 }; 74 /* 8 - 15 */ 75 unsigned int cfvn:16; /* 64-79 Ctr First Version */ 76 unsigned int csvn:16; /* 80-95 Ctr Second Version */ 77 unsigned int cpu_speed:32; /* 96-127 CPU speed */ 78 /* 16 - 23 */ 79 unsigned long timestamp; /* 128-191 Timestamp (TOD) */ 80 /* 24 - 55 */ 81 union { 82 struct { 83 unsigned long progusage1; 84 unsigned long progusage2; 85 unsigned long progusage3; 86 unsigned long tod_base; 87 }; 88 unsigned long progusage[4]; 89 }; 90 /* 56 - 63 */ 91 unsigned int mach_type:16; /* Machine type */ 92 unsigned int res1:16; /* Reserved */ 93 unsigned int res2:32; /* Reserved */ 94}; 95 96/* Create the trailer data at the end of a page. */ 97static void cfdiag_trailer(struct cf_trailer_entry *te) 98{ 99 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 100 struct cpuid cpuid; 101 102 te->cfvn = cpuhw->info.cfvn; /* Counter version numbers */ 103 te->csvn = cpuhw->info.csvn; 104 105 get_cpu_id(&cpuid); /* Machine type */ 106 te->mach_type = cpuid.machine; 107 te->cpu_speed = cfdiag_cpu_speed; 108 if (te->cpu_speed) 109 te->speed = 1; 110 te->clock_base = 1; /* Save clock base */ 111 te->tod_base = tod_clock_base.tod; 112 te->timestamp = get_tod_clock_fast(); 113} 114 115/* Read a counter set. The counter set number determines the counter set and 116 * the CPUM-CF first and second version number determine the number of 117 * available counters in each counter set. 118 * Each counter set starts with header containing the counter set number and 119 * the number of eight byte counters. 120 * 121 * The functions returns the number of bytes occupied by this counter set 122 * including the header. 123 * If there is no counter in the counter set, this counter set is useless and 124 * zero is returned on this case. 125 * 126 * Note that the counter sets may not be enabled or active and the stcctm 127 * instruction might return error 3. Depending on error_ok value this is ok, 128 * for example when called from cpumf_pmu_start() call back function. 129 */ 130static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, 131 size_t room, bool error_ok) 132{ 133 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 134 size_t ctrset_size, need = 0; 135 int rc = 3; /* Assume write failure */ 136 137 ctrdata->def = CF_DIAG_CTRSET_DEF; 138 ctrdata->set = ctrset; 139 ctrdata->res1 = 0; 140 ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info); 141 142 if (ctrset_size) { /* Save data */ 143 need = ctrset_size * sizeof(u64) + sizeof(*ctrdata); 144 if (need <= room) { 145 rc = ctr_stcctm(ctrset, ctrset_size, 146 (u64 *)(ctrdata + 1)); 147 } 148 if (rc != 3 || error_ok) 149 ctrdata->ctr = ctrset_size; 150 else 151 need = 0; 152 } 153 154 debug_sprintf_event(cf_dbg, 3, 155 "%s ctrset %d ctrset_size %zu cfvn %d csvn %d" 156 " need %zd rc %d\n", __func__, ctrset, ctrset_size, 157 cpuhw->info.cfvn, cpuhw->info.csvn, need, rc); 158 return need; 159} 160 161static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = { 162 [CPUMF_CTR_SET_BASIC] = 0x02, 163 [CPUMF_CTR_SET_USER] = 0x04, 164 [CPUMF_CTR_SET_CRYPTO] = 0x08, 165 [CPUMF_CTR_SET_EXT] = 0x01, 166 [CPUMF_CTR_SET_MT_DIAG] = 0x20, 167}; 168 169/* Read out all counter sets and save them in the provided data buffer. 170 * The last 64 byte host an artificial trailer entry. 171 */ 172static size_t cfdiag_getctr(void *data, size_t sz, unsigned long auth, 173 bool error_ok) 174{ 175 struct cf_trailer_entry *trailer; 176 size_t offset = 0, done; 177 int i; 178 179 memset(data, 0, sz); 180 sz -= sizeof(*trailer); /* Always room for trailer */ 181 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { 182 struct cf_ctrset_entry *ctrdata = data + offset; 183 184 if (!(auth & cpumf_ctr_ctl[i])) 185 continue; /* Counter set not authorized */ 186 187 done = cfdiag_getctrset(ctrdata, i, sz - offset, error_ok); 188 offset += done; 189 } 190 trailer = data + offset; 191 cfdiag_trailer(trailer); 192 return offset + sizeof(*trailer); 193} 194 195/* Calculate the difference for each counter in a counter set. */ 196static void cfdiag_diffctrset(u64 *pstart, u64 *pstop, int counters) 197{ 198 for (; --counters >= 0; ++pstart, ++pstop) 199 if (*pstop >= *pstart) 200 *pstop -= *pstart; 201 else 202 *pstop = *pstart - *pstop + 1; 203} 204 205/* Scan the counter sets and calculate the difference of each counter 206 * in each set. The result is the increment of each counter during the 207 * period the counter set has been activated. 208 * 209 * Return true on success. 210 */ 211static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth) 212{ 213 struct cf_trailer_entry *trailer_start, *trailer_stop; 214 struct cf_ctrset_entry *ctrstart, *ctrstop; 215 size_t offset = 0; 216 217 auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1; 218 do { 219 ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset); 220 ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset); 221 222 if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) { 223 pr_err_once("cpum_cf_diag counter set compare error " 224 "in set %i\n", ctrstart->set); 225 return 0; 226 } 227 auth &= ~cpumf_ctr_ctl[ctrstart->set]; 228 if (ctrstart->def == CF_DIAG_CTRSET_DEF) { 229 cfdiag_diffctrset((u64 *)(ctrstart + 1), 230 (u64 *)(ctrstop + 1), ctrstart->ctr); 231 offset += ctrstart->ctr * sizeof(u64) + 232 sizeof(*ctrstart); 233 } 234 } while (ctrstart->def && auth); 235 236 /* Save time_stamp from start of event in stop's trailer */ 237 trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset); 238 trailer_stop = (struct cf_trailer_entry *)(cpuhw->stop + offset); 239 trailer_stop->progusage[0] = trailer_start->timestamp; 240 241 return 1; 242} 243 244static enum cpumf_ctr_set get_counter_set(u64 event) 245{ 246 int set = CPUMF_CTR_SET_MAX; 247 248 if (event < 32) 249 set = CPUMF_CTR_SET_BASIC; 250 else if (event < 64) 251 set = CPUMF_CTR_SET_USER; 252 else if (event < 128) 253 set = CPUMF_CTR_SET_CRYPTO; 254 else if (event < 288) 255 set = CPUMF_CTR_SET_EXT; 256 else if (event >= 448 && event < 496) 257 set = CPUMF_CTR_SET_MT_DIAG; 258 259 return set; 260} 261 262static int validate_ctr_version(const struct hw_perf_event *hwc, 263 enum cpumf_ctr_set set) 264{ 265 struct cpu_cf_events *cpuhw; 266 int err = 0; 267 u16 mtdiag_ctl; 268 269 cpuhw = &get_cpu_var(cpu_cf_events); 270 271 /* check required version for counter sets */ 272 switch (set) { 273 case CPUMF_CTR_SET_BASIC: 274 case CPUMF_CTR_SET_USER: 275 if (cpuhw->info.cfvn < 1) 276 err = -EOPNOTSUPP; 277 break; 278 case CPUMF_CTR_SET_CRYPTO: 279 if ((cpuhw->info.csvn >= 1 && cpuhw->info.csvn <= 5 && 280 hwc->config > 79) || 281 (cpuhw->info.csvn >= 6 && hwc->config > 83)) 282 err = -EOPNOTSUPP; 283 break; 284 case CPUMF_CTR_SET_EXT: 285 if (cpuhw->info.csvn < 1) 286 err = -EOPNOTSUPP; 287 if ((cpuhw->info.csvn == 1 && hwc->config > 159) || 288 (cpuhw->info.csvn == 2 && hwc->config > 175) || 289 (cpuhw->info.csvn >= 3 && cpuhw->info.csvn <= 5 290 && hwc->config > 255) || 291 (cpuhw->info.csvn >= 6 && hwc->config > 287)) 292 err = -EOPNOTSUPP; 293 break; 294 case CPUMF_CTR_SET_MT_DIAG: 295 if (cpuhw->info.csvn <= 3) 296 err = -EOPNOTSUPP; 297 /* 298 * MT-diagnostic counters are read-only. The counter set 299 * is automatically enabled and activated on all CPUs with 300 * multithreading (SMT). Deactivation of multithreading 301 * also disables the counter set. State changes are ignored 302 * by lcctl(). Because Linux controls SMT enablement through 303 * a kernel parameter only, the counter set is either disabled 304 * or enabled and active. 305 * 306 * Thus, the counters can only be used if SMT is on and the 307 * counter set is enabled and active. 308 */ 309 mtdiag_ctl = cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG]; 310 if (!((cpuhw->info.auth_ctl & mtdiag_ctl) && 311 (cpuhw->info.enable_ctl & mtdiag_ctl) && 312 (cpuhw->info.act_ctl & mtdiag_ctl))) 313 err = -EOPNOTSUPP; 314 break; 315 case CPUMF_CTR_SET_MAX: 316 err = -EOPNOTSUPP; 317 } 318 319 put_cpu_var(cpu_cf_events); 320 return err; 321} 322 323static int validate_ctr_auth(const struct hw_perf_event *hwc) 324{ 325 struct cpu_cf_events *cpuhw; 326 int err = 0; 327 328 cpuhw = &get_cpu_var(cpu_cf_events); 329 330 /* Check authorization for cpu counter sets. 331 * If the particular CPU counter set is not authorized, 332 * return with -ENOENT in order to fall back to other 333 * PMUs that might suffice the event request. 334 */ 335 if (!(hwc->config_base & cpuhw->info.auth_ctl)) 336 err = -ENOENT; 337 338 put_cpu_var(cpu_cf_events); 339 return err; 340} 341 342/* 343 * Change the CPUMF state to active. 344 * Enable and activate the CPU-counter sets according 345 * to the per-cpu control state. 346 */ 347static void cpumf_pmu_enable(struct pmu *pmu) 348{ 349 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 350 int err; 351 352 if (cpuhw->flags & PMU_F_ENABLED) 353 return; 354 355 err = lcctl(cpuhw->state | cpuhw->dev_state); 356 if (err) { 357 pr_err("Enabling the performance measuring unit " 358 "failed with rc=%x\n", err); 359 return; 360 } 361 362 cpuhw->flags |= PMU_F_ENABLED; 363} 364 365/* 366 * Change the CPUMF state to inactive. 367 * Disable and enable (inactive) the CPU-counter sets according 368 * to the per-cpu control state. 369 */ 370static void cpumf_pmu_disable(struct pmu *pmu) 371{ 372 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 373 int err; 374 u64 inactive; 375 376 if (!(cpuhw->flags & PMU_F_ENABLED)) 377 return; 378 379 inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); 380 inactive |= cpuhw->dev_state; 381 err = lcctl(inactive); 382 if (err) { 383 pr_err("Disabling the performance measuring unit " 384 "failed with rc=%x\n", err); 385 return; 386 } 387 388 cpuhw->flags &= ~PMU_F_ENABLED; 389} 390 391 392/* Number of perf events counting hardware events */ 393static atomic_t num_events = ATOMIC_INIT(0); 394/* Used to avoid races in calling reserve/release_cpumf_hardware */ 395static DEFINE_MUTEX(pmc_reserve_mutex); 396 397/* Release the PMU if event is the last perf event */ 398static void hw_perf_event_destroy(struct perf_event *event) 399{ 400 if (!atomic_add_unless(&num_events, -1, 1)) { 401 mutex_lock(&pmc_reserve_mutex); 402 if (atomic_dec_return(&num_events) == 0) 403 __kernel_cpumcf_end(); 404 mutex_unlock(&pmc_reserve_mutex); 405 } 406} 407 408/* CPUMF <-> perf event mappings for kernel+userspace (basic set) */ 409static const int cpumf_generic_events_basic[] = { 410 [PERF_COUNT_HW_CPU_CYCLES] = 0, 411 [PERF_COUNT_HW_INSTRUCTIONS] = 1, 412 [PERF_COUNT_HW_CACHE_REFERENCES] = -1, 413 [PERF_COUNT_HW_CACHE_MISSES] = -1, 414 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = -1, 415 [PERF_COUNT_HW_BRANCH_MISSES] = -1, 416 [PERF_COUNT_HW_BUS_CYCLES] = -1, 417}; 418/* CPUMF <-> perf event mappings for userspace (problem-state set) */ 419static const int cpumf_generic_events_user[] = { 420 [PERF_COUNT_HW_CPU_CYCLES] = 32, 421 [PERF_COUNT_HW_INSTRUCTIONS] = 33, 422 [PERF_COUNT_HW_CACHE_REFERENCES] = -1, 423 [PERF_COUNT_HW_CACHE_MISSES] = -1, 424 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = -1, 425 [PERF_COUNT_HW_BRANCH_MISSES] = -1, 426 [PERF_COUNT_HW_BUS_CYCLES] = -1, 427}; 428 429static void cpumf_hw_inuse(void) 430{ 431 mutex_lock(&pmc_reserve_mutex); 432 if (atomic_inc_return(&num_events) == 1) 433 __kernel_cpumcf_begin(); 434 mutex_unlock(&pmc_reserve_mutex); 435} 436 437static int __hw_perf_event_init(struct perf_event *event, unsigned int type) 438{ 439 struct perf_event_attr *attr = &event->attr; 440 struct hw_perf_event *hwc = &event->hw; 441 enum cpumf_ctr_set set; 442 int err = 0; 443 u64 ev; 444 445 switch (type) { 446 case PERF_TYPE_RAW: 447 /* Raw events are used to access counters directly, 448 * hence do not permit excludes */ 449 if (attr->exclude_kernel || attr->exclude_user || 450 attr->exclude_hv) 451 return -EOPNOTSUPP; 452 ev = attr->config; 453 break; 454 455 case PERF_TYPE_HARDWARE: 456 if (is_sampling_event(event)) /* No sampling support */ 457 return -ENOENT; 458 ev = attr->config; 459 /* Count user space (problem-state) only */ 460 if (!attr->exclude_user && attr->exclude_kernel) { 461 if (ev >= ARRAY_SIZE(cpumf_generic_events_user)) 462 return -EOPNOTSUPP; 463 ev = cpumf_generic_events_user[ev]; 464 465 /* No support for kernel space counters only */ 466 } else if (!attr->exclude_kernel && attr->exclude_user) { 467 return -EOPNOTSUPP; 468 } else { /* Count user and kernel space */ 469 if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) 470 return -EOPNOTSUPP; 471 ev = cpumf_generic_events_basic[ev]; 472 } 473 break; 474 475 default: 476 return -ENOENT; 477 } 478 479 if (ev == -1) 480 return -ENOENT; 481 482 if (ev > PERF_CPUM_CF_MAX_CTR) 483 return -ENOENT; 484 485 /* Obtain the counter set to which the specified counter belongs */ 486 set = get_counter_set(ev); 487 switch (set) { 488 case CPUMF_CTR_SET_BASIC: 489 case CPUMF_CTR_SET_USER: 490 case CPUMF_CTR_SET_CRYPTO: 491 case CPUMF_CTR_SET_EXT: 492 case CPUMF_CTR_SET_MT_DIAG: 493 /* 494 * Use the hardware perf event structure to store the 495 * counter number in the 'config' member and the counter 496 * set number in the 'config_base' as bit mask. 497 * It is later used to enable/disable the counter(s). 498 */ 499 hwc->config = ev; 500 hwc->config_base = cpumf_ctr_ctl[set]; 501 break; 502 case CPUMF_CTR_SET_MAX: 503 /* The counter could not be associated to a counter set */ 504 return -EINVAL; 505 } 506 507 /* Initialize for using the CPU-measurement counter facility */ 508 cpumf_hw_inuse(); 509 event->destroy = hw_perf_event_destroy; 510 511 /* Finally, validate version and authorization of the counter set */ 512 err = validate_ctr_auth(hwc); 513 if (!err) 514 err = validate_ctr_version(hwc, set); 515 516 return err; 517} 518 519/* Events CPU_CYLCES and INSTRUCTIONS can be submitted with two different 520 * attribute::type values: 521 * - PERF_TYPE_HARDWARE: 522 * - pmu->type: 523 * Handle both type of invocations identical. They address the same hardware. 524 * The result is different when event modifiers exclude_kernel and/or 525 * exclude_user are also set. 526 */ 527static int cpumf_pmu_event_type(struct perf_event *event) 528{ 529 u64 ev = event->attr.config; 530 531 if (cpumf_generic_events_basic[PERF_COUNT_HW_CPU_CYCLES] == ev || 532 cpumf_generic_events_basic[PERF_COUNT_HW_INSTRUCTIONS] == ev || 533 cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev || 534 cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev) 535 return PERF_TYPE_HARDWARE; 536 return PERF_TYPE_RAW; 537} 538 539static int cpumf_pmu_event_init(struct perf_event *event) 540{ 541 unsigned int type = event->attr.type; 542 int err; 543 544 if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_RAW) 545 err = __hw_perf_event_init(event, type); 546 else if (event->pmu->type == type) 547 /* Registered as unknown PMU */ 548 err = __hw_perf_event_init(event, cpumf_pmu_event_type(event)); 549 else 550 return -ENOENT; 551 552 if (unlikely(err) && event->destroy) 553 event->destroy(event); 554 555 return err; 556} 557 558static int hw_perf_event_reset(struct perf_event *event) 559{ 560 u64 prev, new; 561 int err; 562 563 do { 564 prev = local64_read(&event->hw.prev_count); 565 err = ecctr(event->hw.config, &new); 566 if (err) { 567 if (err != 3) 568 break; 569 /* The counter is not (yet) available. This 570 * might happen if the counter set to which 571 * this counter belongs is in the disabled 572 * state. 573 */ 574 new = 0; 575 } 576 } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); 577 578 return err; 579} 580 581static void hw_perf_event_update(struct perf_event *event) 582{ 583 u64 prev, new, delta; 584 int err; 585 586 do { 587 prev = local64_read(&event->hw.prev_count); 588 err = ecctr(event->hw.config, &new); 589 if (err) 590 return; 591 } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); 592 593 delta = (prev <= new) ? new - prev 594 : (-1ULL - prev) + new + 1; /* overflow */ 595 local64_add(delta, &event->count); 596} 597 598static void cpumf_pmu_read(struct perf_event *event) 599{ 600 if (event->hw.state & PERF_HES_STOPPED) 601 return; 602 603 hw_perf_event_update(event); 604} 605 606static void cpumf_pmu_start(struct perf_event *event, int flags) 607{ 608 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 609 struct hw_perf_event *hwc = &event->hw; 610 int i; 611 612 if (!(hwc->state & PERF_HES_STOPPED)) 613 return; 614 615 hwc->state = 0; 616 617 /* (Re-)enable and activate the counter set */ 618 ctr_set_enable(&cpuhw->state, hwc->config_base); 619 ctr_set_start(&cpuhw->state, hwc->config_base); 620 621 /* The counter set to which this counter belongs can be already active. 622 * Because all counters in a set are active, the event->hw.prev_count 623 * needs to be synchronized. At this point, the counter set can be in 624 * the inactive or disabled state. 625 */ 626 if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) { 627 cpuhw->usedss = cfdiag_getctr(cpuhw->start, 628 sizeof(cpuhw->start), 629 hwc->config_base, true); 630 } else { 631 hw_perf_event_reset(event); 632 } 633 634 /* Increment refcount for counter sets */ 635 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) 636 if ((hwc->config_base & cpumf_ctr_ctl[i])) 637 atomic_inc(&cpuhw->ctr_set[i]); 638} 639 640/* Create perf event sample with the counter sets as raw data. The sample 641 * is then pushed to the event subsystem and the function checks for 642 * possible event overflows. If an event overflow occurs, the PMU is 643 * stopped. 644 * 645 * Return non-zero if an event overflow occurred. 646 */ 647static int cfdiag_push_sample(struct perf_event *event, 648 struct cpu_cf_events *cpuhw) 649{ 650 struct perf_sample_data data; 651 struct perf_raw_record raw; 652 struct pt_regs regs; 653 int overflow; 654 655 /* Setup perf sample */ 656 perf_sample_data_init(&data, 0, event->hw.last_period); 657 memset(®s, 0, sizeof(regs)); 658 memset(&raw, 0, sizeof(raw)); 659 660 if (event->attr.sample_type & PERF_SAMPLE_CPU) 661 data.cpu_entry.cpu = event->cpu; 662 if (event->attr.sample_type & PERF_SAMPLE_RAW) { 663 raw.frag.size = cpuhw->usedss; 664 raw.frag.data = cpuhw->stop; 665 raw.size = raw.frag.size; 666 data.raw = &raw; 667 } 668 669 overflow = perf_event_overflow(event, &data, ®s); 670 debug_sprintf_event(cf_dbg, 3, 671 "%s event %#llx sample_type %#llx raw %d ov %d\n", 672 __func__, event->hw.config, 673 event->attr.sample_type, raw.size, overflow); 674 if (overflow) 675 event->pmu->stop(event, 0); 676 677 perf_event_update_userpage(event); 678 return overflow; 679} 680 681static void cpumf_pmu_stop(struct perf_event *event, int flags) 682{ 683 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 684 struct hw_perf_event *hwc = &event->hw; 685 int i; 686 687 if (!(hwc->state & PERF_HES_STOPPED)) { 688 /* Decrement reference count for this counter set and if this 689 * is the last used counter in the set, clear activation 690 * control and set the counter set state to inactive. 691 */ 692 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { 693 if (!(hwc->config_base & cpumf_ctr_ctl[i])) 694 continue; 695 if (!atomic_dec_return(&cpuhw->ctr_set[i])) 696 ctr_set_stop(&cpuhw->state, cpumf_ctr_ctl[i]); 697 } 698 hwc->state |= PERF_HES_STOPPED; 699 } 700 701 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { 702 if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) { 703 local64_inc(&event->count); 704 cpuhw->usedss = cfdiag_getctr(cpuhw->stop, 705 sizeof(cpuhw->stop), 706 event->hw.config_base, 707 false); 708 if (cfdiag_diffctr(cpuhw, event->hw.config_base)) 709 cfdiag_push_sample(event, cpuhw); 710 } else if (cpuhw->flags & PMU_F_RESERVED) { 711 /* Only update when PMU not hotplugged off */ 712 hw_perf_event_update(event); 713 } 714 hwc->state |= PERF_HES_UPTODATE; 715 } 716} 717 718static int cpumf_pmu_add(struct perf_event *event, int flags) 719{ 720 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 721 722 ctr_set_enable(&cpuhw->state, event->hw.config_base); 723 event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; 724 725 if (flags & PERF_EF_START) 726 cpumf_pmu_start(event, PERF_EF_RELOAD); 727 728 return 0; 729} 730 731static void cpumf_pmu_del(struct perf_event *event, int flags) 732{ 733 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 734 int i; 735 736 cpumf_pmu_stop(event, PERF_EF_UPDATE); 737 738 /* Check if any counter in the counter set is still used. If not used, 739 * change the counter set to the disabled state. This also clears the 740 * content of all counters in the set. 741 * 742 * When a new perf event has been added but not yet started, this can 743 * clear enable control and resets all counters in a set. Therefore, 744 * cpumf_pmu_start() always has to reenable a counter set. 745 */ 746 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) 747 if (!atomic_read(&cpuhw->ctr_set[i])) 748 ctr_set_disable(&cpuhw->state, cpumf_ctr_ctl[i]); 749} 750 751/* Performance monitoring unit for s390x */ 752static struct pmu cpumf_pmu = { 753 .task_ctx_nr = perf_sw_context, 754 .capabilities = PERF_PMU_CAP_NO_INTERRUPT, 755 .pmu_enable = cpumf_pmu_enable, 756 .pmu_disable = cpumf_pmu_disable, 757 .event_init = cpumf_pmu_event_init, 758 .add = cpumf_pmu_add, 759 .del = cpumf_pmu_del, 760 .start = cpumf_pmu_start, 761 .stop = cpumf_pmu_stop, 762 .read = cpumf_pmu_read, 763}; 764 765static int cfset_init(void); 766static int __init cpumf_pmu_init(void) 767{ 768 int rc; 769 770 if (!kernel_cpumcf_avail()) 771 return -ENODEV; 772 773 /* Setup s390dbf facility */ 774 cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128); 775 if (!cf_dbg) { 776 pr_err("Registration of s390dbf(cpum_cf) failed\n"); 777 return -ENOMEM; 778 } 779 debug_register_view(cf_dbg, &debug_sprintf_view); 780 781 cpumf_pmu.attr_groups = cpumf_cf_event_group(); 782 rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1); 783 if (rc) { 784 debug_unregister_view(cf_dbg, &debug_sprintf_view); 785 debug_unregister(cf_dbg); 786 pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc); 787 } else if (stccm_avail()) { /* Setup counter set device */ 788 cfset_init(); 789 } 790 return rc; 791} 792 793/* Support for the CPU Measurement Facility counter set extraction using 794 * device /dev/hwctr. This allows user space programs to extract complete 795 * counter set via normal file operations. 796 */ 797 798static atomic_t cfset_opencnt = ATOMIC_INIT(0); /* Access count */ 799static DEFINE_MUTEX(cfset_ctrset_mutex);/* Synchronize access to hardware */ 800struct cfset_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */ 801 unsigned int sets; /* Counter set bit mask */ 802 atomic_t cpus_ack; /* # CPUs successfully executed func */ 803}; 804 805static struct cfset_session { /* CPUs and counter set bit mask */ 806 struct list_head head; /* Head of list of active processes */ 807} cfset_session = { 808 .head = LIST_HEAD_INIT(cfset_session.head) 809}; 810 811struct cfset_request { /* CPUs and counter set bit mask */ 812 unsigned long ctrset; /* Bit mask of counter set to read */ 813 cpumask_t mask; /* CPU mask to read from */ 814 struct list_head node; /* Chain to cfset_session.head */ 815}; 816 817static void cfset_session_init(void) 818{ 819 INIT_LIST_HEAD(&cfset_session.head); 820} 821 822/* Remove current request from global bookkeeping. Maintain a counter set bit 823 * mask on a per CPU basis. 824 * Done in process context under mutex protection. 825 */ 826static void cfset_session_del(struct cfset_request *p) 827{ 828 list_del(&p->node); 829} 830 831/* Add current request to global bookkeeping. Maintain a counter set bit mask 832 * on a per CPU basis. 833 * Done in process context under mutex protection. 834 */ 835static void cfset_session_add(struct cfset_request *p) 836{ 837 list_add(&p->node, &cfset_session.head); 838} 839 840/* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access 841 * path is currently used. 842 * The cpu_cf_events::dev_state is used to denote counter sets in use by this 843 * interface. It is always or'ed in. If this interface is not active, its 844 * value is zero and no additional counter sets will be included. 845 * 846 * The cpu_cf_events::state is used by the perf_event_open SVC and remains 847 * unchanged. 848 * 849 * perf_pmu_enable() and perf_pmu_enable() and its call backs 850 * cpumf_pmu_enable() and cpumf_pmu_disable() are called by the 851 * performance measurement subsystem to enable per process 852 * CPU Measurement counter facility. 853 * The XXX_enable() and XXX_disable functions are used to turn off 854 * x86 performance monitoring interrupt (PMI) during scheduling. 855 * s390 uses these calls to temporarily stop and resume the active CPU 856 * counters sets during scheduling. 857 * 858 * We do allow concurrent access of perf_event_open() SVC and /dev/hwctr 859 * device access. The perf_event_open() SVC interface makes a lot of effort 860 * to only run the counters while the calling process is actively scheduled 861 * to run. 862 * When /dev/hwctr interface is also used at the same time, the counter sets 863 * will keep running, even when the process is scheduled off a CPU. 864 * However this is not a problem and does not lead to wrong counter values 865 * for the perf_event_open() SVC. The current counter value will be recorded 866 * during schedule-in. At schedule-out time the current counter value is 867 * extracted again and the delta is calculated and added to the event. 868 */ 869/* Stop all counter sets via ioctl interface */ 870static void cfset_ioctl_off(void *parm) 871{ 872 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 873 struct cfset_call_on_cpu_parm *p = parm; 874 int rc; 875 876 /* Check if any counter set used by /dev/hwc */ 877 for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) 878 if ((p->sets & cpumf_ctr_ctl[rc])) { 879 if (!atomic_dec_return(&cpuhw->ctr_set[rc])) { 880 ctr_set_disable(&cpuhw->dev_state, 881 cpumf_ctr_ctl[rc]); 882 ctr_set_stop(&cpuhw->dev_state, 883 cpumf_ctr_ctl[rc]); 884 } 885 } 886 /* Keep perf_event_open counter sets */ 887 rc = lcctl(cpuhw->dev_state | cpuhw->state); 888 if (rc) 889 pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n", 890 cpuhw->state, S390_HWCTR_DEVICE, rc); 891 if (!cpuhw->dev_state) 892 cpuhw->flags &= ~PMU_F_IN_USE; 893 debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n", 894 __func__, rc, cpuhw->state, cpuhw->dev_state); 895} 896 897/* Start counter sets on particular CPU */ 898static void cfset_ioctl_on(void *parm) 899{ 900 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 901 struct cfset_call_on_cpu_parm *p = parm; 902 int rc; 903 904 cpuhw->flags |= PMU_F_IN_USE; 905 ctr_set_enable(&cpuhw->dev_state, p->sets); 906 ctr_set_start(&cpuhw->dev_state, p->sets); 907 for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) 908 if ((p->sets & cpumf_ctr_ctl[rc])) 909 atomic_inc(&cpuhw->ctr_set[rc]); 910 rc = lcctl(cpuhw->dev_state | cpuhw->state); /* Start counter sets */ 911 if (!rc) 912 atomic_inc(&p->cpus_ack); 913 else 914 pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n", 915 cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc); 916 debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n", 917 __func__, rc, cpuhw->state, cpuhw->dev_state); 918} 919 920static void cfset_release_cpu(void *p) 921{ 922 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 923 int rc; 924 925 debug_sprintf_event(cf_dbg, 4, "%s state %#llx dev_state %#llx\n", 926 __func__, cpuhw->state, cpuhw->dev_state); 927 cpuhw->dev_state = 0; 928 rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */ 929 if (rc) 930 pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n", 931 cpuhw->state, S390_HWCTR_DEVICE, rc); 932} 933 934/* This modifies the process CPU mask to adopt it to the currently online 935 * CPUs. Offline CPUs can not be addresses. This call terminates the access 936 * and is usually followed by close() or a new iotcl(..., START, ...) which 937 * creates a new request structure. 938 */ 939static void cfset_all_stop(struct cfset_request *req) 940{ 941 struct cfset_call_on_cpu_parm p = { 942 .sets = req->ctrset, 943 }; 944 945 cpumask_and(&req->mask, &req->mask, cpu_online_mask); 946 on_each_cpu_mask(&req->mask, cfset_ioctl_off, &p, 1); 947} 948 949/* Release function is also called when application gets terminated without 950 * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command. 951 */ 952static int cfset_release(struct inode *inode, struct file *file) 953{ 954 mutex_lock(&cfset_ctrset_mutex); 955 /* Open followed by close/exit has no private_data */ 956 if (file->private_data) { 957 cfset_all_stop(file->private_data); 958 cfset_session_del(file->private_data); 959 kfree(file->private_data); 960 file->private_data = NULL; 961 } 962 if (!atomic_dec_return(&cfset_opencnt)) 963 on_each_cpu(cfset_release_cpu, NULL, 1); 964 mutex_unlock(&cfset_ctrset_mutex); 965 966 hw_perf_event_destroy(NULL); 967 return 0; 968} 969 970static int cfset_open(struct inode *inode, struct file *file) 971{ 972 if (!capable(CAP_SYS_ADMIN)) 973 return -EPERM; 974 mutex_lock(&cfset_ctrset_mutex); 975 if (atomic_inc_return(&cfset_opencnt) == 1) 976 cfset_session_init(); 977 mutex_unlock(&cfset_ctrset_mutex); 978 979 cpumf_hw_inuse(); 980 file->private_data = NULL; 981 /* nonseekable_open() never fails */ 982 return nonseekable_open(inode, file); 983} 984 985static int cfset_all_start(struct cfset_request *req) 986{ 987 struct cfset_call_on_cpu_parm p = { 988 .sets = req->ctrset, 989 .cpus_ack = ATOMIC_INIT(0), 990 }; 991 cpumask_var_t mask; 992 int rc = 0; 993 994 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 995 return -ENOMEM; 996 cpumask_and(mask, &req->mask, cpu_online_mask); 997 on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1); 998 if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) { 999 on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1); 1000 rc = -EIO; 1001 debug_sprintf_event(cf_dbg, 4, "%s CPUs missing", __func__); 1002 } 1003 free_cpumask_var(mask); 1004 return rc; 1005} 1006 1007 1008/* Return the maximum required space for all possible CPUs in case one 1009 * CPU will be onlined during the START, READ, STOP cycles. 1010 * To find out the size of the counter sets, any one CPU will do. They 1011 * all have the same counter sets. 1012 */ 1013static size_t cfset_needspace(unsigned int sets) 1014{ 1015 struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events); 1016 size_t bytes = 0; 1017 int i; 1018 1019 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { 1020 if (!(sets & cpumf_ctr_ctl[i])) 1021 continue; 1022 bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) + 1023 sizeof(((struct s390_ctrset_setdata *)0)->set) + 1024 sizeof(((struct s390_ctrset_setdata *)0)->no_cnts); 1025 } 1026 bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids * 1027 (bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) + 1028 sizeof(((struct s390_ctrset_cpudata *)0)->no_sets)); 1029 put_cpu_ptr(&cpu_cf_events); 1030 return bytes; 1031} 1032 1033static int cfset_all_copy(unsigned long arg, cpumask_t *mask) 1034{ 1035 struct s390_ctrset_read __user *ctrset_read; 1036 unsigned int cpu, cpus, rc; 1037 void __user *uptr; 1038 1039 ctrset_read = (struct s390_ctrset_read __user *)arg; 1040 uptr = ctrset_read->data; 1041 for_each_cpu(cpu, mask) { 1042 struct cpu_cf_events *cpuhw = per_cpu_ptr(&cpu_cf_events, cpu); 1043 struct s390_ctrset_cpudata __user *ctrset_cpudata; 1044 1045 ctrset_cpudata = uptr; 1046 rc = put_user(cpu, &ctrset_cpudata->cpu_nr); 1047 rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets); 1048 rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data, 1049 cpuhw->used); 1050 if (rc) 1051 return -EFAULT; 1052 uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used; 1053 cond_resched(); 1054 } 1055 cpus = cpumask_weight(mask); 1056 if (put_user(cpus, &ctrset_read->no_cpus)) 1057 return -EFAULT; 1058 debug_sprintf_event(cf_dbg, 4, "%s copied %ld\n", __func__, 1059 uptr - (void __user *)ctrset_read->data); 1060 return 0; 1061} 1062 1063static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset, 1064 int ctrset_size, size_t room) 1065{ 1066 size_t need = 0; 1067 int rc = -1; 1068 1069 need = sizeof(*p) + sizeof(u64) * ctrset_size; 1070 if (need <= room) { 1071 p->set = cpumf_ctr_ctl[ctrset]; 1072 p->no_cnts = ctrset_size; 1073 rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv); 1074 if (rc == 3) /* Nothing stored */ 1075 need = 0; 1076 } 1077 return need; 1078} 1079 1080/* Read all counter sets. */ 1081static void cfset_cpu_read(void *parm) 1082{ 1083 struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); 1084 struct cfset_call_on_cpu_parm *p = parm; 1085 int set, set_size; 1086 size_t space; 1087 1088 /* No data saved yet */ 1089 cpuhw->used = 0; 1090 cpuhw->sets = 0; 1091 memset(cpuhw->data, 0, sizeof(cpuhw->data)); 1092 1093 /* Scan the counter sets */ 1094 for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) { 1095 struct s390_ctrset_setdata *sp = (void *)cpuhw->data + 1096 cpuhw->used; 1097 1098 if (!(p->sets & cpumf_ctr_ctl[set])) 1099 continue; /* Counter set not in list */ 1100 set_size = cpum_cf_ctrset_size(set, &cpuhw->info); 1101 space = sizeof(cpuhw->data) - cpuhw->used; 1102 space = cfset_cpuset_read(sp, set, set_size, space); 1103 if (space) { 1104 cpuhw->used += space; 1105 cpuhw->sets += 1; 1106 } 1107 } 1108 debug_sprintf_event(cf_dbg, 4, "%s sets %d used %zd\n", __func__, 1109 cpuhw->sets, cpuhw->used); 1110} 1111 1112static int cfset_all_read(unsigned long arg, struct cfset_request *req) 1113{ 1114 struct cfset_call_on_cpu_parm p; 1115 cpumask_var_t mask; 1116 int rc; 1117 1118 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 1119 return -ENOMEM; 1120 1121 p.sets = req->ctrset; 1122 cpumask_and(mask, &req->mask, cpu_online_mask); 1123 on_each_cpu_mask(mask, cfset_cpu_read, &p, 1); 1124 rc = cfset_all_copy(arg, mask); 1125 free_cpumask_var(mask); 1126 return rc; 1127} 1128 1129static long cfset_ioctl_read(unsigned long arg, struct cfset_request *req) 1130{ 1131 struct s390_ctrset_read read; 1132 int ret = -ENODATA; 1133 1134 if (req && req->ctrset) { 1135 if (copy_from_user(&read, (char __user *)arg, sizeof(read))) 1136 return -EFAULT; 1137 ret = cfset_all_read(arg, req); 1138 } 1139 return ret; 1140} 1141 1142static long cfset_ioctl_stop(struct file *file) 1143{ 1144 struct cfset_request *req = file->private_data; 1145 int ret = -ENXIO; 1146 1147 if (req) { 1148 cfset_all_stop(req); 1149 cfset_session_del(req); 1150 kfree(req); 1151 file->private_data = NULL; 1152 ret = 0; 1153 } 1154 return ret; 1155} 1156 1157static long cfset_ioctl_start(unsigned long arg, struct file *file) 1158{ 1159 struct s390_ctrset_start __user *ustart; 1160 struct s390_ctrset_start start; 1161 struct cfset_request *preq; 1162 void __user *umask; 1163 unsigned int len; 1164 int ret = 0; 1165 size_t need; 1166 1167 if (file->private_data) 1168 return -EBUSY; 1169 ustart = (struct s390_ctrset_start __user *)arg; 1170 if (copy_from_user(&start, ustart, sizeof(start))) 1171 return -EFAULT; 1172 if (start.version != S390_HWCTR_START_VERSION) 1173 return -EINVAL; 1174 if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] | 1175 cpumf_ctr_ctl[CPUMF_CTR_SET_USER] | 1176 cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] | 1177 cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] | 1178 cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG])) 1179 return -EINVAL; /* Invalid counter set */ 1180 if (!start.counter_sets) 1181 return -EINVAL; /* No counter set at all? */ 1182 1183 preq = kzalloc(sizeof(*preq), GFP_KERNEL); 1184 if (!preq) 1185 return -ENOMEM; 1186 cpumask_clear(&preq->mask); 1187 len = min_t(u64, start.cpumask_len, cpumask_size()); 1188 umask = (void __user *)start.cpumask; 1189 if (copy_from_user(&preq->mask, umask, len)) { 1190 kfree(preq); 1191 return -EFAULT; 1192 } 1193 if (cpumask_empty(&preq->mask)) { 1194 kfree(preq); 1195 return -EINVAL; 1196 } 1197 need = cfset_needspace(start.counter_sets); 1198 if (put_user(need, &ustart->data_bytes)) { 1199 kfree(preq); 1200 return -EFAULT; 1201 } 1202 preq->ctrset = start.counter_sets; 1203 ret = cfset_all_start(preq); 1204 if (!ret) { 1205 cfset_session_add(preq); 1206 file->private_data = preq; 1207 debug_sprintf_event(cf_dbg, 4, "%s set %#lx need %ld ret %d\n", 1208 __func__, preq->ctrset, need, ret); 1209 } else { 1210 kfree(preq); 1211 } 1212 return ret; 1213} 1214 1215/* Entry point to the /dev/hwctr device interface. 1216 * The ioctl system call supports three subcommands: 1217 * S390_HWCTR_START: Start the specified counter sets on a CPU list. The 1218 * counter set keeps running until explicitly stopped. Returns the number 1219 * of bytes needed to store the counter values. If another S390_HWCTR_START 1220 * ioctl subcommand is called without a previous S390_HWCTR_STOP stop 1221 * command on the same file descriptor, -EBUSY is returned. 1222 * S390_HWCTR_READ: Read the counter set values from specified CPU list given 1223 * with the S390_HWCTR_START command. 1224 * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the 1225 * previous S390_HWCTR_START subcommand. 1226 */ 1227static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 1228{ 1229 int ret; 1230 1231 cpus_read_lock(); 1232 mutex_lock(&cfset_ctrset_mutex); 1233 switch (cmd) { 1234 case S390_HWCTR_START: 1235 ret = cfset_ioctl_start(arg, file); 1236 break; 1237 case S390_HWCTR_STOP: 1238 ret = cfset_ioctl_stop(file); 1239 break; 1240 case S390_HWCTR_READ: 1241 ret = cfset_ioctl_read(arg, file->private_data); 1242 break; 1243 default: 1244 ret = -ENOTTY; 1245 break; 1246 } 1247 mutex_unlock(&cfset_ctrset_mutex); 1248 cpus_read_unlock(); 1249 return ret; 1250} 1251 1252static const struct file_operations cfset_fops = { 1253 .owner = THIS_MODULE, 1254 .open = cfset_open, 1255 .release = cfset_release, 1256 .unlocked_ioctl = cfset_ioctl, 1257 .compat_ioctl = cfset_ioctl, 1258 .llseek = no_llseek 1259}; 1260 1261static struct miscdevice cfset_dev = { 1262 .name = S390_HWCTR_DEVICE, 1263 .minor = MISC_DYNAMIC_MINOR, 1264 .fops = &cfset_fops, 1265}; 1266 1267/* Hotplug add of a CPU. Scan through all active processes and add 1268 * that CPU to the list of CPUs supplied with ioctl(..., START, ...). 1269 */ 1270int cfset_online_cpu(unsigned int cpu) 1271{ 1272 struct cfset_call_on_cpu_parm p; 1273 struct cfset_request *rp; 1274 1275 mutex_lock(&cfset_ctrset_mutex); 1276 if (!list_empty(&cfset_session.head)) { 1277 list_for_each_entry(rp, &cfset_session.head, node) { 1278 p.sets = rp->ctrset; 1279 cfset_ioctl_on(&p); 1280 cpumask_set_cpu(cpu, &rp->mask); 1281 } 1282 } 1283 mutex_unlock(&cfset_ctrset_mutex); 1284 return 0; 1285} 1286 1287/* Hotplug remove of a CPU. Scan through all active processes and clear 1288 * that CPU from the list of CPUs supplied with ioctl(..., START, ...). 1289 */ 1290int cfset_offline_cpu(unsigned int cpu) 1291{ 1292 struct cfset_call_on_cpu_parm p; 1293 struct cfset_request *rp; 1294 1295 mutex_lock(&cfset_ctrset_mutex); 1296 if (!list_empty(&cfset_session.head)) { 1297 list_for_each_entry(rp, &cfset_session.head, node) { 1298 p.sets = rp->ctrset; 1299 cfset_ioctl_off(&p); 1300 cpumask_clear_cpu(cpu, &rp->mask); 1301 } 1302 } 1303 mutex_unlock(&cfset_ctrset_mutex); 1304 return 0; 1305} 1306 1307static void cfdiag_read(struct perf_event *event) 1308{ 1309 debug_sprintf_event(cf_dbg, 3, "%s event %#llx count %ld\n", __func__, 1310 event->attr.config, local64_read(&event->count)); 1311} 1312 1313static int get_authctrsets(void) 1314{ 1315 struct cpu_cf_events *cpuhw; 1316 unsigned long auth = 0; 1317 enum cpumf_ctr_set i; 1318 1319 cpuhw = &get_cpu_var(cpu_cf_events); 1320 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { 1321 if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i]) 1322 auth |= cpumf_ctr_ctl[i]; 1323 } 1324 put_cpu_var(cpu_cf_events); 1325 return auth; 1326} 1327 1328/* Setup the event. Test for authorized counter sets and only include counter 1329 * sets which are authorized at the time of the setup. Including unauthorized 1330 * counter sets result in specification exception (and panic). 1331 */ 1332static int cfdiag_event_init2(struct perf_event *event) 1333{ 1334 struct perf_event_attr *attr = &event->attr; 1335 int err = 0; 1336 1337 /* Set sample_period to indicate sampling */ 1338 event->hw.config = attr->config; 1339 event->hw.sample_period = attr->sample_period; 1340 local64_set(&event->hw.period_left, event->hw.sample_period); 1341 local64_set(&event->count, 0); 1342 event->hw.last_period = event->hw.sample_period; 1343 1344 /* Add all authorized counter sets to config_base. The 1345 * the hardware init function is either called per-cpu or just once 1346 * for all CPUS (event->cpu == -1). This depends on the whether 1347 * counting is started for all CPUs or on a per workload base where 1348 * the perf event moves from one CPU to another CPU. 1349 * Checking the authorization on any CPU is fine as the hardware 1350 * applies the same authorization settings to all CPUs. 1351 */ 1352 event->hw.config_base = get_authctrsets(); 1353 1354 /* No authorized counter sets, nothing to count/sample */ 1355 if (!event->hw.config_base) 1356 err = -EINVAL; 1357 1358 debug_sprintf_event(cf_dbg, 5, "%s err %d config_base %#lx\n", 1359 __func__, err, event->hw.config_base); 1360 return err; 1361} 1362 1363static int cfdiag_event_init(struct perf_event *event) 1364{ 1365 struct perf_event_attr *attr = &event->attr; 1366 int err = -ENOENT; 1367 1368 if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG || 1369 event->attr.type != event->pmu->type) 1370 goto out; 1371 1372 /* Raw events are used to access counters directly, 1373 * hence do not permit excludes. 1374 * This event is useless without PERF_SAMPLE_RAW to return counter set 1375 * values as raw data. 1376 */ 1377 if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv || 1378 !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) { 1379 err = -EOPNOTSUPP; 1380 goto out; 1381 } 1382 1383 /* Initialize for using the CPU-measurement counter facility */ 1384 cpumf_hw_inuse(); 1385 event->destroy = hw_perf_event_destroy; 1386 1387 err = cfdiag_event_init2(event); 1388 if (unlikely(err)) 1389 event->destroy(event); 1390out: 1391 return err; 1392} 1393 1394/* Create cf_diag/events/CF_DIAG event sysfs file. This counter is used 1395 * to collect the complete counter sets for a scheduled process. Target 1396 * are complete counter sets attached as raw data to the artificial event. 1397 * This results in complete counter sets available when a process is 1398 * scheduled. Contains the delta of every counter while the process was 1399 * running. 1400 */ 1401CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG); 1402 1403static struct attribute *cfdiag_events_attr[] = { 1404 CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG), 1405 NULL, 1406}; 1407 1408PMU_FORMAT_ATTR(event, "config:0-63"); 1409 1410static struct attribute *cfdiag_format_attr[] = { 1411 &format_attr_event.attr, 1412 NULL, 1413}; 1414 1415static struct attribute_group cfdiag_events_group = { 1416 .name = "events", 1417 .attrs = cfdiag_events_attr, 1418}; 1419static struct attribute_group cfdiag_format_group = { 1420 .name = "format", 1421 .attrs = cfdiag_format_attr, 1422}; 1423static const struct attribute_group *cfdiag_attr_groups[] = { 1424 &cfdiag_events_group, 1425 &cfdiag_format_group, 1426 NULL, 1427}; 1428 1429/* Performance monitoring unit for event CF_DIAG. Since this event 1430 * is also started and stopped via the perf_event_open() system call, use 1431 * the same event enable/disable call back functions. They do not 1432 * have a pointer to the perf_event strcture as first parameter. 1433 * 1434 * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common. 1435 * Reuse them and distinguish the event (always first parameter) via 1436 * 'config' member. 1437 */ 1438static struct pmu cf_diag = { 1439 .task_ctx_nr = perf_sw_context, 1440 .event_init = cfdiag_event_init, 1441 .pmu_enable = cpumf_pmu_enable, 1442 .pmu_disable = cpumf_pmu_disable, 1443 .add = cpumf_pmu_add, 1444 .del = cpumf_pmu_del, 1445 .start = cpumf_pmu_start, 1446 .stop = cpumf_pmu_stop, 1447 .read = cfdiag_read, 1448 1449 .attr_groups = cfdiag_attr_groups 1450}; 1451 1452/* Calculate memory needed to store all counter sets together with header and 1453 * trailer data. This is independent of the counter set authorization which 1454 * can vary depending on the configuration. 1455 */ 1456static size_t cfdiag_maxsize(struct cpumf_ctr_info *info) 1457{ 1458 size_t max_size = sizeof(struct cf_trailer_entry); 1459 enum cpumf_ctr_set i; 1460 1461 for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { 1462 size_t size = cpum_cf_ctrset_size(i, info); 1463 1464 if (size) 1465 max_size += size * sizeof(u64) + 1466 sizeof(struct cf_ctrset_entry); 1467 } 1468 return max_size; 1469} 1470 1471/* Get the CPU speed, try sampling facility first and CPU attributes second. */ 1472static void cfdiag_get_cpu_speed(void) 1473{ 1474 unsigned long mhz; 1475 1476 if (cpum_sf_avail()) { /* Sampling facility first */ 1477 struct hws_qsi_info_block si; 1478 1479 memset(&si, 0, sizeof(si)); 1480 if (!qsi(&si)) { 1481 cfdiag_cpu_speed = si.cpu_speed; 1482 return; 1483 } 1484 } 1485 1486 /* Fallback: CPU speed extract static part. Used in case 1487 * CPU Measurement Sampling Facility is turned off. 1488 */ 1489 mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); 1490 if (mhz != -1UL) 1491 cfdiag_cpu_speed = mhz & 0xffffffff; 1492} 1493 1494static int cfset_init(void) 1495{ 1496 struct cpumf_ctr_info info; 1497 size_t need; 1498 int rc; 1499 1500 if (qctri(&info)) 1501 return -ENODEV; 1502 1503 cfdiag_get_cpu_speed(); 1504 /* Make sure the counter set data fits into predefined buffer. */ 1505 need = cfdiag_maxsize(&info); 1506 if (need > sizeof(((struct cpu_cf_events *)0)->start)) { 1507 pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n", 1508 need); 1509 return -ENOMEM; 1510 } 1511 1512 rc = misc_register(&cfset_dev); 1513 if (rc) { 1514 pr_err("Registration of /dev/%s failed rc=%i\n", 1515 cfset_dev.name, rc); 1516 goto out; 1517 } 1518 1519 rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1); 1520 if (rc) { 1521 misc_deregister(&cfset_dev); 1522 pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n", 1523 rc); 1524 } 1525out: 1526 return rc; 1527} 1528 1529device_initcall(cpumf_pmu_init);