/* $Id: x86.c,v 1.165 2007/10/06 13:02:07 mikpe Exp $ * x86/x86_64 performance-monitoring counters driver. * * Copyright (C) 1999-2007 Mikael Pettersson */ #include #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) #include #endif #include #include #include #include #include #include #undef MSR_P6_PERFCTR0 #undef MSR_IA32_MISC_ENABLE #include #include struct hw_interrupt_type; #include #include /* cpu_khz */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18) && defined(CONFIG_X86_LOCAL_APIC) #include #endif #include "cpumask.h" #include "x86_tests.h" /* Support for lazy evntsel and perfctr MSR updates. */ struct per_cpu_cache { /* roughly a subset of perfctr_cpu_state */ unsigned int id; /* cache owner id */ #ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT unsigned int interrupts_masked; #endif struct { /* NOTE: these caches have physical indices, not virtual */ unsigned int evntsel[18]; unsigned int escr[0x3E2-0x3A0]; unsigned int pebs_enable; unsigned int pebs_matrix_vert; } control; }; static DEFINE_PER_CPU(struct per_cpu_cache, per_cpu_cache); #define __get_cpu_cache(cpu) (&per_cpu(per_cpu_cache, cpu)) #define get_cpu_cache() (&__get_cpu_var(per_cpu_cache)) /* Structure for counter snapshots, as 32-bit values. */ struct perfctr_low_ctrs { unsigned int tsc; unsigned int pmc[18]; }; /* Structures for describing the set of PMU MSRs. */ struct perfctr_msr_range { unsigned int first_msr; unsigned int nr_msrs; }; struct perfctr_pmu_msrs { const struct perfctr_msr_range *perfctrs; /* for {reserve,release}_perfctr_nmi() */ const struct perfctr_msr_range *evntsels; /* for {reserve,release}_evntsel_nmi() */ const struct perfctr_msr_range *extras; }; /* Intel P5, Cyrix 6x86MX/MII/III, Centaur WinChip C6/2/3 */ #define MSR_P5_CESR 0x11 #define MSR_P5_CTR0 0x12 /* .. 0x13 */ #define P5_CESR_CPL 0x00C0 #define P5_CESR_RESERVED (~0x01FF) #define MII_CESR_RESERVED (~0x05FF) #define C6_CESR_RESERVED (~0x00FF) /* Intel P6, VIA C3 */ #define MSR_P6_PERFCTR0 0xC1 /* .. 0xC2 */ #define MSR_P6_EVNTSEL0 0x186 /* .. 0x187 */ #define P6_EVNTSEL_ENABLE 0x00400000 #define P6_EVNTSEL_INT 0x00100000 #define P6_EVNTSEL_CPL 0x00030000 #define P6_EVNTSEL_RESERVED 0x00280000 #define VC3_EVNTSEL1_RESERVED (~0x1FF) /* AMD K7 */ #define MSR_K7_EVNTSEL0 0xC0010000 /* .. 0xC0010003 */ #define MSR_K7_PERFCTR0 0xC0010004 /* .. 0xC0010007 */ /* AMD K8 */ #define IS_K8_NB_EVENT(EVNTSEL) ((((EVNTSEL) >> 5) & 0x7) == 0x7) /* Intel P4, Intel Pentium M, Intel Core */ #define MSR_IA32_MISC_ENABLE 0x1A0 #define MSR_IA32_MISC_ENABLE_PERF_AVAIL (1<<7) /* read-only status bit */ #define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL (1<<12) /* read-only status bit */ /* Intel P4 */ #define MSR_P4_PERFCTR0 0x300 /* .. 0x311 */ #define MSR_P4_CCCR0 0x360 /* .. 0x371 */ #define MSR_P4_ESCR0 0x3A0 /* .. 0x3E1, with some gaps */ #define MSR_P4_PEBS_ENABLE 0x3F1 #define P4_PE_REPLAY_TAG_BITS 0x00000607 #define P4_PE_UOP_TAG 0x01000000 #define P4_PE_RESERVED 0xFEFFF9F8 /* only allow ReplayTagging */ #define MSR_P4_PEBS_MATRIX_VERT 0x3F2 #define P4_PMV_REPLAY_TAG_BITS 0x00000003 #define P4_PMV_RESERVED 0xFFFFFFFC #define P4_CCCR_OVF 0x80000000 #define P4_CCCR_CASCADE 0x40000000 #define P4_CCCR_OVF_PMI_T1 0x08000000 #define P4_CCCR_OVF_PMI_T0 0x04000000 #define P4_CCCR_FORCE_OVF 0x02000000 #define P4_CCCR_ACTIVE_THREAD 0x00030000 #define P4_CCCR_ENABLE 0x00001000 #define P4_CCCR_ESCR_SELECT(X) (((X) >> 13) & 0x7) #define P4_CCCR_EXTENDED_CASCADE 0x00000800 #define P4_CCCR_RESERVED (0x300007FF|P4_CCCR_OVF|P4_CCCR_OVF_PMI_T1) #define P4_ESCR_CPL_T1 0x00000003 #define P4_ESCR_CPL_T0 0x0000000C #define P4_ESCR_TAG_ENABLE 0x00000010 #define P4_ESCR_RESERVED (0x80000000) #define P4_FAST_RDPMC 0x80000000 #define P4_MASK_FAST_RDPMC 0x0000001F /* we only need low 5 bits */ /* missing from */ #define cpu_has_msr boot_cpu_has(X86_FEATURE_MSR) #define rdmsr_low(msr,low) \ __asm__ __volatile__("rdmsr" : "=a"(low) : "c"(msr) : "edx") #define rdpmc_low(ctr,low) \ __asm__ __volatile__("rdpmc" : "=a"(low) : "c"(ctr) : "edx") static void clear_msr_range(unsigned int base, unsigned int n) { unsigned int i; for(i = 0; i < n; ++i) wrmsr(base+i, 0, 0); } static inline void set_in_cr4_local(unsigned int mask) { write_cr4(read_cr4() | mask); } static inline void clear_in_cr4_local(unsigned int mask) { write_cr4(read_cr4() & ~mask); } static unsigned int new_id(void) { static DEFINE_SPINLOCK(lock); static unsigned int counter; int id; spin_lock(&lock); id = ++counter; spin_unlock(&lock); return id; } #if defined(CONFIG_X86_LOCAL_APIC) static void perfctr_default_ihandler(unsigned long pc) { } static perfctr_ihandler_t perfctr_ihandler = perfctr_default_ihandler; asmlinkage void smp_perfctr_interrupt(struct pt_regs *regs) { /* PREEMPT note: invoked via an interrupt gate, which masks interrupts. We're still on the originating CPU. */ /* XXX: recursive interrupts? delay the ACK, mask LVTPC, or queue? */ ack_APIC_irq(); if (get_cpu_cache()->interrupts_masked) return; irq_enter(); (*perfctr_ihandler)(instruction_pointer(regs)); irq_exit(); } void perfctr_cpu_set_ihandler(perfctr_ihandler_t ihandler) { perfctr_ihandler = ihandler ? ihandler : perfctr_default_ihandler; } static inline void perfctr_cpu_mask_interrupts(struct per_cpu_cache *cache) { cache->interrupts_masked = 1; } static inline void perfctr_cpu_unmask_interrupts(struct per_cpu_cache *cache) { cache->interrupts_masked = 0; } #else /* CONFIG_X86_LOCAL_APIC */ #define perfctr_cstatus_has_ictrs(cstatus) 0 #undef cpu_has_apic #define cpu_has_apic 0 #undef apic_write #define apic_write(reg,vector) do{}while(0) #endif /* CONFIG_X86_LOCAL_APIC */ #if defined(CONFIG_SMP) static inline void set_isuspend_cpu(struct perfctr_cpu_state *state, int cpu) { state->isuspend_cpu = cpu; } static inline int is_isuspend_cpu(const struct perfctr_cpu_state *state, int cpu) { return state->isuspend_cpu == cpu; } static inline void clear_isuspend_cpu(struct perfctr_cpu_state *state) { state->isuspend_cpu = NR_CPUS; } #else /* CONFIG_SMP */ static inline void set_isuspend_cpu(struct perfctr_cpu_state *state, int cpu) { } static inline int is_isuspend_cpu(const struct perfctr_cpu_state *state, int cpu) { return 1; } static inline void clear_isuspend_cpu(struct perfctr_cpu_state *state) { } #endif /* CONFIG_SMP */ /**************************************************************** * * * Driver procedures. * * * ****************************************************************/ /* * Intel P5 family (Pentium, family code 5). * - One TSC and two 40-bit PMCs. * - A single 32-bit CESR (MSR 0x11) controls both PMCs. * CESR has two halves, each controlling one PMC. * - Overflow interrupts are not available. * - Pentium MMX added the RDPMC instruction. RDPMC has lower * overhead than RDMSR and it can be used in user-mode code. * - The MMX events are not symmetric: some events are only available * for some PMC, and some event codes denote different events * depending on which PMCs they control. */ /* shared with MII and C6 */ static int p5_like_check_control(struct perfctr_cpu_state *state, unsigned int reserved_bits, int is_c6) { unsigned short cesr_half[2]; unsigned int pmc, evntsel, i; if (state->control.header.nrictrs != 0 || state->control.header.nractrs > 2) return -EINVAL; cesr_half[0] = 0; cesr_half[1] = 0; for(i = 0; i < state->control.header.nractrs; ++i) { pmc = state->control.pmc_map[i]; if (pmc > 1 || cesr_half[pmc] != 0) return -EINVAL; evntsel = state->control.evntsel[0]; if (pmc == 0) evntsel &= 0xffff; else evntsel >>= 16; /* protect reserved bits */ if ((evntsel & reserved_bits) != 0) return -EPERM; /* the CPL field (if defined) must be non-zero */ if (!is_c6 && !(evntsel & P5_CESR_CPL)) return -EINVAL; cesr_half[pmc] = evntsel; } state->id = (cesr_half[1] << 16) | cesr_half[0]; return 0; } static int p5_check_control(struct perfctr_cpu_state *state, int is_global) { return p5_like_check_control(state, P5_CESR_RESERVED, 0); } /* shared with MII but not C6 */ static void p5_write_control(const struct perfctr_cpu_state *state) { struct per_cpu_cache *cache; unsigned int cesr; cesr = state->id; if (!cesr) /* no PMC is on (this test doesn't work on C6) */ return; cache = get_cpu_cache(); if (cache->id != cesr) { cache->id = cesr; wrmsr(MSR_P5_CESR, cesr, 0); } } static void p5_read_counters(const struct perfctr_cpu_state *state, struct perfctr_low_ctrs *ctrs) { unsigned int cstatus, nrctrs, i; /* The P5 doesn't allocate a cache line on a write miss, so do a dummy read to avoid a write miss here _and_ a read miss later in our caller. */ asm("" : : "r"(ctrs->tsc)); cstatus = state->user.cstatus; if (perfctr_cstatus_has_tsc(cstatus)) rdtscl(ctrs->tsc); nrctrs = perfctr_cstatus_nractrs(cstatus); for(i = 0; i < nrctrs; ++i) { unsigned int pmc = state->control.pmc_map[i]; rdmsr_low(MSR_P5_CTR0+pmc, ctrs->pmc[i]); } } /* used by all except pre-MMX P5 */ static void rdpmc_read_counters(const struct perfctr_cpu_state *state, struct perfctr_low_ctrs *ctrs) { unsigned int cstatus, nrctrs, i; cstatus = state->user.cstatus; if (perfctr_cstatus_has_tsc(cstatus)) rdtscl(ctrs->tsc); nrctrs = perfctr_cstatus_nractrs(cstatus); for(i = 0; i < nrctrs; ++i) { unsigned int pmc = state->control.pmc_map[i]; rdpmc_low(pmc, ctrs->pmc[i]); } } /* shared with MII and C6 */ static const struct perfctr_msr_range p5_extras[] = { { MSR_P5_CESR, 1+2 }, { 0, 0 }, }; static const struct perfctr_pmu_msrs p5_pmu_msrs = { .extras = p5_extras, }; /* * Cyrix 6x86/MII/III. * - Same MSR assignments as P5 MMX. Has RDPMC and two 48-bit PMCs. * - Event codes and CESR formatting as in the plain P5 subset. * - Many but not all P5 MMX event codes are implemented. * - Cyrix adds a few more event codes. The event code is widened * to 7 bits, and Cyrix puts the high bit in CESR bit 10 * (and CESR bit 26 for PMC1). */ static int mii_check_control(struct perfctr_cpu_state *state, int is_global) { return p5_like_check_control(state, MII_CESR_RESERVED, 0); } /* * Centaur WinChip C6/2/3. * - Same MSR assignments as P5 MMX. Has RDPMC and two 40-bit PMCs. * - CESR is formatted with two halves, like P5. However, there * are no defined control fields for e.g. CPL selection, and * there is no defined method for stopping the counters. * - Only a few event codes are defined. * - The 64-bit TSC is synthesised from the low 32 bits of the * two PMCs, and CESR has to be set up appropriately. * Reprogramming CESR causes RDTSC to yield invalid results. * (The C6 may also hang in this case, due to C6 erratum I-13.) * Therefore, using the PMCs on any of these processors requires * that the TSC is not accessed at all: * 1. The kernel must be configured or a TSC-less processor, i.e. * generic 586 or less. * 2. The "notsc" boot parameter must be passed to the kernel. * 3. User-space libraries and code must also be configured and * compiled for a generic 586 or less. */ #if !defined(CONFIG_X86_TSC) static int c6_check_control(struct perfctr_cpu_state *state, int is_global) { if (state->control.header.tsc_on) return -EINVAL; return p5_like_check_control(state, C6_CESR_RESERVED, 1); } static void c6_write_control(const struct perfctr_cpu_state *state) { struct per_cpu_cache *cache; unsigned int cesr; if (perfctr_cstatus_nractrs(state->user.cstatus) == 0) /* no PMC is on */ return; cache = get_cpu_cache(); cesr = state->id; if (cache->id != cesr) { cache->id = cesr; wrmsr(MSR_P5_CESR, cesr, 0); } } #endif /* !CONFIG_X86_TSC */ /* * Intel P6 family (Pentium Pro, Pentium II, Pentium III, Pentium M, and * Intel Core, including Xeon and Celeron versions of Pentium II and III). * - One TSC and two 40-bit PMCs. * - One 32-bit EVNTSEL MSR for each PMC. * - EVNTSEL0 contains a global enable/disable bit. * That bit is reserved in EVNTSEL1. * - Each EVNTSEL contains a CPL field. * - Overflow interrupts are possible, but requires that the * local APIC is available. Some Mobile P6s have no local APIC. * - The PMCs cannot be initialised with arbitrary values, since * wrmsr fills the high bits by sign-extending from bit 31. * - Most events are symmetric, but a few are not. */ static int k8_is_multicore; /* affects northbridge events */ static int p6_is_core2; /* affects P6_EVNTSEL_ENABLE usage */ /* shared with K7 */ static int p6_like_check_control(struct perfctr_cpu_state *state, int is_k7, int is_global) { unsigned int evntsel, i, nractrs, nrctrs, pmc_mask, pmc; nractrs = state->control.header.nractrs; nrctrs = nractrs + state->control.header.nrictrs; if (nrctrs < nractrs || nrctrs > (is_k7 ? 4 : 2)) return -EINVAL; pmc_mask = 0; for(i = 0; i < nrctrs; ++i) { pmc = state->control.pmc_map[i]; if (pmc >= (is_k7 ? 4 : 2) || (pmc_mask & (1<control.evntsel[pmc]; /* prevent the K8 multicore NB event clobber erratum */ if (!is_global && k8_is_multicore && IS_K8_NB_EVENT(evntsel)) return -EPERM; /* protect reserved bits */ if (evntsel & P6_EVNTSEL_RESERVED) return -EPERM; /* check ENable bit */ if (is_k7 || p6_is_core2) { /* ENable bit must be set in each evntsel */ if (!(evntsel & P6_EVNTSEL_ENABLE)) return -EINVAL; } else { /* only evntsel[0] has the ENable bit */ if (evntsel & P6_EVNTSEL_ENABLE) { if (pmc > 0) return -EPERM; } else { if (pmc == 0) return -EINVAL; } } /* the CPL field must be non-zero */ if (!(evntsel & P6_EVNTSEL_CPL)) return -EINVAL; /* INT bit must be off for a-mode and on for i-mode counters */ if (evntsel & P6_EVNTSEL_INT) { if (i < nractrs) return -EINVAL; } else { if (i >= nractrs) return -EINVAL; } } state->id = new_id(); return 0; } static int p6_check_control(struct perfctr_cpu_state *state, int is_global) { return p6_like_check_control(state, 0, is_global); } #ifdef CONFIG_X86_LOCAL_APIC /* PRE: perfctr_cstatus_has_ictrs(state->cstatus) != 0 */ /* shared with K7 and P4 */ static void p6_like_isuspend(struct perfctr_cpu_state *state, unsigned int msr_evntsel0) { struct per_cpu_cache *cache; unsigned int cstatus, nrctrs, i; int cpu; unsigned int pending = 0; cpu = smp_processor_id(); set_isuspend_cpu(state, cpu); /* early to limit cpu's live range */ cache = __get_cpu_cache(cpu); perfctr_cpu_mask_interrupts(cache); cstatus = state->user.cstatus; nrctrs = perfctr_cstatus_nrctrs(cstatus); for(i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i) { unsigned int pmc_raw, pmc_idx, now; pmc_raw = state->control.pmc_map[i]; /* Note: P4_MASK_FAST_RDPMC is a no-op for P6 and K7. We don't need to make it into a parameter. */ pmc_idx = pmc_raw & P4_MASK_FAST_RDPMC; cache->control.evntsel[pmc_idx] = 0; /* On P4 this intensionally also clears the CCCR.OVF flag. */ wrmsr(msr_evntsel0+pmc_idx, 0, 0); /* P4 erratum N17 does not apply since we read only low 32 bits. */ rdpmc_low(pmc_raw, now); state->user.pmc[i].sum += now - state->user.pmc[i].start; state->user.pmc[i].start = now; if ((int)now >= 0) ++pending; } state->pending_interrupt = pending; /* cache->id is still == state->id */ } /* PRE: perfctr_cstatus_has_ictrs(state->cstatus) != 0 */ /* shared with K7 and P4 */ static void p6_like_iresume(const struct perfctr_cpu_state *state, unsigned int msr_evntsel0, unsigned int msr_perfctr0) { struct per_cpu_cache *cache; unsigned int cstatus, nrctrs, i; int cpu; cpu = smp_processor_id(); cache = __get_cpu_cache(cpu); perfctr_cpu_unmask_interrupts(cache); if (cache->id == state->id) { cache->id = 0; /* force reload of cleared EVNTSELs */ if (is_isuspend_cpu(state, cpu)) return; /* skip reload of PERFCTRs */ } cstatus = state->user.cstatus; nrctrs = perfctr_cstatus_nrctrs(cstatus); for(i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i) { /* Note: P4_MASK_FAST_RDPMC is a no-op for P6 and K7. We don't need to make it into a parameter. */ unsigned int pmc = state->control.pmc_map[i] & P4_MASK_FAST_RDPMC; /* If the control wasn't ours we must disable the evntsels before reinitialising the counters, to prevent unexpected counter increments and missed overflow interrupts. */ if (cache->control.evntsel[pmc]) { cache->control.evntsel[pmc] = 0; wrmsr(msr_evntsel0+pmc, 0, 0); } /* P4 erratum N15 does not apply since the CCCR is disabled. */ wrmsr(msr_perfctr0+pmc, (unsigned int)state->user.pmc[i].start, -1); } /* cache->id remains != state->id */ } static void p6_isuspend(struct perfctr_cpu_state *state) { p6_like_isuspend(state, MSR_P6_EVNTSEL0); } static void p6_iresume(const struct perfctr_cpu_state *state) { p6_like_iresume(state, MSR_P6_EVNTSEL0, MSR_P6_PERFCTR0); } #endif /* CONFIG_X86_LOCAL_APIC */ /* shared with K7 and VC3 */ static void p6_like_write_control(const struct perfctr_cpu_state *state, unsigned int msr_evntsel0) { struct per_cpu_cache *cache; unsigned int nrctrs, i; cache = get_cpu_cache(); if (cache->id == state->id) return; nrctrs = perfctr_cstatus_nrctrs(state->user.cstatus); for(i = 0; i < nrctrs; ++i) { unsigned int pmc = state->control.pmc_map[i]; unsigned int evntsel = state->control.evntsel[pmc]; if (evntsel != cache->control.evntsel[pmc]) { cache->control.evntsel[pmc] = evntsel; wrmsr(msr_evntsel0+pmc, evntsel, 0); } } cache->id = state->id; } /* shared with VC3, Generic*/ static void p6_write_control(const struct perfctr_cpu_state *state) { p6_like_write_control(state, MSR_P6_EVNTSEL0); } static const struct perfctr_msr_range p6_perfctrs[] = { { MSR_P6_PERFCTR0, 2 }, { 0, 0 }, }; static const struct perfctr_msr_range p6_evntsels[] = { { MSR_P6_EVNTSEL0, 2 }, { 0, 0 }, }; static const struct perfctr_pmu_msrs p6_pmu_msrs = { .perfctrs = p6_perfctrs, .evntsels = p6_evntsels, }; /* * AMD K7 family (Athlon, Duron). * - Somewhat similar to the Intel P6 family. * - Four 48-bit PMCs. * - Four 32-bit EVNTSEL MSRs with similar layout as in P6. * - Completely different MSR assignments :-( * - Fewer countable events defined :-( * - The events appear to be completely symmetric. * - The EVNTSEL MSRs are symmetric since each has its own enable bit. * - Publicly available documentation is incomplete. * - K7 model 1 does not have a local APIC. AMD Document #22007 * Revision J hints that it may use debug interrupts instead. * * The K8 has the same hardware layout as the K7. It also has * better documentation and a different set of available events. */ static int k7_check_control(struct perfctr_cpu_state *state, int is_global) { return p6_like_check_control(state, 1, is_global); } #ifdef CONFIG_X86_LOCAL_APIC static void k7_isuspend(struct perfctr_cpu_state *state) { p6_like_isuspend(state, MSR_K7_EVNTSEL0); } static void k7_iresume(const struct perfctr_cpu_state *state) { p6_like_iresume(state, MSR_K7_EVNTSEL0, MSR_K7_PERFCTR0); } #endif /* CONFIG_X86_LOCAL_APIC */ static void k7_write_control(const struct perfctr_cpu_state *state) { p6_like_write_control(state, MSR_K7_EVNTSEL0); } static const struct perfctr_msr_range k7_perfctrs[] = { { MSR_K7_PERFCTR0, 4 }, { 0, 0 }, }; static const struct perfctr_msr_range k7_evntsels[] = { { MSR_K7_EVNTSEL0, 4 }, { 0, 0 }, }; static const struct perfctr_pmu_msrs k7_pmu_msrs = { .perfctrs = k7_perfctrs, .evntsels = k7_evntsels, }; /* * VIA C3 family. * - A Centaur design somewhat similar to the P6/Celeron. * - PERFCTR0 is an alias for the TSC, and EVNTSEL0 is read-only. * - PERFCTR1 is 32 bits wide. * - EVNTSEL1 has no defined control fields, and there is no * defined method for stopping the counter. * - According to testing, the reserved fields in EVNTSEL1 have * no function. We always fill them with zeroes. * - Only a few event codes are defined. * - No local APIC or interrupt-mode support. * - pmc_map[0] must be 1, if nractrs == 1. */ static int vc3_check_control(struct perfctr_cpu_state *state, int is_global) { if (state->control.header.nrictrs || state->control.header.nractrs > 1) return -EINVAL; if (state->control.header.nractrs == 1) { if (state->control.pmc_map[0] != 1) return -EINVAL; if (state->control.evntsel[1] & VC3_EVNTSEL1_RESERVED) return -EPERM; state->id = state->control.evntsel[1]; } else state->id = 0; return 0; } static void vc3_clear_counters(void) { /* Not documented, but seems to be default after boot. */ wrmsr(MSR_P6_EVNTSEL0+1, 0x00070079, 0); } /* * Intel Pentium 4. * Current implementation restrictions: * - No DS/PEBS support. * * Known quirks: * - OVF_PMI+FORCE_OVF counters must have an ireset value of -1. * This allows the regular overflow check to also handle FORCE_OVF * counters. Not having this restriction would lead to MAJOR * complications in the driver's "detect overflow counters" code. * There is no loss of functionality since the ireset value doesn't * affect the counter's PMI rate for FORCE_OVF counters. * - In experiments with FORCE_OVF counters, and regular OVF_PMI * counters with small ireset values between -8 and -1, it appears * that the faulting instruction is subjected to a new PMI before * it can complete, ad infinitum. This occurs even though the driver * clears the CCCR (and in testing also the ESCR) and invokes a * user-space signal handler before restoring the CCCR and resuming * the instruction. */ /* * Table 15-4 in the IA32 Volume 3 manual contains a 18x8 entry mapping * from counter/CCCR number (0-17) and ESCR SELECT value (0-7) to the * actual ESCR MSR number. This mapping contains some repeated patterns, * so we can compact it to a 4x8 table of MSR offsets: * * 1. CCCRs 16 and 17 are mapped just like CCCRs 13 and 14, respectively. * Thus, we only consider the 16 CCCRs 0-15. * 2. The CCCRs are organised in pairs, and both CCCRs in a pair use the * same mapping. Thus, we only consider the 8 pairs 0-7. * 3. In each pair of pairs, the second odd-numbered pair has the same domain * as the first even-numbered pair, and the range is 1+ the range of the * the first even-numbered pair. For example, CCCR(0) and (1) map ESCR * SELECT(7) to 0x3A0, and CCCR(2) and (3) map it to 0x3A1. * The only exception is that pair (7) [CCCRs 14 and 15] does not have * ESCR SELECT(3) in its domain, like pair (6) [CCCRs 12 and 13] has. * NOTE: Revisions of IA32 Volume 3 older than #245472-007 had an error * in this table: CCCRs 12, 13, and 16 had their mappings for ESCR SELECT * values 2 and 3 swapped. * 4. All MSR numbers are on the form 0x3??. Instead of storing these as * 16-bit numbers, the table only stores the 8-bit offsets from 0x300. */ static const unsigned char p4_cccr_escr_map[4][8] = { /* 0x00 and 0x01 as is, 0x02 and 0x03 are +1 */ [0x00/4] { [7] 0xA0, [6] 0xA2, [2] 0xAA, [4] 0xAC, [0] 0xB2, [1] 0xB4, [3] 0xB6, [5] 0xC8, }, /* 0x04 and 0x05 as is, 0x06 and 0x07 are +1 */ [0x04/4] { [0] 0xC0, [2] 0xC2, [1] 0xC4, }, /* 0x08 and 0x09 as is, 0x0A and 0x0B are +1 */ [0x08/4] { [1] 0xA4, [0] 0xA6, [5] 0xA8, [2] 0xAE, [3] 0xB0, }, /* 0x0C, 0x0D, and 0x10 as is, 0x0E, 0x0F, and 0x11 are +1 except [3] is not in the domain */ [0x0C/4] { [4] 0xB8, [5] 0xCC, [6] 0xE0, [0] 0xBA, [2] 0xBC, [3] 0xBE, [1] 0xCA, }, }; static unsigned int p4_escr_addr(unsigned int pmc, unsigned int cccr_val) { unsigned int escr_select, pair, escr_offset; escr_select = P4_CCCR_ESCR_SELECT(cccr_val); if (pmc > 0x11) return 0; /* pmc range error */ if (pmc > 0x0F) pmc -= 3; /* 0 <= pmc <= 0x0F */ pair = pmc / 2; /* 0 <= pair <= 7 */ escr_offset = p4_cccr_escr_map[pair / 2][escr_select]; if (!escr_offset || (pair == 7 && escr_select == 3)) return 0; /* ESCR SELECT range error */ return escr_offset + (pair & 1) + 0x300; }; static int p4_IQ_ESCR_ok; /* only models <= 2 can use IQ_ESCR{0,1} */ static int p4_is_ht; /* affects several CCCR & ESCR fields */ static int p4_extended_cascade_ok; /* only models >= 2 can use extended cascading */ static int p4_check_control(struct perfctr_cpu_state *state, int is_global) { unsigned int i, nractrs, nrctrs, pmc_mask; nractrs = state->control.header.nractrs; nrctrs = nractrs + state->control.header.nrictrs; if (nrctrs < nractrs || nrctrs > 18) return -EINVAL; pmc_mask = 0; for(i = 0; i < nrctrs; ++i) { unsigned int pmc, cccr_val, escr_val, escr_addr; /* check that pmc_map[] is well-defined; pmc_map[i] is what we pass to RDPMC, the PMC itself is extracted by masking off the FAST_RDPMC flag */ pmc = state->control.pmc_map[i] & ~P4_FAST_RDPMC; if (pmc >= 18 || (pmc_mask & (1<control.evntsel[pmc]; if (cccr_val & P4_CCCR_RESERVED) return -EPERM; if (cccr_val & P4_CCCR_EXTENDED_CASCADE) { if (!p4_extended_cascade_ok) return -EPERM; if (!(pmc == 12 || pmc >= 15)) return -EPERM; } if ((cccr_val & P4_CCCR_ACTIVE_THREAD) != P4_CCCR_ACTIVE_THREAD && !p4_is_ht) return -EINVAL; if (!(cccr_val & (P4_CCCR_ENABLE | P4_CCCR_CASCADE | P4_CCCR_EXTENDED_CASCADE))) return -EINVAL; if (cccr_val & P4_CCCR_OVF_PMI_T0) { if (i < nractrs) return -EINVAL; if ((cccr_val & P4_CCCR_FORCE_OVF) && state->control.ireset[pmc] != -1) return -EINVAL; } else { if (i >= nractrs) return -EINVAL; } /* compute and cache ESCR address */ escr_addr = p4_escr_addr(pmc, cccr_val); if (!escr_addr) return -EINVAL; /* ESCR SELECT range error */ /* IQ_ESCR0 and IQ_ESCR1 only exist in models <= 2 */ if ((escr_addr & ~0x001) == 0x3BA && !p4_IQ_ESCR_ok) return -EINVAL; /* XXX: Two counters could map to the same ESCR. Should we check that they use the same ESCR value? */ state->p4_escr_map[i] = escr_addr - MSR_P4_ESCR0; /* check ESCR contents */ escr_val = state->control.p4.escr[escr_addr - MSR_P4_ESCR0]; if (escr_val & P4_ESCR_RESERVED) return -EPERM; if ((escr_val & P4_ESCR_CPL_T1) && (!p4_is_ht || !is_global)) return -EINVAL; } /* check ReplayTagging control (PEBS_ENABLE and PEBS_MATRIX_VERT) */ if (state->control.p4.pebs_enable) { if (!nrctrs) return -EPERM; if (state->control.p4.pebs_enable & P4_PE_RESERVED) return -EPERM; if (!(state->control.p4.pebs_enable & P4_PE_UOP_TAG)) return -EINVAL; if (!(state->control.p4.pebs_enable & P4_PE_REPLAY_TAG_BITS)) return -EINVAL; if (state->control.p4.pebs_matrix_vert & P4_PMV_RESERVED) return -EPERM; if (!(state->control.p4.pebs_matrix_vert & P4_PMV_REPLAY_TAG_BITS)) return -EINVAL; } else if (state->control.p4.pebs_matrix_vert) return -EPERM; state->id = new_id(); return 0; } #ifdef CONFIG_X86_LOCAL_APIC static void p4_isuspend(struct perfctr_cpu_state *state) { return p6_like_isuspend(state, MSR_P4_CCCR0); } static void p4_iresume(const struct perfctr_cpu_state *state) { return p6_like_iresume(state, MSR_P4_CCCR0, MSR_P4_PERFCTR0); } #endif /* CONFIG_X86_LOCAL_APIC */ static void p4_write_control(const struct perfctr_cpu_state *state) { struct per_cpu_cache *cache; unsigned int nrctrs, i; cache = get_cpu_cache(); if (cache->id == state->id) return; nrctrs = perfctr_cstatus_nrctrs(state->user.cstatus); for(i = 0; i < nrctrs; ++i) { unsigned int escr_val, escr_off, cccr_val, pmc; escr_off = state->p4_escr_map[i]; escr_val = state->control.p4.escr[escr_off]; if (escr_val != cache->control.escr[escr_off]) { cache->control.escr[escr_off] = escr_val; wrmsr(MSR_P4_ESCR0+escr_off, escr_val, 0); } pmc = state->control.pmc_map[i] & P4_MASK_FAST_RDPMC; cccr_val = state->control.evntsel[pmc]; if (cccr_val != cache->control.evntsel[pmc]) { cache->control.evntsel[pmc] = cccr_val; wrmsr(MSR_P4_CCCR0+pmc, cccr_val, 0); } } if (state->control.p4.pebs_enable != cache->control.pebs_enable) { cache->control.pebs_enable = state->control.p4.pebs_enable; wrmsr(MSR_P4_PEBS_ENABLE, state->control.p4.pebs_enable, 0); } if (state->control.p4.pebs_matrix_vert != cache->control.pebs_matrix_vert) { cache->control.pebs_matrix_vert = state->control.p4.pebs_matrix_vert; wrmsr(MSR_P4_PEBS_MATRIX_VERT, state->control.p4.pebs_matrix_vert, 0); } cache->id = state->id; } static const struct perfctr_msr_range p4_perfctrs[] = { { MSR_P4_PERFCTR0, 18 }, { 0, 0 }, }; static const struct perfctr_msr_range p4_evntsels[] = { { 0x3BA, 2 }, /* IQ_ESCR{0,1}: only models <= 2 have them */ { 0x3A0, 26 }, { 0x3BC, 3 }, { 0x3C0, 6 }, { 0x3C8, 6 }, { 0x3E0, 2 }, { 0, 0 }, }; static const struct perfctr_msr_range p4_extras[] = { /* MSR 0x3F0 seems to have a default value of 0xFC00, but current docs doesn't fully define it, so leave it alone for now. */ /* PEBS_ENABLE and PEBS_MATRIX_VERT handle both PEBS and ReplayTagging, and should exist even if PEBS is disabled */ { 0x3F1, 2 }, { MSR_P4_CCCR0, 18 }, { 0, 0 }, }; static const struct perfctr_pmu_msrs p4_pmu_msrs_models_0to2 = { .perfctrs = p4_perfctrs, .evntsels = p4_evntsels, .extras = p4_extras, }; static const struct perfctr_pmu_msrs p4_pmu_msrs_models_3up = { .perfctrs = p4_perfctrs, .evntsels = p4_evntsels+1, .extras = p4_extras, }; /* * Generic driver for any x86 with a working TSC. */ static int generic_check_control(struct perfctr_cpu_state *state, int is_global) { if (state->control.header.nractrs || state->control.header.nrictrs) return -EINVAL; return 0; } /* * Driver methods, internal and exported. * * Frequently called functions (write_control, read_counters, * isuspend and iresume) are back-patched to invoke the correct * processor-specific methods directly, thereby saving the * overheads of indirect function calls. * * Backpatchable call sites must have been "finalised" after * initialisation. The reason for this is that unsynchronised code * modification doesn't work in multiprocessor systems, due to * Intel P6 errata. Consequently, all backpatchable call sites * must be known and local to this file. * * Backpatchable calls must initially be to 'noinline' stubs. * Otherwise the compiler may inline the stubs, which breaks * redirect_call() and finalise_backpatching(). */ static int redirect_call_disable; static noinline void redirect_call(void *ra, void *to) { /* XXX: make this function __init later */ if (redirect_call_disable) printk(KERN_ERR __FILE__ ":%s: unresolved call to %p at %p\n", __FUNCTION__, to, ra); /* we can only redirect `call near relative' instructions */ if (*((unsigned char*)ra - 5) != 0xE8) { printk(KERN_WARNING __FILE__ ":%s: unable to redirect caller %p to %p\n", __FUNCTION__, ra, to); return; } *(int*)((char*)ra - 4) = (char*)to - (char*)ra; } static void (*write_control)(const struct perfctr_cpu_state*); static noinline void perfctr_cpu_write_control(const struct perfctr_cpu_state *state) { redirect_call(__builtin_return_address(0), write_control); return write_control(state); } static void (*read_counters)(const struct perfctr_cpu_state*, struct perfctr_low_ctrs*); static noinline void perfctr_cpu_read_counters(const struct perfctr_cpu_state *state, struct perfctr_low_ctrs *ctrs) { redirect_call(__builtin_return_address(0), read_counters); return read_counters(state, ctrs); } #ifdef CONFIG_X86_LOCAL_APIC static void (*cpu_isuspend)(struct perfctr_cpu_state*); static noinline void perfctr_cpu_isuspend(struct perfctr_cpu_state *state) { redirect_call(__builtin_return_address(0), cpu_isuspend); return cpu_isuspend(state); } static void (*cpu_iresume)(const struct perfctr_cpu_state*); static noinline void perfctr_cpu_iresume(const struct perfctr_cpu_state *state) { redirect_call(__builtin_return_address(0), cpu_iresume); return cpu_iresume(state); } /* Call perfctr_cpu_ireload() just before perfctr_cpu_resume() to bypass internal caching and force a reload if the I-mode PMCs. */ void perfctr_cpu_ireload(struct perfctr_cpu_state *state) { #ifdef CONFIG_SMP clear_isuspend_cpu(state); #else get_cpu_cache()->id = 0; #endif } /* PRE: the counters have been suspended and sampled by perfctr_cpu_suspend() */ static int lvtpc_reinit_needed; unsigned int perfctr_cpu_identify_overflow(struct perfctr_cpu_state *state) { unsigned int cstatus, nrctrs, i, pmc_mask; cstatus = state->user.cstatus; nrctrs = perfctr_cstatus_nrctrs(cstatus); state->pending_interrupt = 0; pmc_mask = 0; for(i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i) { if ((int)state->user.pmc[i].start >= 0) { /* XXX: ">" ? */ unsigned int pmc = state->control.pmc_map[i] & P4_MASK_FAST_RDPMC; /* XXX: "+=" to correct for overshots */ state->user.pmc[i].start = state->control.ireset[pmc]; pmc_mask |= (1 << i); /* On a P4 we should now clear the OVF flag in the counter's CCCR. However, p4_isuspend() already did that as a side-effect of clearing the CCCR in order to stop the i-mode counters. */ } } if (lvtpc_reinit_needed) apic_write(APIC_LVTPC, LOCAL_PERFCTR_VECTOR); return pmc_mask; } static inline int check_ireset(struct perfctr_cpu_state *state) { unsigned int nrctrs, i; i = state->control.header.nractrs; nrctrs = i + state->control.header.nrictrs; for(; i < nrctrs; ++i) { unsigned int pmc = state->control.pmc_map[i] & P4_MASK_FAST_RDPMC; if ((int)state->control.ireset[pmc] >= 0) return -EINVAL; state->user.pmc[i].start = state->control.ireset[pmc]; } return 0; } #else /* CONFIG_X86_LOCAL_APIC */ static inline void perfctr_cpu_isuspend(struct perfctr_cpu_state *state) { } static inline void perfctr_cpu_iresume(const struct perfctr_cpu_state *state) { } static inline int check_ireset(struct perfctr_cpu_state *state) { return 0; } #endif /* CONFIG_X86_LOCAL_APIC */ static int (*check_control)(struct perfctr_cpu_state*, int); int perfctr_cpu_update_control(struct perfctr_cpu_state *state, int is_global) { int err; clear_isuspend_cpu(state); state->user.cstatus = 0; /* disallow i-mode counters if we cannot catch the interrupts */ if (!(perfctr_info.cpu_features & PERFCTR_FEATURE_PCINT) && state->control.header.nrictrs) return -EPERM; err = check_control(state, is_global); if (err < 0) return err; err = check_ireset(state); if (err < 0) return err; state->user.cstatus = perfctr_mk_cstatus(state->control.header.tsc_on, state->control.header.nractrs, state->control.header.nrictrs); return 0; } /* * get_reg_offset() maps MSR numbers to offsets into struct perfctr_cpu_control, * suitable for accessing control data of type unsigned int. */ static int p5_reg_offset(unsigned int msr) { if (msr == MSR_P5_CESR) return offsetof(struct perfctr_cpu_control, evntsel[0]); return -1; } static int p6_reg_offset(unsigned int msr) { if (msr - MSR_P6_EVNTSEL0 < 2) return offsetof(struct perfctr_cpu_control, evntsel[msr - MSR_P6_EVNTSEL0]); if (msr - MSR_P6_PERFCTR0 < 2) return offsetof(struct perfctr_cpu_control, ireset[msr - MSR_P6_PERFCTR0]); return -1; } static int k7_reg_offset(unsigned int msr) { if (msr - MSR_K7_EVNTSEL0 < 4) return offsetof(struct perfctr_cpu_control, evntsel[msr - MSR_K7_EVNTSEL0]); if (msr - MSR_K7_PERFCTR0 < 4) return offsetof(struct perfctr_cpu_control, ireset[msr - MSR_K7_PERFCTR0]); return -1; } static int p4_reg_offset(unsigned int msr) { if (msr - MSR_P4_CCCR0 < 18) return offsetof(struct perfctr_cpu_control, evntsel[msr - MSR_P4_CCCR0]); if (msr - MSR_P4_PERFCTR0 < 18) return offsetof(struct perfctr_cpu_control, ireset[msr - MSR_P4_PERFCTR0]); if (msr - MSR_P4_ESCR0 < 0x3E2 - 0x3A0) return offsetof(struct perfctr_cpu_control, p4.escr[msr - MSR_P4_ESCR0]); if (msr == MSR_P4_PEBS_ENABLE) return offsetof(struct perfctr_cpu_control, p4.pebs_enable); if (msr == MSR_P4_PEBS_MATRIX_VERT) return offsetof(struct perfctr_cpu_control, p4.pebs_matrix_vert); return -1; } static int generic_reg_offset(unsigned int msr) { return -1; } static int (*get_reg_offset)(unsigned int); static int access_regs(struct perfctr_cpu_control *control, void *argp, unsigned int argbytes, int do_write) { struct perfctr_cpu_reg *regs; unsigned int i, nr_regs, *where; int offset; nr_regs = argbytes / sizeof(struct perfctr_cpu_reg); if (nr_regs * sizeof(struct perfctr_cpu_reg) != argbytes) return -EINVAL; regs = (struct perfctr_cpu_reg*)argp; for(i = 0; i < nr_regs; ++i) { offset = get_reg_offset(regs[i].nr); if (offset < 0) return -EINVAL; where = (unsigned int*)((char*)control + offset); if (do_write) *where = regs[i].value; else regs[i].value = *where; } return argbytes; } int perfctr_cpu_control_write(struct perfctr_cpu_control *control, unsigned int domain, const void *srcp, unsigned int srcbytes) { if (domain != PERFCTR_DOMAIN_CPU_REGS) return -EINVAL; return access_regs(control, (void*)srcp, srcbytes, 1); } int perfctr_cpu_control_read(const struct perfctr_cpu_control *control, unsigned int domain, void *dstp, unsigned int dstbytes) { if (domain != PERFCTR_DOMAIN_CPU_REGS) return -EINVAL; return access_regs((struct perfctr_cpu_control*)control, dstp, dstbytes, 0); } void perfctr_cpu_suspend(struct perfctr_cpu_state *state) { unsigned int i, cstatus, nractrs; struct perfctr_low_ctrs now; write_perfseq_begin(&state->user.sequence); if (perfctr_cstatus_has_ictrs(state->user.cstatus)) perfctr_cpu_isuspend(state); perfctr_cpu_read_counters(state, &now); cstatus = state->user.cstatus; if (perfctr_cstatus_has_tsc(cstatus)) state->user.tsc_sum += now.tsc - state->user.tsc_start; nractrs = perfctr_cstatus_nractrs(cstatus); for(i = 0; i < nractrs; ++i) state->user.pmc[i].sum += now.pmc[i] - state->user.pmc[i].start; /* perfctr_cpu_disable_rdpmc(); */ /* not for x86 */ write_perfseq_end(&state->user.sequence); } void perfctr_cpu_resume(struct perfctr_cpu_state *state) { write_perfseq_begin(&state->user.sequence); if (perfctr_cstatus_has_ictrs(state->user.cstatus)) perfctr_cpu_iresume(state); /* perfctr_cpu_enable_rdpmc(); */ /* not for x86 or global-mode */ perfctr_cpu_write_control(state); //perfctr_cpu_read_counters(state, &state->start); { struct perfctr_low_ctrs now; unsigned int i, cstatus, nrctrs; perfctr_cpu_read_counters(state, &now); cstatus = state->user.cstatus; if (perfctr_cstatus_has_tsc(cstatus)) state->user.tsc_start = now.tsc; nrctrs = perfctr_cstatus_nractrs(cstatus); for(i = 0; i < nrctrs; ++i) state->user.pmc[i].start = now.pmc[i]; } write_perfseq_end(&state->user.sequence); } void perfctr_cpu_sample(struct perfctr_cpu_state *state) { unsigned int i, cstatus, nractrs; struct perfctr_low_ctrs now; write_perfseq_begin(&state->user.sequence); perfctr_cpu_read_counters(state, &now); cstatus = state->user.cstatus; if (perfctr_cstatus_has_tsc(cstatus)) { state->user.tsc_sum += now.tsc - state->user.tsc_start; state->user.tsc_start = now.tsc; } nractrs = perfctr_cstatus_nractrs(cstatus); for(i = 0; i < nractrs; ++i) { state->user.pmc[i].sum += now.pmc[i] - state->user.pmc[i].start; state->user.pmc[i].start = now.pmc[i]; } write_perfseq_end(&state->user.sequence); } static void (*clear_counters)(void); /* VIA C3 needs non-standard initialisation */ static const struct perfctr_pmu_msrs *pmu_msrs; static void perfctr_cpu_clear_counters(void) { const struct perfctr_pmu_msrs *pmu; const struct perfctr_msr_range *msrs; int i; if (clear_counters) { clear_counters(); return; } pmu = pmu_msrs; if (!pmu) return; /* The order below is significant: evntsels must be cleared before the perfctrs. */ msrs = pmu->evntsels; if (msrs) for(i = 0; msrs[i].first_msr; ++i) clear_msr_range(msrs[i].first_msr, msrs[i].nr_msrs); msrs = pmu->extras; if (msrs) for(i = 0; msrs[i].first_msr; ++i) clear_msr_range(msrs[i].first_msr, msrs[i].nr_msrs); msrs = pmu->perfctrs; if (msrs) for(i = 0; msrs[i].first_msr; ++i) clear_msr_range(msrs[i].first_msr, msrs[i].nr_msrs); } /**************************************************************** * * * Processor detection and initialisation procedures. * * * ****************************************************************/ static inline void clear_perfctr_cpus_forbidden_mask(void) { #if !defined(perfctr_cpus_forbidden_mask) cpus_clear(perfctr_cpus_forbidden_mask); #endif } static inline void set_perfctr_cpus_forbidden_mask(cpumask_t mask) { #if !defined(perfctr_cpus_forbidden_mask) perfctr_cpus_forbidden_mask = mask; #endif } /* see comment above at redirect_call() */ static void __init finalise_backpatching(void) { struct per_cpu_cache *cache; struct perfctr_cpu_state state; cpumask_t old_mask; old_mask = perfctr_cpus_forbidden_mask; clear_perfctr_cpus_forbidden_mask(); cache = get_cpu_cache(); memset(cache, 0, sizeof *cache); memset(&state, 0, sizeof state); if (perfctr_info.cpu_features & PERFCTR_FEATURE_PCINT) { state.user.cstatus = __perfctr_mk_cstatus(0, 1, 0, 0); perfctr_cpu_sample(&state); perfctr_cpu_resume(&state); perfctr_cpu_suspend(&state); } state.user.cstatus = 0; perfctr_cpu_sample(&state); perfctr_cpu_resume(&state); perfctr_cpu_suspend(&state); set_perfctr_cpus_forbidden_mask(old_mask); redirect_call_disable = 1; } #ifdef CONFIG_SMP cpumask_t perfctr_cpus_forbidden_mask; static inline unsigned int find_mask(unsigned int nrvals) { unsigned int tmp = nrvals; unsigned int index_msb = 31; if (!tmp) return 0; while (!(tmp & (1<<31))) { tmp <<= 1; --index_msb; } if (nrvals & (nrvals - 1)) ++index_msb; return ~(~0 << index_msb); } static void __init p4_ht_mask_setup_cpu(void *forbidden) { int cpu = smp_processor_id(); unsigned int cpuid_maxlev; unsigned int cpuid1_ebx, cpuid1_edx, cpuid4_eax; unsigned int initial_APIC_ID; unsigned int max_cores_per_package; unsigned int max_lp_per_package; unsigned int max_lp_per_core; unsigned int smt_id; /* * The following big chunk of code detects the current logical processor's * SMT ID (thread number). This is quite complicated, see AP-485 and Volume 3 * of Intel's IA-32 Manual (especially section 7.10) for details. */ /* Ensure that CPUID reports all levels. */ if (cpu_data[cpu].x86_model == 3) { /* >= 3? */ unsigned int low, high; rdmsr(MSR_IA32_MISC_ENABLE, low, high); if (low & (1<<22)) { /* LIMIT_CPUID_MAXVAL */ low &= ~(1<<22); wrmsr(MSR_IA32_MISC_ENABLE, low, high); printk(KERN_INFO "perfctr/x86.c: CPU %d: removed CPUID level limitation\n", cpu); } } /* Find the highest standard CPUID level. */ cpuid_maxlev = cpuid_eax(0); if (cpuid_maxlev < 1) { printk(KERN_INFO "perfctr/x86: CPU %d: impossibly low # of CPUID levels: %u\n", cpu, cpuid_maxlev); return; } cpuid1_ebx = cpuid_ebx(1); cpuid1_edx = cpuid_edx(1); /* Find the initial (HW-assigned) APIC ID of this logical processor. */ initial_APIC_ID = cpuid1_ebx >> 24; /* Find the max number of logical processors per physical processor package. */ if (cpuid1_edx & (1 << 28)) /* HT is supported */ max_lp_per_package = (cpuid1_ebx >> 16) & 0xFF; else /* HT is not supported */ max_lp_per_package = 1; /* Find the max number of processor cores per physical processor package. */ if (cpuid_maxlev >= 4) { /* For CPUID level 4 we need a zero in ecx as input to CPUID, but cpuid_eax() doesn't do that. So we resort to using cpuid_count() with reference parameters and dummy outputs. */ unsigned int dummy; cpuid_count(4, 0, &cpuid4_eax, &dummy, &dummy, &dummy); max_cores_per_package = (cpuid4_eax >> 26) + 1; } else { cpuid4_eax = 0; max_cores_per_package = 1; } max_lp_per_core = max_lp_per_package / max_cores_per_package; smt_id = initial_APIC_ID & find_mask(max_lp_per_core); printk(KERN_INFO "perfctr/x86.c: CPU %d: cpuid_ebx(1) 0x%08x, cpuid_edx(1) 0x%08x, cpuid_eax(4) 0x%08x, cpuid_maxlev %u, max_cores_per_package %u, SMT_ID %u\n", cpu, cpuid1_ebx, cpuid1_edx, cpuid4_eax, cpuid_maxlev, max_cores_per_package, smt_id); /* * Now (finally!) check the SMT ID. The CPU numbers for non-zero SMT ID * threads are recorded in the forbidden set, to allow performance counter * hardware resource conflicts between sibling threads to be prevented. */ if (smt_id != 0) /* We rely on cpu_set() being atomic! */ cpu_set(cpu, *(cpumask_t*)forbidden); } static int __init p4_ht_smp_init(void) { cpumask_t forbidden; unsigned int cpu; cpus_clear(forbidden); smp_call_function(p4_ht_mask_setup_cpu, &forbidden, 1, 1); p4_ht_mask_setup_cpu(&forbidden); if (cpus_empty(forbidden)) return 0; perfctr_cpus_forbidden_mask = forbidden; printk(KERN_INFO "perfctr/x86.c: hyper-threaded P4s detected:" " restricting access for CPUs"); for(cpu = 0; cpu < NR_CPUS; ++cpu) if (cpu_isset(cpu, forbidden)) printk(" %u", cpu); printk("\n"); return 0; } #else /* SMP */ #define p4_ht_smp_init() (0) #endif /* SMP */ static int __init p4_ht_init(void) { unsigned int nr_siblings; if (!cpu_has_ht) return 0; nr_siblings = (cpuid_ebx(1) >> 16) & 0xFF; if (nr_siblings < 2) return 0; p4_is_ht = 1; /* needed even in a UP kernel */ return p4_ht_smp_init(); } static int __init intel_init(void) { static char p5_name[] __initdata = "Intel P5"; static char p6_name[] __initdata = "Intel P6"; static char p4_name[] __initdata = "Intel P4"; unsigned int misc_enable; if (!cpu_has_tsc) return -ENODEV; switch (current_cpu_data.x86) { case 5: if (cpu_has_mmx) { read_counters = rdpmc_read_counters; /* Avoid Pentium Erratum 74. */ if (current_cpu_data.x86_model == 4 && (current_cpu_data.x86_mask == 4 || (current_cpu_data.x86_mask == 3 && ((cpuid_eax(1) >> 12) & 0x3) == 1))) perfctr_info.cpu_features &= ~PERFCTR_FEATURE_RDPMC; } else { perfctr_info.cpu_features &= ~PERFCTR_FEATURE_RDPMC; read_counters = p5_read_counters; } perfctr_set_tests_type(PTT_P5); perfctr_cpu_name = p5_name; write_control = p5_write_control; check_control = p5_check_control; pmu_msrs = &p5_pmu_msrs; get_reg_offset = p5_reg_offset; return 0; case 6: /* Check MSR_IA32_MISC_ENABLE_PERF_AVAIL on relevant models. */ if (current_cpu_data.x86_model == 9 || /* Pentium M */ current_cpu_data.x86_model == 13 || /* Pentium M */ current_cpu_data.x86_model == 14 || /* Intel Core */ current_cpu_data.x86_model == 15) { /* Intel Core 2 */ rdmsr_low(MSR_IA32_MISC_ENABLE, misc_enable); if (!(misc_enable & MSR_IA32_MISC_ENABLE_PERF_AVAIL)) break; } if (current_cpu_data.x86_model == 15) { /* Intel Core 2 */ p6_is_core2 = 1; } else if (current_cpu_data.x86_model < 3) { /* Pentium Pro */ /* Avoid Pentium Pro Erratum 26. */ if (current_cpu_data.x86_mask < 9) perfctr_info.cpu_features &= ~PERFCTR_FEATURE_RDPMC; } perfctr_set_tests_type(PTT_P6); perfctr_cpu_name = p6_name; read_counters = rdpmc_read_counters; write_control = p6_write_control; check_control = p6_check_control; pmu_msrs = &p6_pmu_msrs; get_reg_offset = p6_reg_offset; #ifdef CONFIG_X86_LOCAL_APIC if (cpu_has_apic) { perfctr_info.cpu_features |= PERFCTR_FEATURE_PCINT; cpu_isuspend = p6_isuspend; cpu_iresume = p6_iresume; /* P-M apparently inherited P4's LVTPC auto-masking :-( */ if (current_cpu_data.x86_model == 9 || current_cpu_data.x86_model == 13 || current_cpu_data.x86_model == 14 || current_cpu_data.x86_model == 15) lvtpc_reinit_needed = 1; } #endif return 0; case 15: /* Pentium 4 */ rdmsr_low(MSR_IA32_MISC_ENABLE, misc_enable); if (!(misc_enable & MSR_IA32_MISC_ENABLE_PERF_AVAIL)) break; if (p4_ht_init() != 0) break; if (current_cpu_data.x86_model <= 2) p4_IQ_ESCR_ok = 1; if (current_cpu_data.x86_model >= 2) p4_extended_cascade_ok = 1; perfctr_set_tests_type(PTT_P4); perfctr_cpu_name = p4_name; read_counters = rdpmc_read_counters; write_control = p4_write_control; check_control = p4_check_control; if (current_cpu_data.x86_model <= 2) pmu_msrs = &p4_pmu_msrs_models_0to2; else pmu_msrs = &p4_pmu_msrs_models_3up; get_reg_offset = p4_reg_offset; #ifdef CONFIG_X86_LOCAL_APIC if (cpu_has_apic) { perfctr_info.cpu_features |= PERFCTR_FEATURE_PCINT; cpu_isuspend = p4_isuspend; cpu_iresume = p4_iresume; lvtpc_reinit_needed = 1; } #endif return 0; } return -ENODEV; } /* * Multicore K8s have issues with northbridge events: * 1. The NB is shared between the cores, so two different cores * in the same node cannot count NB events simultaneously. * This can be handled by using perfctr_cpus_forbidden_mask to * restrict NB-using threads to core0 of all nodes. * 2. The initial multicore chips (Revision E) have an erratum * which causes the NB counters to be reset when either core * reprograms its evntsels (even for non-NB events). * This is only an issue because of scheduling of threads, so * we restrict NB events to the non thread-centric API. * * For now we only implement the workaround for issue 2, as this * also handles issue 1. * * TODO: Detect post Revision E chips and implement a weaker * workaround for them. */ #ifdef CONFIG_SMP static void __init k8_multicore_init(void) { cpumask_t non0cores; int i; cpus_clear(non0cores); for(i = 0; i < NR_CPUS; ++i) { cpumask_t cores = cpu_core_map[i]; int core0 = first_cpu(cores); if (core0 >= NR_CPUS) continue; cpu_clear(core0, cores); cpus_or(non0cores, non0cores, cores); } if (cpus_empty(non0cores)) return; k8_is_multicore = 1; printk(KERN_INFO "perfctr/x86.c: multi-core K8s detected:" " restricting access to northbridge events\n"); } #else /* CONFIG_SMP */ #define k8_multicore_init() do{}while(0) #endif /* CONFIG_SMP */ static int __init amd_init(void) { static char amd_name[] __initdata = "AMD K7/K8"; if (!cpu_has_tsc) return -ENODEV; switch (current_cpu_data.x86) { case 6: /* K7 */ break; case 15: /* K8. Like a K7 with a different event set. */ k8_multicore_init(); break; default: return -ENODEV; } perfctr_set_tests_type(PTT_AMD); perfctr_cpu_name = amd_name; read_counters = rdpmc_read_counters; write_control = k7_write_control; check_control = k7_check_control; pmu_msrs = &k7_pmu_msrs; get_reg_offset = k7_reg_offset; #ifdef CONFIG_X86_LOCAL_APIC if (cpu_has_apic) { perfctr_info.cpu_features |= PERFCTR_FEATURE_PCINT; cpu_isuspend = k7_isuspend; cpu_iresume = k7_iresume; } #endif return 0; } static int __init cyrix_init(void) { static char mii_name[] __initdata = "Cyrix 6x86MX/MII/III"; if (!cpu_has_tsc) return -ENODEV; switch (current_cpu_data.x86) { case 6: /* 6x86MX, MII, or III */ perfctr_set_tests_type(PTT_P5); perfctr_cpu_name = mii_name; read_counters = rdpmc_read_counters; write_control = p5_write_control; check_control = mii_check_control; pmu_msrs = &p5_pmu_msrs; get_reg_offset = p5_reg_offset; return 0; } return -ENODEV; } static int __init centaur_init(void) { #if !defined(CONFIG_X86_TSC) static char winchip_name[] __initdata = "WinChip C6/2/3"; #endif static char vc3_name[] __initdata = "VIA C3"; switch (current_cpu_data.x86) { #if !defined(CONFIG_X86_TSC) case 5: switch (current_cpu_data.x86_model) { case 4: /* WinChip C6 */ case 8: /* WinChip 2, 2A, or 2B */ case 9: /* WinChip 3, a 2A with larger cache and lower voltage */ break; default: return -ENODEV; } perfctr_set_tests_type(PTT_WINCHIP); perfctr_cpu_name = winchip_name; /* * TSC must be inaccessible for perfctrs to work. */ if (!(read_cr4() & X86_CR4_TSD) || cpu_has_tsc) return -ENODEV; perfctr_info.cpu_features &= ~PERFCTR_FEATURE_RDTSC; read_counters = rdpmc_read_counters; write_control = c6_write_control; check_control = c6_check_control; pmu_msrs = &p5_pmu_msrs; get_reg_offset = p5_reg_offset; return 0; #endif /* !CONFIG_X86_TSC */ case 6: /* VIA C3 */ if (!cpu_has_tsc) return -ENODEV; switch (current_cpu_data.x86_model) { case 6: /* Cyrix III */ case 7: /* Samuel 2, Ezra (steppings >= 8) */ case 8: /* Ezra-T */ case 9: /* Antaur/Nehemiah */ break; default: return -ENODEV; } perfctr_set_tests_type(PTT_VC3); perfctr_cpu_name = vc3_name; read_counters = rdpmc_read_counters; write_control = p6_write_control; check_control = vc3_check_control; clear_counters = vc3_clear_counters; pmu_msrs = NULL; get_reg_offset = p6_reg_offset; return 0; } return -ENODEV; } static int __init generic_init(void) { static char generic_name[] __initdata = "Generic x86 with TSC"; if (!cpu_has_tsc) return -ENODEV; perfctr_info.cpu_features &= ~PERFCTR_FEATURE_RDPMC; perfctr_set_tests_type(PTT_GENERIC); perfctr_cpu_name = generic_name; check_control = generic_check_control; write_control = p6_write_control; read_counters = rdpmc_read_counters; pmu_msrs = NULL; get_reg_offset = generic_reg_offset; return 0; } static void perfctr_cpu_invalidate_cache(void) { /* * per_cpu_cache[] is initialised to contain "impossible" * evntsel values guaranteed to differ from anything accepted * by perfctr_cpu_update_control(). * All-bits-one works for all currently supported processors. * The memset also sets the ids to -1, which is intentional. */ memset(get_cpu_cache(), ~0, sizeof(struct per_cpu_cache)); } static void perfctr_cpu_init_one(void *ignore) { /* PREEMPT note: when called via smp_call_function(), this is in IRQ context with preemption disabled. */ perfctr_cpu_clear_counters(); perfctr_cpu_invalidate_cache(); if (cpu_has_apic) apic_write(APIC_LVTPC, LOCAL_PERFCTR_VECTOR); if (perfctr_info.cpu_features & PERFCTR_FEATURE_RDPMC) set_in_cr4_local(X86_CR4_PCE); } static void perfctr_cpu_exit_one(void *ignore) { /* PREEMPT note: when called via smp_call_function(), this is in IRQ context with preemption disabled. */ perfctr_cpu_clear_counters(); perfctr_cpu_invalidate_cache(); if (cpu_has_apic) apic_write(APIC_LVTPC, APIC_DM_NMI | APIC_LVT_MASKED); if (perfctr_info.cpu_features & PERFCTR_FEATURE_RDPMC) clear_in_cr4_local(X86_CR4_PCE); } #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PM) static void perfctr_pm_suspend(void) { /* XXX: clear control registers */ printk("perfctr/x86: PM suspend\n"); } static void perfctr_pm_resume(void) { /* XXX: reload control registers */ printk("perfctr/x86: PM resume\n"); } #include static int perfctr_device_suspend(struct sys_device *dev, pm_message_t state) { perfctr_pm_suspend(); return 0; } static int perfctr_device_resume(struct sys_device *dev) { perfctr_pm_resume(); return 0; } static struct sysdev_class perfctr_sysclass = { set_kset_name("perfctr"), .resume = perfctr_device_resume, .suspend = perfctr_device_suspend, }; static struct sys_device device_perfctr = { .id = 0, .cls = &perfctr_sysclass, }; static void x86_pm_init(void) { if (sysdev_class_register(&perfctr_sysclass) == 0) sysdev_register(&device_perfctr); } static void x86_pm_exit(void) { sysdev_unregister(&device_perfctr); sysdev_class_unregister(&perfctr_sysclass); } #else /* CONFIG_X86_LOCAL_APIC && CONFIG_PM */ static inline void x86_pm_init(void) { } static inline void x86_pm_exit(void) { } #endif /* CONFIG_X86_LOCAL_APIC && CONFIG_PM */ #ifdef CONFIG_X86_LOCAL_APIC #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,19) static void perfctr_release_perfctr_range(unsigned int first_msr, unsigned int nr_msrs) { unsigned int i; for(i = 0; i < nr_msrs; ++i) release_perfctr_nmi(first_msr + i); } static int perfctr_reserve_perfctr_range(unsigned int first_msr, unsigned int nr_msrs) { unsigned int i; for(i = 0; i < nr_msrs; ++i) if (!reserve_perfctr_nmi(first_msr + i)) { printk(KERN_ERR "perfctr/x86.c: failed to reserve perfctr MSR %#x\n", first_msr + i); perfctr_release_perfctr_range(first_msr, i); return -1; } return 0; } static void perfctr_release_evntsel_range(unsigned int first_msr, unsigned int nr_msrs) { unsigned int i; for(i = 0; i < nr_msrs; ++i) release_evntsel_nmi(first_msr + i); } static int perfctr_reserve_evntsel_range(unsigned int first_msr, unsigned int nr_msrs) { unsigned int i; for(i = 0; i < nr_msrs; ++i) if (!reserve_evntsel_nmi(first_msr + i)) { printk(KERN_ERR "perfctr/x86.c: failed to reserve evntsel MSR %#x\n", first_msr + i); perfctr_release_evntsel_range(first_msr, i); return -1; } return 0; } static void perfctr_release_counters_cpu(void *ignore) { const struct perfctr_pmu_msrs *pmu; const struct perfctr_msr_range *msrs; int i; pmu = pmu_msrs; if (!pmu) return; msrs = pmu->perfctrs; if (msrs) for(i = 0; msrs[i].first_msr; ++i) perfctr_release_perfctr_range(msrs[i].first_msr, msrs[i].nr_msrs); msrs = pmu->evntsels; if (msrs) for(i = 0; msrs[i].first_msr; ++i) perfctr_release_evntsel_range(msrs[i].first_msr, msrs[i].nr_msrs); } static void perfctr_release_counters(void) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,21) perfctr_release_counters_cpu(NULL); #else on_each_cpu(perfctr_release_counters_cpu, NULL, 1, 1); #endif } static void perfctr_reserve_counters_cpu(void *error) { const struct perfctr_pmu_msrs *pmu; const struct perfctr_msr_range *msrs; int i; pmu = pmu_msrs; if (!pmu) return; msrs = pmu->perfctrs; if (msrs) { for(i = 0; msrs[i].first_msr; ++i) if (perfctr_reserve_perfctr_range(msrs[i].first_msr, msrs[i].nr_msrs)) goto err_perfctrs; } msrs = pmu->evntsels; if (msrs) { for(i = 0; msrs[i].first_msr; ++i) if (perfctr_reserve_evntsel_range(msrs[i].first_msr, msrs[i].nr_msrs)) goto err_evntsels; } return; err_evntsels: while (--i >= 0) perfctr_release_evntsel_range(msrs[i].first_msr, msrs[i].nr_msrs); msrs = pmu->perfctrs; if (!msrs) goto err; for(i = 0; msrs[i].first_msr; ++i) ; err_perfctrs: while (--i >= 0) perfctr_release_perfctr_range(msrs[i].first_msr, msrs[i].nr_msrs); err: atomic_set((atomic_t*)error, -1); } static int perfctr_reserve_counters(void) { atomic_t error = ATOMIC_INIT(0); #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,21) perfctr_reserve_counters_cpu(&error); #else on_each_cpu(perfctr_reserve_counters_cpu, &error, 1, 1); #endif return atomic_read(&error); } static int reserve_lapic_nmi(void) { if (nmi_watchdog != NMI_LOCAL_APIC) return 0; if (atomic_read(&nmi_active) <= 0) return 0; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) disable_lapic_nmi_watchdog(); #else on_each_cpu(stop_apic_nmi_watchdog, NULL, 1, 1); #endif return perfctr_reserve_counters(); } static void release_lapic_nmi(void) { perfctr_release_counters(); if (nmi_watchdog != NMI_LOCAL_APIC) return; if (atomic_read(&nmi_active) != 0) return; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) enable_lapic_nmi_watchdog(); #else on_each_cpu(setup_apic_nmi_watchdog, NULL, 1, 1); #endif } #endif #else /* CONFIG_X86_LOCAL_APIC */ static inline int reserve_lapic_nmi(void) { return 0; } static inline void release_lapic_nmi(void) { } #endif /* CONFIG_X86_LOCAL_APIC */ static void do_init_tests(void) { #ifdef CONFIG_PERFCTR_INIT_TESTS if (reserve_lapic_nmi() >= 0) { perfctr_x86_init_tests(); release_lapic_nmi(); } #endif } static int init_done; int __init perfctr_cpu_init(void) { int err = -ENODEV; preempt_disable(); /* RDPMC and RDTSC are on by default. They will be disabled by the init procedures if necessary. */ perfctr_info.cpu_features = PERFCTR_FEATURE_RDPMC | PERFCTR_FEATURE_RDTSC; if (cpu_has_msr) { switch (current_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: err = intel_init(); break; case X86_VENDOR_AMD: err = amd_init(); break; case X86_VENDOR_CYRIX: err = cyrix_init(); break; case X86_VENDOR_CENTAUR: err = centaur_init(); } } if (err) { err = generic_init(); /* last resort */ if (err) goto out; } do_init_tests(); finalise_backpatching(); perfctr_info.cpu_khz = cpu_khz; perfctr_info.tsc_to_cpu_mult = 1; init_done = 1; out: preempt_enable(); return err; } void __exit perfctr_cpu_exit(void) { } /**************************************************************** * * * Hardware reservation. * * * ****************************************************************/ static DEFINE_MUTEX(mutex); static const char *current_service = 0; const char *perfctr_cpu_reserve(const char *service) { const char *ret; if (!init_done) return "unsupported hardware"; mutex_lock(&mutex); ret = current_service; if (ret) goto out_unlock; ret = "unknown driver (oprofile?)"; if (reserve_lapic_nmi() < 0) goto out_unlock; current_service = service; if (perfctr_info.cpu_features & PERFCTR_FEATURE_RDPMC) mmu_cr4_features |= X86_CR4_PCE; on_each_cpu(perfctr_cpu_init_one, NULL, 1, 1); perfctr_cpu_set_ihandler(NULL); x86_pm_init(); ret = NULL; out_unlock: mutex_unlock(&mutex); return ret; } void perfctr_cpu_release(const char *service) { mutex_lock(&mutex); if (service != current_service) { printk(KERN_ERR "%s: attempt by %s to release while reserved by %s\n", __FUNCTION__, service, current_service); goto out_unlock; } /* power down the counters */ if (perfctr_info.cpu_features & PERFCTR_FEATURE_RDPMC) mmu_cr4_features &= ~X86_CR4_PCE; on_each_cpu(perfctr_cpu_exit_one, NULL, 1, 1); perfctr_cpu_set_ihandler(NULL); x86_pm_exit(); current_service = 0; release_lapic_nmi(); out_unlock: mutex_unlock(&mutex); }