Blob Blame History Raw
/*
 * pfmlib_intel_nhm.c : Intel Nehalem PMU
 *
 * Copyright (c) 2008 Google, Inc
 * Contributed by Stephane Eranian <eranian@gmail.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 * of the Software, and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
 * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
 * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
 * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * Nehalem PMU = architectural perfmon v3 + OFFCORE + PEBS v2 + uncore PMU + LBR
 */
#include <sys/types.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>

/* public headers */
#include <perfmon/pfmlib_intel_nhm.h>

/* private headers */
#include "pfmlib_priv.h"
#include "pfmlib_intel_nhm_priv.h"

/* Intel Westmere event tables */
#include "intel_wsm_events.h"
#include "intel_wsm_unc_events.h"

/* Intel Core i7 event tables */
#include "intel_corei7_events.h"
#include "intel_corei7_unc_events.h"

/* let's define some handy shortcuts! */
#define usel_event	unc_perfevtsel.usel_event
#define usel_umask	unc_perfevtsel.usel_umask
#define usel_occ	unc_perfevtsel.usel_occ
#define usel_edge	unc_perfevtsel.usel_edge
#define usel_int	unc_perfevtsel.usel_int
#define usel_en		unc_perfevtsel.usel_en
#define usel_inv	unc_perfevtsel.usel_inv
#define usel_cnt_mask	unc_perfevtsel.usel_cnt_mask

#define sel_event	perfevtsel.sel_event
#define sel_umask	perfevtsel.sel_umask
#define sel_usr		perfevtsel.sel_usr
#define sel_os		perfevtsel.sel_os
#define sel_edge	perfevtsel.sel_edge
#define sel_pc		perfevtsel.sel_pc
#define sel_int		perfevtsel.sel_int
#define sel_en		perfevtsel.sel_en
#define sel_inv		perfevtsel.sel_inv
#define sel_anythr	perfevtsel.sel_anythr
#define sel_cnt_mask	perfevtsel.sel_cnt_mask


/*
 * Description of the PMC registers mappings:
 *
 * 0  -> PMC0  -> PERFEVTSEL0
 * 1  -> PMC1  -> PERFEVTSEL1 
 * 2  -> PMC2  -> PERFEVTSEL2 
 * 3  -> PMC3  -> PERFEVTSEL3 
 * 16 -> PMC16 -> FIXED_CTR_CTRL
 * 17 -> PMC17 -> PEBS_ENABLED
 * 18 -> PMC18 -> PEBS_LD_LATENCY_THRESHOLD
 * 19 -> PMC19 -> OFFCORE_RSP0
 * 20 -> PMC20 -> UNCORE_FIXED_CTRL
 * 21 -> PMC21 -> UNCORE_EVNTSEL0
 * 22 -> PMC22 -> UNCORE_EVNTSEL1
 * 23 -> PMC23 -> UNCORE_EVNTSEL2
 * 24 -> PMC24 -> UNCORE_EVNTSEL3
 * 25 -> PMC25 -> UNCORE_EVNTSEL4
 * 26 -> PMC26 -> UNCORE_EVNTSEL5
 * 27 -> PMC27 -> UNCORE_EVNTSEL6
 * 28 -> PMC28 -> UNCORE_EVNTSEL7
 * 29 -> PMC31 -> UNCORE_ADDROP_MATCH
 * 30 -> PMC32 -> LBR_SELECT
 *
 * Description of the PMD registers mapping:
 *
 * 0  -> PMD0  -> PMC0
 * 1  -> PMD1  -> PMC1
 * 2  -> PMD2  -> PMC2
 * 3  -> PMD3  -> PMC3
 * 16 -> PMD16 -> FIXED_CTR0
 * 17 -> PMD17 -> FIXED_CTR1
 * 18 -> PMD18 -> FIXED_CTR2
 * 19 not used
 * 20 -> PMD20 -> UNCORE_FIXED_CTR0
 * 21 -> PMD21 -> UNCORE_PMC0
 * 22 -> PMD22 -> UNCORE_PMC1
 * 23 -> PMD23 -> UNCORE_PMC2
 * 24 -> PMD24 -> UNCORE_PMC3
 * 25 -> PMD25 -> UNCORE_PMC4
 * 26 -> PMD26 -> UNCORE_PMC5
 * 27 -> PMD27 -> UNCORE_PMC6
 * 28 -> PMD28 -> UNCORE_PMC7
 *
 * 31 -> PMD31 -> LBR_TOS
 * 32-63 -> PMD32-PMD63 -> LBR_FROM_0/LBR_TO_0 - LBR_FROM15/LBR_TO_15
 */
#define NHM_SEL_BASE		0x186
#define NHM_CTR_BASE		0xc1
#define NHM_FIXED_CTR_BASE	0x309

#define UNC_NHM_SEL_BASE	0x3c0
#define UNC_NHM_CTR_BASE	0x3b0
#define UNC_NHM_FIXED_CTR_BASE	0x394

#define MAX_COUNTERS	28 /* highest implemented counter */

#define PFMLIB_NHM_ALL_FLAGS \
	(PFM_NHM_SEL_INV|PFM_NHM_SEL_EDGE|PFM_NHM_SEL_ANYTHR)

#define NHM_NUM_GEN_COUNTERS	4
#define NHM_NUM_FIXED_COUNTERS	3

pfm_pmu_support_t intel_nhm_support;
pfm_pmu_support_t intel_wsm_support;

static pfmlib_regmask_t nhm_impl_pmcs, nhm_impl_pmds;
static pfmlib_regmask_t nhm_impl_unc_pmcs, nhm_impl_unc_pmds;
static pme_nhm_entry_t *pe, *unc_pe;
static unsigned int num_pe, num_unc_pe;
static int cpu_model, aaj80;
static int pme_cycles, pme_instr;

#ifdef __i386__
static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx,
			 unsigned int *ecx, unsigned int *edx)
{
	/*
	 * because ebx is used in Pic mode, we need to save/restore because
	 * cpuid clobbers it. I could not figure out a way to get ebx out in
	 * one cpuid instruction. To extract ebx, we need to  move it to another
	 * register (here eax)
	 */
	__asm__("pushl %%ebx;cpuid; popl %%ebx"
			:"=a" (*eax)
			: "a" (op)
			: "ecx", "edx");

	__asm__("pushl %%ebx;cpuid; movl %%ebx, %%eax;popl %%ebx"
			:"=a" (*ebx)
			: "a" (op)
			: "ecx", "edx");
}
#else
static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx,
			 unsigned int *ecx, unsigned int *edx)
{
        __asm__("cpuid"
                        : "=a" (*eax),
                        "=b" (*ebx),
                        "=c" (*ecx),
                        "=d" (*edx)
                        : "0" (op), "c"(0));
}
#endif

static inline pme_nhm_entry_t *
get_nhm_entry(unsigned int i)
{
        return i < num_pe ? pe+i : unc_pe+(i-num_pe);
}

static int
pfm_nhm_midx2uidx(unsigned int ev, unsigned int midx)
{
	int i, num = 0;
	pme_nhm_entry_t *ne;
	int model;

	ne = get_nhm_entry(ev);

	for (i=0; i < ne->pme_numasks; i++) {
		model = ne->pme_umasks[i].pme_umodel;
		if (!model || model == cpu_model) {
			if (midx == num) 
				return i;
			num++;
		}
	}
	DPRINT("cannot find umask %d for event %s\n", midx, ne->pme_name);
	return -1;
}

static int
pfm_nhm_detect_common(void)
{
	int ret;
	int family;
	char buffer[128];

	ret = __pfm_getcpuinfo_attr("vendor_id", buffer, sizeof(buffer));
	if (ret == -1)
		return PFMLIB_ERR_NOTSUPP;

	if (strcmp(buffer, "GenuineIntel"))
		return PFMLIB_ERR_NOTSUPP;

	ret = __pfm_getcpuinfo_attr("cpu family", buffer, sizeof(buffer));
	if (ret == -1)
		return PFMLIB_ERR_NOTSUPP;

	family = atoi(buffer);

	ret = __pfm_getcpuinfo_attr("model", buffer, sizeof(buffer));
	if (ret == -1)
		return PFMLIB_ERR_NOTSUPP;

	cpu_model = atoi(buffer);

	if (family != 6)
		return PFMLIB_ERR_NOTSUPP;

	return PFMLIB_SUCCESS;
}

static int
pfm_nhm_detect(void)
{
#define INTEL_ARCH_MISP_BR_RETIRED	(1 << 6)
	unsigned int eax, ebx, ecx, edx;
	int ret;

	ret = pfm_nhm_detect_common();
	if (ret != PFMLIB_SUCCESS)
		return ret;

	switch(cpu_model) {
		case 26: /* Nehalem */
		case 30:
		case 31:
		case 46:
			/*
	 		 * check for erratum AAJ80
	 		 *
	 		 * MISPREDICTED_BRANCH_RETIRED may be broken
	 		 * in which case it appears in the list of
	 		 * unavailable architected events
	 		 */
			cpuid(0xa, &eax, &ebx, &ecx, &edx);
			if (ebx & INTEL_ARCH_MISP_BR_RETIRED)
				aaj80 = 1;
			break;
		default:
			return PFMLIB_ERR_NOTSUPP;
	}
	return PFMLIB_SUCCESS;
}

static int
pfm_wsm_detect(void)
{
	switch(cpu_model) {
		case 37: /* Westmere */
		case 44:
			break;
		default:
			return PFMLIB_ERR_NOTSUPP;
	}
	return PFMLIB_SUCCESS;
}

static inline void setup_nhm_impl_unc_regs(void)
{
	pfm_regmask_set(&nhm_impl_unc_pmds, 20);
	pfm_regmask_set(&nhm_impl_unc_pmds, 21);
	pfm_regmask_set(&nhm_impl_unc_pmds, 22);
	pfm_regmask_set(&nhm_impl_unc_pmds, 23);
	pfm_regmask_set(&nhm_impl_unc_pmds, 24);
	pfm_regmask_set(&nhm_impl_unc_pmds, 25);
	pfm_regmask_set(&nhm_impl_unc_pmds, 26);
	pfm_regmask_set(&nhm_impl_unc_pmds, 27);
	pfm_regmask_set(&nhm_impl_unc_pmds, 28);

	/* uncore */
	pfm_regmask_set(&nhm_impl_unc_pmcs, 20);
	pfm_regmask_set(&nhm_impl_unc_pmcs, 21);
	pfm_regmask_set(&nhm_impl_unc_pmcs, 22);
	pfm_regmask_set(&nhm_impl_unc_pmcs, 23);
	pfm_regmask_set(&nhm_impl_unc_pmcs, 24);
	pfm_regmask_set(&nhm_impl_unc_pmcs, 25);
	pfm_regmask_set(&nhm_impl_unc_pmcs, 26);
	pfm_regmask_set(&nhm_impl_unc_pmcs, 27);
	pfm_regmask_set(&nhm_impl_unc_pmcs, 28);
	/* unnhm_addrop_match */
	pfm_regmask_set(&nhm_impl_unc_pmcs, 29);

}

static void
fixup_mem_uncore_retired(void)
{
	size_t i;

	for(i=0; i < PME_COREI7_EVENT_COUNT; i++) {
		if (corei7_pe[i].pme_code != 0xf)
			continue;
		
		/*
 		 * assume model46 umasks are at the end
 		 */
		corei7_pe[i].pme_numasks = 6;
		break;
	}
}

static int
pfm_nhm_init(void)
{
	pfm_pmu_support_t *supp;
	int i;
	int num_unc_cnt = 0;

	if (forced_pmu != PFMLIB_NO_PMU) {
		if (forced_pmu == PFMLIB_INTEL_NHM_PMU)
			cpu_model = 26;
		else
			cpu_model = 37;
	}

	/* core */
	pfm_regmask_set(&nhm_impl_pmcs, 0);
	pfm_regmask_set(&nhm_impl_pmcs, 1);
	pfm_regmask_set(&nhm_impl_pmcs, 2);
	pfm_regmask_set(&nhm_impl_pmcs, 3);
	pfm_regmask_set(&nhm_impl_pmcs, 16);
	pfm_regmask_set(&nhm_impl_pmcs, 17);
	pfm_regmask_set(&nhm_impl_pmcs, 18);
	pfm_regmask_set(&nhm_impl_pmcs, 19);

	pfm_regmask_set(&nhm_impl_pmds, 0);
	pfm_regmask_set(&nhm_impl_pmds, 1);
	pfm_regmask_set(&nhm_impl_pmds, 2);
	pfm_regmask_set(&nhm_impl_pmds, 3);
	pfm_regmask_set(&nhm_impl_pmds, 16);
	pfm_regmask_set(&nhm_impl_pmds, 17);
	pfm_regmask_set(&nhm_impl_pmds, 18);

	/* lbr */
	pfm_regmask_set(&nhm_impl_pmcs, 30);
	for(i=31; i < 64; i++)
		pfm_regmask_set(&nhm_impl_pmds, i);

	switch(cpu_model) {
	case 46:
		num_pe = PME_COREI7_EVENT_COUNT;
		num_unc_pe = 0;
		pe = corei7_pe;
		unc_pe = NULL;
		pme_cycles = PME_COREI7_UNHALTED_CORE_CYCLES;
		pme_instr = PME_COREI7_INSTRUCTIONS_RETIRED;
		num_unc_cnt = 0;
		fixup_mem_uncore_retired();
		supp = &intel_nhm_support;
		break;
	case 26: /* Nehalem */
	case 30: /* Lynnfield */
		num_pe = PME_COREI7_EVENT_COUNT;
		num_unc_pe = PME_COREI7_UNC_EVENT_COUNT;
		pe = corei7_pe;
		unc_pe = corei7_unc_pe;
		pme_cycles = PME_COREI7_UNHALTED_CORE_CYCLES;
		pme_instr = PME_COREI7_INSTRUCTIONS_RETIRED;
		setup_nhm_impl_unc_regs();
		num_unc_cnt = 9; /* one fixed + 8 generic */
		supp = &intel_nhm_support;
		break;
	case 37: /* Westmere */
	case 44:
		num_pe = PME_WSM_EVENT_COUNT;
		num_unc_pe = PME_WSM_UNC_EVENT_COUNT;
		pe = wsm_pe;
		unc_pe = intel_wsm_unc_pe;
		pme_cycles = PME_WSM_UNHALTED_CORE_CYCLES;
		pme_instr = PME_WSM_INSTRUCTIONS_RETIRED;
		setup_nhm_impl_unc_regs();
		num_unc_cnt = 9; /* one fixed + 8 generic */

		/* OFFCORE_RESPONSE_1 */
		pfm_regmask_set(&nhm_impl_pmcs, 31);
		supp = &intel_wsm_support;
		break;
	default:
		return PFMLIB_ERR_NOTSUPP;
	}
	
	supp->pme_count = num_pe + num_unc_pe;
	supp->num_cnt = NHM_NUM_GEN_COUNTERS
		      + NHM_NUM_FIXED_COUNTERS
		      + num_unc_cnt;
	/*
	 * propagate uncore registers to impl bitmaps
	 */
	pfm_regmask_or(&nhm_impl_pmds, &nhm_impl_pmds, &nhm_impl_unc_pmds);
	pfm_regmask_or(&nhm_impl_pmcs, &nhm_impl_pmcs, &nhm_impl_unc_pmcs);

	/*
	 * compute number of registers available
	 * not all CPUs may have uncore
	 */
	pfm_regmask_weight(&nhm_impl_pmds, &supp->pmd_count);
	pfm_regmask_weight(&nhm_impl_pmcs, &supp->pmc_count);

	return PFMLIB_SUCCESS;
}

static int
pfm_nhm_is_fixed(pfmlib_event_t *e, unsigned int f)
{
	pme_nhm_entry_t *ne;
	unsigned int fl, flc, i;
	unsigned int mask = 0;

	ne = get_nhm_entry(e->event);
	fl = ne->pme_flags;

	/*
	 * first pass: check if event as a whole supports fixed counters
	 */
	switch(f) {
		case 0:
			mask = PFMLIB_NHM_FIXED0;
			break;
		case 1:
			mask = PFMLIB_NHM_FIXED1;
			break;
		case 2:
			mask = PFMLIB_NHM_FIXED2_ONLY;
			break;
		default:
			return 0;
	}
	if (fl & mask)
		return 1;
	/*
	 * second pass: check if unit mask supports fixed counter
	 *
	 * reject if mask not found OR if not all unit masks have
	 * same fixed counter mask
	 */
	flc = 0;
	for(i=0; i < e->num_masks; i++) {
		int midx = pfm_nhm_midx2uidx(e->event, e->unit_masks[i]);
		fl = ne->pme_umasks[midx].pme_uflags;
		if (fl & mask)
			flc++;
	}
	return flc > 0 && flc == e->num_masks ? 1 : 0;
}

/*
 * Allow combination of events when cnt_mask > 0 AND unit mask codes do
 * not overlap (otherwise, we do not know what is actually measured)
 */
static int
pfm_nhm_check_cmask(pfmlib_event_t *e, pme_nhm_entry_t *ne, pfmlib_nhm_counter_t *cntr)
{
	unsigned int ref, ucode;
	int i, j;

	if (!cntr)
		return -1;

	if (cntr->cnt_mask == 0)
		return -1;

	for(i=0; i < e->num_masks; i++) {
		int midx = pfm_nhm_midx2uidx(e->event, e->unit_masks[i]);
		ref =  ne->pme_umasks[midx].pme_ucode;
		for(j=i+1; j < e->num_masks; j++) {
			midx = pfm_nhm_midx2uidx(e->event, e->unit_masks[j]);
			ucode = ne->pme_umasks[midx].pme_ucode;
			if (ref & ucode)
				return -1;
		}
	}
	return 0;
}
 
/*
 * IMPORTANT: the interface guarantees that pfp_pmds[] elements are returned in the order the events
 *	      were submitted.
 */
static int
pfm_nhm_dispatch_counters(pfmlib_input_param_t *inp, pfmlib_nhm_input_param_t *param, pfmlib_output_param_t *outp)
{
#define HAS_OPTIONS(x)	(cntrs && (cntrs[x].flags || cntrs[x].cnt_mask))
#define is_fixed_pmc(a) (a == 16 || a == 17 || a == 18)
#define is_uncore(a) (a > 19)

	pme_nhm_entry_t *ne;
	pfmlib_nhm_counter_t *cntrs;
	pfm_nhm_sel_reg_t reg;
	pfmlib_event_t *e;
	pfmlib_reg_t *pc, *pd;
	pfmlib_regmask_t *r_pmcs;
	uint64_t val, unc_global_ctrl;
	uint64_t pebs_mask, ld_mask;
	unsigned long long fixed_ctr;
	unsigned int plm;
	unsigned int npc, npmc0, npmc01, nf2, nuf;
	unsigned int i, n, k, j, umask, use_pebs = 0;
	unsigned int assign_pc[PMU_NHM_NUM_COUNTERS];
	unsigned int next_gen, last_gen, u_flags;
	unsigned int next_unc_gen, last_unc_gen, lat;
	unsigned int offcore_rsp0_value = 0;
	unsigned int offcore_rsp1_value = 0;

	npc = npmc01 = npmc0 = nf2 = nuf = 0;
	unc_global_ctrl = 0;

	e      = inp->pfp_events;
	pc     = outp->pfp_pmcs;
	pd     = outp->pfp_pmds;
	n      = inp->pfp_event_count;
	r_pmcs = &inp->pfp_unavail_pmcs;
	cntrs  = param ? param->pfp_nhm_counters : NULL;
	pebs_mask = ld_mask = 0;
	use_pebs = param ? param->pfp_nhm_pebs.pebs_used : 0;
	lat = param ? param->pfp_nhm_pebs.ld_lat_thres : 0;

	if (n > PMU_NHM_NUM_COUNTERS)
		return PFMLIB_ERR_TOOMANY;

	/*
	 * error checking
	 */
	for(i=0; i < n; i++) {
		/*
		 * only supports two priv levels for perf counters
		 */
		if (e[i].plm & (PFM_PLM1|PFM_PLM2))
			return PFMLIB_ERR_INVAL;

		ne = get_nhm_entry(e[i].event);

		/* check for erratum AAJ80 */
		if (aaj80 && (ne->pme_code & 0xff) == 0xc5) {
			DPRINT("MISPREDICTED_BRANCH_RETIRED broken on this Nehalem processor, see eeratum AAJ80\n");
			return PFMLIB_ERR_NOTSUPP;
		}

		/*
		 * check for valid flags
		 */
		if (e[i].flags & ~PFMLIB_NHM_ALL_FLAGS)
			return PFMLIB_ERR_INVAL;

		if (ne->pme_flags & PFMLIB_NHM_UMASK_NCOMBO
		    && e[i].num_masks > 1 && pfm_nhm_check_cmask(e, ne, cntrs ? cntrs+i : NULL)) {
			DPRINT("events does not support unit mask combination\n");
				return PFMLIB_ERR_NOASSIGN;
		}
		/*
		 * check event-level single register constraint for uncore fixed
		 */
		if (ne->pme_flags & PFMLIB_NHM_UNC_FIXED) {
			if (++nuf > 1) {
				DPRINT("two events compete for a UNCORE_FIXED_CTR0\n");
				return PFMLIB_ERR_NOASSIGN;
			}
			if (HAS_OPTIONS(i)) {
				DPRINT("uncore fixed counter does not support options\n");
				return PFMLIB_ERR_NOASSIGN;
			}
		}
		if (ne->pme_flags & PFMLIB_NHM_PMC0) {
			if (++npmc0 > 1) {
				DPRINT("two events compete for a PMC0\n");
				return PFMLIB_ERR_NOASSIGN;
			}
		}
		/*
		 * check event-level single register constraint (PMC0/1 only)
		 * fail if more than two events requested for the same counter pair
		 */
		if (ne->pme_flags & PFMLIB_NHM_PMC01) {
			if (++npmc01 > 2) {
				DPRINT("two events compete for a PMC0\n");
				return PFMLIB_ERR_NOASSIGN;
			}
		}
		/*
 		 * UNHALTED_REFERENCE_CYCLES (CPU_CLK_UNHALTED:BUS)
 		 * can only be measured on FIXED_CTR2
 		 */
		if (ne->pme_flags & PFMLIB_NHM_FIXED2_ONLY) {
			if (++nf2 > 1) {
				DPRINT("two events compete for FIXED_CTR2\n");
				return PFMLIB_ERR_NOASSIGN;
			}
			if (cntrs && ((cntrs[i].flags & (PFM_NHM_SEL_INV|PFM_NHM_SEL_EDGE)) || cntrs[i].cnt_mask)) {
				DPRINT("UNHALTED_REFERENCE_CYCLES only accepts anythr filter\n");
				return PFMLIB_ERR_NOASSIGN;
			}
		}
		/*
 		 * OFFCORE_RSP0 is shared, unit masks for all offcore_response events
 		 * must be identical
 		 */
		umask = 0;
                for(j=0; j < e[i].num_masks; j++) {
			int midx = pfm_nhm_midx2uidx(e[i].event, e[i].unit_masks[j]);
			umask |= ne->pme_umasks[midx].pme_ucode;
		}

		if (ne->pme_flags & PFMLIB_NHM_OFFCORE_RSP0) {
			if (offcore_rsp0_value && offcore_rsp0_value != umask) {
				DPRINT("all OFFCORE_RSP0 events must have the same unit mask\n");
				return PFMLIB_ERR_NOASSIGN;
			}
			if (pfm_regmask_isset(r_pmcs, 19)) {
				DPRINT("OFFCORE_RSP0 register not available\n");
				return PFMLIB_ERR_NOASSIGN;
			}
			if (!((umask & 0xff) && (umask & 0xff00))) {
				DPRINT("OFFCORE_RSP0 must have at least one request and response unit mask set\n");
				return PFMLIB_ERR_INVAL;
			}
			/* lock-in offcore_value */
			offcore_rsp0_value = umask;
		}
		if (ne->pme_flags & PFMLIB_NHM_OFFCORE_RSP1) {
			if (offcore_rsp1_value && offcore_rsp1_value != umask) {
				DPRINT("all OFFCORE_RSP1 events must have the same unit mask\n");
				return PFMLIB_ERR_NOASSIGN;
			}
			if (pfm_regmask_isset(r_pmcs, 31)) {
				DPRINT("OFFCORE_RSP1 register not available\n");
				return PFMLIB_ERR_NOASSIGN;
			}
			if (!((umask & 0xff) && (umask & 0xff00))) {
				DPRINT("OFFCORE_RSP1 must have at least one request and response unit mask set\n");
				return PFMLIB_ERR_INVAL;
			}
			/* lock-in offcore_value */
			offcore_rsp1_value = umask;
		}

		/*
		 * enforce PLM0|PLM3 for uncore events given they have no
		 * priv level filter. This is to ensure users understand what
		 * they are doing
		 */
		if (ne->pme_flags & (PFMLIB_NHM_UNC|PFMLIB_NHM_UNC_FIXED)) {
			if (inp->pfp_dfl_plm != (PFM_PLM0|PFM_PLM3)
			    && e[i].plm != (PFM_PLM0|PFM_PLM3)) {
				DPRINT("uncore events must have PLM0|PLM3\n");
				return PFMLIB_ERR_NOASSIGN;
			}
		}
	}

	/*
	 * initilize to empty
	 */
	for(i=0; i < PMU_NHM_NUM_COUNTERS; i++)
		assign_pc[i] = -1;


	next_gen = 0; /* first generic counter */
	last_gen = 3; /* last generic counter */

	/*
	 * strongest constraint: only uncore_fixed_ctr0 or PMC0 only
	 */
	if (nuf || npmc0) {
		for(i=0; i < n; i++) {
			ne = get_nhm_entry(e[i].event);
			if (ne->pme_flags & PFMLIB_NHM_PMC0) {
				if (pfm_regmask_isset(r_pmcs, 0))
					return PFMLIB_ERR_NOASSIGN;
				assign_pc[i] = 0;
				next_gen = 1;
			}
			if (ne->pme_flags & PFMLIB_NHM_UNC_FIXED) {
				if (pfm_regmask_isset(r_pmcs, 20))
					return PFMLIB_ERR_NOASSIGN;
				assign_pc[i] = 20;
			}
		}
	}
	/*
	 * 2nd strongest constraint first: works only on PMC0 or PMC1
	 * On Nehalem, this constraint applies at the event-level
	 * (not unit mask level, fortunately)
	 *
	 * PEBS works on all 4 generic counters
	 *
	 * Because of sanity check above, we know we can find
	 * only up to 2 events with this constraint
	 */
	if (npmc01) {
		for(i=0; i < n; i++) {
			ne = get_nhm_entry(e[i].event);
			if (ne->pme_flags & PFMLIB_NHM_PMC01) {
				while (next_gen < 2 && pfm_regmask_isset(r_pmcs, next_gen))
					next_gen++;
				if (next_gen == 2)
					return PFMLIB_ERR_NOASSIGN;
				assign_pc[i] = next_gen++;
			}
		}
	}

	/*
	 * next constraint: fixed counters
	 *
	 * We abuse the mapping here for assign_pc to make it easier
	 * to provide the correct values for pd[].
	 * We use:
	 * 	- 16 : fixed counter 0 (pmc16, pmd16)
	 * 	- 17 : fixed counter 1 (pmc16, pmd17)
	 * 	- 18 : fixed counter 2 (pmc16, pmd18)
	 */
	fixed_ctr = pfm_regmask_isset(r_pmcs, 16) ? 0 : 0x7;
	if (fixed_ctr) {
		for(i=0; i < n; i++) {
			/* 
			 * Nehalem fixed counters (as for architected perfmon v3)
			 * does support anythr filter
			 */
			if (HAS_OPTIONS(i)) {
				if (use_pebs && pfm_nhm_is_pebs(e+i))
					continue;

				if (cntrs[i].flags != PFM_NHM_SEL_ANYTHR)
					continue;
			}
			if ((fixed_ctr & 0x1) && pfm_nhm_is_fixed(e+i, 0)) {
				assign_pc[i] = 16;
				fixed_ctr &= ~1;
			}
			if ((fixed_ctr & 0x2) && pfm_nhm_is_fixed(e+i, 1)) {
				assign_pc[i] = 17;
				fixed_ctr &= ~2;
			}
			if ((fixed_ctr & 0x4) && pfm_nhm_is_fixed(e+i, 2)) {
				assign_pc[i] = 18;
				fixed_ctr &= ~4;
			}
		}
	}
	/*
 	 * uncore events on any of the 8 counters
 	 */
	next_unc_gen = 21; /* first generic uncore counter config */
	last_unc_gen = 28; /* last generic uncore counter config */
	for(i=0; i < n; i++) {
		ne = get_nhm_entry(e[i].event);
		if (ne->pme_flags & PFMLIB_NHM_UNC) {
			for(; next_unc_gen <= last_unc_gen; next_unc_gen++) {
				if (!pfm_regmask_isset(r_pmcs, next_unc_gen))
					break;
			}
			if (next_unc_gen <= last_unc_gen)
				assign_pc[i] = next_unc_gen++;
			else {
				DPRINT("cannot assign generic uncore event\n");
				return PFMLIB_ERR_NOASSIGN;
			}
		}
	}

	/*
	 * assign what is left of the generic events
	 */
	for(i=0; i < n; i++) {
		if (assign_pc[i] == -1) {
			for(; next_gen <= last_gen; next_gen++) {
				DPRINT("i=%d next_gen=%d last=%d isset=%d\n", i, next_gen, last_gen, pfm_regmask_isset(r_pmcs, next_gen));
				if (!pfm_regmask_isset(r_pmcs, next_gen))
					break;
			}
			if (next_gen <= last_gen) {
				assign_pc[i] = next_gen++;
			} else {
				DPRINT("cannot assign generic event\n");
				return PFMLIB_ERR_NOASSIGN;
			}
		}
	}

	/*
 	 * setup core fixed counters
 	 */
	reg.val = 0;
	for (i=0; i < n ; i++ ) {
		if (!is_fixed_pmc(assign_pc[i]))
			continue;
		val = 0;
		/* if plm is 0, then assume not specified per-event and use default */
		plm = e[i].plm ? e[i].plm : inp->pfp_dfl_plm;
		if (plm & PFM_PLM0)
			val |= 1ULL;
		if (plm & PFM_PLM3)
			val |= 2ULL;
		if (cntrs && cntrs[i].flags & PFM_NHM_SEL_ANYTHR)
			val |= 4ULL;
		val |= 1ULL << 3;	 /* force APIC int (kernel may force it anyway) */

		reg.val |= val << ((assign_pc[i]-16)<<2);
	}

	if (reg.val) {
		pc[npc].reg_num   = 16;
		pc[npc].reg_value = reg.val;
		pc[npc].reg_addr  = 0x38D;
		pc[npc].reg_alt_addr  = 0x38D;

		__pfm_vbprintf("[FIXED_CTRL(pmc%u)=0x%"PRIx64" pmi0=1 en0=0x%"PRIx64" any0=%d pmi1=1 en1=0x%"PRIx64" any1=%d pmi2=1 en2=0x%"PRIx64" any2=%d] ",
				pc[npc].reg_num,
				reg.val,
				reg.val & 0x3ULL,
				!!(reg.val & 0x4ULL),
				(reg.val>>4) & 0x3ULL,
				!!((reg.val>>4) & 0x4ULL),
				(reg.val>>8) & 0x3ULL,
				!!((reg.val>>8) & 0x4ULL));

		if ((fixed_ctr & 0x1) == 0)
			__pfm_vbprintf("INSTRUCTIONS_RETIRED ");
		if ((fixed_ctr & 0x2) == 0)
			__pfm_vbprintf("UNHALTED_CORE_CYCLES ");
		if ((fixed_ctr & 0x4) == 0)
			__pfm_vbprintf("UNHALTED_REFERENCE_CYCLES ");
		__pfm_vbprintf("\n");

		npc++;

		if ((fixed_ctr & 0x1) == 0)
			__pfm_vbprintf("[FIXED_CTR0(pmd16)]\n");
		if ((fixed_ctr & 0x2) == 0)
			__pfm_vbprintf("[FIXED_CTR1(pmd17)]\n");
		if ((fixed_ctr & 0x4) == 0)
			__pfm_vbprintf("[FIXED_CTR2(pmd18)]\n");
	}

	/*
	 * setup core counter config
	 */
	for (i=0; i < n ; i++ ) {
		/* skip fixed counters */
		if (is_fixed_pmc(assign_pc[i]) || is_uncore(assign_pc[i]))
			continue;

		reg.val = 0; /* assume reserved bits are zeroed */

		/* if plm is 0, then assume not specified per-event and use default */
		plm = e[i].plm ? e[i].plm : inp->pfp_dfl_plm;

		ne = get_nhm_entry(e[i].event);
		val = ne->pme_code;

		reg.sel_event = val & 0xff;

		umask = (val >> 8) & 0xff;

		u_flags = 0;

		/*
		 * for OFFCORE_RSP, the unit masks are all in the
		 * dedicated OFFCORE_RSP MSRs and event unit mask must be
		 * 0x1 (extracted from pme_code)
		 */
		if (!(ne->pme_flags & (PFMLIB_NHM_OFFCORE_RSP0|PFMLIB_NHM_OFFCORE_RSP1)))
			for(k=0; k < e[i].num_masks; k++) {
				int midx = pfm_nhm_midx2uidx(e[i].event, e[i].unit_masks[k]);
				umask |= ne->pme_umasks[midx].pme_ucode;
				u_flags |= ne->pme_umasks[midx].pme_uflags;
			}
		val |= umask << 8;

		reg.sel_umask  = umask;
		reg.sel_usr    = plm & PFM_PLM3 ? 1 : 0;
		reg.sel_os     = plm & PFM_PLM0 ? 1 : 0;
		reg.sel_en     = 1; /* force enable bit to 1 */
		reg.sel_int    = 1; /* force APIC int to 1 */

		reg.sel_cnt_mask = val >>24;
		reg.sel_inv = val >> 23;
		reg.sel_anythr = val >> 21;
		reg.sel_edge = val >> 18;

		if (cntrs) {
			/*
 			 * occupancy reset flag is for uncore counters only
 			 */
			if (cntrs[i].flags & PFM_NHM_SEL_OCC_RST)
				return PFMLIB_ERR_INVAL;

			if (!reg.sel_cnt_mask) {
				/*
				 * counter mask is 8-bit wide, do not silently
				 * wrap-around
				 */
				if (cntrs[i].cnt_mask > 255)
					return PFMLIB_ERR_INVAL;

				reg.sel_cnt_mask = cntrs[i].cnt_mask;
			}

			if (!reg.sel_edge)
				reg.sel_edge = cntrs[i].flags & PFM_NHM_SEL_EDGE ? 1 : 0;
			if (!reg.sel_inv)
				reg.sel_inv = cntrs[i].flags & PFM_NHM_SEL_INV ? 1 : 0;
			if (!reg.sel_anythr)
				reg.sel_anythr = cntrs[i].flags & PFM_NHM_SEL_ANYTHR ? 1 : 0;
		}

		if (u_flags || (ne->pme_flags & PFMLIB_NHM_PEBS))
				pebs_mask |= 1ULL << assign_pc[i];

		/*
		 * check for MEM_INST_RETIRED:LATENCY_ABOVE_THRESHOLD_0 to enable load latency filtering
		 * when PEBS is used. There is only one threshold possible, yet mutliple counters may be
		 * programmed with this event/umask. That means they all share the same threshold.
		 */
		if (reg.sel_event == 0xb && (umask & 0x10))
			ld_mask |= 1ULL << assign_pc[i];

		pc[npc].reg_num     = assign_pc[i];
		pc[npc].reg_value   = reg.val;
		pc[npc].reg_addr    = NHM_SEL_BASE+assign_pc[i];
		pc[npc].reg_alt_addr= NHM_SEL_BASE+assign_pc[i];

		__pfm_vbprintf("[PERFEVTSEL%u(pmc%u)=0x%"PRIx64" event_sel=0x%x umask=0x%x os=%d usr=%d anythr=%d en=%d int=%d inv=%d edge=%d cnt_mask=%d] %s\n",
				pc[npc].reg_num,
				pc[npc].reg_num,
				reg.val,
				reg.sel_event,
				reg.sel_umask,
				reg.sel_os,
				reg.sel_usr,
				reg.sel_anythr,
				reg.sel_en,
				reg.sel_int,
				reg.sel_inv,
				reg.sel_edge,
				reg.sel_cnt_mask,
				ne->pme_name);

		__pfm_vbprintf("[PMC%u(pmd%u)]\n",
				pc[npc].reg_num,
				pc[npc].reg_num);

		npc++;
	}
	/*
	 * setup uncore fixed counter config
	 */
	if (nuf) {
		pc[npc].reg_num   = 20;
		pc[npc].reg_value = 0x5ULL; /* ena=1, PMI=dtermined by kernel */
		pc[npc].reg_addr  = 0x395;
		pc[npc].reg_alt_addr  = 0x395;
		__pfm_vbprintf("[UNC_FIXED_CTRL(pmc20)=0x%"PRIx64" pmi=1 ena=1] UNC_CLK_UNHALTED\n", pc[npc].reg_value);
		__pfm_vbprintf("[UNC_FIXED_CTR0(pmd20)]\n");
		unc_global_ctrl |= 1ULL<< 32;
		npc++;
	}
	/*
 	 * setup uncore counter config
 	 */
	for (i=0; i < n ; i++ ) {

		/* skip core counters, uncore fixed */
		if (!is_uncore(assign_pc[i]) || assign_pc[i] == 20)
			continue;

		reg.val = 0; /* assume reserved bits are zerooed */

		ne = get_nhm_entry(e[i].event);
		val = ne->pme_code;

		reg.usel_event = val & 0xff;

		umask = (val >> 8) & 0xff;

		for(k=0; k < e[i].num_masks; k++) {
			int midx = pfm_nhm_midx2uidx(e[i].event, e[i].unit_masks[k]);
			umask |= ne->pme_umasks[midx].pme_ucode;
		}

		val |= umask << 8;

		reg.usel_umask = umask;
		reg.usel_en    = 1; /* force enable bit to 1 */
		reg.usel_int   = 1; /* force APIC int to 1 */

		/*
 		 * allow hardcoded filters in event table
 		 */
		reg.usel_cnt_mask = val >>24;
		reg.usel_inv = val >> 23;
		reg.usel_edge = val >> 18;
		reg.usel_occ = val >> 17;

		if (cntrs) {
			/*
 			 * anythread if for core counters only
 			 */
			if (cntrs[i].flags & PFM_NHM_SEL_ANYTHR)
				return PFMLIB_ERR_INVAL;

			if (!reg.usel_cnt_mask) {
				/*
				 * counter mask is 8-bit wide, do not silently
				 * wrap-around
				 */
				if (cntrs[i].cnt_mask > 255)
					return PFMLIB_ERR_INVAL;

				reg.usel_cnt_mask = cntrs[i].cnt_mask;
			}
			if (!reg.usel_edge)
				reg.usel_edge = cntrs[i].flags & PFM_NHM_SEL_EDGE ? 1 : 0;
			
			if (!reg.usel_inv)
				reg.usel_inv = cntrs[i].flags & PFM_NHM_SEL_INV ? 1 : 0;

			if (!reg.usel_occ)
				reg.usel_occ = cntrs[i].flags & PFM_NHM_SEL_OCC_RST ? 1 : 0;
		}

		unc_global_ctrl |= 1ULL<< (assign_pc[i] - 21);
		pc[npc].reg_num     = assign_pc[i];
		pc[npc].reg_value   = reg.val;
		pc[npc].reg_addr    = UNC_NHM_SEL_BASE+assign_pc[i] - 21;
		pc[npc].reg_alt_addr= UNC_NHM_SEL_BASE+assign_pc[i] - 21;

		__pfm_vbprintf("[UNC_PERFEVTSEL%u(pmc%u)=0x%"PRIx64" event=0x%x umask=0x%x en=%d int=%d inv=%d edge=%d occ=%d cnt_msk=%d] %s\n",
				pc[npc].reg_num - 21,
				pc[npc].reg_num,
				reg.val,
				reg.usel_event,
				reg.usel_umask,
				reg.usel_en,
				reg.usel_int,
				reg.usel_inv,
				reg.usel_edge,
				reg.usel_occ,
				reg.usel_cnt_mask,
				ne->pme_name);

		__pfm_vbprintf("[UNC_PMC%u(pmd%u)]\n",
				pc[npc].reg_num - 21,
				pc[npc].reg_num);
		npc++;
	}

	/*
	 * setup pmds: must be in the same order as the events
	 */
	for (i=0; i < n ; i++) {
		switch (assign_pc[i]) {
			case 0 ... 4:
				pd[i].reg_num  = assign_pc[i];
				pd[i].reg_addr = NHM_CTR_BASE+assign_pc[i];
				/* index to use with RDPMC */
				pd[i].reg_alt_addr  = assign_pc[i];
				break;
			case 16 ... 18:
				/* setup pd array */
				pd[i].reg_num = assign_pc[i];
				pd[i].reg_addr = NHM_FIXED_CTR_BASE+assign_pc[i]-16;
				pd[i].reg_alt_addr = 0x40000000+assign_pc[i]-16;
				break;
			case 20:
				pd[i].reg_num = 20;
				pd[i].reg_addr = UNC_NHM_FIXED_CTR_BASE;
				pd[i].reg_alt_addr = UNC_NHM_FIXED_CTR_BASE;
				break;
			case 21 ... 28:
				pd[i].reg_num = assign_pc[i];
				pd[i].reg_addr = UNC_NHM_CTR_BASE + assign_pc[i] - 21;
				pd[i].reg_alt_addr = UNC_NHM_CTR_BASE + assign_pc[i] - 21;
				break;
		}
	}
	outp->pfp_pmd_count = i;

	/*
	 * setup PEBS_ENABLE
	 */
	if (use_pebs && pebs_mask) {
		if (!lat)
			ld_mask = 0;
		/*
		 * check that PEBS_ENABLE is available
		 */
		if (pfm_regmask_isset(r_pmcs, 17))
			return PFMLIB_ERR_NOASSIGN;

		pc[npc].reg_num   = 17;
		pc[npc].reg_value = pebs_mask | (ld_mask <<32);
		pc[npc].reg_addr  = 0x3f1; /* IA32_PEBS_ENABLE */
		pc[npc].reg_alt_addr  = 0x3f1; /* IA32_PEBS_ENABLE */

		__pfm_vbprintf("[PEBS_ENABLE(pmc%u)=0x%"PRIx64" ena0=%d ena1=%d ena2=%d ena3=%d ll0=%d ll1=%d ll2=%d ll3=%d]\n",
				pc[npc].reg_num,
				pc[npc].reg_value,
				pc[npc].reg_value & 0x1,
				(pc[npc].reg_value >> 1)  & 0x1,
				(pc[npc].reg_value >> 2)  & 0x1,
				(pc[npc].reg_value >> 3)  & 0x1,
				(pc[npc].reg_value >> 32)  & 0x1,
				(pc[npc].reg_value >> 33)  & 0x1,
				(pc[npc].reg_value >> 34)  & 0x1,
				(pc[npc].reg_value >> 35)  & 0x1);

		npc++;

		if (ld_mask) {
			if (lat < 3 || lat > 0xffff) {
				DPRINT("invalid load latency threshold %u (must be in [3:0xffff])\n", lat);
				return PFMLIB_ERR_INVAL;
			}

			if (pfm_regmask_isset(r_pmcs, 18))
				return PFMLIB_ERR_NOASSIGN;

			pc[npc].reg_num   = 18;
			pc[npc].reg_value = lat;
			pc[npc].reg_addr  = 0x3f1; /* IA32_PEBS_ENABLE */
			pc[npc].reg_alt_addr  = 0x3f1; /* IA32_PEBS_ENABLE */
			__pfm_vbprintf("[LOAD_LATENCY_THRESHOLD(pmc%u)=0x%"PRIx64"]\n",
					pc[npc].reg_num,
					pc[npc].reg_value);

			npc++;
		}
	}

	/*
 	 * setup OFFCORE_RSP0
 	 */
	if (offcore_rsp0_value) {
		pc[npc].reg_num     = 19;
		pc[npc].reg_value   = offcore_rsp0_value;
		pc[npc].reg_addr    = 0x1a6;
		pc[npc].reg_alt_addr = 0x1a6;
		__pfm_vbprintf("[OFFCORE_RSP0(pmc%u)=0x%"PRIx64"]\n",
				pc[npc].reg_num,
				pc[npc].reg_value);
		npc++;
	}
	/*
 	 * setup OFFCORE_RSP1
 	 */
	if (offcore_rsp1_value) {
		pc[npc].reg_num     = 31;
		pc[npc].reg_value   = offcore_rsp1_value;
		pc[npc].reg_addr    = 0x1a7;
		pc[npc].reg_alt_addr = 0x1a7;
		__pfm_vbprintf("[OFFCORE_RSP1(pmc%u)=0x%"PRIx64"]\n",
				pc[npc].reg_num,
				pc[npc].reg_value);
		npc++;
	}

	outp->pfp_pmc_count = npc;

	return PFMLIB_SUCCESS;
}

static int
pfm_nhm_dispatch_lbr(pfmlib_input_param_t *inp, pfmlib_nhm_input_param_t *param, pfmlib_output_param_t *outp)
{
	static int lbr_plm_map[4]={
		0x3,	/* PLM0=0 PLM3=0 neq0=1 eq0=1 */
		0x1,	/* PLM0=0 PLM3=1 neq0=0 eq0=1 */
		0x2,	/* PLM0=1 PLM3=0 neq0=1 eq0=0 */
		0x0	/* PLM0=1 PLM3=1 neq0=0 eq0=0 */
	};
	pfm_nhm_sel_reg_t reg;
	unsigned int filter, i, c;
	unsigned int plm;

	/*
	 * check LBR_SELECT is available
	 */
	if (pfm_regmask_isset(&inp->pfp_unavail_pmcs, 30))
		return PFMLIB_ERR_NOASSIGN;

	reg.val = 0; /* capture everything */

	plm = param->pfp_nhm_lbr.lbr_plm;
	if (!plm)
		plm = inp->pfp_dfl_plm;

	/*
 	 * LBR does not distinguish PLM1, PLM2 from PLM3
 	 */

	i  = plm & PFM_PLM0 ? 0x2 : 0;
	i |= plm & PFM_PLM3 ? 0x1 : 0;

	if (lbr_plm_map[i] & 0x1)
		reg.lbr_select.cpl_eq0 = 1;

	if (lbr_plm_map[i] & 0x2)
		reg.lbr_select.cpl_neq0 = 1;

	filter = param->pfp_nhm_lbr.lbr_filter;

	if (filter & PFM_NHM_LBR_JCC)
		reg.lbr_select.jcc = 1;

	if (filter & PFM_NHM_LBR_NEAR_REL_CALL)
		reg.lbr_select.near_rel_call = 1;

	if (filter & PFM_NHM_LBR_NEAR_IND_CALL)
		reg.lbr_select.near_ind_call = 1;

	if (filter & PFM_NHM_LBR_NEAR_RET)
		reg.lbr_select.near_ret = 1;

	if (filter & PFM_NHM_LBR_NEAR_IND_JMP)
		reg.lbr_select.near_ind_jmp = 1;

	if (filter & PFM_NHM_LBR_NEAR_REL_JMP)
		reg.lbr_select.near_rel_jmp = 1;

	if (filter & PFM_NHM_LBR_FAR_BRANCH)
		reg.lbr_select.far_branch = 1;

	__pfm_vbprintf("[LBR_SELECT(PMC30)=0x%"PRIx64" eq0=%d neq0=%d jcc=%d rel=%d ind=%d ret=%d ind_jmp=%d rel_jmp=%d far=%d ]\n",
			reg.val,
			reg.lbr_select.cpl_eq0,
			reg.lbr_select.cpl_neq0,
			reg.lbr_select.jcc,
			reg.lbr_select.near_rel_call,
			reg.lbr_select.near_ind_call,
			reg.lbr_select.near_ret,
			reg.lbr_select.near_ind_jmp,
			reg.lbr_select.near_rel_jmp,
			reg.lbr_select.far_branch);

	__pfm_vbprintf("[LBR_TOS(PMD31)]\n");

	__pfm_vbprintf("[LBR_FROM-LBR_TO(PMD32..PMD63)]\n");

	c = outp->pfp_pmc_count;

	outp->pfp_pmcs[c].reg_num = 30;
	outp->pfp_pmcs[c].reg_value = reg.val;
	outp->pfp_pmcs[c].reg_addr = 0x1c8;
	outp->pfp_pmcs[c].reg_alt_addr = 0x1c8;
	c++;
	outp->pfp_pmc_count = c;

	c = outp->pfp_pmd_count;

	outp->pfp_pmds[c].reg_num = 31;
	outp->pfp_pmds[c].reg_value = 0;
	outp->pfp_pmds[c].reg_addr = 0x1c9;
	outp->pfp_pmds[c].reg_alt_addr = 0x1c9;
	c++;

	for(i=0; i < 32; i++, c++) {
		outp->pfp_pmds[c].reg_num = 32 + i;
		outp->pfp_pmds[c].reg_value = 0;
		outp->pfp_pmds[c].reg_addr = (i>>1) + ((i & 0x1) ? 0x6c0 : 0x680);
		outp->pfp_pmds[c].reg_alt_addr = (i>>1) + ((i & 0x1) ? 0x6c0 : 0x680);
	}
	outp->pfp_pmd_count = c;
	return PFMLIB_SUCCESS;
}

static int
pfm_nhm_dispatch_events(pfmlib_input_param_t *inp, void *model_in, pfmlib_output_param_t *outp, void *model_out)
{
	pfmlib_nhm_input_param_t *mod_in  = (pfmlib_nhm_input_param_t *)model_in;
	int ret;

	if (inp->pfp_dfl_plm & (PFM_PLM1|PFM_PLM2)) {
		DPRINT("invalid plm=%x\n", inp->pfp_dfl_plm);
		return PFMLIB_ERR_INVAL;
	}
	ret = pfm_nhm_dispatch_counters(inp, mod_in, outp);
	if (ret != PFMLIB_SUCCESS)
		return ret;

	if (mod_in && mod_in->pfp_nhm_lbr.lbr_used)
		ret = pfm_nhm_dispatch_lbr(inp, mod_in, outp);

	return ret;
}

static int
pfm_nhm_get_event_code(unsigned int i, unsigned int cnt, int *code)
{
	pfmlib_regmask_t cnts;

	pfm_get_impl_counters(&cnts);

	if (cnt != PFMLIB_CNT_FIRST
	    && (cnt > MAX_COUNTERS ||
		!pfm_regmask_isset(&cnts, cnt)))
		return PFMLIB_ERR_INVAL;

	*code = get_nhm_entry(i)->pme_code;

	return PFMLIB_SUCCESS;
}

static void
pfm_nhm_get_event_counters(unsigned int j, pfmlib_regmask_t *counters)
{
	pme_nhm_entry_t *ne;
	unsigned int i;

	memset(counters, 0, sizeof(*counters));

	ne = get_nhm_entry(j);

	if (ne->pme_flags & PFMLIB_NHM_UNC_FIXED) {
		pfm_regmask_set(counters, 20);
		return;
	}

	if (ne->pme_flags & PFMLIB_NHM_UNC) {
		pfm_regmask_set(counters, 20);
		pfm_regmask_set(counters, 21);
		pfm_regmask_set(counters, 22);
		pfm_regmask_set(counters, 23);
		pfm_regmask_set(counters, 24);
		pfm_regmask_set(counters, 25);
		pfm_regmask_set(counters, 26);
		pfm_regmask_set(counters, 27);
		return;
	}
	/*
	 * fixed counter events have no unit mask
	 */
	if (ne->pme_flags & PFMLIB_NHM_FIXED0)
		pfm_regmask_set(counters, 16);

	if (ne->pme_flags & PFMLIB_NHM_FIXED1)
		pfm_regmask_set(counters, 17);

	if (ne->pme_flags & PFMLIB_NHM_FIXED2_ONLY)
		pfm_regmask_set(counters, 18);

	/*
	 * extract from unit mask level
	 */
	for (i=0; i < ne->pme_numasks; i++) {
		if (ne->pme_umasks[i].pme_uflags & PFMLIB_NHM_FIXED0)
			pfm_regmask_set(counters, 16);
		if (ne->pme_umasks[i].pme_uflags & PFMLIB_NHM_FIXED1)
			pfm_regmask_set(counters, 17);
		if (ne->pme_umasks[i].pme_uflags & PFMLIB_NHM_FIXED2_ONLY)
			pfm_regmask_set(counters, 18);
	}

	/*
	 * event on FIXED_CTR2 is exclusive CPU_CLK_UNHALTED:REF
	 * PMC0|PMC1 only on 0,1, constraint at event-level
	 */
	if (!pfm_regmask_isset(counters, 18)) {
		pfm_regmask_set(counters, 0);
		if (!(ne->pme_flags & PFMLIB_NHM_PMC0))
			pfm_regmask_set(counters, 1);
		if (!(ne->pme_flags & (PFMLIB_NHM_PMC01|PFMLIB_NHM_PMC0))) {
			pfm_regmask_set(counters, 2);
			pfm_regmask_set(counters, 3);
		}
	}
}

static void
pfm_nhm_get_impl_pmcs(pfmlib_regmask_t *impl_pmcs)
{
	*impl_pmcs = nhm_impl_pmcs;
}

static void
pfm_nhm_get_impl_pmds(pfmlib_regmask_t *impl_pmds)
{
	*impl_pmds = nhm_impl_pmds;
}

static void
pfm_nhm_get_impl_counters(pfmlib_regmask_t *impl_counters)
{
	/* core generic */
	pfm_regmask_set(impl_counters, 0);
	pfm_regmask_set(impl_counters, 1);
	pfm_regmask_set(impl_counters, 2);
	pfm_regmask_set(impl_counters, 3);
	/* core fixed */
	pfm_regmask_set(impl_counters, 16);
	pfm_regmask_set(impl_counters, 17);
	pfm_regmask_set(impl_counters, 18);

	/* uncore pmd registers all counters */
	pfm_regmask_or(impl_counters, impl_counters, &nhm_impl_unc_pmds);
}

/*
 * Even though, CPUID 0xa returns in eax the actual counter
 * width, the architecture specifies that writes are limited
 * to lower 32-bits. As such, only the lower 32-bit have full
 * degree of freedom. That is the "useable" counter width.
 */
#define PMU_NHM_COUNTER_WIDTH       32

static void
pfm_nhm_get_hw_counter_width(unsigned int *width)
{
	/*
	 * Even though, CPUID 0xa returns in eax the actual counter
	 * width, the architecture specifies that writes are limited
	 * to lower 32-bits. As such, only the lower 31 bits have full
	 * degree of freedom. That is the "useable" counter width.
	 */
	*width = PMU_NHM_COUNTER_WIDTH;
}

static char *
pfm_nhm_get_event_name(unsigned int i)
{
	return get_nhm_entry(i)->pme_name;
}

static int
pfm_nhm_get_event_description(unsigned int ev, char **str)
{
	char *s;
	s = get_nhm_entry(ev)->pme_desc;
	if (s) {
		*str = strdup(s);
	} else {
		*str = NULL;
	}
	return PFMLIB_SUCCESS;
}
static char *
pfm_nhm_get_event_mask_name(unsigned int ev, unsigned int midx)
{
	midx = pfm_nhm_midx2uidx(ev, midx);
	return get_nhm_entry(ev)->pme_umasks[midx].pme_uname;
}

static int
pfm_nhm_get_event_mask_desc(unsigned int ev, unsigned int midx, char **str)
{
	char *s;

	midx = pfm_nhm_midx2uidx(ev, midx);
	s = get_nhm_entry(ev)->pme_umasks[midx].pme_udesc;
	if (s) {
		*str = strdup(s);
	} else {
		*str = NULL;
	}
	return PFMLIB_SUCCESS;
}

static unsigned int
pfm_nhm_get_num_event_masks(unsigned int ev)
{
	int i, num = 0;
	pme_nhm_entry_t *ne;
	int model;

	ne = get_nhm_entry(ev);

	for (i=0; i < ne->pme_numasks; i++) {
		model = ne->pme_umasks[i].pme_umodel;
		if (!model || model == cpu_model)
			num++;
	}
DPRINT("event %s numasks=%d\n", ne->pme_name, num);
	return num;
}

static int
pfm_nhm_get_event_mask_code(unsigned int ev, unsigned int midx, unsigned int *code)
{
	midx = pfm_nhm_midx2uidx(ev, midx);
	*code =get_nhm_entry(ev)->pme_umasks[midx].pme_ucode;
	return PFMLIB_SUCCESS;
}

static int
pfm_nhm_get_cycle_event(pfmlib_event_t *e)
{
	e->event = pme_cycles;
	return PFMLIB_SUCCESS;
}

static int
pfm_nhm_get_inst_retired(pfmlib_event_t *e)
{
	e->event = pme_instr;;
	return PFMLIB_SUCCESS;
}

/*
 * the following function implement the model
 * specific API directly available to user
 */

/*
 *  Check if event and all provided unit masks support PEBS
 *
 *  return:
 *  	PFMLIB_ERR_INVAL: invalid event e
 *  	1 event supports PEBS
 *  	0 event does not support PEBS
 *
 */
int
pfm_nhm_is_pebs(pfmlib_event_t *e)
{
	pme_nhm_entry_t *ne;
	unsigned int i, n=0;

	if (e == NULL || e->event >= intel_nhm_support.pme_count)
		return PFMLIB_ERR_INVAL;

	ne = get_nhm_entry(e->event);
	if (ne->pme_flags & PFMLIB_NHM_PEBS)
		return 1;

	/*
	 * ALL unit mask must support PEBS for this test to return true
	 */
	for(i=0; i < e->num_masks; i++) {
		int midx;
		/* check for valid unit mask */
		if (e->unit_masks[i] >= ne->pme_numasks)
			return PFMLIB_ERR_INVAL;
 		midx = pfm_nhm_midx2uidx(e->event, e->unit_masks[i]);
		if (ne->pme_umasks[midx].pme_uflags & PFMLIB_NHM_PEBS)
			n++;
	}
	return n > 0 && n == e->num_masks;
}

/*
 * Check if event is uncore
 * return:
 *  	PFMLIB_ERR_INVAL: invalid event e
 *  	1 event is uncore
 *  	0 event is not uncore
 */
int
pfm_nhm_is_uncore(pfmlib_event_t *e)
{
	if (PFMLIB_INITIALIZED() == 0)
		return 0;

	if (e == NULL || e->event >= num_pe)
		return PFMLIB_ERR_INVAL;

	return !!(get_nhm_entry(e->event)->pme_flags & (PFMLIB_NHM_UNC|PFMLIB_NHM_UNC_FIXED));
}

static const char *data_src_encodings[]={
/*  0 */	"unknown L3 cache miss",
/*  1 */	"minimal latency core cache hit. Request was satisfied by L1 data cache",
/*  2 */	"pending core cache HIT. Outstanding core cache miss to same cacheline address already underway",
/*  3 */	"data request satisfied by the L2",
/*  4 */	"L3 HIT. Local or remote home request that hit L3 in the uncore with no coherency actions required (snooping)",
/*  5 */	"L3 HIT. Local or remote home request that hit L3 and was serviced by another core with a cross core snoop where no modified copy was found (clean)",
/*  6 */	"L3 HIT. Local or remote home request that hit L3 and was serviced by another core with a cross core snoop where modified copies were found (HITM)",
/*  7 */	"reserved",
/*  8 */	"L3 MISS. Local homed request that missed L3 and was serviced by forwarded data following a cross package snoop where no modified copy was found (remote home requests are not counted)",
/*  9 */	"reserved",
/* 10 */	"L3 MISS. Local homed request that missed L3 and was serviced by local DRAM (go to shared state)",
/* 11 */	"L3 MISS. Remote homed request that missed L3 and was serviced by remote DRAM (go to shared state)",
/* 12 */	"L3 MISS. Local homed request that missed L3 and was serviced by local DRAM (go to exclusive state)",
/* 13 */	"L3 MISS. Remote homed request that missed L3 and was serviced by remote DRAM (go to exclusive state)",
/* 14 */	"reserved",
/* 15 */	"request to uncacheable memory"
};

/*
 * return data source encoding based on index in val
 * To be used with PEBS load latency filtering to decode
 * source of the load miss
 */
int pfm_nhm_data_src_desc(unsigned int val, char **desc)
{
	if (val > 15 || !desc)
		return PFMLIB_ERR_INVAL;

	*desc = strdup(data_src_encodings[val]);
	if (!*desc)
		return PFMLIB_ERR_NOMEM;

	return PFMLIB_SUCCESS;	
}

pfm_pmu_support_t intel_nhm_support={
	.pmu_name		= "Intel Nehalem",
	.pmu_type		= PFMLIB_INTEL_NHM_PMU,
	.pme_count		= 0,/* patched at runtime */
	.pmc_count		= 0,/* patched at runtime */
	.pmd_count		= 0,/* patched at runtime */
	.num_cnt		= 0,/* patched at runtime */
	.get_event_code		= pfm_nhm_get_event_code,
	.get_event_name		= pfm_nhm_get_event_name,
	.get_event_counters	= pfm_nhm_get_event_counters,
	.dispatch_events	= pfm_nhm_dispatch_events,
	.pmu_detect		= pfm_nhm_detect,
	.pmu_init		= pfm_nhm_init,
	.get_impl_pmcs		= pfm_nhm_get_impl_pmcs,
	.get_impl_pmds		= pfm_nhm_get_impl_pmds,
	.get_impl_counters	= pfm_nhm_get_impl_counters,
	.get_hw_counter_width	= pfm_nhm_get_hw_counter_width,
	.get_event_desc         = pfm_nhm_get_event_description,
	.get_num_event_masks	= pfm_nhm_get_num_event_masks,
	.get_event_mask_name	= pfm_nhm_get_event_mask_name,
	.get_event_mask_code	= pfm_nhm_get_event_mask_code,
	.get_event_mask_desc	= pfm_nhm_get_event_mask_desc,
	.get_cycle_event	= pfm_nhm_get_cycle_event,
	.get_inst_retired_event = pfm_nhm_get_inst_retired
};

pfm_pmu_support_t intel_wsm_support={
	.pmu_name		= "Intel Westmere",
	.pmu_type		= PFMLIB_INTEL_WSM_PMU,
	.pme_count		= 0,/* patched at runtime */
	.pmc_count		= 0,/* patched at runtime */
	.pmd_count		= 0,/* patched at runtime */
	.num_cnt		= 0,/* patched at runtime */
	.get_event_code		= pfm_nhm_get_event_code,
	.get_event_name		= pfm_nhm_get_event_name,
	.get_event_counters	= pfm_nhm_get_event_counters,
	.dispatch_events	= pfm_nhm_dispatch_events,
	.pmu_detect		= pfm_wsm_detect,
	.pmu_init		= pfm_nhm_init,
	.get_impl_pmcs		= pfm_nhm_get_impl_pmcs,
	.get_impl_pmds		= pfm_nhm_get_impl_pmds,
	.get_impl_counters	= pfm_nhm_get_impl_counters,
	.get_hw_counter_width	= pfm_nhm_get_hw_counter_width,
	.get_event_desc         = pfm_nhm_get_event_description,
	.get_num_event_masks	= pfm_nhm_get_num_event_masks,
	.get_event_mask_name	= pfm_nhm_get_event_mask_name,
	.get_event_mask_code	= pfm_nhm_get_event_mask_code,
	.get_event_mask_desc	= pfm_nhm_get_event_mask_desc,
	.get_cycle_event	= pfm_nhm_get_cycle_event,
	.get_inst_retired_event = pfm_nhm_get_inst_retired
};