Tree - source-git/papi - CentOS Git server

source-git / papi

Files

Commit: 57771707c5d17696fd3660ed6f10428b3c234e9a
Blob Blame History Raw
/*
* File:    perf_event.c
*
* Author:  Corey Ashford
*          cjashfor@us.ibm.com
*          - based upon perfmon.c written by -
*          Philip Mucci
*          mucci@cs.utk.edu
* Mods:    Gary Mohr
*          gary.mohr@bull.com
* Mods:    Vince Weaver
*          vweaver1@eecs.utk.edu
* Mods:	   Philip Mucci
*	   mucci@eecs.utk.edu
* Mods:    Gary Mohr
*          gary.mohr@bull.com
*          Modified the perf_event component to use PFM_OS_PERF_EVENT_EXT mode in libpfm4.
*          This adds several new event masks, including cpu=, u=, and k= which give the user
*          the ability to set cpu number to use or control the domain (user, kernel, or both)
*          in which the counter should be incremented.  These are event masks so it is now 
*          possible to have multiple events in the same event set that count activity from 
*          differennt cpu's or count activity in different domains.
*/


#include <fcntl.h>
#include <string.h>
#include <errno.h>
#include <signal.h>
#include <syscall.h>
#include <sys/utsname.h>
#include <sys/mman.h>
#include <sys/ioctl.h>

/* PAPI-specific includes */
#include "papi.h"
#include "papi_memory.h"
#include "papi_internal.h"
#include "papi_vector.h"
#include "extras.h"

/* libpfm4 includes */
#include "papi_libpfm4_events.h"
#include "pe_libpfm4_events.h"
#include "perfmon/pfmlib.h"
#include PEINCLUDE

/* Linux-specific includes */
#include "mb.h"
#include "linux-memory.h"
#include "linux-timer.h"
#include "linux-common.h"
#include "linux-context.h"

#include "perf_event_lib.h"
#include "perf_helpers.h"

/* Set to enable pre-Linux 2.6.34 perf_event workarounds   */
/* If disabling them gets no complaints then we can remove */
/* These in a future version of PAPI.                      */
#define OBSOLETE_WORKAROUNDS 0

/* Defines for ctx->state */
#define PERF_EVENTS_OPENED  0x01
#define PERF_EVENTS_RUNNING 0x02

/* Forward declaration */
papi_vector_t _perf_event_vector;

/* Globals */
struct native_event_table_t perf_native_event_table;
static int our_cidx;
static int exclude_guest_unsupported;

/* The kernel developers say to never use a refresh value of 0        */
/* See https://lkml.org/lkml/2011/5/24/172                            */
/* However, on some platforms (like Power) a value of 1 does not work */
/* We're still tracking down why this happens.                        */

#if defined(__powerpc__)
#define PAPI_REFRESH_VALUE 0
#else
#define PAPI_REFRESH_VALUE 1
#endif

static int _pe_set_domain( hwd_control_state_t *ctl, int domain);

#if (OBSOLETE_WORKAROUNDS==1)

/* Check for processor support */
/* Can be used for generic checking, though in general we only     */
/* check for pentium4 here because support was broken for multiple */
/* kernel releases and the usual standard detections did not       */
/* handle this.  So we check for pentium 4 explicitly.             */
static int
processor_supported(int vendor, int family) {

   /* Error out if kernel too early to support p4 */
   if (( vendor == PAPI_VENDOR_INTEL ) && (family == 15)) {
      if (_papi_os_info.os_version < LINUX_VERSION(2,6,35)) {
         PAPIERROR("Pentium 4 not supported on kernels before 2.6.35");
         return PAPI_ENOSUPP;
      }
   }
   return PAPI_OK;
}

#endif

/* Fix up the config based on what CPU/Vendor we are running on */
static int
pe_vendor_fixups(papi_vector_t *vector)
{
     /* powerpc */
     /* On IBM and Power6 Machines default domain should include supervisor */
  if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_IBM ) {
     vector->cmp_info.available_domains |=
                  PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR;
     if (strcmp(_papi_hwi_system_info.hw_info.model_string, "POWER6" ) == 0 ) {
        vector->cmp_info.default_domain =
                  PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR;
     }
  }

  if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_MIPS ) {
     vector->cmp_info.available_domains |= PAPI_DOM_KERNEL;
  }

  if ((_papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_INTEL) ||
      (_papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_AMD)) {
     vector->cmp_info.fast_real_timer = 1;
  }

	/* ARM */
	if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_ARM) {

		/* Some ARMv7 and earlier could not measure	*/
		/* KERNEL and USER separately.			*/

		/* Whitelist CortexA7 and CortexA15		*/
		/* There might be more				*/

		if ((_papi_hwi_system_info.hw_info.cpuid_family < 8) &&
			(_papi_hwi_system_info.hw_info.cpuid_model!=0xc07) &&
			(_papi_hwi_system_info.hw_info.cpuid_model!=0xc0f)) {

			vector->cmp_info.available_domains |=
				PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR;
			vector->cmp_info.default_domain =
				PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR;
		}
	}

	/* CRAY */
	if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_CRAY ) {
		vector->cmp_info.available_domains |= PAPI_DOM_OTHER;
	}

	return PAPI_OK;
}



/******************************************************************/
/******** Kernel Version Dependent Routines  **********************/
/******************************************************************/


/* PERF_FORMAT_GROUP allows reading an entire group's counts at once   */
/* before 2.6.34 PERF_FORMAT_GROUP did not work when reading results   */
/*  from attached processes.  We are lazy and disable it for all cases */
/*  commit was:  050735b08ca8a016bbace4445fa025b88fee770b              */

static int
bug_format_group(void) {


#if (OBSOLETE_WORKAROUNDS==1)
	if (_papi_os_info.os_version < LINUX_VERSION(2,6,34)) return 1;
#endif

	/* MIPS, as of version 3.1, does not support this properly */
	/* FIXME: is this still true? */

#if defined(__mips__)
  return 1;
#endif

  return 0;

}

#if (OBSOLETE_WORKAROUNDS==1)


/* There's a bug prior to Linux 2.6.33 where if you are using */
/* PERF_FORMAT_GROUP, the TOTAL_TIME_ENABLED and              */
/* TOTAL_TIME_RUNNING fields will be zero unless you disable  */
/* the counters first                                         */
static int
bug_sync_read(void) {

  if (_papi_os_info.os_version < LINUX_VERSION(2,6,33)) return 1;

  return 0;

}

#endif

/* Set the F_SETOWN_EX flag on the fd.                          */
/* This affects which thread an overflow signal gets sent to    */
/* Handled in a subroutine to handle the fact that the behavior */
/* is dependent on kernel version.                              */
static int
fcntl_setown_fd(int fd) {

	int ret;
	struct f_owner_ex fown_ex;

	/* F_SETOWN_EX is not available until 2.6.32 */
	/* but PAPI perf_event support didn't work on 2.6.31 anyay */

	/* set ownership of the descriptor */
	fown_ex.type = F_OWNER_TID;
	fown_ex.pid  = mygettid();
	ret = fcntl(fd, F_SETOWN_EX, (unsigned long)&fown_ex );

	if ( ret == -1 ) {
		PAPIERROR( "cannot fcntl(F_SETOWN_EX) on %d: %s",
			fd, strerror( errno ) );
		return PAPI_ESYS;
	}
	return PAPI_OK;
}

/* The read format on perf_event varies based on various flags that */
/* are passed into it.  This helper avoids copying this logic       */
/* multiple places.                                                 */
static unsigned int
get_read_format( unsigned int multiplex,
		 unsigned int inherit,
		 int format_group )
{
   unsigned int format = 0;

   /* if we need read format options for multiplexing, add them now */
   if (multiplex) {
      format |= PERF_FORMAT_TOTAL_TIME_ENABLED;
      format |= PERF_FORMAT_TOTAL_TIME_RUNNING;
   }

   /* if our kernel supports it and we are not using inherit, */
   /* add the group read options                              */
   if ( (!bug_format_group()) && !inherit) {
      if (format_group) {
	 format |= PERF_FORMAT_GROUP;
      }
   }

   SUBDBG("multiplex: %d, inherit: %d, group_leader: %d, format: %#x\n",
	  multiplex, inherit, format_group, format);

   return format;
}


/* attr.exclude_guest is enabled by default in recent libpfm4 */
/* however older kernels will reject events with it set */
/* because the reserved field is not all zeros */
static int
check_exclude_guest( void )
{
	int ev_fd;
	struct perf_event_attr attr;

	exclude_guest_unsupported=0;

	/* First check that we can open a plain instructions event */
	memset(&attr, 0 , sizeof(attr));
	attr.config = PERF_COUNT_HW_INSTRUCTIONS;

	ev_fd = sys_perf_event_open( &attr, 0, -1, -1, 0 );
	if ( ev_fd == -1 ) {
		PAPIERROR("Couldn't open hw_instructions in exclude_guest=0 test");
		return -1;
	}
	close(ev_fd);

	/* Now try again with excude_guest */
	memset(&attr, 0 , sizeof(attr));
	attr.config = PERF_COUNT_HW_INSTRUCTIONS;
	attr.exclude_guest=1;

	ev_fd = sys_perf_event_open( &attr, 0, -1, -1, 0 );
	if ( ev_fd == -1 ) {
		if (errno==EINVAL) {
			exclude_guest_unsupported=1;
		}
		else {
		  PAPIERROR("Couldn't open hw_instructions in exclude_guest=1 test");
		}
	} else {
		exclude_guest_unsupported=0;
		close(ev_fd);
	}

	return PAPI_OK;
}

/*****************************************************************/
/********* End Kernel-version Dependent Routines  ****************/
/*****************************************************************/

/*****************************************************************/
/********* Begin perf_event low-level code ***********************/
/*****************************************************************/

static void perf_event_dump_attr( struct perf_event_attr *hw_event,
	pid_t pid, int cpu, int group_fd, unsigned long int flags) {

	/* Mark parameters as not used                   */
	/* In the common case (no SUBDBG) the function   */
	/* compiles into an empty function and complains */
	/* about unused variables.                       */
	(void)hw_event;
	(void)pid;
	(void)cpu;
	(void)group_fd;
	(void)flags;

	SUBDBG("sys_perf_event_open(hw_event: %p, pid: %d, cpu: %d, "
		"group_fd: %d, flags: %lx\n",
		hw_event, pid, cpu, group_fd, flags);
	SUBDBG("   type: %d\n",hw_event->type);
	SUBDBG("   size: %d\n",hw_event->size);
	SUBDBG("   config: %"PRIx64" (%"PRIu64")\n",
		hw_event->config, hw_event->config);
	SUBDBG("   sample_period: %"PRIu64"\n",hw_event->sample_period);
	SUBDBG("   sample_type: %"PRIu64"\n",hw_event->sample_type);
	SUBDBG("   read_format: %"PRIu64"\n",hw_event->read_format);
	SUBDBG("   disabled: %d\n",hw_event->disabled);
	SUBDBG("   inherit: %d\n",hw_event->inherit);
	SUBDBG("   pinned: %d\n",hw_event->pinned);
	SUBDBG("   exclusive: %d\n",hw_event->exclusive);
	SUBDBG("   exclude_user: %d\n",hw_event->exclude_user);
	SUBDBG("   exclude_kernel: %d\n",hw_event->exclude_kernel);
	SUBDBG("   exclude_hv: %d\n",hw_event->exclude_hv);
	SUBDBG("   exclude_idle: %d\n",hw_event->exclude_idle);
	SUBDBG("   mmap: %d\n",hw_event->mmap);
	SUBDBG("   comm: %d\n",hw_event->comm);
	SUBDBG("   freq: %d\n",hw_event->freq);
	SUBDBG("   inherit_stat: %d\n",hw_event->inherit_stat);
	SUBDBG("   enable_on_exec: %d\n",hw_event->enable_on_exec);
	SUBDBG("   task: %d\n",hw_event->task);
	SUBDBG("   watermark: %d\n",hw_event->watermark);
	SUBDBG("   precise_ip: %d\n",hw_event->precise_ip);
	SUBDBG("   mmap_data: %d\n",hw_event->mmap_data);
	SUBDBG("   sample_id_all: %d\n",hw_event->sample_id_all);
	SUBDBG("   exclude_host: %d\n",hw_event->exclude_host);
	SUBDBG("   exclude_guest: %d\n",hw_event->exclude_guest);
	SUBDBG("   exclude_callchain_kernel: %d\n",
		hw_event->exclude_callchain_kernel);
	SUBDBG("   exclude_callchain_user: %d\n",
		hw_event->exclude_callchain_user);
	SUBDBG("   wakeup_events: %"PRIx32" (%"PRIu32")\n",
		hw_event->wakeup_events, hw_event->wakeup_events);
	SUBDBG("   bp_type: %"PRIx32" (%"PRIu32")\n",
		hw_event->bp_type, hw_event->bp_type);
	SUBDBG("   config1: %"PRIx64" (%"PRIu64")\n",
		hw_event->config1, hw_event->config1);
	SUBDBG("   config2: %"PRIx64" (%"PRIu64")\n",
		hw_event->config2, hw_event->config2);
	SUBDBG("   branch_sample_type: %"PRIx64" (%"PRIu64")\n",
		hw_event->branch_sample_type, hw_event->branch_sample_type);
	SUBDBG("   sample_regs_user: %"PRIx64" (%"PRIu64")\n",
		hw_event->sample_regs_user, hw_event->sample_regs_user);
	SUBDBG("   sample_stack_user: %"PRIx32" (%"PRIu32")\n",
		hw_event->sample_stack_user, hw_event->sample_stack_user);
}


static int map_perf_event_errors_to_papi(int perf_event_error) {

   int ret;

   /* These mappings are approximate.
      EINVAL in particular can mean lots of different things */
   switch(perf_event_error) {
      case EPERM:
      case EACCES:
           ret = PAPI_EPERM;
	   break;
      case ENODEV:
      case EOPNOTSUPP:
	   ret = PAPI_ENOSUPP;
           break;
      case ENOENT:
	   ret = PAPI_ENOEVNT;
           break;
      case ENOSYS:
      case EAGAIN:
      case EBUSY:
      case E2BIG:	/* Only happens if attr is the wrong size somehow */
      case EBADF:	/* We are attempting to group with an invalid file descriptor */
	   ret = PAPI_ESYS;
	   break;
      case ENOMEM:
	   ret = PAPI_ENOMEM;
	   break;
      case EMFILE:	/* Out of file descriptors.  Typically max out at 1024 */
           ret = PAPI_ECOUNT;
           break;
      case EINVAL:
      default:
	   ret = PAPI_EINVAL;
           break;
   }
   return ret;
}


/** Check if the current set of options is supported by  */
/*  perf_events.                                         */
/*  We do this by temporarily opening an event with the  */
/*  desired options then closing it again.  We use the   */
/*  PERF_COUNT_HW_INSTRUCTION event as a dummy event     */
/*  on the assumption it is available on all             */
/*  platforms.                                           */

static int
check_permissions( unsigned long tid,
		   unsigned int cpu_num,
		   unsigned int domain,
		   unsigned int granularity,
		   unsigned int multiplex,
		   unsigned int inherit )
{
   int ev_fd;
   struct perf_event_attr attr;

   long pid;

   /* clearing this will set a type of hardware and to count all domains */
   memset(&attr, '\0', sizeof(attr));
   attr.read_format = get_read_format(multiplex, inherit, 1);

   /* set the event id (config field) to instructios */
   /* (an event that should always exist)            */
   /* This was cycles but that is missing on Niagara */
   attr.config = PERF_COUNT_HW_INSTRUCTIONS;

   /* now set up domains this event set will be counting */
   if (!(domain & PAPI_DOM_SUPERVISOR)) {
      attr.exclude_hv = 1;
   }
   if (!(domain & PAPI_DOM_USER)) {
      attr.exclude_user = 1;
   }
   if (!(domain & PAPI_DOM_KERNEL)) {
      attr.exclude_kernel = 1;
   }

   if (granularity==PAPI_GRN_SYS) {
      pid = -1;
   } else {
      pid = tid;
   }

   SUBDBG("Calling sys_perf_event_open() from check_permissions\n");

	perf_event_dump_attr( &attr, pid, cpu_num, -1, 0 );

   ev_fd = sys_perf_event_open( &attr, pid, cpu_num, -1, 0 );
   if ( ev_fd == -1 ) {
      SUBDBG("sys_perf_event_open returned error.  Linux says, %s", 
	     strerror( errno ) );
      return map_perf_event_errors_to_papi(errno);
   }

   /* now close it, this was just to make sure we have permissions */
   /* to set these options                                         */
   close(ev_fd);
   return PAPI_OK;
}

/* Maximum size we ever expect to read from a perf_event fd   */
/*  (this is the number of 64-bit values)                     */
/* We use this to size the read buffers                       */
/* The three is for event count, time_enabled, time_running   */
/*  and the counter term is count value and count id for each */
/*  possible counter value.                                   */
#define READ_BUFFER_SIZE (3 + (2 * PERF_EVENT_MAX_MPX_COUNTERS))



/* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch */
/* implementations (e.g. x86 before 2.6.33) which don't do a static event */
/* scheduability check in sys_perf_event_open.  It is also needed if the  */
/* kernel is stealing an event, such as when NMI watchdog is enabled.     */

static int
check_scheduability( pe_context_t *ctx, pe_control_t *ctl, int idx )
{
   int retval = 0, cnt = -1;
   ( void ) ctx;			 /*unused */
   long long papi_pe_buffer[READ_BUFFER_SIZE];
   int i,group_leader_fd;

   /* If the kernel isn't tracking scheduability right       */
   /* Then we need to start/stop/read to force the event     */
   /* to be scheduled and see if an error condition happens. */

   /* get the proper fd to start */
   group_leader_fd=ctl->events[idx].group_leader_fd;
   if (group_leader_fd==-1) group_leader_fd=ctl->events[idx].event_fd;

   /* start the event */
   retval = ioctl( group_leader_fd, PERF_EVENT_IOC_ENABLE, NULL );
   if (retval == -1) {
      PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed");
      return PAPI_ESYS;
   }

   /* stop the event */
   retval = ioctl(group_leader_fd, PERF_EVENT_IOC_DISABLE, NULL );
   if (retval == -1) {
      PAPIERROR( "ioctl(PERF_EVENT_IOC_DISABLE) failed" );
      return PAPI_ESYS;
   }

   /* See if a read returns any results */
   cnt = read( group_leader_fd, papi_pe_buffer, sizeof(papi_pe_buffer));
   if ( cnt == -1 ) {
      SUBDBG( "read returned an error!  Should never happen.\n" );
      return PAPI_ESYS;
   }

   if ( cnt == 0 ) {
      /* We read 0 bytes if we could not schedule the event */
      /* The kernel should have detected this at open       */
      /* but various bugs (including NMI watchdog)          */
      /* result in this behavior                            */

      return PAPI_ECNFLCT;

   } else {

      /* Reset all of the counters (opened so far) back to zero      */
      /* from the above brief enable/disable call pair.              */

      /* We have to reset all events because reset of group leader      */
      /* does not reset all.                                            */
      /* we assume that the events are being added one by one and that  */
      /* we do not need to reset higher events (doing so may reset ones */
      /* that have not been initialized yet.                            */

      /* Note... PERF_EVENT_IOC_RESET does not reset time running       */
      /* info if multiplexing, so we should avoid coming here if        */
      /* we are multiplexing the event.                                 */
      for( i = 0; i < idx; i++) {
	 retval=ioctl( ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL );
	 if (retval == -1) {
	    PAPIERROR( "ioctl(PERF_EVENT_IOC_RESET) #%d/%d %d "
		       "(fd %d)failed",
		       i,ctl->num_events,idx,ctl->events[i].event_fd);
	    return PAPI_ESYS;
	 }
      }
   }
   return PAPI_OK;
}


/* Do some extra work on a perf_event fd if we're doing sampling  */
/* This mostly means setting up the mmap buffer.                  */
static int
configure_fd_for_sampling( pe_control_t *ctl, int evt_idx )
{
   int ret;
   int fd = ctl->events[evt_idx].event_fd;

   /* Register that we would like a SIGIO notification when a mmap'd page */
   /* becomes full.                                                       */
   ret = fcntl( fd, F_SETFL, O_ASYNC | O_NONBLOCK );
   if ( ret ) {
      PAPIERROR ( "fcntl(%d, F_SETFL, O_ASYNC | O_NONBLOCK) "
		  "returned error: %s", fd, strerror( errno ) );
      return PAPI_ESYS;
   }

   /* Set the F_SETOWN_EX flag on the fd.                          */
   /* This affects which thread an overflow signal gets sent to.   */
   ret=fcntl_setown_fd(fd);
   if (ret!=PAPI_OK) return ret;

   /* Set FD_CLOEXEC.  Otherwise if we do an exec with an overflow */
   /* running, the overflow handler will continue into the exec()'d*/
   /* process and kill it because no signal handler is set up.     */
   ret=fcntl(fd, F_SETFD, FD_CLOEXEC);
   if (ret) {
      return PAPI_ESYS;
   }

   /* when you explicitely declare that you want a particular signal,  */
   /* even with you use the default signal, the kernel will send more  */
   /* information concerning the event to the signal handler.          */
   /*                                                                  */
   /* In particular, it will send the file descriptor from which the   */
   /* event is originating which can be quite useful when monitoring   */
   /* multiple tasks from a single thread.                             */
   ret = fcntl( fd, F_SETSIG, ctl->overflow_signal );
   if ( ret == -1 ) {
      PAPIERROR( "cannot fcntl(F_SETSIG,%d) on %d: %s",
		 ctl->overflow_signal, fd,
		 strerror( errno ) );
      return PAPI_ESYS;
   }

	return PAPI_OK;
}

static int
set_up_mmap( pe_control_t *ctl, int evt_idx)
{

	void *buf_addr;
	int fd = ctl->events[evt_idx].event_fd;

	/* mmap() the sample buffer */
	buf_addr = mmap( NULL,
			ctl->events[evt_idx].nr_mmap_pages * getpagesize(),
			PROT_READ | PROT_WRITE,
			MAP_SHARED,
			fd, 0 );

	/* This may happen if we go over the limit in	*/
	/* /proc/sys/kernel/perf_event_mlock_kb		*/
	/* which defaults to 516k			*/
	/* with regular rdpmc events on 4k page archs	*/
	/* this is roughly 128 events			*/

	/* We sholdn't fail, just fall back to non-rdpmc	*/
	/* Although not sure what happens if it's a sample	*/
	/* event that fails to mmap.				*/

	if ( buf_addr == MAP_FAILED ) {
		SUBDBG( "mmap(NULL,%d,%d,%d,%d,0): %s",
			ctl->events[evt_idx].nr_mmap_pages * getpagesize(),
			PROT_READ | PROT_WRITE,
			MAP_SHARED,
			fd, strerror( errno ) );

		ctl->events[evt_idx].mmap_buf = NULL;

		/* Easier to just globally disable this, as it should	*/
		/* be a fairly uncommon case hopefully.			*/
		if (_perf_event_vector.cmp_info.fast_counter_read) {
			PAPIERROR("Can't mmap, disabling fast_counter_read\n");
			_perf_event_vector.cmp_info.fast_counter_read=0;
		}
		return PAPI_ESYS;
	}

	SUBDBG( "Sample buffer for fd %d is located at %p\n", fd, buf_addr );

	/* Set up the mmap buffer and its associated helpers */
	ctl->events[evt_idx].mmap_buf = (struct perf_counter_mmap_page *) buf_addr;
	ctl->events[evt_idx].tail = 0;
	ctl->events[evt_idx].mask =
		( ctl->events[evt_idx].nr_mmap_pages - 1 ) * getpagesize() - 1;

	return PAPI_OK;
}



/* Open all events in the control state */
static int
open_pe_events( pe_context_t *ctx, pe_control_t *ctl )
{

	int i, ret = PAPI_OK;
	long pid;

	if (ctl->granularity==PAPI_GRN_SYS) {
		pid = -1;
	}
	else {
		pid = ctl->tid;
	}

	for( i = 0; i < ctl->num_events; i++ ) {

		ctl->events[i].event_opened=0;

		/* set up the attr structure.			*/
		/* We don't set up all fields here		*/
		/* as some have already been set up previously.	*/

		/* Handle the broken exclude_guest problem */
		/* libpfm4 sets this by default (PEBS events depend on it) */
		/* but on older kernels that dont know about exclude_guest */
		/* perf_event_open() will error out as a "reserved"        */
		/* unknown bit is set to 1.                                */
		/* Do we need to also watch for exclude_host, exclude_idle */
		/* exclude_callchain*?					   */
		if ((ctl->events[i].attr.exclude_guest) &&
			(exclude_guest_unsupported)) {
			SUBDBG("Disabling exclude_guest in event %d\n",i);
			ctl->events[i].attr.exclude_guest=0;
		}

		/* group leader (event 0) is special                */
		/* If we're multiplexed, everyone is a group leader */
		if (( i == 0 ) || (ctl->multiplexed)) {
			ctl->events[i].attr.pinned = !ctl->multiplexed;
			ctl->events[i].attr.disabled = 1;
			ctl->events[i].group_leader_fd=-1;
			ctl->events[i].attr.read_format = get_read_format(
							ctl->multiplexed,
							ctl->inherit,
							!ctl->multiplexed );
		} else {
			ctl->events[i].attr.pinned=0;
			ctl->events[i].attr.disabled = 0;
			ctl->events[i].group_leader_fd=ctl->events[0].event_fd;
			ctl->events[i].attr.read_format = get_read_format(
							ctl->multiplexed,
							ctl->inherit,
							0 );
		}

		/* try to open */
		perf_event_dump_attr(
				&ctl->events[i].attr,
				pid,
				ctl->events[i].cpu,
				ctl->events[i].group_leader_fd,
				0 /* flags */ );

		ctl->events[i].event_fd = sys_perf_event_open(
				&ctl->events[i].attr,
				pid,
				ctl->events[i].cpu,
				ctl->events[i].group_leader_fd,
				0 /* flags */ );

		/* Try to match Linux errors to PAPI errors */
		if ( ctl->events[i].event_fd == -1 ) {
			SUBDBG("sys_perf_event_open returned error "
				"on event #%d.  Error: %s\n",
				i, strerror( errno ) );
			ret=map_perf_event_errors_to_papi(errno);

			goto open_pe_cleanup;
		}

		SUBDBG ("sys_perf_event_open: tid: %ld, cpu_num: %d,"
			" group_leader/fd: %d, event_fd: %d,"
			" read_format: %"PRIu64"\n",
			pid, ctl->events[i].cpu,
			ctl->events[i].group_leader_fd,
			ctl->events[i].event_fd,
			ctl->events[i].attr.read_format);


		/* in many situations the kernel will indicate we opened fine */
		/* yet things will fail later.  So we need to double check    */
		/* we actually can use the events we've set up.               */

		/* This is not necessary if we are multiplexing, and in fact */
		/* we cannot do this properly if multiplexed because         */
		/* PERF_EVENT_IOC_RESET does not reset the time running info */
		if (!ctl->multiplexed) {
			ret = check_scheduability( ctx, ctl, i );

			if ( ret != PAPI_OK ) {
				/* the last event did open, so we need to    */
				/* bump the counter before doing the cleanup */
				i++;
				goto open_pe_cleanup;
			}
		}
		ctl->events[i].event_opened=1;
	}

	/* Now that we've successfully opened all of the events, do whatever  */
	/* "tune-up" is needed to attach the mmap'd buffers, signal handlers, */
	/* and so on.                                                         */


	/* Make things easier and give each event a mmap() buffer */
	/* Keeping separate tracking for rdpmc vs regular events  */
	/* Would be a pain.  Also perf always gives every event a */
	/* mmap buffer.						  */

	for ( i = 0; i < ctl->num_events; i++ ) {

		/* Can't mmap() inherited events :( */
		if (ctl->inherit) {
			ctl->events[i].nr_mmap_pages = 0;
			ctl->events[i].mmap_buf = NULL;
		}
		else {
			/* Just a guess at how many pages would make this   */
			/* relatively efficient.                            */
			/* Note that it's "1 +" because of the need for a   */
			/* control page, and the number following the "+"   */
			/* must be a power of 2 (1, 4, 8, 16, etc) or zero. */
			/* This is required to optimize dealing with        */
			/* circular buffer wrapping of the mapped pages.    */
			if (ctl->events[i].sampling) {
				ctl->events[i].nr_mmap_pages = 1 + 2;
			}
			else if (_perf_event_vector.cmp_info.fast_counter_read) {
				ctl->events[i].nr_mmap_pages = 1;
			}
			else {
				ctl->events[i].nr_mmap_pages = 0;
			}

			/* Set up the MMAP sample pages */
			if (ctl->events[i].nr_mmap_pages) {
				set_up_mmap(ctl,i);
			} else {
				ctl->events[i].mmap_buf = NULL;
			}
		}
	}

	for ( i = 0; i < ctl->num_events; i++ ) {

		/* If sampling is enabled, hook up signal handler */
		if (ctl->events[i].attr.sample_period) {

			ret = configure_fd_for_sampling( ctl, i );
			if ( ret != PAPI_OK ) {
				/* We failed, and all of the fds are open */
				/* so we need to clean up all of them */
				i = ctl->num_events;
				goto open_pe_cleanup;
			}
		}
	}

	/* Set num_evts only if completely successful */
	ctx->state |= PERF_EVENTS_OPENED;

	return PAPI_OK;

open_pe_cleanup:
	/* We encountered an error, close up the fds we successfully opened.  */
	/* We go backward in an attempt to close group leaders last, although */
	/* That's probably not strictly necessary.                            */
	while ( i > 0 ) {
		i--;
		if (ctl->events[i].event_fd>=0) {
			close( ctl->events[i].event_fd );
			ctl->events[i].event_opened=0;
		}
	}

	return ret;
}

/* TODO: make code clearer -- vmw */
static int
close_event( pe_event_info_t *event )
{
	int munmap_error=0,close_error=0;

	if ( event->mmap_buf ) {
		if (event->nr_mmap_pages==0) {
			PAPIERROR("munmap and num pages is zero");
		}
		if ( munmap ( event->mmap_buf,
				event->nr_mmap_pages * getpagesize() ) ) {
			PAPIERROR( "munmap of fd = %d returned error: %s",
							event->event_fd,
							strerror( errno ) );
			event->mmap_buf=NULL;
			munmap_error=1;
		}
	}
	if ( close( event->event_fd ) ) {
		PAPIERROR( "close of fd = %d returned error: %s",
			event->event_fd, strerror( errno ) );
		close_error=1;
	}

	event->event_opened=0;

	if ((close_error || munmap_error)) {
		return PAPI_ESYS;
	}

	return 0;
}

/* Close all of the opened events */
static int
close_pe_events( pe_context_t *ctx, pe_control_t *ctl )
{
	int i,result;
	int num_closed=0;
	int events_not_opened=0;

	/* should this be a more serious error? */
	if ( ctx->state & PERF_EVENTS_RUNNING ) {
		SUBDBG("Closing without stopping first\n");
	}

	/* Close child events first */
	/* Is that necessary? -- vmw */
	for( i=0; i<ctl->num_events; i++ ) {
		if (ctl->events[i].event_opened) {
			if (ctl->events[i].group_leader_fd!=-1) {
				result=close_event(&ctl->events[i]);
				if (result!=0) return result;
				else num_closed++;
			}
		}
		else {
			events_not_opened++;
		}
	}

	/* Close the group leaders last */
	for( i=0; i<ctl->num_events; i++ ) {
		if (ctl->events[i].event_opened) {
			if (ctl->events[i].group_leader_fd==-1) {
				result=close_event(&ctl->events[i]);
				if (result!=0) return result;
				else num_closed++;
			}
		}
	}

	if (ctl->num_events!=num_closed) {
		if (ctl->num_events!=(num_closed+events_not_opened)) {
			PAPIERROR("Didn't close all events: "
				"Closed %d Not Opened: %d Expected %d",
				num_closed,events_not_opened,ctl->num_events);
			return PAPI_EBUG;
		}
	}

	ctl->num_events=0;

	ctx->state &= ~PERF_EVENTS_OPENED;

	return PAPI_OK;
}


/********************************************************************/
/********************************************************************/
/*     Functions that are exported via the component interface      */
/********************************************************************/
/********************************************************************/

/********************* DOMAIN RELATED *******************************/


/* set the domain. */
/* perf_events allows per-event control of this, */
/* papi allows it to be set at the event level or at the event set level. */
/* this will set the event set level domain values */
/* but they only get used if no event level domain mask (u= or k=) */
/* was specified. */
static int
_pe_set_domain( hwd_control_state_t *ctl, int domain)
{
	pe_control_t *pe_ctl = ( pe_control_t *) ctl;

	SUBDBG("old control domain %d, new domain %d\n", pe_ctl->domain,domain);
	pe_ctl->domain = domain;
	return PAPI_OK;
}


/********************* THREAD RELATED *******************************/


/* Shutdown a thread */
static int
_pe_shutdown_thread( hwd_context_t *ctx )
{
	pe_context_t *pe_ctx = ( pe_context_t *) ctx;

	pe_ctx->initialized=0;

	return PAPI_OK;
}

/* Initialize a thread */
static int
_pe_init_thread( hwd_context_t *hwd_ctx )
{

	pe_context_t *pe_ctx = ( pe_context_t *) hwd_ctx;

	/* clear the context structure and mark as initialized */
	memset( pe_ctx, 0, sizeof ( pe_context_t ) );
	pe_ctx->initialized=1;
	pe_ctx->event_table=&perf_native_event_table;
	pe_ctx->cidx=our_cidx;

	return PAPI_OK;
}



/**************************** COUNTER RELATED *******************/


/* reset the hardware counters */
/* Note: PAPI_reset() does not necessarily call this */
/* unless the events are actually running.           */
static int
_pe_reset( hwd_context_t *ctx, hwd_control_state_t *ctl )
{
	int i, ret;
	pe_control_t *pe_ctl = ( pe_control_t *) ctl;

	( void ) ctx;			 /*unused */

	/* We need to reset all of the events, not just the group leaders */
	for( i = 0; i < pe_ctl->num_events; i++ ) {
		ret = ioctl( pe_ctl->events[i].event_fd,
				PERF_EVENT_IOC_RESET, NULL );
		if ( ret == -1 ) {
			PAPIERROR("ioctl(%d, PERF_EVENT_IOC_RESET, NULL) "
					"returned error, Linux says: %s",
					pe_ctl->events[i].event_fd,
					strerror( errno ) );
			return PAPI_ESYS;
		}
	}

	return PAPI_OK;
}


/* write (set) the hardware counters */
/* Currently we do not support this.   */
static int
_pe_write( hwd_context_t *ctx, hwd_control_state_t *ctl,
		long long *from )
{
	( void ) ctx;			 /*unused */
	( void ) ctl;			 /*unused */
	( void ) from;			 /*unused */
	/*
	 * Counters cannot be written.  Do we need to virtualize the
	 * counters so that they can be written, or perhaps modify code so that
	 * they can be written? FIXME ?
	 */

	return PAPI_ENOSUPP;
}

/*
 * perf_event provides a complicated read interface.
 *  the info returned by read() varies depending on whether
 *  you have PERF_FORMAT_GROUP, PERF_FORMAT_TOTAL_TIME_ENABLED,
 *  PERF_FORMAT_TOTAL_TIME_RUNNING, or PERF_FORMAT_ID set
 *
 * To simplify things we just always ask for everything.  This might
 * lead to overhead when reading more than we need, but it makes the
 * read code a lot simpler than the original implementation we had here.
 *
 * For more info on the layout see include/uapi/linux/perf_event.h
 *
 */


/* When we read with rdpmc, we must read each counter individually */
/* Because of this we don't need separate multiplexing support */
/* This is all handled by mmap_read_self() */
static int
_pe_rdpmc_read( hwd_context_t *ctx, hwd_control_state_t *ctl,
		long long **events, int flags )
{
	SUBDBG("ENTER: ctx: %p, ctl: %p, events: %p, flags: %#x\n",
		ctx, ctl, events, flags);

	( void ) flags;			/*unused */
	( void ) ctx;			/*unused */
	int i;
	pe_control_t *pe_ctl = ( pe_control_t *) ctl;
	unsigned long long count, enabled, running, adjusted;

	/* we must read each counter individually */
	for ( i = 0; i < pe_ctl->num_events; i++ ) {

		count = mmap_read_self(pe_ctl->events[i].mmap_buf,
						&enabled,&running);

		/* TODO: error checking? */

		/* Handle multiplexing case */
		if (enabled!=running) {
			adjusted = (enabled * 128LL) / running;
			adjusted = adjusted * count;
			adjusted = adjusted / 128LL;
			count = adjusted;
		}

		pe_ctl->counts[i] = count;
	}
	/* point PAPI to the values we read */
	*events = pe_ctl->counts;

	SUBDBG("EXIT: *events: %p\n", *events);

	return PAPI_OK;
}


static int
_pe_read_multiplexed( pe_control_t *pe_ctl )
{
	int i,ret=-1;
	long long papi_pe_buffer[READ_BUFFER_SIZE];
	long long tot_time_running, tot_time_enabled, scale;

	/* perf_event does not support FORMAT_GROUP on multiplex */
	/* so we have to handle separate events when multiplexing */

	for ( i = 0; i < pe_ctl->num_events; i++ ) {

		ret = read( pe_ctl->events[i].event_fd,
				papi_pe_buffer,
				sizeof ( papi_pe_buffer ) );
		if ( ret == -1 ) {
			PAPIERROR("read returned an error: ",
					strerror( errno ));
			return PAPI_ESYS;
		}

		/* We should read 3 64-bit values from the counter */
		if (ret<(signed)(3*sizeof(long long))) {
			PAPIERROR("Error!  short read");
			return PAPI_ESYS;
		}

		SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
				pe_ctl->events[i].event_fd,
				(long)pe_ctl->tid, pe_ctl->events[i].cpu, ret);
		SUBDBG("read: %lld %lld %lld\n",
				papi_pe_buffer[0],
				papi_pe_buffer[1],
				papi_pe_buffer[2]);

		tot_time_enabled = papi_pe_buffer[1];
		tot_time_running = papi_pe_buffer[2];

		SUBDBG("count[%d] = (papi_pe_buffer[%d] %lld * "
				"tot_time_enabled %lld) / "
				"tot_time_running %lld\n",
				i, 0,papi_pe_buffer[0],
				tot_time_enabled,tot_time_running);

		if (tot_time_running == tot_time_enabled) {
			/* No scaling needed */
			pe_ctl->counts[i] = papi_pe_buffer[0];
		} else if (tot_time_running && tot_time_enabled) {
			/* Scale to give better results */
			/* avoid truncation.            */
			/* Why use 100?  Would 128 be faster? */
			scale = (tot_time_enabled * 100LL) / tot_time_running;
			scale = scale * papi_pe_buffer[0];
			scale = scale / 100LL;
			pe_ctl->counts[i] = scale;
		} else {
			/* This should not happen, but Phil reports it sometime does. */
			SUBDBG("perf_event kernel bug(?) count, enabled, "
				"running: %lld, %lld, %lld\n",
				papi_pe_buffer[0],tot_time_enabled,
				tot_time_running);

			pe_ctl->counts[i] = papi_pe_buffer[0];
		}
	}
	return PAPI_OK;
}

/* For cases where we can't group counters together */
/* But must read them out individually */
/* This includes when INHERIT is set, as well as various bugs */

static int
_pe_read_nogroup( pe_control_t *pe_ctl ) {

	int i,ret=-1;
	long long papi_pe_buffer[READ_BUFFER_SIZE];

	/* we must read each counter individually */
	for ( i = 0; i < pe_ctl->num_events; i++ ) {
		ret = read( pe_ctl->events[i].event_fd,
				papi_pe_buffer,
				sizeof ( papi_pe_buffer ) );
		if ( ret == -1 ) {
			PAPIERROR("read returned an error: ",
				strerror( errno ));
			return PAPI_ESYS;
		}

		/* we should read one 64-bit value from each counter */
		if (ret!=sizeof(long long)) {
			PAPIERROR("Error!  short read");
			PAPIERROR("read: fd: %2d, tid: %ld, cpu: %d, ret: %d",
				pe_ctl->events[i].event_fd,
				(long)pe_ctl->tid, pe_ctl->events[i].cpu, ret);
			return PAPI_ESYS;
		}

		SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
			pe_ctl->events[i].event_fd, (long)pe_ctl->tid,
			pe_ctl->events[i].cpu, ret);
		SUBDBG("read: %lld\n",papi_pe_buffer[0]);

		pe_ctl->counts[i] = papi_pe_buffer[0];
	}

	return PAPI_OK;

}

static int
_pe_read( hwd_context_t *ctx, hwd_control_state_t *ctl,
	       long long **events, int flags )
{
	SUBDBG("ENTER: ctx: %p, ctl: %p, events: %p, flags: %#x\n",
		ctx, ctl, events, flags);

	( void ) flags;			 /*unused */
	( void ) ctx;			 /*unused */
	int i, j, ret = -1;
	pe_control_t *pe_ctl = ( pe_control_t *) ctl;
	long long papi_pe_buffer[READ_BUFFER_SIZE];

	/* Handle fast case */
	if ((_perf_event_vector.cmp_info.fast_counter_read) && (!pe_ctl->inherit)) {
		return _pe_rdpmc_read( ctx, ctl, events, flags);
	}

	/* Handle case where we are multiplexing */
	if (pe_ctl->multiplexed) {
		_pe_read_multiplexed(pe_ctl);
	}

	/* Handle cases where we cannot use FORMAT GROUP */
	else if (bug_format_group() || pe_ctl->inherit) {
		_pe_read_nogroup(pe_ctl);
	}

	/* Handle common case where we are using FORMAT_GROUP	*/
	/* We assume only one group leader, in position 0	*/

	/* By reading the leader file descriptor, we get a series */
	/* of 64-bit values.  The first is the total number of    */
	/* events, followed by the counts for them.               */

	else {
		if (pe_ctl->events[0].group_leader_fd!=-1) {
			PAPIERROR("Was expecting group leader");
		}

		ret = read( pe_ctl->events[0].event_fd,
			papi_pe_buffer,
			sizeof ( papi_pe_buffer ) );

		if ( ret == -1 ) {
			PAPIERROR("read returned an error: ",
				strerror( errno ));
			return PAPI_ESYS;
		}

		/* we read 1 64-bit value (number of events) then     */
		/* num_events more 64-bit values that hold the counts */
		if (ret<(signed)((1+pe_ctl->num_events)*sizeof(long long))) {
			PAPIERROR("Error! short read");
			return PAPI_ESYS;
		}

		SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
			pe_ctl->events[0].event_fd,
			(long)pe_ctl->tid, pe_ctl->events[0].cpu, ret);

		for(j=0;j<ret/8;j++) {
			SUBDBG("read %d: %lld\n",j,papi_pe_buffer[j]);
		}

		/* Make sure the kernel agrees with how many events we have */
		if (papi_pe_buffer[0]!=pe_ctl->num_events) {
			PAPIERROR("Error!  Wrong number of events");
			return PAPI_ESYS;
		}

		/* put the count values in their proper location */
		for(i=0;i<pe_ctl->num_events;i++) {
			pe_ctl->counts[i] = papi_pe_buffer[1+i];
		}
	}

	/* point PAPI to the values we read */
	*events = pe_ctl->counts;

	SUBDBG("EXIT: *events: %p\n", *events);

	return PAPI_OK;
}

#if (OBSOLETE_WORKAROUNDS==1)
/* On kernels before 2.6.33 the TOTAL_TIME_ENABLED and TOTAL_TIME_RUNNING */
/* fields are always 0 unless the counter is disabled.  So if we are on   */
/* one of these kernels, then we must disable events before reading.      */
/* Elsewhere though we disable multiplexing on kernels before 2.6.34 */
/* so maybe this isn't even necessary.                               */
static int
_pe_read_bug_sync( hwd_context_t *ctx, hwd_control_state_t *ctl,
	       long long **events, int flags )
{

	( void ) flags;			 /*unused */
	int i, ret = -1;
	pe_context_t *pe_ctx = ( pe_context_t *) ctx;
	pe_control_t *pe_ctl = ( pe_control_t *) ctl;
	int result;

	if ( pe_ctx->state & PERF_EVENTS_RUNNING ) {
		 for ( i = 0; i < pe_ctl->num_events; i++ ) {
			/* disable only the group leaders */
			if ( pe_ctl->events[i].group_leader_fd == -1 ) {
				ret = ioctl( pe_ctl->events[i].event_fd,
					PERF_EVENT_IOC_DISABLE, NULL );
				if ( ret == -1 ) {
					PAPIERROR("ioctl(PERF_EVENT_IOC_DISABLE) "
						"returned an error: ", strerror( errno ));
					return PAPI_ESYS;
				}
			}
		}
	}

	result=_pe_read( ctx, ctl, events, flags );

	/* If we disabled the counters due to the sync_read_bug(), */
	/* then we need to re-enable them now.                     */

	if ( pe_ctx->state & PERF_EVENTS_RUNNING ) {
		for ( i = 0; i < pe_ctl->num_events; i++ ) {
			if ( pe_ctl->events[i].group_leader_fd == -1 ) {
				/* this should refresh any overflow counters too */
				ret = ioctl( pe_ctl->events[i].event_fd,
					PERF_EVENT_IOC_ENABLE, NULL );
				if ( ret == -1 ) {
					/* Should never happen */
					PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) returned an error: ",
						strerror( errno ));
					return PAPI_ESYS;
				}
			}
		}
	}

	return result;
}

#endif

/* Start counting events */
static int
_pe_start( hwd_context_t *ctx, hwd_control_state_t *ctl )
{
	int ret;
	int i;
	int did_something = 0;
	pe_context_t *pe_ctx = ( pe_context_t *) ctx;
	pe_control_t *pe_ctl = ( pe_control_t *) ctl;

	/* Reset the counters first.  Is this necessary? */
	ret = _pe_reset( pe_ctx, pe_ctl );
	if ( ret ) {
		return ret;
	}

	/* Enable all of the group leaders                */
	/* All group leaders have a group_leader_fd of -1 */
	for( i = 0; i < pe_ctl->num_events; i++ ) {
		if (pe_ctl->events[i].group_leader_fd == -1) {
			SUBDBG("ioctl(enable): fd: %d\n",
				pe_ctl->events[i].event_fd);
			ret=ioctl( pe_ctl->events[i].event_fd,
				PERF_EVENT_IOC_ENABLE, NULL) ;

			/* ioctls always return -1 on failure */
			if (ret == -1) {
				PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed");
				return PAPI_ESYS;
			}

			did_something++;
		}
	}

	if (!did_something) {
		PAPIERROR("Did not enable any counters");
		return PAPI_EBUG;
	}

	pe_ctx->state |= PERF_EVENTS_RUNNING;

	return PAPI_OK;

}

/* Stop all of the counters */
static int
_pe_stop( hwd_context_t *ctx, hwd_control_state_t *ctl )
{
	SUBDBG( "ENTER: ctx: %p, ctl: %p\n", ctx, ctl);

	int ret;
	int i;
	pe_context_t *pe_ctx = ( pe_context_t *) ctx;
	pe_control_t *pe_ctl = ( pe_control_t *) ctl;

	/* Just disable the group leaders */
	for ( i = 0; i < pe_ctl->num_events; i++ ) {
		if ( pe_ctl->events[i].group_leader_fd == -1 ) {
			ret=ioctl( pe_ctl->events[i].event_fd,
				PERF_EVENT_IOC_DISABLE, NULL);
			if ( ret == -1 ) {
				PAPIERROR( "ioctl(%d, PERF_EVENT_IOC_DISABLE, NULL) "
					"returned error, Linux says: %s",
					pe_ctl->events[i].event_fd, strerror( errno ) );
				return PAPI_EBUG;
			}
		}
	}

	pe_ctx->state &= ~PERF_EVENTS_RUNNING;

	SUBDBG( "EXIT:\n");

	return PAPI_OK;
}





/*********************** CONTROL STATE RELATED *******************/


/* This function clears the current contents of the control structure and
   updates it with whatever resources are allocated for all the native events
   in the native info structure array. */

static int
_pe_update_control_state( hwd_control_state_t *ctl,
			       NativeInfo_t *native,
			       int count, hwd_context_t *ctx )
{
	SUBDBG( "ENTER: ctl: %p, native: %p, count: %d, ctx: %p\n",
		ctl, native, count, ctx);
	int i;
	int j;
	int ret;
	int skipped_events=0;
	struct native_event_t *ntv_evt;
	pe_context_t *pe_ctx = ( pe_context_t *) ctx;
	pe_control_t *pe_ctl = ( pe_control_t *) ctl;

	/* close all of the existing fds and start over again */
	/* In theory we could have finer-grained control and know if             */
	/* things were changed, but it's easier to tear things down and rebuild. */
	close_pe_events( pe_ctx, pe_ctl );

	/* Calling with count==0 should be OK, it's how things are deallocated */
	/* when an eventset is destroyed.                                      */
	if ( count == 0 ) {
		SUBDBG( "EXIT: Called with count == 0\n" );
		return PAPI_OK;
	}

	/* set up all the events */
	for( i = 0; i < count; i++ ) {
		if ( native ) {
			/* get the native event pointer used for this papi event */
			int ntv_idx = _papi_hwi_get_ntv_idx((unsigned)(native[i].ni_papi_code));
			if (ntv_idx < -1) {
				SUBDBG("papi_event_code: %#x known by papi but not by the component\n", native[i].ni_papi_code);
				continue;
			}
			/* if native index is -1, then we have an event without a mask and need to find the right native index to use */
			if (ntv_idx == -1) {
				/* find the native event index we want by matching for the right papi event code */
				for (j=0 ; j<pe_ctx->event_table->num_native_events ; j++) {
					if (pe_ctx->event_table->native_events[j].papi_event_code == native[i].ni_papi_code) {
						ntv_idx = j;
					}
				}
			}

			/* if native index is still negative, we did not find event we wanted so just return error */
			if (ntv_idx < 0) {
				SUBDBG("papi_event_code: %#x not found in native event tables\n", native[i].ni_papi_code);
				continue;
			}

			/* this native index is positive so there was a mask with the event, the ntv_idx identifies which native event to use */
			ntv_evt = (struct native_event_t *)(&(pe_ctx->event_table->native_events[ntv_idx]));
			SUBDBG("ntv_evt: %p\n", ntv_evt);

			SUBDBG("i: %d, pe_ctx->event_table->num_native_events: %d\n", i, pe_ctx->event_table->num_native_events);

			/* Move this events hardware config values and other attributes to the perf_events attribute structure */
			memcpy (&pe_ctl->events[i].attr, &ntv_evt->attr, sizeof(perf_event_attr_t));

			/* may need to update the attribute structure with information from event set level domain settings (values set by PAPI_set_domain) */
			/* only done if the event mask which controls each counting domain was not provided */

			/* get pointer to allocated name, will be NULL when adding preset events to event set */
			char *aName = ntv_evt->allocated_name;
			if ((aName == NULL)  ||  (strstr(aName, ":u=") == NULL)) {
				SUBDBG("set exclude_user attribute from eventset level domain flags, encode: %d, eventset: %d\n", pe_ctl->events[i].attr.exclude_user, !(pe_ctl->domain & PAPI_DOM_USER));
				pe_ctl->events[i].attr.exclude_user = !(pe_ctl->domain & PAPI_DOM_USER);
			}
			if ((aName == NULL)  ||  (strstr(aName, ":k=") == NULL)) {
				SUBDBG("set exclude_kernel attribute from eventset level domain flags, encode: %d, eventset: %d\n", pe_ctl->events[i].attr.exclude_kernel, !(pe_ctl->domain & PAPI_DOM_KERNEL));
				pe_ctl->events[i].attr.exclude_kernel = !(pe_ctl->domain & PAPI_DOM_KERNEL);
			}

			// libpfm4 supports mh (monitor host) and mg (monitor guest) event masks
			// perf_events supports exclude_hv and exclude_idle attributes
			// PAPI_set_domain supports PAPI_DOM_SUPERVISOR and PAPI_DOM_OTHER domain attributes
			// not sure how these perf_event attributes, and PAPI domain attributes relate to each other
			// if that can be figured out then there should probably be code here to set some perf_events attributes based on what was set in a PAPI_set_domain call
			// the code sample below is one possibility
//			if (strstr(ntv_evt->allocated_name, ":mg=") == NULL) {
//				SUBDBG("set exclude_hv attribute from eventset level domain flags, encode: %d, eventset: %d\n", pe_ctl->events[i].attr.exclude_hv, !(pe_ctl->domain & PAPI_DOM_SUPERVISOR));
//				pe_ctl->events[i].attr.exclude_hv = !(pe_ctl->domain & PAPI_DOM_SUPERVISOR);
//			}


			// set the cpu number provided with an event mask if there was one (will be -1 if mask not provided)
			pe_ctl->events[i].cpu = ntv_evt->cpu;
			// if cpu event mask not provided, then set the cpu to use to what may have been set on call to PAPI_set_opt (will still be -1 if not called)
			if (pe_ctl->events[i].cpu == -1) {
				pe_ctl->events[i].cpu = pe_ctl->cpu;
			}
      } else {
    	  /* This case happens when called from _pe_set_overflow and _pe_ctl */
          /* Those callers put things directly into the pe_ctl structure so it is already set for the open call */
      }

      /* Copy the inherit flag into the attribute block that will be passed to the kernel */
      pe_ctl->events[i].attr.inherit = pe_ctl->inherit;

      /* Set the position in the native structure */
      /* We just set up events linearly           */
      if ( native ) {
    	  native[i].ni_position = i;
    	  SUBDBG( "&native[%d]: %p, ni_papi_code: %#x, ni_event: %#x, ni_position: %d, ni_owners: %d\n",
			i, &(native[i]), native[i].ni_papi_code, native[i].ni_event, native[i].ni_position, native[i].ni_owners);
      }
   }

	if (count <= skipped_events) {
		SUBDBG("EXIT: No events to count, they all contained invalid umasks\n");
		return PAPI_ENOEVNT;
	}

	pe_ctl->num_events = count - skipped_events;

	/* actually open the events */
	ret = open_pe_events( pe_ctx, pe_ctl );
	if ( ret != PAPI_OK ) {
		SUBDBG("EXIT: open_pe_events returned: %d\n", ret);
      		/* Restore values ? */
		return ret;
	}

	SUBDBG( "EXIT: PAPI_OK\n" );
	return PAPI_OK;
}

/* Set various options on a control state */
static int
_pe_ctl( hwd_context_t *ctx, int code, _papi_int_option_t *option )
{
   int ret;
   pe_context_t *pe_ctx = ( pe_context_t *) ctx;
   pe_control_t *pe_ctl = NULL;

   switch ( code ) {
      case PAPI_MULTIPLEX:
	   pe_ctl = ( pe_control_t * ) ( option->multiplex.ESI->ctl_state );
	   ret = check_permissions( pe_ctl->tid, pe_ctl->cpu, pe_ctl->domain,
				    pe_ctl->granularity,
				    1, pe_ctl->inherit );
           if (ret != PAPI_OK) {
	      return ret;
	   }

	   /* looks like we are allowed, so set multiplexed attribute */
	   pe_ctl->multiplexed = 1;
	   ret = _pe_update_control_state( pe_ctl, NULL,
						pe_ctl->num_events, pe_ctx );
	   if (ret != PAPI_OK) {
	      pe_ctl->multiplexed = 0;
	   }
	   return ret;

      case PAPI_ATTACH:
	   pe_ctl = ( pe_control_t * ) ( option->attach.ESI->ctl_state );
	   ret = check_permissions( option->attach.tid, pe_ctl->cpu,
				  pe_ctl->domain, pe_ctl->granularity,
				  pe_ctl->multiplexed,
				    pe_ctl->inherit );
	   if (ret != PAPI_OK) {
	      return ret;
	   }

	   pe_ctl->tid = option->attach.tid;

	   /* If events have been already been added, something may */
	   /* have been done to the kernel, so update */
	   ret =_pe_update_control_state( pe_ctl, NULL,
						pe_ctl->num_events, pe_ctx);

	   return ret;

      case PAPI_DETACH:
	   pe_ctl = ( pe_control_t *) ( option->attach.ESI->ctl_state );

	   pe_ctl->tid = 0;
	   return PAPI_OK;

      case PAPI_CPU_ATTACH:
	   pe_ctl = ( pe_control_t *) ( option->cpu.ESI->ctl_state );
	   ret = check_permissions( pe_ctl->tid, option->cpu.cpu_num,
				    pe_ctl->domain, pe_ctl->granularity,
				    pe_ctl->multiplexed,
				    pe_ctl->inherit );
           if (ret != PAPI_OK) {
	       return ret;
	   }
	   /* looks like we are allowed so set cpu number */

	   /* this tells the kernel not to count for a thread   */
	   /* should we warn if we try to set both?  perf_event */
	   /* will reject it.                                   */
	   pe_ctl->tid = -1;

	   pe_ctl->cpu = option->cpu.cpu_num;

	   return PAPI_OK;

      case PAPI_DOMAIN:
	   pe_ctl = ( pe_control_t *) ( option->domain.ESI->ctl_state );
	   ret = check_permissions( pe_ctl->tid, pe_ctl->cpu,
				    option->domain.domain,
				    pe_ctl->granularity,
				    pe_ctl->multiplexed,
				    pe_ctl->inherit );
           if (ret != PAPI_OK) {
	      return ret;
	   }
	   /* looks like we are allowed, so set event set level counting domains */
       pe_ctl->domain = option->domain.domain;
	   return PAPI_OK;

      case PAPI_GRANUL:
	   pe_ctl = (pe_control_t *) ( option->granularity.ESI->ctl_state );

	   /* FIXME: we really don't support this yet */

           switch ( option->granularity.granularity  ) {
              case PAPI_GRN_PROCG:
              case PAPI_GRN_SYS_CPU:
              case PAPI_GRN_PROC:
		   return PAPI_ECMP;

	      /* Currently we only support thread and CPU granularity */
              case PAPI_GRN_SYS:
	 	   pe_ctl->granularity=PAPI_GRN_SYS;
		   pe_ctl->cpu=_papi_getcpu();
		   break;

              case PAPI_GRN_THR:
	 	   pe_ctl->granularity=PAPI_GRN_THR;
		   break;


              default:
		   return PAPI_EINVAL;
	   }
           return PAPI_OK;

      case PAPI_INHERIT:
	   pe_ctl = (pe_control_t *) ( option->inherit.ESI->ctl_state );
	   ret = check_permissions( pe_ctl->tid, pe_ctl->cpu, pe_ctl->domain,
				  pe_ctl->granularity, pe_ctl->multiplexed,
				    option->inherit.inherit );
           if (ret != PAPI_OK) {
	      return ret;
	   }
	   /* looks like we are allowed, so set the requested inheritance */
	   if (option->inherit.inherit) {
	      /* children will inherit counters */
	      pe_ctl->inherit = 1;
	   } else {
	      /* children won't inherit counters */
	      pe_ctl->inherit = 0;
	   }
	   return PAPI_OK;

      case PAPI_DATA_ADDRESS:
	   return PAPI_ENOSUPP;
#if 0
	   pe_ctl = (pe_control_t *) (option->address_range.ESI->ctl_state);
	   ret = set_default_domain( pe_ctl, option->address_range.domain );
	   if ( ret != PAPI_OK ) {
	      return ret;
	   }
	   set_drange( pe_ctx, pe_ctl, option );
	   return PAPI_OK;
#endif
      case PAPI_INSTR_ADDRESS:
	   return PAPI_ENOSUPP;
#if 0
	   pe_ctl = (pe_control_t *) (option->address_range.ESI->ctl_state);
	   ret = set_default_domain( pe_ctl, option->address_range.domain );
	   if ( ret != PAPI_OK ) {
	      return ret;
	   }
	   set_irange( pe_ctx, pe_ctl, option );
	   return PAPI_OK;
#endif

      case PAPI_DEF_ITIMER:
	   /* What should we be checking for here?                   */
	   /* This seems like it should be OS-specific not component */
	   /* specific.                                              */

	   return PAPI_OK;

      case PAPI_DEF_MPX_NS:
	   /* Defining a given ns per set is not current supported */
	   return PAPI_ENOSUPP;

      case PAPI_DEF_ITIMER_NS:
	   /* We don't support this... */
	   return PAPI_OK;

      default:
	   return PAPI_ENOSUPP;
   }
}


/* Initialize a new control state */
static int
_pe_init_control_state( hwd_control_state_t *ctl )
{
	pe_control_t *pe_ctl = ( pe_control_t *) ctl;

	/* clear the contents */
	memset( pe_ctl, 0, sizeof ( pe_control_t ) );

	/* Set the domain */
	_pe_set_domain( ctl, _perf_event_vector.cmp_info.default_domain );

	/* default granularity */
	pe_ctl->granularity= _perf_event_vector.cmp_info.default_granularity;

	/* overflow signal */
	pe_ctl->overflow_signal=_perf_event_vector.cmp_info.hardware_intr_sig;

	pe_ctl->cidx=our_cidx;

	/* Set cpu number in the control block to show events */
	/* are not tied to specific cpu                       */
	pe_ctl->cpu = -1;

	return PAPI_OK;
}


/****************** EVENT NAME HANDLING CODE *****************/

static int
_pe_ntv_enum_events( unsigned int *PapiEventCode, int modifier )
{
	return _pe_libpfm4_ntv_enum_events(PapiEventCode, modifier, our_cidx,
			&perf_native_event_table);
}

static int
_pe_ntv_name_to_code( const char *name, unsigned int *event_code)
{
	return _pe_libpfm4_ntv_name_to_code(name,event_code, our_cidx,
			&perf_native_event_table);
}

static int
_pe_ntv_code_to_name(unsigned int EventCode,
			char *ntv_name, int len)
{
	return _pe_libpfm4_ntv_code_to_name(EventCode,
					ntv_name, len,
					&perf_native_event_table);
}

static int
_pe_ntv_code_to_descr( unsigned int EventCode,
			char *ntv_descr, int len)
{

	return _pe_libpfm4_ntv_code_to_descr(EventCode,ntv_descr,len,
					&perf_native_event_table);
}

static int
_pe_ntv_code_to_info(unsigned int EventCode,
			PAPI_event_info_t *info) {

	return _pe_libpfm4_ntv_code_to_info(EventCode, info,
					&perf_native_event_table);
}


/*********************** SAMPLING / PROFILING *******************/


/* Find a native event specified by a profile index */
static int
find_profile_index( EventSetInfo_t *ESI, int evt_idx, int *flags,
		unsigned int *native_index, int *profile_index )
{
	int pos, esi_index, count;

	for ( count = 0; count < ESI->profile.event_counter; count++ ) {
		esi_index = ESI->profile.EventIndex[count];
		pos = ESI->EventInfoArray[esi_index].pos[0];

		if ( pos == evt_idx ) {
			*profile_index = count;
			*native_index = ESI->NativeInfoArray[pos].ni_event &
					PAPI_NATIVE_AND_MASK;
			*flags = ESI->profile.flags;
			SUBDBG( "Native event %d is at profile index %d, flags %d\n",
				*native_index, *profile_index, *flags );
			return PAPI_OK;
		}
	}
	PAPIERROR( "wrong count: %d vs. ESI->profile.event_counter %d",
			count, ESI->profile.event_counter );
	return PAPI_EBUG;
}


/* What exactly does this do? */
static int
process_smpl_buf( int evt_idx, ThreadInfo_t **thr, int cidx )
{
	int ret, flags, profile_index;
	unsigned native_index;
	pe_control_t *ctl;

	ret = find_profile_index( ( *thr )->running_eventset[cidx], evt_idx,
			&flags, &native_index, &profile_index );
	if ( ret != PAPI_OK ) {
		return ret;
	}

	ctl= (*thr)->running_eventset[cidx]->ctl_state;

	mmap_read( cidx, thr, &(ctl->events[evt_idx]), profile_index );

	return PAPI_OK;
}

/*
 * This function is used when hardware overflows are working or when
 * software overflows are forced
 */

static void
_pe_dispatch_timer( int n, hwd_siginfo_t *info, void *uc)
{
	( void ) n;                           /*unused */
	_papi_hwi_context_t hw_context;
	int found_evt_idx = -1, fd = info->si_fd;
	caddr_t address;
	ThreadInfo_t *thread = _papi_hwi_lookup_thread( 0 );
	int i;
	pe_control_t *ctl;
	int cidx = _perf_event_vector.cmp_info.CmpIdx;

	if ( thread == NULL ) {
		PAPIERROR( "thread == NULL in _papi_pe_dispatch_timer for fd %d!", fd );
		return;
	}

	if ( thread->running_eventset[cidx] == NULL ) {
		PAPIERROR( "thread->running_eventset == NULL in "
				"_papi_pe_dispatch_timer for fd %d!",fd );
		return;
	}

	if ( thread->running_eventset[cidx]->overflow.flags == 0 ) {
		PAPIERROR( "thread->running_eventset->overflow.flags == 0 in "
			"_papi_pe_dispatch_timer for fd %d!", fd );
		return;
	}

	hw_context.si = info;
	hw_context.ucontext = ( hwd_ucontext_t * ) uc;

	if ( thread->running_eventset[cidx]->overflow.flags &
			PAPI_OVERFLOW_FORCE_SW ) {
		address = GET_OVERFLOW_ADDRESS( hw_context );
		_papi_hwi_dispatch_overflow_signal( ( void * ) &hw_context,
					address, NULL, 0,
					0, &thread, cidx );
		return;
	}

	if ( thread->running_eventset[cidx]->overflow.flags !=
		PAPI_OVERFLOW_HARDWARE ) {
			PAPIERROR( "thread->running_eventset->overflow.flags "
				"is set to something other than "
				"PAPI_OVERFLOW_HARDWARE or "
				"PAPI_OVERFLOW_FORCE_SW for fd %d (%#x)",
				fd,
				thread->running_eventset[cidx]->overflow.flags);
	}

	/* convoluted way to get ctl */
	ctl= thread->running_eventset[cidx]->ctl_state;

	/* See if the fd is one that's part of the this thread's context */
	for( i=0; i < ctl->num_events; i++ ) {
		if ( fd == ctl->events[i].event_fd ) {
			found_evt_idx = i;
			break;
		}
	}

	if ( found_evt_idx == -1 ) {
		PAPIERROR( "Unable to find fd %d among the open event fds "
				"_papi_hwi_dispatch_timer!", fd );
		return;
	}

	if (ioctl( fd, PERF_EVENT_IOC_DISABLE, NULL ) == -1 ) {
		PAPIERROR("ioctl(PERF_EVENT_IOC_DISABLE) failed");
	}

	if ( ( thread->running_eventset[cidx]->state & PAPI_PROFILING ) &&
		!( thread->running_eventset[cidx]->profile.flags &
		PAPI_PROFIL_FORCE_SW ) ) {
		process_smpl_buf( found_evt_idx, &thread, cidx );
	}
	else {
		uint64_t ip;
		unsigned int head;
		pe_event_info_t *pe = &(ctl->events[found_evt_idx]);
		unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize(  );

	/*
	* Read up the most recent IP from the sample in the mmap buffer.  To
	* do this, we make the assumption that all of the records in the
	* mmap buffer are the same size, and that they all contain the IP as
	* their only record element.  This means that we can use the
	* data_head element from the user page and move backward one record
	* from that point and read the data.  Since we don't actually need
	* to access the header of the record, we can just subtract 8 (size
	* of the IP) from data_head and read up that word from the mmap
	* buffer.  After we subtract 8, we account for mmap buffer wrapping
	* by AND'ing this offset with the buffer mask.
	*/
		head = mmap_read_head( pe );

		if ( head == 0 ) {
			PAPIERROR( "Attempting to access memory "
				"which may be inaccessable" );
			return;
		}
		ip = *( uint64_t * ) ( data + ( ( head - 8 ) & pe->mask ) );
	/*
	* Update the tail to the current head pointer.
	*
	* Note: that if we were to read the record at the tail pointer,
	* rather than the one at the head (as you might otherwise think
	* would be natural), we could run into problems.  Signals don't
	* stack well on Linux, particularly if not using RT signals, and if
	* they come in rapidly enough, we can lose some.  Overtime, the head
	* could catch up to the tail and monitoring would be stopped, and
	* since no more signals are coming in, this problem will never be
	* resolved, resulting in a complete loss of overflow notification
	* from that point on.  So the solution we use here will result in
	* only the most recent IP value being read every time there are two
	* or more samples in the buffer (for that one overflow signal).  But
	* the handler will always bring up the tail, so the head should
	* never run into the tail.
	*/
		mmap_write_tail( pe, head );

	/*
	* The fourth parameter is supposed to be a vector of bits indicating
	* the overflowed hardware counters, but it's not really clear that
	* it's useful, because the actual hardware counters used are not
	* exposed to the PAPI user.  For now, I'm just going to set the bit
	* that indicates which event register in the array overflowed.  The
	* result is that the overflow vector will not be identical to the
	* perfmon implementation, and part of that is due to the fact that
	* which hardware register is actually being used is opaque at the
	* user level (the kernel event dispatcher hides that info).
	*/

		_papi_hwi_dispatch_overflow_signal( ( void * ) &hw_context,
					( caddr_t ) ( unsigned long ) ip,
					NULL, ( 1 << found_evt_idx ), 0,
					&thread, cidx );

	}

	/* Restart the counters */
	if (ioctl( fd, PERF_EVENT_IOC_REFRESH, PAPI_REFRESH_VALUE ) == -1) {
		PAPIERROR( "overflow refresh failed", 0 );
	}
}

/* Stop profiling */
/* FIXME: does this actually stop anything? */
/* It looks like it is only actually called from PAPI_stop() */
/* So the event will be destroyed soon after anyway. */
static int
_pe_stop_profiling( ThreadInfo_t *thread, EventSetInfo_t *ESI )
{
	int i, ret = PAPI_OK;
	pe_control_t *ctl;
	int cidx;

	ctl=ESI->ctl_state;

	cidx=ctl->cidx;

	/* Loop through all of the events and process those which have mmap */
	/* buffers attached.                                                */
	for ( i = 0; i < ctl->num_events; i++ ) {
		/* Use the mmap_buf field as an indicator */
		/* of this fd being used for profiling.   */
		if ( ctl->events[i].profiling ) {
			/* Process any remaining samples in the sample buffer */
			ret = process_smpl_buf( i, &thread, cidx );
			if ( ret ) {
				PAPIERROR( "process_smpl_buf returned error %d", ret );
				return ret;
			}
			ctl->events[i].profiling=0;
		}
	}

	return ret;
}

/* Set up an event to cause overflow */
/* If threshold==0 then disable overflow for that event */
static int
_pe_set_overflow( EventSetInfo_t *ESI, int EventIndex, int threshold )
{
	SUBDBG("ENTER: ESI: %p, EventIndex: %d, threshold: %d\n",
		ESI, EventIndex, threshold);

	pe_context_t *ctx;
	pe_control_t *ctl = (pe_control_t *) ( ESI->ctl_state );
	int i, evt_idx, found_non_zero_sample_period = 0, retval = PAPI_OK;
	int cidx;

	cidx = ctl->cidx;
	ctx = ( pe_context_t *) ( ESI->master->context[cidx] );

	/* pos[0] is the first native event */
	/* derived events might be made up of multiple native events */
	evt_idx = ESI->EventInfoArray[EventIndex].pos[0];

	SUBDBG("Attempting to set overflow for index %d (%d) of EventSet %d\n",
		evt_idx,EventIndex,ESI->EventSetIndex);

	if (evt_idx<0) {
		SUBDBG("EXIT: evt_idx: %d\n", evt_idx);
		return PAPI_EINVAL;
	}

	/* It's an error to disable overflow if it wasn't set in the	*/
	/* first place.							*/
	if (( threshold == 0 ) &&
		( ctl->events[evt_idx].attr.sample_period == 0 ) ) {
			SUBDBG("EXIT: PAPI_EINVAL, Tried to clear "
				"sample threshold when it was not set\n");
			return PAPI_EINVAL;
	}

	/* Set the sample period to threshold */
	ctl->events[evt_idx].attr.sample_period = threshold;

	if (threshold == 0) {
		ctl->events[evt_idx].sampling = 0;
	}
	else {
		ctl->events[evt_idx].sampling = 1;

		/* Setting wakeup_events to one means issue a wakeup on every */
		/* counter overflow (not mmap page overflow).                 */
		ctl->events[evt_idx].attr.wakeup_events = 1;
		/* We need the IP to pass to the overflow handler */
		ctl->events[evt_idx].attr.sample_type = PERF_SAMPLE_IP;
	}


	/* Check to see if any events in the EventSet are setup to sample */
	/* Do we actually handle multiple overflow events at once? --vmw  */
	for ( i = 0; i < ctl->num_events; i++ ) {
		if ( ctl->events[i].attr.sample_period ) {
			found_non_zero_sample_period = 1;
			break;
		}
	}

	if ( found_non_zero_sample_period ) {
		/* turn on internal overflow flag for this event set */
		ctl->overflow = 1;

		/* Enable the signal handler */
		retval = _papi_hwi_start_signal(
				    ctl->overflow_signal,
				    1, ctl->cidx );
		if (retval != PAPI_OK) {
			SUBDBG("Call to _papi_hwi_start_signal "
				"returned: %d\n", retval);
		}
	} else {

		/* turn off internal overflow flag for this event set */
		ctl->overflow = 0;

		/* Remove the signal handler, if there are no remaining */
		/* non-zero sample_periods set                          */
		retval = _papi_hwi_stop_signal(ctl->overflow_signal);
		if ( retval != PAPI_OK ) {
			SUBDBG("Call to _papi_hwi_stop_signal "
				"returned: %d\n", retval);
			return retval;
		}
	}

	retval = _pe_update_control_state( ctl, NULL,
				((pe_control_t *)(ESI->ctl_state) )->num_events,
				ctx );

	SUBDBG("EXIT: return: %d\n", retval);

	return retval;
}

/* Enable/disable profiling */
/* If threshold is zero, we disable */
static int
_pe_set_profile( EventSetInfo_t *ESI, int EventIndex, int threshold )
{
	int ret;
	int evt_idx;
	pe_control_t *ctl = ( pe_control_t *) ( ESI->ctl_state );

	/* Since you can't profile on a derived event,	*/
	/* the event is always the first and only event	*/
	/* in the native event list.			*/
	evt_idx = ESI->EventInfoArray[EventIndex].pos[0];

	/* If threshold is zero we want to *disable*    */
	/* profiling on the event                       */
	if ( threshold == 0 ) {
//		SUBDBG( "MUNMAP(%p,%"PRIu64")\n",
//			ctl->events[evt_idx].mmap_buf,
//			( uint64_t ) ctl->events[evt_idx].nr_mmap_pages *
//			getpagesize() );

//		if ( ctl->events[evt_idx].mmap_buf ) {
//			munmap( ctl->events[evt_idx].mmap_buf,
//				ctl->events[evt_idx].nr_mmap_pages *
//				getpagesize() );
//		}
//		ctl->events[evt_idx].mmap_buf = NULL;
//		ctl->events[evt_idx].nr_mmap_pages = 0;

		/* no longer sample on IP */
		ctl->events[evt_idx].attr.sample_type &= ~PERF_SAMPLE_IP;

		/* Clear any residual overflow flags */
		/* ??? old warning says "This should be handled somewhere else" */
		ESI->state &= ~( PAPI_OVERFLOWING );
		ESI->overflow.flags &= ~( PAPI_OVERFLOW_HARDWARE );

		ctl->events[evt_idx].profiling=0;

	} else {

		/* Otherwise, we are *enabling* profiling */

		/* Look up the native event code */

		if ( ESI->profile.flags & (PAPI_PROFIL_DATA_EAR |
						PAPI_PROFIL_INST_EAR)) {
			/* Not supported yet... */
			return PAPI_ENOSUPP;
		}

		if ( ESI->profile.flags & PAPI_PROFIL_RANDOM ) {
			/* This requires an ability to randomly alter the    */
			/* sample_period within a given range.		     */
			/* Linux currently does not have this ability. FIXME */
			return PAPI_ENOSUPP;
		}
		ctl->events[evt_idx].profiling=1;
	}

	ret = _pe_set_overflow( ESI, EventIndex, threshold );
	if ( ret != PAPI_OK ) return ret;

	return PAPI_OK;
}


/************ INITIALIZATION / SHUTDOWN CODE *********************/


/* Shutdown the perf_event component */
static int
_pe_shutdown_component( void ) {

	/* deallocate our event table */
	_pe_libpfm4_shutdown(&_perf_event_vector, &perf_native_event_table);

	/* Shutdown libpfm4 */
	_papi_libpfm4_shutdown(&_perf_event_vector);

	return PAPI_OK;
}


/* Check the mmap page for rdpmc support */
static int _pe_detect_rdpmc(void) {

	struct perf_event_attr pe;
	int fd,rdpmc_exists=1;
	void *addr;
	struct perf_event_mmap_page *our_mmap;
	int page_size=getpagesize();

#if defined(__i386__) || defined (__x86_64__)
#else
	/* We only support rdpmc on x86 for now */
        return 0;
#endif

	/* There were various subtle bugs in rdpmc support before	*/
	/* the Linux 4.13 release.					*/
	if (_papi_os_info.os_version < LINUX_VERSION(4,13,0)) {
		return 0;
	}

	/* Create a fake instructions event so we can read a mmap page */
	memset(&pe,0,sizeof(struct perf_event_attr));

	pe.type=PERF_TYPE_HARDWARE;
	pe.size=sizeof(struct perf_event_attr);
	pe.config=PERF_COUNT_HW_INSTRUCTIONS;
	pe.exclude_kernel=1;
	pe.disabled=1;

	perf_event_dump_attr(&pe,0,-1,-1,0);
	fd=sys_perf_event_open(&pe,0,-1,-1,0);

	/* This hopefully won't happen? */
	/* Though there is a chance this is the first */
	/* attempt to open a perf_event */
	if (fd<0) {
		SUBDBG("FAILED perf_event_open trying to detect rdpmc support");
		return PAPI_ESYS;
	}

	/* create the mmap page */
	addr=mmap(NULL, page_size, PROT_READ, MAP_SHARED,fd,0);
	if (addr == MAP_FAILED) {
		SUBDBG("FAILED mmap trying to detect rdpmc support");
		close(fd);
		return PAPI_ESYS;
	}

	/* get the rdpmc info from the mmap page */
	our_mmap=(struct perf_event_mmap_page *)addr;

	/* If cap_usr_rdpmc bit is set to 1, we have support! */
	if (our_mmap->cap_usr_rdpmc!=0) {
		rdpmc_exists=1;
	}
	else if ((!our_mmap->cap_bit0_is_deprecated) && (our_mmap->cap_bit0)) {
		/* 3.4 to 3.11 had somewhat broken rdpmc support */
		/* This convoluted test is the "official" way to detect this */
		/* To make things easier we don't support these kernels */
		rdpmc_exists=0;
	}
	else {
		rdpmc_exists=0;
	}

	/* close the fake event */
	munmap(addr,page_size);
	close(fd);

	return rdpmc_exists;

}


static int
_pe_handle_paranoid(papi_vector_t *component) {

	FILE *fff;
	int paranoid_level;
	int retval;

	/* The is the official way to detect if perf_event support exists */
	/* The file is called perf_counter_paranoid on 2.6.31             */
	/* currently we are lazy and do not support 2.6.31 kernels        */

	fff=fopen("/proc/sys/kernel/perf_event_paranoid","r");
	if (fff==NULL) {
		strncpy(component->cmp_info.disabled_reason,
			"perf_event support not detected",PAPI_MAX_STR_LEN);
		return PAPI_ENOCMP;
	}

	/* 3 (vendor patch) means completely disabled */
	/* 2 means no kernel measurements allowed   */
	/* 1 means normal counter access            */
	/* 0 means you can access CPU-specific data */
	/* -1 means no restrictions                 */
	retval=fscanf(fff,"%d",&paranoid_level);
	if (retval!=1) fprintf(stderr,"Error reading paranoid level\n");
	fclose(fff);

	if (paranoid_level==3) {
		strncpy(component->cmp_info.disabled_reason,
			"perf_event support disabled by Linux with paranoid=3",PAPI_MAX_STR_LEN);
		return PAPI_ENOCMP;
	}

	if ((paranoid_level==2) && (getuid()!=0)) {
		SUBDBG("/proc/sys/kernel/perf_event_paranoid prohibits kernel counts");
		component->cmp_info.available_domains &=~PAPI_DOM_KERNEL;
	}

	return PAPI_OK;

}

#if (OBSOLETE_WORKAROUNDS==1)
/* Version based workarounds */
/* perf_event has many bugs */
/* PAPI has to work around a number of them, but for the most part */
/* all of those were fixed by Linux 2.6.34 (May 2010) */
/* Unfortunately it's not easy to auto-detect for these so we were */
/* going by uname() version number */
/* To complicate things, some vendors like Redhat backport fixes */
/* So even though their kernel reports as 2.6.32 it has the fixes */
/* As of PAPI 5.6 we're going to default to disabling the workarounds */
/* I'm going to leave them here, ifdefed out, for the time being */
static int
_pe_version_workarounds(papi_vector_t *component) {

	/* Kernel multiplexing is broken prior to kernel 2.6.34 */
	/* The fix was probably git commit:                     */
	/*     45e16a6834b6af098702e5ea6c9a40de42ff77d8         */
	if (_papi_os_info.os_version < LINUX_VERSION(2,6,34)) {
		component->cmp_info.kernel_multiplex = 0;
		component->cmp_info.num_mpx_cntrs = PAPI_MAX_SW_MPX_EVENTS;
	}

	/* Check that processor is supported */
	if (processor_supported(_papi_hwi_system_info.hw_info.vendor,
			_papi_hwi_system_info.hw_info.cpuid_family)!=PAPI_OK) {
		fprintf(stderr,"warning, your processor is unsupported\n");
		/* should not return error, as software events should still work */
	}

	/* Update the default function pointers */
	/* Based on features/bugs               */
	if (bug_sync_read()) {
		component->read = _pe_read_bug_sync;
	}

	return PAPI_OK;

}

#endif




/* Initialize the perf_event component */
static int
_pe_init_component( int cidx )
{

	int retval;

	our_cidx=cidx;

	/* Update component behavior based on paranoid setting */
	retval=_pe_handle_paranoid(_papi_hwd[cidx]);
	if (retval!=PAPI_OK) return retval;

#if (OBSOLETE_WORKAROUNDS==1)
	/* Handle any kernel version related workarounds */
	_pe_version_workarounds(_papi_hwd[cidx]);
#endif

	/* Setup mmtimers, if appropriate */
	retval=mmtimer_setup();
	if (retval) {
		strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
			"Error initializing mmtimer",PAPI_MAX_STR_LEN);
		return retval;
	}

	/* Set the overflow signal */
	_papi_hwd[cidx]->cmp_info.hardware_intr_sig = SIGRTMIN + 2;

	/* Run Vendor-specific fixups */
	pe_vendor_fixups(_papi_hwd[cidx]);

	/* Detect if we can use rdpmc (or equivalent) */
	retval=_pe_detect_rdpmc();
	_papi_hwd[cidx]->cmp_info.fast_counter_read = retval;
	if (retval < 0 ) {
		/* Don't actually fail here, as could be a surivable bug? */
		/* If perf_event_open/mmap truly are failing we will      */
		/* likely catch it pretty quickly elsewhere.              */
		_papi_hwd[cidx]->cmp_info.fast_counter_read = 0;
	}

#if (USE_PERFEVENT_RDPMC==1)

#else
	/* Force fast_counter_read off if --enable-perfevent-rdpmc=no */
	_papi_hwd[cidx]->cmp_info.fast_counter_read = 0;
#endif

	/* Run the libpfm4-specific setup */
	retval = _papi_libpfm4_init(_papi_hwd[cidx]);
	if (retval) {

		strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
			"Error initializing libpfm4",PAPI_MAX_STR_LEN);
		return retval;

	}

	/* Now that libpfm4 is initialized */
	/* Try to setup the perf_event component events */

	retval = _pe_libpfm4_init(_papi_hwd[cidx], cidx,
				&perf_native_event_table,
				PMU_TYPE_CORE | PMU_TYPE_OS);
	if (retval) {
		switch(retval) {
			case PAPI_ENOMEM:
				strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
					"Error libpfm4 memory allocation",
					PAPI_MAX_STR_LEN);
				break;
			case PAPI_ENOSUPP:
				strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
					"Error libpfm4 no PMUs found",
					PAPI_MAX_STR_LEN);
				break;
			case PAPI_ENOCMP:
				strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
					"Error libpfm4 no default PMU found",
					PAPI_MAX_STR_LEN);
				break;
			case PAPI_ECOUNT:
				strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
					"Error libpfm4 too many default PMUs found",
					PAPI_MAX_STR_LEN);
				break;
			case PAPI_ENOEVNT:
				strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
					"Error loading preset events",
					PAPI_MAX_STR_LEN);
				break;
			default:
				printf("PAPI error %d\n",retval);
				strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
					"Unknown libpfm4 related error",
					PAPI_MAX_STR_LEN);

		}
		return retval;
	}

	/* Detect NMI watchdog which can steal counters */
	/* FIXME: on Intel we should also halve the count if SMT enabled */
	if (_linux_detect_nmi_watchdog()) {
		if (_papi_hwd[cidx]->cmp_info.num_cntrs>0) {
			_papi_hwd[cidx]->cmp_info.num_cntrs--;
		}
		SUBDBG("The Linux nmi_watchdog is using one of the performance "
			"counters, reducing the total number available.\n");
	}

	/* check for exclude_guest issue */
	check_exclude_guest();

	return PAPI_OK;

}



/* Our component vector */

papi_vector_t _perf_event_vector = {
   .cmp_info = {
       /* component information (unspecified values initialized to 0) */
      .name = "perf_event",
      .short_name = "perf",
      .version = "5.0",
      .description = "Linux perf_event CPU counters",

      .default_domain = PAPI_DOM_USER,
      .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR,
      .default_granularity = PAPI_GRN_THR,
      .available_granularities = PAPI_GRN_THR | PAPI_GRN_SYS,

      .hardware_intr = 1,
      .kernel_profile = 1,

      /* component specific cmp_info initializations */
      .fast_virtual_timer = 0,
      .attach = 1,
      .attach_must_ptrace = 1,
      .cpu = 1,
      .inherit = 1,
      .cntr_umasks = 1,

	.kernel_multiplex = 1,
	.num_mpx_cntrs = PERF_EVENT_MAX_MPX_COUNTERS,


  },

  /* sizes of framework-opaque component-private structures */
  .size = {
      .context = sizeof ( pe_context_t ),
      .control_state = sizeof ( pe_control_t ),
      .reg_value = sizeof ( int ),
      .reg_alloc = sizeof ( int ),
  },

  /* function pointers in this component */
  .init_component =        _pe_init_component,
  .shutdown_component =    _pe_shutdown_component,
  .init_thread =           _pe_init_thread,
  .init_control_state =    _pe_init_control_state,
  .dispatch_timer =        _pe_dispatch_timer,

  /* function pointers from the shared perf_event lib */
  .start =                 _pe_start,
  .stop =                  _pe_stop,
  .read =                  _pe_read,
  .shutdown_thread =       _pe_shutdown_thread,
  .ctl =                   _pe_ctl,
  .update_control_state =  _pe_update_control_state,
  .set_domain =            _pe_set_domain,
  .reset =                 _pe_reset,
  .set_overflow =          _pe_set_overflow,
  .set_profile =           _pe_set_profile,
  .stop_profiling =        _pe_stop_profiling,
  .write =                 _pe_write,


  /* from counter name mapper */
  .ntv_enum_events =   _pe_ntv_enum_events,
  .ntv_name_to_code =  _pe_ntv_name_to_code,
  .ntv_code_to_name =  _pe_ntv_code_to_name,
  .ntv_code_to_descr = _pe_ntv_code_to_descr,
  .ntv_code_to_info =  _pe_ntv_code_to_info,
};
source-git / papi

Source Code

Files