/*
* File:    perf_event_uncore.c
*
* Author:  Vince Weaver
*          vincent.weaver@maine.edu
* Mods:    Gary Mohr
*          gary.mohr@bull.com
*          Modified the perf_event_uncore component to use PFM_OS_PERF_EVENT_EXT mode in libpfm4.
*          This adds several new event masks, including cpu=, u=, and k= which give the user
*          the ability to set cpu number to use or control the domain (user, kernel, or both)
*          in which the counter should be incremented.  These are event masks so it is now
*          possible to have multiple events in the same event set that count activity from
*          differennt cpu's or count activity in different domains.
*/

#include <stdio.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>
#include <signal.h>
#include <syscall.h>
#include <sys/utsname.h>
#include <sys/mman.h>
#include <sys/ioctl.h>

/* PAPI-specific includes */
#include "papi.h"
#include "papi_memory.h"
#include "papi_internal.h"
#include "papi_vector.h"
#include "extras.h"

/* libpfm4 includes */
#include "papi_libpfm4_events.h"
#include "components/perf_event/pe_libpfm4_events.h"
#include "perfmon/pfmlib.h"
#include PEINCLUDE

/* Linux-specific includes */
#include "mb.h"
#include "linux-memory.h"
#include "linux-timer.h"
#include "linux-common.h"
#include "linux-context.h"

#include "components/perf_event/perf_event_lib.h"

/* Forward declaration */
papi_vector_t _perf_event_uncore_vector;

/* Globals */
struct native_event_table_t uncore_native_event_table;
static int our_cidx;
//int
//_peu_libpfm4_get_cidx() {
//	return our_cidx;
//}

/* Defines for ctx->state */
#define PERF_EVENTS_OPENED  0x01
#define PERF_EVENTS_RUNNING 0x02

static int _peu_set_domain( hwd_control_state_t *ctl, int domain);


/******************************************************************/
/******** Kernel Version Dependent Routines  **********************/
/******************************************************************/

/* The read format on perf_event varies based on various flags that */
/* are passed into it.  This helper avoids copying this logic       */
/* multiple places.                                                 */
static unsigned int
get_read_format( unsigned int multiplex,
		 unsigned int inherit,
		 int format_group )
{
   unsigned int format = 0;

   /* if we need read format options for multiplexing, add them now */
   if (multiplex) {
      format |= PERF_FORMAT_TOTAL_TIME_ENABLED;
      format |= PERF_FORMAT_TOTAL_TIME_RUNNING;
   }

   /* If we are not using inherit, add the group read options     */
   if (!inherit) {
      if (format_group) {
	 format |= PERF_FORMAT_GROUP;
      }
   }

   SUBDBG("multiplex: %d, inherit: %d, group_leader: %d, format: %#x\n",
	  multiplex, inherit, format_group, format);

   return format;
}

/*****************************************************************/
/********* End Kernel-version Dependent Routines  ****************/
/*****************************************************************/

/********************************************************************/
/* Low-level perf_event calls                                       */
/********************************************************************/

/* In case headers aren't new enough to have __NR_perf_event_open */
#ifndef __NR_perf_event_open

#ifdef __powerpc__
#define __NR_perf_event_open	319
#elif defined(__x86_64__)
#define __NR_perf_event_open	298
#elif defined(__i386__)
#define __NR_perf_event_open	336
#elif defined(__arm__)          366+0x900000
#define __NR_perf_event_open
#endif

#endif

static long
sys_perf_event_open( struct perf_event_attr *hw_event, pid_t pid, int cpu,
					   int group_fd, unsigned long flags )
{
   int ret;

   SUBDBG("sys_perf_event_open(hw_event: %p, pid: %d, cpu: %d, group_fd: %d, flags: %lx\n",hw_event,pid,cpu,group_fd,flags);
   SUBDBG("   type: %d\n",hw_event->type);
   SUBDBG("   size: %d\n",hw_event->size);
   SUBDBG("   config: %#"PRIx64" (%"PRIu64")\n",hw_event->config,
	  hw_event->config);
   SUBDBG("   sample_period: %"PRIu64"\n",hw_event->sample_period);
   SUBDBG("   sample_type: %"PRIu64"\n",hw_event->sample_type);
   SUBDBG("   read_format: %"PRIu64"\n",hw_event->read_format);
   SUBDBG("   disabled: %d\n",hw_event->disabled);
   SUBDBG("   inherit: %d\n",hw_event->inherit);
   SUBDBG("   pinned: %d\n",hw_event->pinned);
   SUBDBG("   exclusive: %d\n",hw_event->exclusive);
   SUBDBG("   exclude_user: %d\n",hw_event->exclude_user);
   SUBDBG("   exclude_kernel: %d\n",hw_event->exclude_kernel);
   SUBDBG("   exclude_hv: %d\n",hw_event->exclude_hv);
   SUBDBG("   exclude_idle: %d\n",hw_event->exclude_idle);
   SUBDBG("   mmap: %d\n",hw_event->mmap);
   SUBDBG("   comm: %d\n",hw_event->comm);
   SUBDBG("   freq: %d\n",hw_event->freq);
   SUBDBG("   inherit_stat: %d\n",hw_event->inherit_stat);
   SUBDBG("   enable_on_exec: %d\n",hw_event->enable_on_exec);
   SUBDBG("   task: %d\n",hw_event->task);
   SUBDBG("   watermark: %d\n",hw_event->watermark);
   SUBDBG("   precise_ip: %d\n",hw_event->precise_ip);
   SUBDBG("   mmap_data: %d\n",hw_event->mmap_data);
   SUBDBG("   sample_id_all: %d\n",hw_event->sample_id_all);
   SUBDBG("   exclude_host: %d\n",hw_event->exclude_host);
   SUBDBG("   exclude_guest: %d\n",hw_event->exclude_guest);
   SUBDBG("   exclude_callchain_kernel: %d\n",hw_event->exclude_callchain_kernel);
   SUBDBG("   exclude_callchain_user: %d\n",hw_event->exclude_callchain_user);
   SUBDBG("   wakeup_watermark: %d\n",hw_event->wakeup_watermark);
   SUBDBG("   bp_type: %d\n",hw_event->bp_type);
   SUBDBG("   config1: %#lx (%lu)\n",hw_event->config1,hw_event->config1);
   SUBDBG("   config2: %#lx (%lu)\n",hw_event->config2,hw_event->config2);
   SUBDBG("   branch_sample_type: %lu\n",hw_event->branch_sample_type);
   SUBDBG("   sample_regs_user: %lu\n",hw_event->sample_regs_user);
   SUBDBG("   sample_stack_user: %d\n",hw_event->sample_stack_user);

	ret = syscall( __NR_perf_event_open, hw_event, pid, cpu, group_fd, flags );
	SUBDBG("Returned %d %d %s\n",ret,
	       ret<0?errno:0,
	       ret<0?strerror(errno):" ");
	return ret;
}


static int map_perf_event_errors_to_papi(int perf_event_error) {

   int ret;

   /* These mappings are approximate.
      EINVAL in particular can mean lots of different things */
   switch(perf_event_error) {
      case EPERM:
      case EACCES:
           ret = PAPI_EPERM;
	   break;
      case ENODEV:
      case EOPNOTSUPP:
	   ret = PAPI_ENOSUPP;
           break;
      case ENOENT:
	   ret = PAPI_ENOEVNT;
           break;
      case ENOSYS:
      case EAGAIN:
      case EBUSY:
      case E2BIG:
	   ret = PAPI_ESYS;
	   break;
      case ENOMEM:
	   ret = PAPI_ENOMEM;
	   break;
      case EINVAL:
      default:
	   ret = PAPI_EINVAL;
           break;
   }
   return ret;
}

/* Maximum size we ever expect to read from a perf_event fd   */
/*  (this is the number of 64-bit values)                     */
/* We use this to size the read buffers                       */
/* The three is for event count, time_enabled, time_running   */
/*  and the counter term is count value and count id for each */
/*  possible counter value.                                   */
#define READ_BUFFER_SIZE (3 + (2 * PERF_EVENT_MAX_MPX_COUNTERS))


/* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch */
/* implementations (e.g. x86 before 2.6.33) which don't do a static event */
/* scheduability check in sys_perf_event_open.  It is also needed if the  */
/* kernel is stealing an event, such as when NMI watchdog is enabled.     */

static int
check_scheduability( pe_context_t *ctx, pe_control_t *ctl)
{
   SUBDBG("ENTER: ctx: %p, ctl: %p\n", ctx, ctl);
   int retval = 0, cnt = -1;
   ( void ) ctx;	     /*unused */
   long long papi_pe_buffer[READ_BUFFER_SIZE];
   int i;

   /* If the kernel isn't tracking scheduability right       */
   /* Then we need to start/stop/read to force the event     */
   /* to be scheduled and see if an error condition happens. */

   /* start all events */
   for( i = 0; i < ctl->num_events; i++) {
      retval = ioctl( ctl->events[i].event_fd, PERF_EVENT_IOC_ENABLE, NULL );
      if (retval == -1) {
	 SUBDBG("EXIT: Enable failed event index: %d, num_events: %d, return PAPI_ESYS\n", i, ctl->num_events);
	 return PAPI_ESYS;
      }
   }

   /* stop all events */
   for( i = 0; i < ctl->num_events; i++) {
      retval = ioctl(ctl->events[i].event_fd, PERF_EVENT_IOC_DISABLE, NULL );
      if (retval == -1) {
	 SUBDBG("EXIT: Disable failed: event index: %d, num_events: %d, return PAPI_ESYS\n", i, ctl->num_events);
	 return PAPI_ESYS;
      }
   }

   /* See if a read of each event returns results */
   for( i = 0; i < ctl->num_events; i++) {
      cnt = read( ctl->events[i].event_fd, papi_pe_buffer, sizeof(papi_pe_buffer));
      if ( cnt == -1 ) {
	 SUBDBG( "EXIT: read failed: event index: %d, num_events: %d, return PAPI_ESYS.  Should never happen.\n", i, ctl->num_events);
	 return PAPI_ESYS;
      }

      if ( cnt == 0 ) {
	 /* We read 0 bytes if we could not schedule the event */
	 /* The kernel should have detected this at open       */
	 /* but various bugs (including NMI watchdog)          */
	 /* result in this behavior                            */

	 SUBDBG( "EXIT: read returned 0: event index: %d, num_events: %d, return PAPI_ECNFLCT.\n", i, ctl->num_events);
	 return PAPI_ECNFLCT;
      }
   }

   /* Reset all of the counters (opened so far) back to zero      */
   /* from the above brief enable/disable call pair.              */

   /* We have to reset all events because reset of group leader      */
   /* does not reset all.                                            */
   /* we assume that the events are being added one by one and that  */
   /* we do not need to reset higher events (doing so may reset ones */
   /* that have not been initialized yet.                            */

   /* Note... PERF_EVENT_IOC_RESET does not reset time running       */
   /* info if multiplexing, so we should avoid coming here if        */
   /* we are multiplexing the event.                                 */
   for( i = 0; i < ctl->num_events; i++) {
      retval=ioctl( ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL );
      if (retval == -1) {
	 SUBDBG("EXIT: Reset failed: event index: %d, num_events: %d, return PAPI_ESYS\n", i, ctl->num_events);
	 return PAPI_ESYS;
      }
   }
   SUBDBG("EXIT: return PAPI_OK\n");
   return PAPI_OK;
}


/* Open all events in the control state */
static int
open_pe_events( pe_context_t *ctx, pe_control_t *ctl )
{

   int i, ret = PAPI_OK;
   long pid;

   if (ctl->granularity==PAPI_GRN_SYS) {
      pid = -1;
   }
   else {
      pid = ctl->tid;
   }

   for( i = 0; i < ctl->num_events; i++ ) {

      ctl->events[i].event_opened=0;

      /* set up the attr structure.  We don't set up all fields here */
      /* as some have already been set up previously.                */

/*
 * The following code controls how the uncore component interfaces with the 
 * kernel for uncore events.  The code inside the ifdef will use grouping of 
 * uncore events which can make the cost of reading the results more efficient.
 * The problem with it is that the uncore component supports 20 different uncore 
 * PMU's.  The kernel requires that all events in a group must be for the same PMU.
 * This means that with grouping enabled papi applications can count events on only
 * one of the 20 PMU's during a run.
 * 
 * The code inside the else clause treats each event in the event set as 
 * independent.  When running in this mode the kernel allows the papi multiple 
 * uncore PMU's at the same time.
 * 
 * Example:
 *  An application wants to measure all the L3 cache write requests.
 *  The event to do this is part of a cbox pmu (there are 8 cbox pmu's).
 *  When built with the code in the ifdef, the application would have to be 
 *    run 8 times and count write requests from one pmu at a time.
 *  When built with the code in the else, the write requests in all 8 cbox 
 *    pmu's could be counted in the same run.
 * 
 */
// #define GROUPIT 1       // remove the comment on this line to force event grouping
#ifdef GROUPIT
      /* group leader (event 0) is special                */
      /* If we're multiplexed, everyone is a group leader */
      if (( i == 0 ) || (ctl->multiplexed)) {
         ctl->events[i].attr.pinned = !ctl->multiplexed;
	 ctl->events[i].attr.disabled = 1;
	 ctl->events[i].group_leader_fd=-1;
         ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed,
							   ctl->inherit,
							   !ctl->multiplexed );
      } else {
	 ctl->events[i].attr.pinned=0;
	 ctl->events[i].attr.disabled = 0;
	 ctl->events[i].group_leader_fd=ctl->events[0].event_fd,
         ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed,
							   ctl->inherit,
							   0 );
      }
#else
             ctl->events[i].attr.pinned = !ctl->multiplexed;
         	 ctl->events[i].attr.disabled = 1;
         	 ctl->inherit = 1;
         	 ctl->events[i].group_leader_fd=-1;
             ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed, ctl->inherit, 0 );
#endif


      /* try to open */
      ctl->events[i].event_fd = sys_perf_event_open( &ctl->events[i].attr,
						     pid,
						     ctl->events[i].cpu,
			       ctl->events[i].group_leader_fd,
						     0 /* flags */
						     );

      /* Try to match Linux errors to PAPI errors */
      if ( ctl->events[i].event_fd == -1 ) {
	 SUBDBG("sys_perf_event_open returned error on event #%d."
		"  Error: %s\n",
		i, strerror( errno ) );
         ret=map_perf_event_errors_to_papi(errno);

    goto open_peu_cleanup;
      }

      SUBDBG ("sys_perf_event_open: tid: %ld, cpu_num: %d,"
              " group_leader/fd: %d, event_fd: %d,"
              " read_format: %"PRIu64"\n",
	      pid, ctl->events[i].cpu, ctl->events[i].group_leader_fd,
	      ctl->events[i].event_fd, ctl->events[i].attr.read_format);

      ctl->events[i].event_opened=1;
   }


   /* in many situations the kernel will indicate we opened fine */
   /* yet things will fail later.  So we need to double check    */
   /* we actually can use the events we've set up.               */

   /* This is not necessary if we are multiplexing, and in fact */
   /* we cannot do this properly if multiplexed because         */
   /* PERF_EVENT_IOC_RESET does not reset the time running info */
   if (!ctl->multiplexed) {
	ret = check_scheduability( ctx, ctl);

	if ( ret != PAPI_OK ) {
	    /* the last event did open, so we need to bump the counter */
	    /* before doing the cleanup                                */
	    i++;
	    goto open_peu_cleanup;
	}
   }

   /* Now that we've successfully opened all of the events, do whatever  */
   /* "tune-up" is needed to attach the mmap'd buffers, signal handlers, */
   /* and so on.                                                         */
   for ( i = 0; i < ctl->num_events; i++ ) {

      /* No sampling if uncore */
      ctl->events[i].mmap_buf = NULL;
   }

   /* Set num_evts only if completely successful */
   ctx->state |= PERF_EVENTS_OPENED;

   return PAPI_OK;

open_peu_cleanup:
   /* We encountered an error, close up the fds we successfully opened.  */
   /* We go backward in an attempt to close group leaders last, although */
   /* That's probably not strictly necessary.                            */
   while ( i > 0 ) {
      i--;
      if (ctl->events[i].event_fd>=0) {
	 close( ctl->events[i].event_fd );
	 ctl->events[i].event_opened=0;
      }
   }

   return ret;
}

/* Close all of the opened events */
static int
close_pe_events( pe_context_t *ctx, pe_control_t *ctl )
{
   int i;
   int num_closed=0;
   int events_not_opened=0;

   /* should this be a more serious error? */
   if ( ctx->state & PERF_EVENTS_RUNNING ) {
      SUBDBG("Closing without stopping first\n");
   }

   /* Close child events first */
   for( i=0; i<ctl->num_events; i++ ) {

      if (ctl->events[i].event_opened) {

         if (ctl->events[i].group_leader_fd!=-1) {
            if ( ctl->events[i].mmap_buf ) {
	       if ( munmap ( ctl->events[i].mmap_buf,
		             ctl->events[i].nr_mmap_pages * getpagesize() ) ) {
	          PAPIERROR( "munmap of fd = %d returned error: %s",
			     ctl->events[i].event_fd, strerror( errno ) );
	          return PAPI_ESYS;
	       }
	    }

            if ( close( ctl->events[i].event_fd ) ) {
	       PAPIERROR( "close of fd = %d returned error: %s",
		       ctl->events[i].event_fd, strerror( errno ) );
	       return PAPI_ESYS;
	    } else {
	       num_closed++;
	    }
	    ctl->events[i].event_opened=0;
	 }
      }
      else {
	events_not_opened++;
      }
   }

   /* Close the group leaders last */
   for( i=0; i<ctl->num_events; i++ ) {

      if (ctl->events[i].event_opened) {

         if (ctl->events[i].group_leader_fd==-1) {
            if ( ctl->events[i].mmap_buf ) {
	       if ( munmap ( ctl->events[i].mmap_buf,
		             ctl->events[i].nr_mmap_pages * getpagesize() ) ) {
	          PAPIERROR( "munmap of fd = %d returned error: %s",
			     ctl->events[i].event_fd, strerror( errno ) );
	          return PAPI_ESYS;
	       }
	    }


            if ( close( ctl->events[i].event_fd ) ) {
	       PAPIERROR( "close of fd = %d returned error: %s",
		       ctl->events[i].event_fd, strerror( errno ) );
	       return PAPI_ESYS;
	    } else {
	       num_closed++;
	    }
	    ctl->events[i].event_opened=0;
	 }
      }
   }


   if (ctl->num_events!=num_closed) {
      if (ctl->num_events!=(num_closed+events_not_opened)) {
         PAPIERROR("Didn't close all events: "
		   "Closed %d Not Opened: %d Expected %d\n",
		   num_closed,events_not_opened,ctl->num_events);
         return PAPI_EBUG;
      }
   }

   ctl->num_events=0;

   ctx->state &= ~PERF_EVENTS_OPENED;

   return PAPI_OK;
}


/********************************************************************/
/* Component Interface                                              */
/********************************************************************/


/* Initialize a thread */
static int
_peu_init_thread( hwd_context_t *hwd_ctx )
{

  pe_context_t *pe_ctx = ( pe_context_t *) hwd_ctx;

  /* clear the context structure and mark as initialized */
  memset( pe_ctx, 0, sizeof ( pe_context_t ) );
  pe_ctx->initialized=1;

  pe_ctx->event_table=&uncore_native_event_table;
  pe_ctx->cidx=our_cidx;

  return PAPI_OK;
}

/* Initialize a new control state */
static int
_peu_init_control_state( hwd_control_state_t *ctl )
{
  pe_control_t *pe_ctl = ( pe_control_t *) ctl;

  /* clear the contents */
  memset( pe_ctl, 0, sizeof ( pe_control_t ) );

  /* Set the default domain */
  _peu_set_domain( ctl, _perf_event_uncore_vector.cmp_info.default_domain );

  /* Set the default granularity */
  pe_ctl->granularity=_perf_event_uncore_vector.cmp_info.default_granularity;

  pe_ctl->cidx=our_cidx;

  /* Set cpu number in the control block to show events */
  /* are not tied to specific cpu                       */
  pe_ctl->cpu = -1;
  return PAPI_OK;
}


/* Initialize the perf_event uncore component */
static int
_peu_init_component( int cidx )
{

   int retval;
   int paranoid_level;

   FILE *fff;

   our_cidx=cidx;

   /* The is the official way to detect if perf_event support exists */
   /* The file is called perf_counter_paranoid on 2.6.31             */
   /* currently we are lazy and do not support 2.6.31 kernels        */

   fff=fopen("/proc/sys/kernel/perf_event_paranoid","r");
   if (fff==NULL) {
     strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
	    "perf_event support not detected",PAPI_MAX_STR_LEN);
     return PAPI_ENOCMP;
   }
   retval=fscanf(fff,"%d",&paranoid_level);
   if (retval!=1) fprintf(stderr,"Error reading paranoid level\n");
   fclose(fff);


   /* Run the libpfm4-specific setup */

   retval = _papi_libpfm4_init(_papi_hwd[cidx]);
   if (retval) {
     strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
	     "Error initializing libpfm4",PAPI_MAX_STR_LEN);
     return PAPI_ENOCMP;
   }


   /* Run the uncore specific libpfm4 setup */

   retval = _peu_libpfm4_init(_papi_hwd[cidx], our_cidx,
			       &uncore_native_event_table,
                               PMU_TYPE_UNCORE);
   if (retval) {
     strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
	     "Error setting up libpfm4",PAPI_MAX_STR_LEN);
     return PAPI_ENOCMP;
   }

   /* Check if no uncore events found */

   if (_papi_hwd[cidx]->cmp_info.num_native_events==0) {
     strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
	     "No uncore PMUs or events found",PAPI_MAX_STR_LEN);
     return PAPI_ENOCMP;
   }

   /* Check if we have enough permissions for uncore */

   /* 2 means no kernel measurements allowed   */
   /* 1 means normal counter access            */
   /* 0 means you can access CPU-specific data */
   /* -1 means no restrictions                 */

   if ((paranoid_level>0) && (getuid()!=0)) {
      strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
	    "Insufficient permissions for uncore access.  Set /proc/sys/kernel/perf_event_paranoid to 0 or run as root.",
	    PAPI_MAX_STR_LEN);
     return PAPI_ENOCMP;
   }

   return PAPI_OK;

}

/* Shutdown the perf_event component */
static int
_peu_shutdown_component( void ) {

	/* deallocate our event table */
	_pe_libpfm4_shutdown(&_perf_event_uncore_vector,
				&uncore_native_event_table);

	/* Shutdown libpfm4 */
	_papi_libpfm4_shutdown(&_perf_event_uncore_vector);

	return PAPI_OK;
}

/* This function clears the current contents of the control structure and
   updates it with whatever resources are allocated for all the native events
   in the native info structure array. */

int
_peu_update_control_state( hwd_control_state_t *ctl,
			       NativeInfo_t *native,
			       int count, hwd_context_t *ctx )
{
	int i;
	int j;
	int ret;
	int skipped_events=0;
	struct native_event_t *ntv_evt;
   pe_context_t *pe_ctx = ( pe_context_t *) ctx;
   pe_control_t *pe_ctl = ( pe_control_t *) ctl;

   /* close all of the existing fds and start over again */
   /* In theory we could have finer-grained control and know if             */
   /* things were changed, but it's easier to tear things down and rebuild. */
   close_pe_events( pe_ctx, pe_ctl );

   /* Calling with count==0 should be OK, it's how things are deallocated */
   /* when an eventset is destroyed.                                      */
   if ( count == 0 ) {
      SUBDBG( "Called with count == 0\n" );
      return PAPI_OK;
   }

   /* set up all the events */
   for( i = 0; i < count; i++ ) {
      if ( native ) {
			// get the native event pointer used for this papi event
			int ntv_idx = _papi_hwi_get_ntv_idx((unsigned)(native[i].ni_papi_code));
			if (ntv_idx < -1) {
				SUBDBG("papi_event_code: %#x known by papi but not by the component\n", native[i].ni_papi_code);
				continue;
			}
			// if native index is -1, then we have an event without a mask and need to find the right native index to use
			if (ntv_idx == -1) {
				// find the native event index we want by matching for the right papi event code
				for (j=0 ; j<pe_ctx->event_table->num_native_events ; j++) {
					if (pe_ctx->event_table->native_events[j].papi_event_code == native[i].ni_papi_code) {
						ntv_idx = j;
					}
				}
			}

			// if native index is still negative, we did not find event we wanted so just return error
			if (ntv_idx < 0) {
				SUBDBG("papi_event_code: %#x not found in native event tables\n", native[i].ni_papi_code);
				continue;
			}

			// this native index is positive so there was a mask with the event, the ntv_idx identifies which native event to use
			ntv_evt = (struct native_event_t *)(&(pe_ctx->event_table->native_events[ntv_idx]));

			SUBDBG("ntv_evt: %p\n", ntv_evt);

			SUBDBG("i: %d, pe_ctx->event_table->num_native_events: %d\n", i, pe_ctx->event_table->num_native_events);

	    	// Move this events hardware config values and other attributes to the perf_events attribute structure
			memcpy (&pe_ctl->events[i].attr, &ntv_evt->attr, sizeof(perf_event_attr_t));

			// may need to update the attribute structure with information from event set level domain settings (values set by PAPI_set_domain)
			// only done if the event mask which controls each counting domain was not provided

			// get pointer to allocated name, will be NULL when adding preset events to event set
			char *aName = ntv_evt->allocated_name;
			if ((aName == NULL)  ||  (strstr(aName, ":u=") == NULL)) {
				SUBDBG("set exclude_user attribute from eventset level domain flags, encode: %d, eventset: %d\n", pe_ctl->events[i].attr.exclude_user, !(pe_ctl->domain & PAPI_DOM_USER));
				pe_ctl->events[i].attr.exclude_user = !(pe_ctl->domain & PAPI_DOM_USER);
			}
			if ((aName == NULL)  ||  (strstr(aName, ":k=") == NULL)) {
				SUBDBG("set exclude_kernel attribute from eventset level domain flags, encode: %d, eventset: %d\n", pe_ctl->events[i].attr.exclude_kernel, !(pe_ctl->domain & PAPI_DOM_KERNEL));
				pe_ctl->events[i].attr.exclude_kernel = !(pe_ctl->domain & PAPI_DOM_KERNEL);
			}

			// set the cpu number provided with an event mask if there was one (will be -1 if mask not provided)
			pe_ctl->events[i].cpu = ntv_evt->cpu;
			// if cpu event mask not provided, then set the cpu to use to what may have been set on call to PAPI_set_opt (will still be -1 if not called)
			if (pe_ctl->events[i].cpu == -1) {
				pe_ctl->events[i].cpu = pe_ctl->cpu;
			}
      } else {
    	  // This case happens when called from _pe_set_overflow and _pe_ctl
          // Those callers put things directly into the pe_ctl structure so it is already set for the open call
      }

      // Copy the inherit flag into the attribute block that will be passed to the kernel
      pe_ctl->events[i].attr.inherit = pe_ctl->inherit;

      /* Set the position in the native structure */
      /* We just set up events linearly           */
      if ( native ) {
    	  native[i].ni_position = i;
    	  SUBDBG( "&native[%d]: %p, ni_papi_code: %#x, ni_event: %#x, ni_position: %d, ni_owners: %d\n",
			i, &(native[i]), native[i].ni_papi_code, native[i].ni_event, native[i].ni_position, native[i].ni_owners);
      }
   }

	if (count <= skipped_events) {
		SUBDBG("EXIT: No events to count, they all contained invalid umasks\n");
		return PAPI_ENOEVNT;
	}

  pe_ctl->num_events = count - skipped_events;

   /* actuall open the events */
   /* (why is this a separate function?) */
   ret = open_pe_events( pe_ctx, pe_ctl );
   if ( ret != PAPI_OK ) {
      SUBDBG("open_pe_events failed\n");
      /* Restore values ? */
      return ret;
   }

   SUBDBG( "EXIT: PAPI_OK\n" );
   return PAPI_OK;
}

/********************************************************************/
/********************************************************************/
/* Start with functions that are exported via the module interface  */
/********************************************************************/
/********************************************************************/


/* set the domain. perf_events allows per-event control of this, papi allows it to be set at the event level or at the event set level. */
/* this will set the event set level domain values but they only get used if no event level domain mask (u= or k=) was specified. */
static int
_peu_set_domain( hwd_control_state_t *ctl, int domain)
{
   pe_control_t *pe_ctl = ( pe_control_t *) ctl;

   SUBDBG("old control domain %d, new domain %d\n",
	  pe_ctl->domain,domain);

   pe_ctl->domain = domain;
   return PAPI_OK;
}

/* Shutdown a thread */
static int
_peu_shutdown_thread( hwd_context_t *ctx )
{
    pe_context_t *pe_ctx = ( pe_context_t *) ctx;

    pe_ctx->initialized=0;

    return PAPI_OK;
}


/* reset the hardware counters */
/* Note: PAPI_reset() does not necessarily call this */
/* unless the events are actually running.           */
static int
_peu_reset( hwd_context_t *ctx, hwd_control_state_t *ctl )
{
   int i, ret;
   pe_control_t *pe_ctl = ( pe_control_t *) ctl;

   ( void ) ctx;			 /*unused */

   /* We need to reset all of the events, not just the group leaders */
   for( i = 0; i < pe_ctl->num_events; i++ ) {
      ret = ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL );
      if ( ret == -1 ) {
	 PAPIERROR("ioctl(%d, PERF_EVENT_IOC_RESET, NULL) "
		   "returned error, Linux says: %s",
		   pe_ctl->events[i].event_fd, strerror( errno ) );
	 return PAPI_ESYS;
      }
   }

   return PAPI_OK;
}


/* write (set) the hardware counters */
/* Current we do not support this.   */
static int
_peu_write( hwd_context_t *ctx, hwd_control_state_t *ctl,
		long long *from )
{
   ( void ) ctx;			 /*unused */
   ( void ) ctl;			 /*unused */
   ( void ) from;			 /*unused */
   /*
    * Counters cannot be written.  Do we need to virtualize the
    * counters so that they can be written, or perhaps modify code so that
    * they can be written? FIXME ?
    */

    return PAPI_ENOSUPP;
}

/*
 * perf_event provides a complicated read interface.
 *  the info returned by read() varies depending on whether
 *  you have PERF_FORMAT_GROUP, PERF_FORMAT_TOTAL_TIME_ENABLED,
 *  PERF_FORMAT_TOTAL_TIME_RUNNING, or PERF_FORMAT_ID set
 *
 * To simplify things we just always ask for everything.  This might
 * lead to overhead when reading more than we need, but it makes the
 * read code a lot simpler than the original implementation we had here.
 *
 * For more info on the layout see include/linux/perf_event.h
 *
 */

static int
_peu_read( hwd_context_t *ctx, hwd_control_state_t *ctl,
	       long long **events, int flags )
{
    SUBDBG("ENTER: ctx: %p, ctl: %p, events: %p, flags: %#x\n", ctx, ctl, events, flags);

   ( void ) flags;			 /*unused */
   int i, ret = -1;
   /* pe_context_t *pe_ctx = ( pe_context_t *) ctx; */ 
   (void) ctx; /*unused*/
   pe_control_t *pe_ctl = ( pe_control_t *) ctl;
   long long papi_pe_buffer[READ_BUFFER_SIZE];
   long long tot_time_running, tot_time_enabled, scale;

   /* Handle case where we are multiplexing */
   if (pe_ctl->multiplexed) {

      /* currently we handle multiplexing by having individual events */
      /* so we read from each in turn.                                */

      for ( i = 0; i < pe_ctl->num_events; i++ ) {

         ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer,
		    sizeof ( papi_pe_buffer ) );
         if ( ret == -1 ) {
	    PAPIERROR("read returned an error: ", strerror( errno ));
       SUBDBG("EXIT: PAPI_ESYS\n");
	    return PAPI_ESYS;
	 }

	 /* We should read 3 64-bit values from the counter */
	 if (ret<(signed)(3*sizeof(long long))) {
	    PAPIERROR("Error!  short read!\n");
       SUBDBG("EXIT: PAPI_ESYS\n");
	    return PAPI_ESYS;
	 }

         SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
	        pe_ctl->events[i].event_fd,
		(long)pe_ctl->tid, pe_ctl->events[i].cpu, ret);
         SUBDBG("read: %lld %lld %lld\n",papi_pe_buffer[0],
	        papi_pe_buffer[1],papi_pe_buffer[2]);

         tot_time_enabled = papi_pe_buffer[1];
         tot_time_running = papi_pe_buffer[2];

         SUBDBG("count[%d] = (papi_pe_buffer[%d] %lld * "
		"tot_time_enabled %lld) / tot_time_running %lld\n",
		i, 0,papi_pe_buffer[0],
		tot_time_enabled,tot_time_running);

         if (tot_time_running == tot_time_enabled) {
	    /* No scaling needed */
	    pe_ctl->counts[i] = papi_pe_buffer[0];
         } else if (tot_time_running && tot_time_enabled) {
	    /* Scale factor of 100 to avoid overflows when computing */
	    /*enabled/running */

	    scale = (tot_time_enabled * 100LL) / tot_time_running;
	    scale = scale * papi_pe_buffer[0];
	    scale = scale / 100LL;
	    pe_ctl->counts[i] = scale;
	 } else {
	   /* This should not happen, but Phil reports it sometime does. */
	    SUBDBG("perf_event kernel bug(?) count, enabled, "
		   "running: %lld, %lld, %lld\n",
		   papi_pe_buffer[0],tot_time_enabled,
		   tot_time_running);

	    pe_ctl->counts[i] = papi_pe_buffer[0];
	 }
      }
   }

   /* Handle cases where we cannot use FORMAT GROUP */
   else if (pe_ctl->inherit) {

      /* we must read each counter individually */
      for ( i = 0; i < pe_ctl->num_events; i++ ) {

         ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer, 
		    sizeof ( papi_pe_buffer ) );
         if ( ret == -1 ) {
	    PAPIERROR("read returned an error: ", strerror( errno ));
       SUBDBG("EXIT: PAPI_ESYS\n");
	    return PAPI_ESYS;
	 }

	 /* we should read one 64-bit value from each counter */
	 if (ret!=sizeof(long long)) {
	    PAPIERROR("Error!  short read!\n");
	    PAPIERROR("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
		   pe_ctl->events[i].event_fd,
		   (long)pe_ctl->tid, pe_ctl->events[i].cpu, ret);
       SUBDBG("EXIT: PAPI_ESYS\n");
	    return PAPI_ESYS;
	 }

         SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
	        pe_ctl->events[i].event_fd, (long)pe_ctl->tid,
		pe_ctl->events[i].cpu, ret);
         SUBDBG("read: %lld\n",papi_pe_buffer[0]);

	 pe_ctl->counts[i] = papi_pe_buffer[0];
      }
   }


   /* Handle cases where we are using FORMAT_GROUP   */
   /* We assume only one group leader, in position 0 */

   else {
      if (pe_ctl->events[0].group_leader_fd!=-1) {
	 PAPIERROR("Was expecting group leader!\n");
      }

      ret = read( pe_ctl->events[0].event_fd, papi_pe_buffer,
		  sizeof ( papi_pe_buffer ) );

      if ( ret == -1 ) {
	 PAPIERROR("read returned an error: ", strerror( errno ));
       SUBDBG("EXIT: PAPI_ESYS\n");
	 return PAPI_ESYS;
      }

      /* we read 1 64-bit value (number of events) then     */
      /* num_events more 64-bit values that hold the counts */
      if (ret<(signed)((1+pe_ctl->num_events)*sizeof(long long))) {
	 PAPIERROR("Error! short read!\n");
       SUBDBG("EXIT: PAPI_ESYS\n");
	 return PAPI_ESYS;
      }

      SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
	     pe_ctl->events[0].event_fd,
	     (long)pe_ctl->tid, pe_ctl->events[0].cpu, ret);
      {
	 int j;
	 for(j=0;j<ret/8;j++) {
            SUBDBG("read %d: %lld\n",j,papi_pe_buffer[j]);
	 }
      }

      /* Make sure the kernel agrees with how many events we have */
      if (papi_pe_buffer[0]!=pe_ctl->num_events) {
	 PAPIERROR("Error!  Wrong number of events!\n");
       SUBDBG("EXIT: PAPI_ESYS\n");
	 return PAPI_ESYS;
      }

      /* put the count values in their proper location */
      for(i=0;i<pe_ctl->num_events;i++) {
         pe_ctl->counts[i] = papi_pe_buffer[1+i];
      }
   }

   /* point PAPI to the values we read */
   *events = pe_ctl->counts;

   SUBDBG("EXIT: PAPI_OK\n");
   return PAPI_OK;
}

/* Start counting events */
static int
_peu_start( hwd_context_t *ctx, hwd_control_state_t *ctl )
{
   int ret;
   int i;
   int did_something = 0;
   pe_context_t *pe_ctx = ( pe_context_t *) ctx;
   pe_control_t *pe_ctl = ( pe_control_t *) ctl;

   /* Reset the counters first.  Is this necessary? */
   ret = _peu_reset( pe_ctx, pe_ctl );
   if ( ret ) {
      return ret;
   }

   /* Enable all of the group leaders                */
   /* All group leaders have a group_leader_fd of -1 */
   for( i = 0; i < pe_ctl->num_events; i++ ) {
      if (pe_ctl->events[i].group_leader_fd == -1) {
	 SUBDBG("ioctl(enable): fd: %d\n", pe_ctl->events[i].event_fd);
	 ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_ENABLE, NULL) ; 

	 /* ioctls always return -1 on failure */
         if (ret == -1) {
            PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n");
            return PAPI_ESYS;
	 }

	 did_something++;
      } 
   }

   if (!did_something) {
      PAPIERROR("Did not enable any counters.\n");
      return PAPI_EBUG;
   }

   pe_ctx->state |= PERF_EVENTS_RUNNING;

   return PAPI_OK;

}

/* Stop all of the counters */
static int
_peu_stop( hwd_context_t *ctx, hwd_control_state_t *ctl )
{

   int ret;
   int i;
   pe_context_t *pe_ctx = ( pe_context_t *) ctx;
   pe_control_t *pe_ctl = ( pe_control_t *) ctl;

   /* Just disable the group leaders */
   for ( i = 0; i < pe_ctl->num_events; i++ ) {
      if ( pe_ctl->events[i].group_leader_fd == -1 ) {
	 ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_DISABLE, NULL);
	 if ( ret == -1 ) {
	    PAPIERROR( "ioctl(%d, PERF_EVENT_IOC_DISABLE, NULL) "
		       "returned error, Linux says: %s",
		       pe_ctl->events[i].event_fd, strerror( errno ) );
	    return PAPI_EBUG;
	 }
      }
   }

   pe_ctx->state &= ~PERF_EVENTS_RUNNING;

   return PAPI_OK;
}

/* Set various options on a control state */
static int
_peu_ctl( hwd_context_t *ctx, int code, _papi_int_option_t *option )
{
   int ret;
   pe_context_t *pe_ctx = ( pe_context_t *) ctx;
   pe_control_t *pe_ctl = NULL;

   switch ( code ) {
      case PAPI_MULTIPLEX:
	   pe_ctl = ( pe_control_t * ) ( option->multiplex.ESI->ctl_state );

	   pe_ctl->multiplexed = 1;
	   ret = _peu_update_control_state( pe_ctl, NULL,
						pe_ctl->num_events, pe_ctx );
	   if (ret != PAPI_OK) {
	      pe_ctl->multiplexed = 0;
	   }
	   return ret;

      case PAPI_ATTACH:
	   pe_ctl = ( pe_control_t * ) ( option->attach.ESI->ctl_state );

	   pe_ctl->tid = option->attach.tid;

	   /* If events have been already been added, something may */
	   /* have been done to the kernel, so update */
	   ret =_peu_update_control_state( pe_ctl, NULL,
						pe_ctl->num_events, pe_ctx);

	   return ret;

      case PAPI_DETACH:
	   pe_ctl = ( pe_control_t *) ( option->attach.ESI->ctl_state );

	   pe_ctl->tid = 0;
	   return PAPI_OK;

      case PAPI_CPU_ATTACH:
	   pe_ctl = ( pe_control_t *) ( option->cpu.ESI->ctl_state );

	   /* this tells the kernel not to count for a thread   */
	   /* should we warn if we try to set both?  perf_event */
	   /* will reject it.                                   */
	   pe_ctl->tid = -1;

	   pe_ctl->cpu = option->cpu.cpu_num;

	   return PAPI_OK;

      case PAPI_DOMAIN:
	   pe_ctl = ( pe_control_t *) ( option->domain.ESI->ctl_state );

	   /* looks like we are allowed, so set event set level counting domains */
       pe_ctl->domain = option->domain.domain;
	   return PAPI_OK;

      case PAPI_GRANUL:
	   pe_ctl = (pe_control_t *) ( option->granularity.ESI->ctl_state );

	   /* FIXME: we really don't support this yet */

           switch ( option->granularity.granularity  ) {
              case PAPI_GRN_PROCG:
              case PAPI_GRN_SYS_CPU:
              case PAPI_GRN_PROC:
		   return PAPI_ECMP;

	      /* Currently we only support thread and CPU granularity */
              case PAPI_GRN_SYS:
	 	   pe_ctl->granularity=PAPI_GRN_SYS;
		   break;

              case PAPI_GRN_THR:
	 	   pe_ctl->granularity=PAPI_GRN_THR;
		   break;


              default:
		   return PAPI_EINVAL;
	   }
           return PAPI_OK;

      case PAPI_INHERIT:
	   pe_ctl = (pe_control_t *) ( option->inherit.ESI->ctl_state );

	   if (option->inherit.inherit) {
	      /* children will inherit counters */
	      pe_ctl->inherit = 1;
	   } else {
	      /* children won't inherit counters */
	      pe_ctl->inherit = 0;
	   }
	   return PAPI_OK;

      case PAPI_DATA_ADDRESS:
	   return PAPI_ENOSUPP;

      case PAPI_INSTR_ADDRESS:
	   return PAPI_ENOSUPP;

      case PAPI_DEF_ITIMER:
	   return PAPI_ENOSUPP;

      case PAPI_DEF_MPX_NS:
	   return PAPI_ENOSUPP;

      case PAPI_DEF_ITIMER_NS:
	   return PAPI_ENOSUPP;

      default:
	   return PAPI_ENOSUPP;
   }
}


static int
_peu_ntv_enum_events( unsigned int *PapiEventCode, int modifier )
{

  if (_perf_event_uncore_vector.cmp_info.disabled) return PAPI_ENOEVNT;


  return _pe_libpfm4_ntv_enum_events(PapiEventCode, modifier, our_cidx,
                                       &uncore_native_event_table);
}

static int
_peu_ntv_name_to_code( const char *name, unsigned int *event_code) {

  if (_perf_event_uncore_vector.cmp_info.disabled) return PAPI_ENOEVNT;

  return _pe_libpfm4_ntv_name_to_code(name,event_code, our_cidx,
                                        &uncore_native_event_table);
}

static int
_peu_ntv_code_to_name(unsigned int EventCode,
                          char *ntv_name, int len) {

   if (_perf_event_uncore_vector.cmp_info.disabled) return PAPI_ENOEVNT;

   return _pe_libpfm4_ntv_code_to_name(EventCode,
                                         ntv_name, len, 
					 &uncore_native_event_table);
}

static int
_peu_ntv_code_to_descr( unsigned int EventCode,
                            char *ntv_descr, int len) {

   if (_perf_event_uncore_vector.cmp_info.disabled) return PAPI_ENOEVNT;

   return _pe_libpfm4_ntv_code_to_descr(EventCode,ntv_descr,len,
                                          &uncore_native_event_table);
}

static int
_peu_ntv_code_to_info(unsigned int EventCode,
                          PAPI_event_info_t *info) {

  if (_perf_event_uncore_vector.cmp_info.disabled) return PAPI_ENOEVNT;

  return _pe_libpfm4_ntv_code_to_info(EventCode, info,
                                        &uncore_native_event_table);
}

/* Our component vector */

papi_vector_t _perf_event_uncore_vector = {
   .cmp_info = {
       /* component information (unspecified values initialized to 0) */
      .name = "perf_event_uncore",
      .short_name = "peu",
      .version = "5.0",
      .description = "Linux perf_event CPU uncore and northbridge",

      .default_domain = PAPI_DOM_ALL,
      .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR,
      .default_granularity = PAPI_GRN_SYS,
      .available_granularities = PAPI_GRN_SYS,

      .num_mpx_cntrs = PERF_EVENT_MAX_MPX_COUNTERS,

      /* component specific cmp_info initializations */
      .fast_virtual_timer = 0,
      .attach = 1,
      .attach_must_ptrace = 1,
      .cpu = 1,
      .inherit = 1,
      .cntr_umasks = 1,

  },

  /* sizes of framework-opaque component-private structures */
  .size = {
      .context = sizeof ( pe_context_t ),
      .control_state = sizeof ( pe_control_t ),
      .reg_value = sizeof ( int ),
      .reg_alloc = sizeof ( int ),
  },

  /* function pointers in this component */
  .init_component =        _peu_init_component,
  .shutdown_component =    _peu_shutdown_component,
  .init_thread =           _peu_init_thread,
  .init_control_state =    _peu_init_control_state,
  .start =                 _peu_start,
  .stop =                  _peu_stop,
  .read =                  _peu_read,
  .shutdown_thread =       _peu_shutdown_thread,
  .ctl =                   _peu_ctl,
  .update_control_state =  _peu_update_control_state,
  .set_domain =            _peu_set_domain,
  .reset =                 _peu_reset,
  .write =                 _peu_write,

  /* from counter name mapper */
  .ntv_enum_events =   _peu_ntv_enum_events,
  .ntv_name_to_code =  _peu_ntv_name_to_code,
  .ntv_code_to_name =  _peu_ntv_code_to_name,
  .ntv_code_to_descr = _peu_ntv_code_to_descr,
  .ntv_code_to_info =  _peu_ntv_code_to_info,
};