/* * File: perf_event_uncore.c * * Author: Vince Weaver * vincent.weaver@maine.edu * Mods: Gary Mohr * gary.mohr@bull.com * Modified the perf_event_uncore component to use PFM_OS_PERF_EVENT_EXT mode in libpfm4. * This adds several new event masks, including cpu=, u=, and k= which give the user * the ability to set cpu number to use or control the domain (user, kernel, or both) * in which the counter should be incremented. These are event masks so it is now * possible to have multiple events in the same event set that count activity from * differennt cpu's or count activity in different domains. */ #include #include #include #include #include #include #include #include #include /* PAPI-specific includes */ #include "papi.h" #include "papi_memory.h" #include "papi_internal.h" #include "papi_vector.h" #include "extras.h" /* libpfm4 includes */ #include "papi_libpfm4_events.h" #include "components/perf_event/pe_libpfm4_events.h" #include "perfmon/pfmlib.h" #include PEINCLUDE /* Linux-specific includes */ #include "mb.h" #include "linux-memory.h" #include "linux-timer.h" #include "linux-common.h" #include "linux-context.h" #include "components/perf_event/perf_event_lib.h" /* Forward declaration */ papi_vector_t _perf_event_uncore_vector; /* Globals */ struct native_event_table_t uncore_native_event_table; static int our_cidx; //int //_peu_libpfm4_get_cidx() { // return our_cidx; //} /* Defines for ctx->state */ #define PERF_EVENTS_OPENED 0x01 #define PERF_EVENTS_RUNNING 0x02 static int _peu_set_domain( hwd_control_state_t *ctl, int domain); /******************************************************************/ /******** Kernel Version Dependent Routines **********************/ /******************************************************************/ /* The read format on perf_event varies based on various flags that */ /* are passed into it. This helper avoids copying this logic */ /* multiple places. */ static unsigned int get_read_format( unsigned int multiplex, unsigned int inherit, int format_group ) { unsigned int format = 0; /* if we need read format options for multiplexing, add them now */ if (multiplex) { format |= PERF_FORMAT_TOTAL_TIME_ENABLED; format |= PERF_FORMAT_TOTAL_TIME_RUNNING; } /* If we are not using inherit, add the group read options */ if (!inherit) { if (format_group) { format |= PERF_FORMAT_GROUP; } } SUBDBG("multiplex: %d, inherit: %d, group_leader: %d, format: %#x\n", multiplex, inherit, format_group, format); return format; } /*****************************************************************/ /********* End Kernel-version Dependent Routines ****************/ /*****************************************************************/ /********************************************************************/ /* Low-level perf_event calls */ /********************************************************************/ /* In case headers aren't new enough to have __NR_perf_event_open */ #ifndef __NR_perf_event_open #ifdef __powerpc__ #define __NR_perf_event_open 319 #elif defined(__x86_64__) #define __NR_perf_event_open 298 #elif defined(__i386__) #define __NR_perf_event_open 336 #elif defined(__arm__) 366+0x900000 #define __NR_perf_event_open #endif #endif static long sys_perf_event_open( struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags ) { int ret; SUBDBG("sys_perf_event_open(hw_event: %p, pid: %d, cpu: %d, group_fd: %d, flags: %lx\n",hw_event,pid,cpu,group_fd,flags); SUBDBG(" type: %d\n",hw_event->type); SUBDBG(" size: %d\n",hw_event->size); SUBDBG(" config: %#"PRIx64" (%"PRIu64")\n",hw_event->config, hw_event->config); SUBDBG(" sample_period: %"PRIu64"\n",hw_event->sample_period); SUBDBG(" sample_type: %"PRIu64"\n",hw_event->sample_type); SUBDBG(" read_format: %"PRIu64"\n",hw_event->read_format); SUBDBG(" disabled: %d\n",hw_event->disabled); SUBDBG(" inherit: %d\n",hw_event->inherit); SUBDBG(" pinned: %d\n",hw_event->pinned); SUBDBG(" exclusive: %d\n",hw_event->exclusive); SUBDBG(" exclude_user: %d\n",hw_event->exclude_user); SUBDBG(" exclude_kernel: %d\n",hw_event->exclude_kernel); SUBDBG(" exclude_hv: %d\n",hw_event->exclude_hv); SUBDBG(" exclude_idle: %d\n",hw_event->exclude_idle); SUBDBG(" mmap: %d\n",hw_event->mmap); SUBDBG(" comm: %d\n",hw_event->comm); SUBDBG(" freq: %d\n",hw_event->freq); SUBDBG(" inherit_stat: %d\n",hw_event->inherit_stat); SUBDBG(" enable_on_exec: %d\n",hw_event->enable_on_exec); SUBDBG(" task: %d\n",hw_event->task); SUBDBG(" watermark: %d\n",hw_event->watermark); SUBDBG(" precise_ip: %d\n",hw_event->precise_ip); SUBDBG(" mmap_data: %d\n",hw_event->mmap_data); SUBDBG(" sample_id_all: %d\n",hw_event->sample_id_all); SUBDBG(" exclude_host: %d\n",hw_event->exclude_host); SUBDBG(" exclude_guest: %d\n",hw_event->exclude_guest); SUBDBG(" exclude_callchain_kernel: %d\n",hw_event->exclude_callchain_kernel); SUBDBG(" exclude_callchain_user: %d\n",hw_event->exclude_callchain_user); SUBDBG(" wakeup_watermark: %d\n",hw_event->wakeup_watermark); SUBDBG(" bp_type: %d\n",hw_event->bp_type); SUBDBG(" config1: %#lx (%lu)\n",hw_event->config1,hw_event->config1); SUBDBG(" config2: %#lx (%lu)\n",hw_event->config2,hw_event->config2); SUBDBG(" branch_sample_type: %lu\n",hw_event->branch_sample_type); SUBDBG(" sample_regs_user: %lu\n",hw_event->sample_regs_user); SUBDBG(" sample_stack_user: %d\n",hw_event->sample_stack_user); ret = syscall( __NR_perf_event_open, hw_event, pid, cpu, group_fd, flags ); SUBDBG("Returned %d %d %s\n",ret, ret<0?errno:0, ret<0?strerror(errno):" "); return ret; } static int map_perf_event_errors_to_papi(int perf_event_error) { int ret; /* These mappings are approximate. EINVAL in particular can mean lots of different things */ switch(perf_event_error) { case EPERM: case EACCES: ret = PAPI_EPERM; break; case ENODEV: case EOPNOTSUPP: ret = PAPI_ENOSUPP; break; case ENOENT: ret = PAPI_ENOEVNT; break; case ENOSYS: case EAGAIN: case EBUSY: case E2BIG: ret = PAPI_ESYS; break; case ENOMEM: ret = PAPI_ENOMEM; break; case EINVAL: default: ret = PAPI_EINVAL; break; } return ret; } /* Maximum size we ever expect to read from a perf_event fd */ /* (this is the number of 64-bit values) */ /* We use this to size the read buffers */ /* The three is for event count, time_enabled, time_running */ /* and the counter term is count value and count id for each */ /* possible counter value. */ #define READ_BUFFER_SIZE (3 + (2 * PERF_EVENT_MAX_MPX_COUNTERS)) /* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch */ /* implementations (e.g. x86 before 2.6.33) which don't do a static event */ /* scheduability check in sys_perf_event_open. It is also needed if the */ /* kernel is stealing an event, such as when NMI watchdog is enabled. */ static int check_scheduability( pe_context_t *ctx, pe_control_t *ctl) { SUBDBG("ENTER: ctx: %p, ctl: %p\n", ctx, ctl); int retval = 0, cnt = -1; ( void ) ctx; /*unused */ long long papi_pe_buffer[READ_BUFFER_SIZE]; int i; /* If the kernel isn't tracking scheduability right */ /* Then we need to start/stop/read to force the event */ /* to be scheduled and see if an error condition happens. */ /* start all events */ for( i = 0; i < ctl->num_events; i++) { retval = ioctl( ctl->events[i].event_fd, PERF_EVENT_IOC_ENABLE, NULL ); if (retval == -1) { SUBDBG("EXIT: Enable failed event index: %d, num_events: %d, return PAPI_ESYS\n", i, ctl->num_events); return PAPI_ESYS; } } /* stop all events */ for( i = 0; i < ctl->num_events; i++) { retval = ioctl(ctl->events[i].event_fd, PERF_EVENT_IOC_DISABLE, NULL ); if (retval == -1) { SUBDBG("EXIT: Disable failed: event index: %d, num_events: %d, return PAPI_ESYS\n", i, ctl->num_events); return PAPI_ESYS; } } /* See if a read of each event returns results */ for( i = 0; i < ctl->num_events; i++) { cnt = read( ctl->events[i].event_fd, papi_pe_buffer, sizeof(papi_pe_buffer)); if ( cnt == -1 ) { SUBDBG( "EXIT: read failed: event index: %d, num_events: %d, return PAPI_ESYS. Should never happen.\n", i, ctl->num_events); return PAPI_ESYS; } if ( cnt == 0 ) { /* We read 0 bytes if we could not schedule the event */ /* The kernel should have detected this at open */ /* but various bugs (including NMI watchdog) */ /* result in this behavior */ SUBDBG( "EXIT: read returned 0: event index: %d, num_events: %d, return PAPI_ECNFLCT.\n", i, ctl->num_events); return PAPI_ECNFLCT; } } /* Reset all of the counters (opened so far) back to zero */ /* from the above brief enable/disable call pair. */ /* We have to reset all events because reset of group leader */ /* does not reset all. */ /* we assume that the events are being added one by one and that */ /* we do not need to reset higher events (doing so may reset ones */ /* that have not been initialized yet. */ /* Note... PERF_EVENT_IOC_RESET does not reset time running */ /* info if multiplexing, so we should avoid coming here if */ /* we are multiplexing the event. */ for( i = 0; i < ctl->num_events; i++) { retval=ioctl( ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL ); if (retval == -1) { SUBDBG("EXIT: Reset failed: event index: %d, num_events: %d, return PAPI_ESYS\n", i, ctl->num_events); return PAPI_ESYS; } } SUBDBG("EXIT: return PAPI_OK\n"); return PAPI_OK; } /* Open all events in the control state */ static int open_pe_events( pe_context_t *ctx, pe_control_t *ctl ) { int i, ret = PAPI_OK; long pid; if (ctl->granularity==PAPI_GRN_SYS) { pid = -1; } else { pid = ctl->tid; } for( i = 0; i < ctl->num_events; i++ ) { ctl->events[i].event_opened=0; /* set up the attr structure. We don't set up all fields here */ /* as some have already been set up previously. */ /* * The following code controls how the uncore component interfaces with the * kernel for uncore events. The code inside the ifdef will use grouping of * uncore events which can make the cost of reading the results more efficient. * The problem with it is that the uncore component supports 20 different uncore * PMU's. The kernel requires that all events in a group must be for the same PMU. * This means that with grouping enabled papi applications can count events on only * one of the 20 PMU's during a run. * * The code inside the else clause treats each event in the event set as * independent. When running in this mode the kernel allows the papi multiple * uncore PMU's at the same time. * * Example: * An application wants to measure all the L3 cache write requests. * The event to do this is part of a cbox pmu (there are 8 cbox pmu's). * When built with the code in the ifdef, the application would have to be * run 8 times and count write requests from one pmu at a time. * When built with the code in the else, the write requests in all 8 cbox * pmu's could be counted in the same run. * */ // #define GROUPIT 1 // remove the comment on this line to force event grouping #ifdef GROUPIT /* group leader (event 0) is special */ /* If we're multiplexed, everyone is a group leader */ if (( i == 0 ) || (ctl->multiplexed)) { ctl->events[i].attr.pinned = !ctl->multiplexed; ctl->events[i].attr.disabled = 1; ctl->events[i].group_leader_fd=-1; ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed, ctl->inherit, !ctl->multiplexed ); } else { ctl->events[i].attr.pinned=0; ctl->events[i].attr.disabled = 0; ctl->events[i].group_leader_fd=ctl->events[0].event_fd, ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed, ctl->inherit, 0 ); } #else ctl->events[i].attr.pinned = !ctl->multiplexed; ctl->events[i].attr.disabled = 1; ctl->inherit = 1; ctl->events[i].group_leader_fd=-1; ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed, ctl->inherit, 0 ); #endif /* try to open */ ctl->events[i].event_fd = sys_perf_event_open( &ctl->events[i].attr, pid, ctl->events[i].cpu, ctl->events[i].group_leader_fd, 0 /* flags */ ); /* Try to match Linux errors to PAPI errors */ if ( ctl->events[i].event_fd == -1 ) { SUBDBG("sys_perf_event_open returned error on event #%d." " Error: %s\n", i, strerror( errno ) ); ret=map_perf_event_errors_to_papi(errno); goto open_peu_cleanup; } SUBDBG ("sys_perf_event_open: tid: %ld, cpu_num: %d," " group_leader/fd: %d, event_fd: %d," " read_format: %"PRIu64"\n", pid, ctl->events[i].cpu, ctl->events[i].group_leader_fd, ctl->events[i].event_fd, ctl->events[i].attr.read_format); ctl->events[i].event_opened=1; } /* in many situations the kernel will indicate we opened fine */ /* yet things will fail later. So we need to double check */ /* we actually can use the events we've set up. */ /* This is not necessary if we are multiplexing, and in fact */ /* we cannot do this properly if multiplexed because */ /* PERF_EVENT_IOC_RESET does not reset the time running info */ if (!ctl->multiplexed) { ret = check_scheduability( ctx, ctl); if ( ret != PAPI_OK ) { /* the last event did open, so we need to bump the counter */ /* before doing the cleanup */ i++; goto open_peu_cleanup; } } /* Now that we've successfully opened all of the events, do whatever */ /* "tune-up" is needed to attach the mmap'd buffers, signal handlers, */ /* and so on. */ for ( i = 0; i < ctl->num_events; i++ ) { /* No sampling if uncore */ ctl->events[i].mmap_buf = NULL; } /* Set num_evts only if completely successful */ ctx->state |= PERF_EVENTS_OPENED; return PAPI_OK; open_peu_cleanup: /* We encountered an error, close up the fds we successfully opened. */ /* We go backward in an attempt to close group leaders last, although */ /* That's probably not strictly necessary. */ while ( i > 0 ) { i--; if (ctl->events[i].event_fd>=0) { close( ctl->events[i].event_fd ); ctl->events[i].event_opened=0; } } return ret; } /* Close all of the opened events */ static int close_pe_events( pe_context_t *ctx, pe_control_t *ctl ) { int i; int num_closed=0; int events_not_opened=0; /* should this be a more serious error? */ if ( ctx->state & PERF_EVENTS_RUNNING ) { SUBDBG("Closing without stopping first\n"); } /* Close child events first */ for( i=0; inum_events; i++ ) { if (ctl->events[i].event_opened) { if (ctl->events[i].group_leader_fd!=-1) { if ( ctl->events[i].mmap_buf ) { if ( munmap ( ctl->events[i].mmap_buf, ctl->events[i].nr_mmap_pages * getpagesize() ) ) { PAPIERROR( "munmap of fd = %d returned error: %s", ctl->events[i].event_fd, strerror( errno ) ); return PAPI_ESYS; } } if ( close( ctl->events[i].event_fd ) ) { PAPIERROR( "close of fd = %d returned error: %s", ctl->events[i].event_fd, strerror( errno ) ); return PAPI_ESYS; } else { num_closed++; } ctl->events[i].event_opened=0; } } else { events_not_opened++; } } /* Close the group leaders last */ for( i=0; inum_events; i++ ) { if (ctl->events[i].event_opened) { if (ctl->events[i].group_leader_fd==-1) { if ( ctl->events[i].mmap_buf ) { if ( munmap ( ctl->events[i].mmap_buf, ctl->events[i].nr_mmap_pages * getpagesize() ) ) { PAPIERROR( "munmap of fd = %d returned error: %s", ctl->events[i].event_fd, strerror( errno ) ); return PAPI_ESYS; } } if ( close( ctl->events[i].event_fd ) ) { PAPIERROR( "close of fd = %d returned error: %s", ctl->events[i].event_fd, strerror( errno ) ); return PAPI_ESYS; } else { num_closed++; } ctl->events[i].event_opened=0; } } } if (ctl->num_events!=num_closed) { if (ctl->num_events!=(num_closed+events_not_opened)) { PAPIERROR("Didn't close all events: " "Closed %d Not Opened: %d Expected %d\n", num_closed,events_not_opened,ctl->num_events); return PAPI_EBUG; } } ctl->num_events=0; ctx->state &= ~PERF_EVENTS_OPENED; return PAPI_OK; } /********************************************************************/ /* Component Interface */ /********************************************************************/ /* Initialize a thread */ static int _peu_init_thread( hwd_context_t *hwd_ctx ) { pe_context_t *pe_ctx = ( pe_context_t *) hwd_ctx; /* clear the context structure and mark as initialized */ memset( pe_ctx, 0, sizeof ( pe_context_t ) ); pe_ctx->initialized=1; pe_ctx->event_table=&uncore_native_event_table; pe_ctx->cidx=our_cidx; return PAPI_OK; } /* Initialize a new control state */ static int _peu_init_control_state( hwd_control_state_t *ctl ) { pe_control_t *pe_ctl = ( pe_control_t *) ctl; /* clear the contents */ memset( pe_ctl, 0, sizeof ( pe_control_t ) ); /* Set the default domain */ _peu_set_domain( ctl, _perf_event_uncore_vector.cmp_info.default_domain ); /* Set the default granularity */ pe_ctl->granularity=_perf_event_uncore_vector.cmp_info.default_granularity; pe_ctl->cidx=our_cidx; /* Set cpu number in the control block to show events */ /* are not tied to specific cpu */ pe_ctl->cpu = -1; return PAPI_OK; } /* Initialize the perf_event uncore component */ static int _peu_init_component( int cidx ) { int retval; int paranoid_level; FILE *fff; our_cidx=cidx; /* The is the official way to detect if perf_event support exists */ /* The file is called perf_counter_paranoid on 2.6.31 */ /* currently we are lazy and do not support 2.6.31 kernels */ fff=fopen("/proc/sys/kernel/perf_event_paranoid","r"); if (fff==NULL) { strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason, "perf_event support not detected",PAPI_MAX_STR_LEN); return PAPI_ENOCMP; } retval=fscanf(fff,"%d",¶noid_level); if (retval!=1) fprintf(stderr,"Error reading paranoid level\n"); fclose(fff); /* Run the libpfm4-specific setup */ retval = _papi_libpfm4_init(_papi_hwd[cidx]); if (retval) { strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason, "Error initializing libpfm4",PAPI_MAX_STR_LEN); return PAPI_ENOCMP; } /* Run the uncore specific libpfm4 setup */ retval = _peu_libpfm4_init(_papi_hwd[cidx], our_cidx, &uncore_native_event_table, PMU_TYPE_UNCORE); if (retval) { strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason, "Error setting up libpfm4",PAPI_MAX_STR_LEN); return PAPI_ENOCMP; } /* Check if no uncore events found */ if (_papi_hwd[cidx]->cmp_info.num_native_events==0) { strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason, "No uncore PMUs or events found",PAPI_MAX_STR_LEN); return PAPI_ENOCMP; } /* Check if we have enough permissions for uncore */ /* 2 means no kernel measurements allowed */ /* 1 means normal counter access */ /* 0 means you can access CPU-specific data */ /* -1 means no restrictions */ if ((paranoid_level>0) && (getuid()!=0)) { strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason, "Insufficient permissions for uncore access. Set /proc/sys/kernel/perf_event_paranoid to 0 or run as root.", PAPI_MAX_STR_LEN); return PAPI_ENOCMP; } return PAPI_OK; } /* Shutdown the perf_event component */ static int _peu_shutdown_component( void ) { /* deallocate our event table */ _pe_libpfm4_shutdown(&_perf_event_uncore_vector, &uncore_native_event_table); /* Shutdown libpfm4 */ _papi_libpfm4_shutdown(&_perf_event_uncore_vector); return PAPI_OK; } /* This function clears the current contents of the control structure and updates it with whatever resources are allocated for all the native events in the native info structure array. */ int _peu_update_control_state( hwd_control_state_t *ctl, NativeInfo_t *native, int count, hwd_context_t *ctx ) { int i; int j; int ret; int skipped_events=0; struct native_event_t *ntv_evt; pe_context_t *pe_ctx = ( pe_context_t *) ctx; pe_control_t *pe_ctl = ( pe_control_t *) ctl; /* close all of the existing fds and start over again */ /* In theory we could have finer-grained control and know if */ /* things were changed, but it's easier to tear things down and rebuild. */ close_pe_events( pe_ctx, pe_ctl ); /* Calling with count==0 should be OK, it's how things are deallocated */ /* when an eventset is destroyed. */ if ( count == 0 ) { SUBDBG( "Called with count == 0\n" ); return PAPI_OK; } /* set up all the events */ for( i = 0; i < count; i++ ) { if ( native ) { // get the native event pointer used for this papi event int ntv_idx = _papi_hwi_get_ntv_idx((unsigned)(native[i].ni_papi_code)); if (ntv_idx < -1) { SUBDBG("papi_event_code: %#x known by papi but not by the component\n", native[i].ni_papi_code); continue; } // if native index is -1, then we have an event without a mask and need to find the right native index to use if (ntv_idx == -1) { // find the native event index we want by matching for the right papi event code for (j=0 ; jevent_table->num_native_events ; j++) { if (pe_ctx->event_table->native_events[j].papi_event_code == native[i].ni_papi_code) { ntv_idx = j; } } } // if native index is still negative, we did not find event we wanted so just return error if (ntv_idx < 0) { SUBDBG("papi_event_code: %#x not found in native event tables\n", native[i].ni_papi_code); continue; } // this native index is positive so there was a mask with the event, the ntv_idx identifies which native event to use ntv_evt = (struct native_event_t *)(&(pe_ctx->event_table->native_events[ntv_idx])); SUBDBG("ntv_evt: %p\n", ntv_evt); SUBDBG("i: %d, pe_ctx->event_table->num_native_events: %d\n", i, pe_ctx->event_table->num_native_events); // Move this events hardware config values and other attributes to the perf_events attribute structure memcpy (&pe_ctl->events[i].attr, &ntv_evt->attr, sizeof(perf_event_attr_t)); // may need to update the attribute structure with information from event set level domain settings (values set by PAPI_set_domain) // only done if the event mask which controls each counting domain was not provided // get pointer to allocated name, will be NULL when adding preset events to event set char *aName = ntv_evt->allocated_name; if ((aName == NULL) || (strstr(aName, ":u=") == NULL)) { SUBDBG("set exclude_user attribute from eventset level domain flags, encode: %d, eventset: %d\n", pe_ctl->events[i].attr.exclude_user, !(pe_ctl->domain & PAPI_DOM_USER)); pe_ctl->events[i].attr.exclude_user = !(pe_ctl->domain & PAPI_DOM_USER); } if ((aName == NULL) || (strstr(aName, ":k=") == NULL)) { SUBDBG("set exclude_kernel attribute from eventset level domain flags, encode: %d, eventset: %d\n", pe_ctl->events[i].attr.exclude_kernel, !(pe_ctl->domain & PAPI_DOM_KERNEL)); pe_ctl->events[i].attr.exclude_kernel = !(pe_ctl->domain & PAPI_DOM_KERNEL); } // set the cpu number provided with an event mask if there was one (will be -1 if mask not provided) pe_ctl->events[i].cpu = ntv_evt->cpu; // if cpu event mask not provided, then set the cpu to use to what may have been set on call to PAPI_set_opt (will still be -1 if not called) if (pe_ctl->events[i].cpu == -1) { pe_ctl->events[i].cpu = pe_ctl->cpu; } } else { // This case happens when called from _pe_set_overflow and _pe_ctl // Those callers put things directly into the pe_ctl structure so it is already set for the open call } // Copy the inherit flag into the attribute block that will be passed to the kernel pe_ctl->events[i].attr.inherit = pe_ctl->inherit; /* Set the position in the native structure */ /* We just set up events linearly */ if ( native ) { native[i].ni_position = i; SUBDBG( "&native[%d]: %p, ni_papi_code: %#x, ni_event: %#x, ni_position: %d, ni_owners: %d\n", i, &(native[i]), native[i].ni_papi_code, native[i].ni_event, native[i].ni_position, native[i].ni_owners); } } if (count <= skipped_events) { SUBDBG("EXIT: No events to count, they all contained invalid umasks\n"); return PAPI_ENOEVNT; } pe_ctl->num_events = count - skipped_events; /* actuall open the events */ /* (why is this a separate function?) */ ret = open_pe_events( pe_ctx, pe_ctl ); if ( ret != PAPI_OK ) { SUBDBG("open_pe_events failed\n"); /* Restore values ? */ return ret; } SUBDBG( "EXIT: PAPI_OK\n" ); return PAPI_OK; } /********************************************************************/ /********************************************************************/ /* Start with functions that are exported via the module interface */ /********************************************************************/ /********************************************************************/ /* set the domain. perf_events allows per-event control of this, papi allows it to be set at the event level or at the event set level. */ /* this will set the event set level domain values but they only get used if no event level domain mask (u= or k=) was specified. */ static int _peu_set_domain( hwd_control_state_t *ctl, int domain) { pe_control_t *pe_ctl = ( pe_control_t *) ctl; SUBDBG("old control domain %d, new domain %d\n", pe_ctl->domain,domain); pe_ctl->domain = domain; return PAPI_OK; } /* Shutdown a thread */ static int _peu_shutdown_thread( hwd_context_t *ctx ) { pe_context_t *pe_ctx = ( pe_context_t *) ctx; pe_ctx->initialized=0; return PAPI_OK; } /* reset the hardware counters */ /* Note: PAPI_reset() does not necessarily call this */ /* unless the events are actually running. */ static int _peu_reset( hwd_context_t *ctx, hwd_control_state_t *ctl ) { int i, ret; pe_control_t *pe_ctl = ( pe_control_t *) ctl; ( void ) ctx; /*unused */ /* We need to reset all of the events, not just the group leaders */ for( i = 0; i < pe_ctl->num_events; i++ ) { ret = ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL ); if ( ret == -1 ) { PAPIERROR("ioctl(%d, PERF_EVENT_IOC_RESET, NULL) " "returned error, Linux says: %s", pe_ctl->events[i].event_fd, strerror( errno ) ); return PAPI_ESYS; } } return PAPI_OK; } /* write (set) the hardware counters */ /* Current we do not support this. */ static int _peu_write( hwd_context_t *ctx, hwd_control_state_t *ctl, long long *from ) { ( void ) ctx; /*unused */ ( void ) ctl; /*unused */ ( void ) from; /*unused */ /* * Counters cannot be written. Do we need to virtualize the * counters so that they can be written, or perhaps modify code so that * they can be written? FIXME ? */ return PAPI_ENOSUPP; } /* * perf_event provides a complicated read interface. * the info returned by read() varies depending on whether * you have PERF_FORMAT_GROUP, PERF_FORMAT_TOTAL_TIME_ENABLED, * PERF_FORMAT_TOTAL_TIME_RUNNING, or PERF_FORMAT_ID set * * To simplify things we just always ask for everything. This might * lead to overhead when reading more than we need, but it makes the * read code a lot simpler than the original implementation we had here. * * For more info on the layout see include/linux/perf_event.h * */ static int _peu_read( hwd_context_t *ctx, hwd_control_state_t *ctl, long long **events, int flags ) { SUBDBG("ENTER: ctx: %p, ctl: %p, events: %p, flags: %#x\n", ctx, ctl, events, flags); ( void ) flags; /*unused */ int i, ret = -1; /* pe_context_t *pe_ctx = ( pe_context_t *) ctx; */ (void) ctx; /*unused*/ pe_control_t *pe_ctl = ( pe_control_t *) ctl; long long papi_pe_buffer[READ_BUFFER_SIZE]; long long tot_time_running, tot_time_enabled, scale; /* Handle case where we are multiplexing */ if (pe_ctl->multiplexed) { /* currently we handle multiplexing by having individual events */ /* so we read from each in turn. */ for ( i = 0; i < pe_ctl->num_events; i++ ) { ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer, sizeof ( papi_pe_buffer ) ); if ( ret == -1 ) { PAPIERROR("read returned an error: ", strerror( errno )); SUBDBG("EXIT: PAPI_ESYS\n"); return PAPI_ESYS; } /* We should read 3 64-bit values from the counter */ if (ret<(signed)(3*sizeof(long long))) { PAPIERROR("Error! short read!\n"); SUBDBG("EXIT: PAPI_ESYS\n"); return PAPI_ESYS; } SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", pe_ctl->events[i].event_fd, (long)pe_ctl->tid, pe_ctl->events[i].cpu, ret); SUBDBG("read: %lld %lld %lld\n",papi_pe_buffer[0], papi_pe_buffer[1],papi_pe_buffer[2]); tot_time_enabled = papi_pe_buffer[1]; tot_time_running = papi_pe_buffer[2]; SUBDBG("count[%d] = (papi_pe_buffer[%d] %lld * " "tot_time_enabled %lld) / tot_time_running %lld\n", i, 0,papi_pe_buffer[0], tot_time_enabled,tot_time_running); if (tot_time_running == tot_time_enabled) { /* No scaling needed */ pe_ctl->counts[i] = papi_pe_buffer[0]; } else if (tot_time_running && tot_time_enabled) { /* Scale factor of 100 to avoid overflows when computing */ /*enabled/running */ scale = (tot_time_enabled * 100LL) / tot_time_running; scale = scale * papi_pe_buffer[0]; scale = scale / 100LL; pe_ctl->counts[i] = scale; } else { /* This should not happen, but Phil reports it sometime does. */ SUBDBG("perf_event kernel bug(?) count, enabled, " "running: %lld, %lld, %lld\n", papi_pe_buffer[0],tot_time_enabled, tot_time_running); pe_ctl->counts[i] = papi_pe_buffer[0]; } } } /* Handle cases where we cannot use FORMAT GROUP */ else if (pe_ctl->inherit) { /* we must read each counter individually */ for ( i = 0; i < pe_ctl->num_events; i++ ) { ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer, sizeof ( papi_pe_buffer ) ); if ( ret == -1 ) { PAPIERROR("read returned an error: ", strerror( errno )); SUBDBG("EXIT: PAPI_ESYS\n"); return PAPI_ESYS; } /* we should read one 64-bit value from each counter */ if (ret!=sizeof(long long)) { PAPIERROR("Error! short read!\n"); PAPIERROR("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", pe_ctl->events[i].event_fd, (long)pe_ctl->tid, pe_ctl->events[i].cpu, ret); SUBDBG("EXIT: PAPI_ESYS\n"); return PAPI_ESYS; } SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", pe_ctl->events[i].event_fd, (long)pe_ctl->tid, pe_ctl->events[i].cpu, ret); SUBDBG("read: %lld\n",papi_pe_buffer[0]); pe_ctl->counts[i] = papi_pe_buffer[0]; } } /* Handle cases where we are using FORMAT_GROUP */ /* We assume only one group leader, in position 0 */ else { if (pe_ctl->events[0].group_leader_fd!=-1) { PAPIERROR("Was expecting group leader!\n"); } ret = read( pe_ctl->events[0].event_fd, papi_pe_buffer, sizeof ( papi_pe_buffer ) ); if ( ret == -1 ) { PAPIERROR("read returned an error: ", strerror( errno )); SUBDBG("EXIT: PAPI_ESYS\n"); return PAPI_ESYS; } /* we read 1 64-bit value (number of events) then */ /* num_events more 64-bit values that hold the counts */ if (ret<(signed)((1+pe_ctl->num_events)*sizeof(long long))) { PAPIERROR("Error! short read!\n"); SUBDBG("EXIT: PAPI_ESYS\n"); return PAPI_ESYS; } SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", pe_ctl->events[0].event_fd, (long)pe_ctl->tid, pe_ctl->events[0].cpu, ret); { int j; for(j=0;jnum_events) { PAPIERROR("Error! Wrong number of events!\n"); SUBDBG("EXIT: PAPI_ESYS\n"); return PAPI_ESYS; } /* put the count values in their proper location */ for(i=0;inum_events;i++) { pe_ctl->counts[i] = papi_pe_buffer[1+i]; } } /* point PAPI to the values we read */ *events = pe_ctl->counts; SUBDBG("EXIT: PAPI_OK\n"); return PAPI_OK; } /* Start counting events */ static int _peu_start( hwd_context_t *ctx, hwd_control_state_t *ctl ) { int ret; int i; int did_something = 0; pe_context_t *pe_ctx = ( pe_context_t *) ctx; pe_control_t *pe_ctl = ( pe_control_t *) ctl; /* Reset the counters first. Is this necessary? */ ret = _peu_reset( pe_ctx, pe_ctl ); if ( ret ) { return ret; } /* Enable all of the group leaders */ /* All group leaders have a group_leader_fd of -1 */ for( i = 0; i < pe_ctl->num_events; i++ ) { if (pe_ctl->events[i].group_leader_fd == -1) { SUBDBG("ioctl(enable): fd: %d\n", pe_ctl->events[i].event_fd); ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_ENABLE, NULL) ; /* ioctls always return -1 on failure */ if (ret == -1) { PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n"); return PAPI_ESYS; } did_something++; } } if (!did_something) { PAPIERROR("Did not enable any counters.\n"); return PAPI_EBUG; } pe_ctx->state |= PERF_EVENTS_RUNNING; return PAPI_OK; } /* Stop all of the counters */ static int _peu_stop( hwd_context_t *ctx, hwd_control_state_t *ctl ) { int ret; int i; pe_context_t *pe_ctx = ( pe_context_t *) ctx; pe_control_t *pe_ctl = ( pe_control_t *) ctl; /* Just disable the group leaders */ for ( i = 0; i < pe_ctl->num_events; i++ ) { if ( pe_ctl->events[i].group_leader_fd == -1 ) { ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_DISABLE, NULL); if ( ret == -1 ) { PAPIERROR( "ioctl(%d, PERF_EVENT_IOC_DISABLE, NULL) " "returned error, Linux says: %s", pe_ctl->events[i].event_fd, strerror( errno ) ); return PAPI_EBUG; } } } pe_ctx->state &= ~PERF_EVENTS_RUNNING; return PAPI_OK; } /* Set various options on a control state */ static int _peu_ctl( hwd_context_t *ctx, int code, _papi_int_option_t *option ) { int ret; pe_context_t *pe_ctx = ( pe_context_t *) ctx; pe_control_t *pe_ctl = NULL; switch ( code ) { case PAPI_MULTIPLEX: pe_ctl = ( pe_control_t * ) ( option->multiplex.ESI->ctl_state ); pe_ctl->multiplexed = 1; ret = _peu_update_control_state( pe_ctl, NULL, pe_ctl->num_events, pe_ctx ); if (ret != PAPI_OK) { pe_ctl->multiplexed = 0; } return ret; case PAPI_ATTACH: pe_ctl = ( pe_control_t * ) ( option->attach.ESI->ctl_state ); pe_ctl->tid = option->attach.tid; /* If events have been already been added, something may */ /* have been done to the kernel, so update */ ret =_peu_update_control_state( pe_ctl, NULL, pe_ctl->num_events, pe_ctx); return ret; case PAPI_DETACH: pe_ctl = ( pe_control_t *) ( option->attach.ESI->ctl_state ); pe_ctl->tid = 0; return PAPI_OK; case PAPI_CPU_ATTACH: pe_ctl = ( pe_control_t *) ( option->cpu.ESI->ctl_state ); /* this tells the kernel not to count for a thread */ /* should we warn if we try to set both? perf_event */ /* will reject it. */ pe_ctl->tid = -1; pe_ctl->cpu = option->cpu.cpu_num; return PAPI_OK; case PAPI_DOMAIN: pe_ctl = ( pe_control_t *) ( option->domain.ESI->ctl_state ); /* looks like we are allowed, so set event set level counting domains */ pe_ctl->domain = option->domain.domain; return PAPI_OK; case PAPI_GRANUL: pe_ctl = (pe_control_t *) ( option->granularity.ESI->ctl_state ); /* FIXME: we really don't support this yet */ switch ( option->granularity.granularity ) { case PAPI_GRN_PROCG: case PAPI_GRN_SYS_CPU: case PAPI_GRN_PROC: return PAPI_ECMP; /* Currently we only support thread and CPU granularity */ case PAPI_GRN_SYS: pe_ctl->granularity=PAPI_GRN_SYS; break; case PAPI_GRN_THR: pe_ctl->granularity=PAPI_GRN_THR; break; default: return PAPI_EINVAL; } return PAPI_OK; case PAPI_INHERIT: pe_ctl = (pe_control_t *) ( option->inherit.ESI->ctl_state ); if (option->inherit.inherit) { /* children will inherit counters */ pe_ctl->inherit = 1; } else { /* children won't inherit counters */ pe_ctl->inherit = 0; } return PAPI_OK; case PAPI_DATA_ADDRESS: return PAPI_ENOSUPP; case PAPI_INSTR_ADDRESS: return PAPI_ENOSUPP; case PAPI_DEF_ITIMER: return PAPI_ENOSUPP; case PAPI_DEF_MPX_NS: return PAPI_ENOSUPP; case PAPI_DEF_ITIMER_NS: return PAPI_ENOSUPP; default: return PAPI_ENOSUPP; } } static int _peu_ntv_enum_events( unsigned int *PapiEventCode, int modifier ) { if (_perf_event_uncore_vector.cmp_info.disabled) return PAPI_ENOEVNT; return _pe_libpfm4_ntv_enum_events(PapiEventCode, modifier, our_cidx, &uncore_native_event_table); } static int _peu_ntv_name_to_code( const char *name, unsigned int *event_code) { if (_perf_event_uncore_vector.cmp_info.disabled) return PAPI_ENOEVNT; return _pe_libpfm4_ntv_name_to_code(name,event_code, our_cidx, &uncore_native_event_table); } static int _peu_ntv_code_to_name(unsigned int EventCode, char *ntv_name, int len) { if (_perf_event_uncore_vector.cmp_info.disabled) return PAPI_ENOEVNT; return _pe_libpfm4_ntv_code_to_name(EventCode, ntv_name, len, &uncore_native_event_table); } static int _peu_ntv_code_to_descr( unsigned int EventCode, char *ntv_descr, int len) { if (_perf_event_uncore_vector.cmp_info.disabled) return PAPI_ENOEVNT; return _pe_libpfm4_ntv_code_to_descr(EventCode,ntv_descr,len, &uncore_native_event_table); } static int _peu_ntv_code_to_info(unsigned int EventCode, PAPI_event_info_t *info) { if (_perf_event_uncore_vector.cmp_info.disabled) return PAPI_ENOEVNT; return _pe_libpfm4_ntv_code_to_info(EventCode, info, &uncore_native_event_table); } /* Our component vector */ papi_vector_t _perf_event_uncore_vector = { .cmp_info = { /* component information (unspecified values initialized to 0) */ .name = "perf_event_uncore", .short_name = "peu", .version = "5.0", .description = "Linux perf_event CPU uncore and northbridge", .default_domain = PAPI_DOM_ALL, .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR, .default_granularity = PAPI_GRN_SYS, .available_granularities = PAPI_GRN_SYS, .num_mpx_cntrs = PERF_EVENT_MAX_MPX_COUNTERS, /* component specific cmp_info initializations */ .fast_virtual_timer = 0, .attach = 1, .attach_must_ptrace = 1, .cpu = 1, .inherit = 1, .cntr_umasks = 1, }, /* sizes of framework-opaque component-private structures */ .size = { .context = sizeof ( pe_context_t ), .control_state = sizeof ( pe_control_t ), .reg_value = sizeof ( int ), .reg_alloc = sizeof ( int ), }, /* function pointers in this component */ .init_component = _peu_init_component, .shutdown_component = _peu_shutdown_component, .init_thread = _peu_init_thread, .init_control_state = _peu_init_control_state, .start = _peu_start, .stop = _peu_stop, .read = _peu_read, .shutdown_thread = _peu_shutdown_thread, .ctl = _peu_ctl, .update_control_state = _peu_update_control_state, .set_domain = _peu_set_domain, .reset = _peu_reset, .write = _peu_write, /* from counter name mapper */ .ntv_enum_events = _peu_ntv_enum_events, .ntv_name_to_code = _peu_ntv_name_to_code, .ntv_code_to_name = _peu_ntv_code_to_name, .ntv_code_to_descr = _peu_ntv_code_to_descr, .ntv_code_to_info = _peu_ntv_code_to_info, };