/*****************************************************************/
/********* Begin perf_event low-level code ***********************/
/*****************************************************************/
/* In case headers aren't new enough to have __NR_perf_event_open */
#ifndef __NR_perf_event_open
#ifdef __powerpc__
#define __NR_perf_event_open 319
#elif defined(__x86_64__)
#define __NR_perf_event_open 298
#elif defined(__i386__)
#define __NR_perf_event_open 336
#elif defined(__arm__)
#define __NR_perf_event_open 364
#endif
#endif
static long
sys_perf_event_open( struct perf_event_attr *hw_event,
pid_t pid, int cpu, int group_fd, unsigned long flags )
{
int ret;
ret = syscall( __NR_perf_event_open,
hw_event, pid, cpu, group_fd, flags );
return ret;
}
#if defined(__x86_64__) || defined(__i386__)
static inline unsigned long long rdtsc(void) {
unsigned a,d;
__asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
return ((unsigned long long)a) | (((unsigned long long)d) << 32);
}
static inline unsigned long long rdpmc(unsigned int counter) {
unsigned int low, high;
__asm__ volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));
return (unsigned long long)low | ((unsigned long long)high) <<32;
}
#define barrier() __asm__ volatile("" ::: "memory")
/* based on the code in include/uapi/linux/perf_event.h */
static inline unsigned long long mmap_read_self(void *addr,
unsigned long long *en,
unsigned long long *ru) {
struct perf_event_mmap_page *pc = addr;
uint32_t seq, time_mult, time_shift, index, width;
int64_t count;
uint64_t enabled, running;
uint64_t cyc, time_offset;
int64_t pmc = 0;
uint64_t quot, rem;
uint64_t delta = 0;
do {
/* The kernel increments pc->lock any time */
/* perf_event_update_userpage() is called */
/* So by checking now, and the end, we */
/* can see if an update happened while we */
/* were trying to read things, and re-try */
/* if something changed */
/* The barrier ensures we get the most up to date */
/* version of the pc->lock variable */
seq=pc->lock;
barrier();
/* For multiplexing */
/* time_enabled is time the event was enabled */
enabled = pc->time_enabled;
/* time_running is time the event was actually running */
running = pc->time_running;
/* if cap_user_time is set, we can use rdtsc */
/* to calculate more exact enabled/running time */
/* for more accurate multiplex calculations */
if ( (pc->cap_user_time) && (enabled != running)) {
cyc = rdtsc();
time_offset = pc->time_offset;
time_mult = pc->time_mult;
time_shift = pc->time_shift;
quot=(cyc>>time_shift);
rem = cyc & (((uint64_t)1 << time_shift) - 1);
delta = time_offset + (quot * time_mult) +
((rem * time_mult) >> time_shift);
}
enabled+=delta;
/* actually do the measurement */
/* Index of register to read */
/* 0 means stopped/not-active */
/* Need to subtract 1 to get actual index to rdpmc() */
index = pc->index;
/* count is the value of the counter the last time */
/* the kernel read it */
/* If we don't sign extend it, we get large negative */
/* numbers which break if an IOC_RESET is done */
width = pc->pmc_width;
count = pc->offset;
count<<=(64-width);
count>>=(64-width);
/* Ugh, libpfm4 perf_event.h has cap_usr_rdpmc */
/* while actual perf_event.h has cap_user_rdpmc */
/* Only read if rdpmc enabled and event index valid */
/* Otherwise return the older (out of date?) count value */
if (pc->cap_usr_rdpmc && index) {
/* Read counter value */
pmc = rdpmc(index-1);
/* sign extend result */
pmc<<=(64-width);
pmc>>=(64-width);
/* add current count into the existing kernel count */
count+=pmc;
/* Only adjust if index is valid */
running+=delta;
}
barrier();
} while (pc->lock != seq);
if (en) *en=enabled;
if (ru) *ru=running;
return count;
}
#else
static inline unsigned long long mmap_read_self(void *addr,
unsigned long long *en,
unsigned long long *ru) {
(void)addr;
*en=0;
*ru=0;
return (unsigned long long)(-1);
}
#endif
/* These functions are based on builtin-record.c in the */
/* kernel's tools/perf directory. */
/* This code is from a really ancient version of perf */
/* And should be updated/commented properly */
static uint64_t
mmap_read_head( pe_event_info_t *pe )
{
struct perf_event_mmap_page *pc = pe->mmap_buf;
int head;
if ( pc == NULL ) {
PAPIERROR( "perf_event_mmap_page is NULL" );
return 0;
}
head = pc->data_head;
rmb();
return head;
}
static void
mmap_write_tail( pe_event_info_t *pe, uint64_t tail )
{
struct perf_event_mmap_page *pc = pe->mmap_buf;
/* ensure all reads are done before we write the tail out. */
pc->data_tail = tail;
}
/* Does the kernel define these somewhere? */
struct ip_event {
struct perf_event_header header;
uint64_t ip;
};
struct lost_event {
struct perf_event_header header;
uint64_t id;
uint64_t lost;
};
typedef union event_union {
struct perf_event_header header;
struct ip_event ip;
struct lost_event lost;
} perf_sample_event_t;
/* Should re-write with comments if we ever figure out what's */
/* going on here. */
static void
mmap_read( int cidx, ThreadInfo_t **thr, pe_event_info_t *pe,
int profile_index )
{
uint64_t head = mmap_read_head( pe );
uint64_t old = pe->tail;
unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize();
int diff;
diff = head - old;
if ( diff < 0 ) {
SUBDBG( "WARNING: failed to keep up with mmap data. head = %" PRIu64
", tail = %" PRIu64 ". Discarding samples.\n", head, old );
/* head points to a known good entry, start there. */
old = head;
}
for( ; old != head; ) {
perf_sample_event_t *event = ( perf_sample_event_t * )& data[old & pe->mask];
perf_sample_event_t event_copy;
size_t size = event->header.size;
/* Event straddles the mmap boundary -- header should always */
/* be inside due to u64 alignment of output. */
if ( ( old & pe->mask ) + size != ( ( old + size ) & pe->mask ) ) {
uint64_t offset = old;
uint64_t len = min( sizeof ( *event ), size ), cpy;
void *dst = &event_copy;
do {
cpy = min( pe->mask + 1 - ( offset & pe->mask ), len );
memcpy( dst, &data[offset & pe->mask], cpy );
offset += cpy;
dst = ((unsigned char*)dst) + cpy;
len -= cpy;
} while ( len );
event = &event_copy;
}
old += size;
SUBDBG( "event->type = %08x\n", event->header.type );
SUBDBG( "event->size = %d\n", event->header.size );
switch ( event->header.type ) {
case PERF_RECORD_SAMPLE:
_papi_hwi_dispatch_profile( ( *thr )->running_eventset[cidx],
( caddr_t ) ( unsigned long ) event->ip.ip,
0, profile_index );
break;
case PERF_RECORD_LOST:
SUBDBG( "Warning: because of a mmap buffer overrun, %" PRId64
" events were lost.\n"
"Loss was recorded when counter id %#"PRIx64
" overflowed.\n", event->lost.lost, event->lost.id );
break;
default:
SUBDBG( "Error: unexpected header type - %d\n",
event->header.type );
break;
}
}
pe->tail = old;
mmap_write_tail( pe, old );
}