Blob Blame History Raw
/*****************************************************************/
/********* Begin perf_event low-level code ***********************/
/*****************************************************************/

/* In case headers aren't new enough to have __NR_perf_event_open */
#ifndef __NR_perf_event_open

#ifdef __powerpc__
#define __NR_perf_event_open	319
#elif defined(__x86_64__)
#define __NR_perf_event_open	298
#elif defined(__i386__)
#define __NR_perf_event_open	336
#elif defined(__arm__)
#define __NR_perf_event_open	364
#endif

#endif

static long
sys_perf_event_open( struct perf_event_attr *hw_event,
		pid_t pid, int cpu, int group_fd, unsigned long flags )
{
	int ret;

	ret = syscall( __NR_perf_event_open,
			hw_event, pid, cpu, group_fd, flags );

	return ret;
}

#if defined(__x86_64__) || defined(__i386__)


static inline unsigned long long rdtsc(void) {

	unsigned a,d;

	__asm__ volatile("rdtsc" : "=a" (a), "=d" (d));

	return ((unsigned long long)a) | (((unsigned long long)d) << 32);
}

static inline unsigned long long rdpmc(unsigned int counter) {

	unsigned int low, high;

	__asm__ volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));

	return (unsigned long long)low | ((unsigned long long)high) <<32;
}

#define barrier() __asm__ volatile("" ::: "memory")

/* based on the code in include/uapi/linux/perf_event.h */
static inline unsigned long long mmap_read_self(void *addr,
					 unsigned long long *en,
					 unsigned long long *ru) {

	struct perf_event_mmap_page *pc = addr;

	uint32_t seq, time_mult, time_shift, index, width;
	int64_t count;
	uint64_t enabled, running;
	uint64_t cyc, time_offset;
	int64_t pmc = 0;
	uint64_t quot, rem;
	uint64_t delta = 0;


	do {
		/* The kernel increments pc->lock any time */
		/* perf_event_update_userpage() is called */
		/* So by checking now, and the end, we */
		/* can see if an update happened while we */
		/* were trying to read things, and re-try */
		/* if something changed */
		/* The barrier ensures we get the most up to date */
		/* version of the pc->lock variable */

		seq=pc->lock;
		barrier();

		/* For multiplexing */
		/* time_enabled is time the event was enabled */
		enabled = pc->time_enabled;
		/* time_running is time the event was actually running */
		running = pc->time_running;

		/* if cap_user_time is set, we can use rdtsc */
		/* to calculate more exact enabled/running time */
		/* for more accurate multiplex calculations */
		if ( (pc->cap_user_time) && (enabled != running)) {
			cyc = rdtsc();
			time_offset = pc->time_offset;
			time_mult = pc->time_mult;
			time_shift = pc->time_shift;

			quot=(cyc>>time_shift);
			rem = cyc & (((uint64_t)1 << time_shift) - 1);
			delta = time_offset + (quot * time_mult) +
				((rem * time_mult) >> time_shift);
		}
		enabled+=delta;

		/* actually do the measurement */

		/* Index of register to read */
		/* 0 means stopped/not-active */
		/* Need to subtract 1 to get actual index to rdpmc() */
		index = pc->index;

		/* count is the value of the counter the last time */
		/* the kernel read it */
		/* If we don't sign extend it, we get large negative */
		/* numbers which break if an IOC_RESET is done */
		width = pc->pmc_width;
		count = pc->offset;
		count<<=(64-width);
		count>>=(64-width);

		/* Ugh, libpfm4 perf_event.h has cap_usr_rdpmc */
		/* while actual perf_event.h has cap_user_rdpmc */

		/* Only read if rdpmc enabled and event index valid */
		/* Otherwise return the older (out of date?) count value */
		if (pc->cap_usr_rdpmc && index) {

			/* Read counter value */
			pmc = rdpmc(index-1);

			/* sign extend result */
			pmc<<=(64-width);
			pmc>>=(64-width);

			/* add current count into the existing kernel count */
			count+=pmc;

			/* Only adjust if index is valid */
			running+=delta;
		}

		barrier();

	} while (pc->lock != seq);

	if (en) *en=enabled;
	if (ru) *ru=running;

	return count;
}

#else
static inline unsigned long long mmap_read_self(void *addr,
					 unsigned long long *en,
					 unsigned long long *ru) {

	(void)addr;

	*en=0;
	*ru=0;

	return (unsigned long long)(-1);
}

#endif

/* These functions are based on builtin-record.c in the  */
/* kernel's tools/perf directory.                        */
/* This code is from a really ancient version of perf */
/* And should be updated/commented properly */


static uint64_t
mmap_read_head( pe_event_info_t *pe )
{
	struct perf_event_mmap_page *pc = pe->mmap_buf;
	int head;

	if ( pc == NULL ) {
		PAPIERROR( "perf_event_mmap_page is NULL" );
		return 0;
	}

	head = pc->data_head;
	rmb();

	return head;
}

static void
mmap_write_tail( pe_event_info_t *pe, uint64_t tail )
{
	struct perf_event_mmap_page *pc = pe->mmap_buf;

	/* ensure all reads are done before we write the tail out. */
	pc->data_tail = tail;
}

/* Does the kernel define these somewhere? */
struct ip_event {
	struct perf_event_header header;
 	uint64_t ip;
};
struct lost_event {
	struct perf_event_header header;
	uint64_t id;
	uint64_t lost;
};
typedef union event_union {
	struct perf_event_header header;
	struct ip_event ip;
	struct lost_event lost;
} perf_sample_event_t;

/* Should re-write with comments if we ever figure out what's */
/* going on here.                                             */
static void
mmap_read( int cidx, ThreadInfo_t **thr, pe_event_info_t *pe,
           int profile_index )
{
	uint64_t head = mmap_read_head( pe );
	uint64_t old = pe->tail;
	unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize();
	int diff;

	diff = head - old;
	if ( diff < 0 ) {
		SUBDBG( "WARNING: failed to keep up with mmap data. head = %" PRIu64
			",  tail = %" PRIu64 ". Discarding samples.\n", head, old );
		/* head points to a known good entry, start there. */
		old = head;
	}

	for( ; old != head; ) {
		perf_sample_event_t *event = ( perf_sample_event_t * )& data[old & pe->mask];
		perf_sample_event_t event_copy;
		size_t size = event->header.size;

		/* Event straddles the mmap boundary -- header should always */
		/* be inside due to u64 alignment of output.                 */
		if ( ( old & pe->mask ) + size != ( ( old + size ) & pe->mask ) ) {
			uint64_t offset = old;
			uint64_t len = min( sizeof ( *event ), size ), cpy;
			void *dst = &event_copy;

			do {
				cpy = min( pe->mask + 1 - ( offset & pe->mask ), len );
				memcpy( dst, &data[offset & pe->mask], cpy );
				offset += cpy;
				dst = ((unsigned char*)dst) + cpy;
				len -= cpy;
			} while ( len );

			event = &event_copy;
		}
		old += size;

		SUBDBG( "event->type = %08x\n", event->header.type );
		SUBDBG( "event->size = %d\n", event->header.size );

		switch ( event->header.type ) {
			case PERF_RECORD_SAMPLE:
				_papi_hwi_dispatch_profile( ( *thr )->running_eventset[cidx],
					( caddr_t ) ( unsigned long ) event->ip.ip,
					0, profile_index );
				break;

			case PERF_RECORD_LOST:
				SUBDBG( "Warning: because of a mmap buffer overrun, %" PRId64
					" events were lost.\n"
					"Loss was recorded when counter id %#"PRIx64
					" overflowed.\n", event->lost.lost, event->lost.id );
				break;
			default:
				SUBDBG( "Error: unexpected header type - %d\n",
					event->header.type );
				break;
		}
	}

	pe->tail = old;
	mmap_write_tail( pe, old );
}