Blob Blame History Raw
#include <sys/types.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <pthread.h>
#include <semaphore.h>
#include <inttypes.h>
#include <syscall.h>
#include <errno.h>
#include <stdarg.h>

#include <perfmon/perfmon.h>

#include "libpfms.h"

//#define dprint(format, arg...) fprintf(stderr, "%s.%d: " format , __FUNCTION__ , __LINE__, ## arg)
#define dprint(format, arg...)

typedef enum {	CMD_NONE,
		CMD_CTX,
		CMD_LOAD,
		CMD_UNLOAD,
		CMD_WPMCS,
		CMD_WPMDS,
		CMD_RPMDS,
		CMD_STOP,
		CMD_START,
		CMD_CLOSE
} pfms_cmd_t;

typedef struct _barrier {
	pthread_mutex_t mutex;
	pthread_cond_t	cond;
	uint32_t	counter;
	uint32_t	max;
	uint64_t	generation; /* avoid race condition on wake-up */
} barrier_t;

typedef struct {
	uint32_t	cpu;
	uint32_t	fd;
	void		*smpl_vaddr;
	size_t		smpl_buf_size;
} pfms_cpu_t;

typedef struct _pfms_thread {
	uint32_t	cpu;
	pfms_cmd_t	cmd;
	void		*data;
	uint32_t	ndata;
	sem_t		cmd_sem;
	int		ret;
	pthread_t	tid;
	barrier_t	*barrier; 
} pfms_thread_t;

typedef struct  {
	barrier_t	barrier;
	uint32_t	ncpus;
} pfms_session_t;

static uint32_t	ncpus;
static pfms_thread_t	*tds;
static pthread_mutex_t  tds_lock = PTHREAD_MUTEX_INITIALIZER;

static int
barrier_init(barrier_t *b, uint32_t count)
{
	int r;

	r = pthread_mutex_init(&b->mutex, NULL);
	if (r == -1) return -1;

	r = pthread_cond_init(&b->cond, NULL);
	if (r == -1) return -1;

	b->max = b->counter = count;
	b->generation = 0;

	return 0;
}

static void
cleanup_barrier(void *arg)
{
	barrier_t *b = (barrier_t *)arg;
	int r;
	r = pthread_mutex_unlock(&b->mutex);
	dprint("free barrier mutex r=%d\n", r);
	(void) r;
}

static int
barrier_wait(barrier_t *b)
{
	uint64_t generation;
	int oldstate;

	pthread_cleanup_push(cleanup_barrier, b);

	pthread_mutex_lock(&b->mutex);

	pthread_testcancel();

	if (--b->counter == 0) {

		/* reset barrier */
		b->counter = b->max;
		/*
		 * bump generation number, this avoids thread getting stuck in the
		 * wake up loop below in case a thread just out of the barrier goes
		 * back in right away before all the thread from the previous "round"
		 * have "escaped".
		 */
		b->generation++;

		pthread_cond_broadcast(&b->cond);
	} else {

		generation = b->generation;

		pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate);

		while (b->counter != b->max && generation == b->generation) {
			pthread_cond_wait(&b->cond, &b->mutex);
		}

		pthread_setcancelstate(oldstate, NULL);
	}
	pthread_mutex_unlock(&b->mutex);

	pthread_cleanup_pop(0);

	return 0;
}

/*
 * placeholder for pthread_setaffinity_np(). This stuff is ugly
 * and I could not figure out a way to get it compiled while also preserving
 * the pthread_*cancel(). There are issues with LinuxThreads and NPTL. I
 * decided to quit on this and implement my own affinity call until this 
 * settles.
 */
static int
pin_cpu(uint32_t cpu)
{
	uint64_t *mask;
	size_t size;
	pid_t pid;
	int ret;

	pid = syscall(__NR_gettid);

	size = ncpus * sizeof(uint64_t);

	mask = calloc(1, size);
	if (mask == NULL) {
		dprint("CPU%u: cannot allocate bitvector\n", cpu);
		return -1;
	}
	mask[cpu>>6] = 1ULL << (cpu & 63);

	ret = syscall(__NR_sched_setaffinity, pid, size, mask);

	free(mask);

	return ret;
}

static void
pfms_thread_mainloop(void *arg)
{
	long k = (long )arg;
	uint32_t mycpu = (uint32_t)k;
	pfarg_ctx_t myctx, *ctx;
	pfarg_load_t load_args;
	int fd = -1;
	pfms_thread_t *td;
	sem_t *cmd_sem;
	int ret = 0;

	memset(&load_args, 0, sizeof(load_args));
	load_args.load_pid = mycpu;
	td = tds+mycpu;

	ret = pin_cpu(mycpu);
	dprint("CPU%u wthread created and pinned ret=%d\n", mycpu, ret);

	cmd_sem = &tds[mycpu].cmd_sem;

	for(;;) {
		dprint("CPU%u waiting for cmd\n", mycpu);

		sem_wait(cmd_sem);

		switch(td->cmd) {
			case CMD_NONE:
				ret = 0;
				break;

			case CMD_CTX:

				/*
				 * copy context to get private fd
				 */
				ctx = td->data;
				myctx = *ctx;

				fd = pfm_create_context(&myctx, NULL, NULL, 0);
				ret = fd < 0 ? -1 : 0;
				dprint("CPU%u CMD_CTX ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd);
				break;

			case CMD_LOAD:
				ret = pfm_load_context(fd, &load_args);
				dprint("CPU%u CMD_LOAD ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd);
				break;
			case CMD_UNLOAD:
				ret = pfm_unload_context(fd);
				dprint("CPU%u CMD_UNLOAD ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd);
				break;
			case CMD_START:
				ret = pfm_start(fd, NULL);
				dprint("CPU%u CMD_START ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd);
				break;
			case CMD_STOP:
				ret = pfm_stop(fd);
				dprint("CPU%u CMD_STOP ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd);
				break;
			case CMD_WPMCS:
				ret = pfm_write_pmcs(fd,(pfarg_pmc_t *)td->data, td->ndata);
				dprint("CPU%u CMD_WPMCS ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd);
				break;
			case CMD_WPMDS:
				ret = pfm_write_pmds(fd,(pfarg_pmd_t *)td->data, td->ndata);
				dprint("CPU%u CMD_WPMDS ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd);
				break;
			case CMD_RPMDS:
				ret = pfm_read_pmds(fd,(pfarg_pmd_t *)td->data, td->ndata);
				dprint("CPU%u CMD_RPMDS ret=%d errno=%d fd=%d\n", mycpu, ret, errno, fd);
				break;
			case CMD_CLOSE:
				dprint("CPU%u CMD_CLOSE fd=%d\n", mycpu, fd);
				ret = close(fd);
				fd = -1;
				break;
			default:
				break;
		}
		td->ret = ret;

		dprint("CPU%u td->ret=%d\n", mycpu, ret);

		barrier_wait(td->barrier);
	}
}

static int
create_one_wthread(int cpu)
{
	int ret;

	sem_init(&tds[cpu].cmd_sem, 0, 0);

	ret = pthread_create(&tds[cpu].tid, 
			     NULL, 
			     (void *(*)(void *))pfms_thread_mainloop,
			     (void *)(long)cpu);
	return ret;
}

/*
 * must be called with tds_lock held
 */
static int
create_wthreads(uint64_t *cpu_list, uint32_t n)
{
	uint64_t v;
	uint32_t i,k, cpu;
	int ret = 0;

	for(k=0, cpu = 0; k < n; k++, cpu+= 64) {
		v = cpu_list[k];
		for(i=0; v && i < 63; i++, v>>=1, cpu++) {
			if ((v & 0x1) && tds[cpu].tid == 0) {
				ret = create_one_wthread(cpu);
				if (ret) break;
			}
		}
	}

	if (ret)
		dprint("cannot create wthread on CPU%u\n", cpu);

	return ret;
}

int
pfms_initialize(void)
{
	printf("cpu_t=%zu thread=%zu session_t=%zu\n",
		sizeof(pfms_cpu_t),
		sizeof(pfms_thread_t),
		sizeof(pfms_session_t));

	ncpus = (uint32_t)sysconf(_SC_NPROCESSORS_ONLN);
	if (ncpus == -1) {
		dprint("cannot retrieve number of online processors\n");
		return -1;
	}

	dprint("configured for %u CPUs\n", ncpus);

	/*
	 * XXX: assuming CPU are contiguously indexed
	 */
	tds = calloc(ncpus, sizeof(*tds));
	if (tds == NULL) {
		dprint("cannot allocate thread descriptors\n");
		return -1;
	}
	return 0;
}

int
pfms_create(uint64_t *cpu_list, size_t n, pfarg_ctx_t *ctx, pfms_ovfl_t *ovfl, void **desc)
{
	uint64_t v;
	size_t k, i;
	uint32_t num, cpu;
	pfms_session_t *s;
	int ret;

	if (cpu_list == NULL || n == 0 || ctx == NULL || desc == NULL) {
		dprint("invalid parameters\n");
		return -1;
	}

	if ((ctx->ctx_flags & PFM_FL_SYSTEM_WIDE) == 0) {
		dprint("only works for system wide\n");
		return -1;
	}

	*desc = NULL;

	/*
	 * XXX: assuming CPU are contiguously indexed
	 */
	num = 0;
	for(k=0, cpu = 0; k < n; k++, cpu+=64) {
		v = cpu_list[k];
		for(i=0; v && i < 63; i++, v>>=1, cpu++) {
			if (v & 0x1) {
				if (cpu >= ncpus) {
					dprint("unavailable CPU%u\n", cpu);
					return -1;
				}
				num++;
			}
		}
	}

	if (num == 0)
		return 0;

	s = calloc(1, sizeof(*s));
	if (s == NULL) {
		dprint("cannot allocate %u contexts\n", num);
		return -1;
	}
	s->ncpus = num;

	printf("%u-way  session\n", num);

	/*
	 * +1 to account for main thread waiting
	 */
	ret = barrier_init(&s->barrier, num + 1);
	if (ret) {
		dprint("cannot init barrier\n");
		goto error_free;
	}

	/*
	 * lock thread descriptor table, no other create_session, close_session
	 * can occur
	 */
	pthread_mutex_lock(&tds_lock);

	if (create_wthreads(cpu_list, n))
		goto error_free_unlock;

	/*
	 * check all needed threads are available
	 */
	for(k=0, cpu = 0; k < n; k++, cpu += 64) {
		v = cpu_list[k];
		for(i=0; v && i < 63; i++, v>>=1, cpu++) {
			if (v & 0x1) {
				if (tds[cpu].barrier) {
					dprint("CPU%u already managing a session\n", cpu);
					goto error_free_unlock;
				}

			}
		}
	}

	/*
	 * send create context order
	 */
	for(k=0, cpu = 0; k < n; k++, cpu += 64) {
		v = cpu_list[k];
		for(i=0; v && i < 63; i++, v>>=1, cpu++) {
			if (v & 0x1) {
				tds[cpu].cmd  = CMD_CTX;
				tds[cpu].data = ctx;
				tds[cpu].barrier = &s->barrier;
				sem_post(&tds[cpu].cmd_sem);
			}
		}
	}
	barrier_wait(&s->barrier);

	ret = 0;

	/*
	 * check for errors
	 */
	for(k=0; k < ncpus; k++) {
		if (tds[k].barrier == &s->barrier) {
			ret = tds[k].ret;
			if (ret)
				break;
		}
	}
	/*
	 * undo if error found
	 */
	if (k < ncpus) {
		for(k=0; k < ncpus; k++) {
			if (tds[k].barrier == &s->barrier) {
				if (tds[k].ret == 0) {
					tds[k].cmd = CMD_CLOSE;
					sem_post(&tds[k].cmd_sem);
				}
				/* mark as free */
				tds[k].barrier = NULL;
			}
		}
	}
	pthread_mutex_unlock(&tds_lock);

	if (ret == 0) *desc = s;

	return ret ? -1 : 0;

error_free_unlock:
	pthread_mutex_unlock(&tds_lock);

error_free:
	free(s);
	return -1;
}

int
pfms_load(void *desc)
{
	uint32_t k;
	pfms_session_t *s;
	int ret;

	if (desc == NULL) {
		dprint("invalid parameters\n");
		return -1;
	}
	s = (pfms_session_t *)desc;

	if (s->ncpus == 0) {
		dprint("invalid session content 0 CPUS\n");
		return -1;
	}
	/*
	 * send create context order
	 */
	for(k=0; k < ncpus; k++) {
		if (tds[k].barrier == &s->barrier) {
			tds[k].cmd  = CMD_LOAD;
			sem_post(&tds[k].cmd_sem);
		}
	}

	barrier_wait(&s->barrier);

	ret = 0;

	/*
	 * check for errors
	 */
	for(k=0; k < ncpus; k++) {
		if (tds[k].barrier == &s->barrier) {
			ret = tds[k].ret;
			if (ret) {
				dprint("failure on CPU%u\n", k);
				break;
			}
		}
	}

	/*
	 * if error, unload all others
	 */
	if (k < ncpus) {
		for(k=0; k < ncpus; k++) {
			if (tds[k].barrier == &s->barrier) {
				if (tds[k].ret == 0) {
					tds[k].cmd = CMD_UNLOAD;
					sem_post(&tds[k].cmd_sem);
				}
			}
		}
	}
	return ret ? -1 : 0;
}

static int
__pfms_do_simple_cmd(pfms_cmd_t cmd, void *desc, void *data, uint32_t n)
{
	size_t k;
	pfms_session_t *s;
	int ret;

	if (desc == NULL) {
		dprint("invalid parameters\n");
		return -1;
	}
	s = (pfms_session_t *)desc;

	if (s->ncpus == 0) {
		dprint("invalid session content 0 CPUS\n");
		return -1;
	}
	/*
	 * send create context order
	 */
	for(k=0; k < ncpus; k++) {
		if (tds[k].barrier == &s->barrier) {
			tds[k].cmd  = cmd;
			tds[k].data = data;
			tds[k].ndata = n;
			sem_post(&tds[k].cmd_sem);
		}
	}
	barrier_wait(&s->barrier);

	ret = 0;

	/*
	 * check for errors
	 */
	for(k=0; k < ncpus; k++) {
		if (tds[k].barrier == &s->barrier) {
			ret = tds[k].ret;
			if (ret) {
				dprint("failure on CPU%zu\n", k);
				break;
			}
		}
	}
	/*
	 * simple commands cannot be undone
	 */
	return ret ? -1 : 0;
}

int
pfms_unload(void *desc)
{
	return __pfms_do_simple_cmd(CMD_UNLOAD, desc, NULL, 0);
}

int
pfms_start(void *desc)
{
	return __pfms_do_simple_cmd(CMD_START, desc, NULL, 0);
}

int
pfms_stop(void *desc)
{
	return __pfms_do_simple_cmd(CMD_STOP, desc, NULL, 0);
}

int
pfms_write_pmcs(void *desc, pfarg_pmc_t *pmcs, uint32_t n)
{
	return __pfms_do_simple_cmd(CMD_WPMCS, desc, pmcs, n);
}

int
pfms_write_pmds(void *desc, pfarg_pmd_t *pmds, uint32_t n)
{
	return __pfms_do_simple_cmd(CMD_WPMDS, desc, pmds, n);
}

int
pfms_close(void *desc)
{
	size_t k;
	pfms_session_t *s;
	int ret;

	if (desc == NULL) {
		dprint("invalid parameters\n");
		return -1;
	}
	s = (pfms_session_t *)desc;

	if (s->ncpus == 0) {
		dprint("invalid session content 0 CPUS\n");
		return -1;
	}

	for(k=0; k < ncpus; k++) {
		if (tds[k].barrier == &s->barrier) {
			tds[k].cmd  = CMD_CLOSE;
			sem_post(&tds[k].cmd_sem);
		}
	}
	barrier_wait(&s->barrier);

	ret = 0;

	pthread_mutex_lock(&tds_lock);
	/*
	 * check for errors
	 */
	for(k=0; k < ncpus; k++) {
		if (tds[k].barrier == &s->barrier) {
			if (tds[k].ret) {
				dprint("failure on CPU%zu\n", k);
			}
			ret |= tds[k].ret;
			tds[k].barrier = NULL;
		}
	}

	pthread_mutex_unlock(&tds_lock);

	free(s);

	/*
	 * XXX: we cannot undo close
	 */
	return ret ? -1 : 0;
}

int
pfms_read_pmds(void *desc, pfarg_pmd_t *pmds, uint32_t n)
{
	pfms_session_t *s;
	uint32_t k, pmds_per_cpu;
	int ret;

	if (desc == NULL) {
		dprint("invalid parameters\n");
		return -1;
	}
	s = (pfms_session_t *)desc;

	if (s->ncpus == 0) {
		dprint("invalid session content 0 CPUS\n");
		return -1;
	}
	if (n % s->ncpus) {
		dprint("invalid number of pfarg_pmd_t provided, must be multiple of %u\n", s->ncpus);
		return -1;
	}
	pmds_per_cpu = n / s->ncpus;

	dprint("n=%u ncpus=%u per_cpu=%u\n", n, s->ncpus, pmds_per_cpu);

	for(k=0; k < ncpus; k++) {
		if (tds[k].barrier == &s->barrier) {
			tds[k].cmd  = CMD_RPMDS;
			tds[k].data = pmds;
			tds[k].ndata= pmds_per_cpu;
			sem_post(&tds[k].cmd_sem);
			pmds += pmds_per_cpu;
		}
	}
	barrier_wait(&s->barrier);

	ret = 0;

	/*
	 * check for errors
	 */
	for(k=0; k < ncpus; k++) {
		if (tds[k].barrier == &s->barrier) {
			ret = tds[k].ret;
			if (ret) {
				dprint("failure on CPU%u\n", k);
				break;
			}
		}
	}
	/*
	 * cannot undo pfm_read_pmds
	 */
	return ret ? -1 : 0;
}
#if 0

/*
 * beginning of test program
 */
#include <perfmon/pfmlib.h>

#define NUM_PMCS PFMLIB_MAX_PMCS
#define NUM_PMDS PFMLIB_MAX_PMDS

static void fatal_error(char *fmt,...) __attribute__((noreturn));

static void
fatal_error(char *fmt, ...)
{
	va_list ap;

	va_start(ap, fmt);
	vfprintf(stderr, fmt, ap);
	va_end(ap);

	exit(1);
}

static uint32_t
popcount(uint64_t c)
{
	uint32_t count = 0;

	for(; c; c>>=1) {
		if (c & 0x1)
			count++;
	}
	return count;
}

int
main(int argc, char **argv)
{
	pfarg_ctx_t ctx;
	pfarg_pmc_t pc[NUM_PMCS];
	pfarg_pmd_t *pd;
	pfmlib_input_param_t inp;
	pfmlib_output_param_t outp;
	uint64_t cpu_list;
	void *desc;
	unsigned int num_counters;
	uint32_t i, j, k, l, ncpus, npmds;
	size_t len;
	int ret;
	char *name;

	if (pfm_initialize() != PFMLIB_SUCCESS)
		fatal_error("cannot initialize libpfm\n");

	if (pfms_initialize())
		fatal_error("cannot initialize libpfms\n");

	pfm_get_num_counters(&num_counters);
	pfm_get_max_event_name_len(&len);

	name = malloc(len+1);
	if (name == NULL)
		fatal_error("cannot allocate memory for event name\n");

	memset(&ctx, 0, sizeof(ctx));
	memset(pc, 0, sizeof(pc));
	memset(&inp,0, sizeof(inp));
	memset(&outp,0, sizeof(outp));

	cpu_list = argc > 1 ? strtoul(argv[1], NULL, 0) : 0x3;

	ncpus = popcount(cpu_list);

		if (pfm_get_cycle_event(&inp.pfp_events[0].event) != PFMLIB_SUCCESS)
		fatal_error("cannot find cycle event\n");

	if (pfm_get_inst_retired_event(&inp.pfp_events[1].event) != PFMLIB_SUCCESS)
		fatal_error("cannot find inst retired event\n");

	i = 2;

	inp.pfp_dfl_plm = PFM_PLM3|PFM_PLM0;

	if (i > num_counters) {
		i = num_counters;
		printf("too many events provided (max=%d events), using first %d event(s)\n", num_counters, i);
	}
	/*
	 * how many counters we use
	 */
	inp.pfp_event_count = i;

	/*
	 * indicate we are using the monitors for a system-wide session.
	 * This may impact the way the library sets up the PMC values.
	 */
	inp.pfp_flags = PFMLIB_PFP_SYSTEMWIDE;

	/*
	 * let the library figure out the values for the PMCS
	 */
	if ((ret=pfm_dispatch_events(&inp, NULL, &outp, NULL)) != PFMLIB_SUCCESS)
		fatal_error("cannot configure events: %s\n", pfm_strerror(ret));

	npmds = ncpus * inp.pfp_event_count;
	dprint("ncpus=%u npmds=%u\n", ncpus, npmds);

	pd = calloc(npmds, sizeof(pfarg_pmd_t));
	if (pd == NULL)
		fatal_error("cannot allocate pd array\n");

	for (i=0; i < outp.pfp_pmc_count; i++) {
		pc[i].reg_num   = outp.pfp_pmcs[i].reg_num;
		pc[i].reg_value = outp.pfp_pmcs[i].reg_value;
	}

	for(l=0, k = 0; l < ncpus; l++) {
		for (i=0, j=0; i < inp.pfp_event_count; i++, k++) {
			pd[k].reg_num   = outp.pfp_pmcs[j].reg_pmd_num;
			for(; j < outp.pfp_pmc_count; j++)  if (outp.pfp_pmcs[j].reg_evt_idx != i) break;
		}
	}

	/*
	 * create a context on all CPUs we asked for
	 *
	 * libpfms only works for system-wide, so we set the flag in
	 * the master context. the context argument is not modified by
	 * call.
	 *
	 * desc is an opaque descriptor used to identify session.
	 */
	ctx.ctx_flags = PFM_FL_SYSTEM_WIDE;

	ret = pfms_create(&cpu_list, 1, &ctx, NULL, &desc);
	if (ret == -1)
		fatal_error("create error %d\n", ret);

	/*
	 * program the PMC registers on all CPUs of interest
	 */
	ret = pfms_write_pmcs(desc, pc, outp.pfp_pmc_count);
	if (ret == -1)
		fatal_error("write_pmcs error %d\n", ret);

	/*
	 * program the PMD registers on all CPUs of interest
	 */
	ret = pfms_write_pmds(desc, pd, inp.pfp_event_count);
	if (ret == -1)
		fatal_error("write_pmds error %d\n", ret);

	/*
	 * load context on all CPUs of interest
	 */
	ret = pfms_load(desc);
	if (ret == -1)
		fatal_error("load error %d\n", ret);

	/*
	 * start monitoring on all CPUs of interest
	 */
	ret = pfms_start(desc);
	if (ret == -1)
		fatal_error("start error %d\n", ret);

	/*
	 * simulate some work
	 */
	sleep(10);

	/*
	 * stop monitoring on all CPUs of interest
	 */
	ret = pfms_stop(desc);
	if (ret == -1)
		fatal_error("stop error %d\n", ret);
	
	/*
	 * read the PMD registers on all CPUs of interest.
	 * The pd[] array must be organized such that to
	 * read 2 PMDs on each CPU you need:
	 * 	- 2 * number of CPUs of interest
	 * 	- the first 2 elements of pd[] read on 1st CPU
	 * 	- the next  2 elements of pd[] read on the 2nd CPU
	 * 	- and so on
	 */
	ret = pfms_read_pmds(desc, pd, npmds);
	if (ret == -1)
		fatal_error("read_pmds error %d\n", ret);

	/*
	 * pre per-CPU results
	 */
	for(j=0, k= 0; j < ncpus; j++) {
		for (i=0; i < inp.pfp_event_count; i++, k++) {
			pfm_get_full_event_name(&inp.pfp_events[i], name, len);
			printf("CPU%-3d PMD%u %20"PRIu64" %s\n",
			j,
			pd[k].reg_num,
			pd[k].reg_value,
			name);
		}
	}

	/*
	 * destroy context  on all CPUs of interest.
	 * After this call desc is invalid
	 */
	ret = pfms_close(desc);
	if (ret == -1)
		fatal_error("close error %d\n", ret);

	free(name);

	return 0;
}
#endif