Blob Blame History Raw
/* rtop.c - a simple PMU-based CPU utilization tool
 *
 * Copyright (c) 2004-2006 Hewlett-Packard Development Company, L.P.
 * Contributed by Stephane Eranian <eranian@hpl.hp.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 * of the Software, and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
 * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
 * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
 * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
#ifndef _GNU_SOURCE
  #define _GNU_SOURCE /* for getline */
#endif
#include <sys/types.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <errno.h>
#include <unistd.h>
#include <string.h>
#include <syscall.h>
#include <getopt.h>
#include <curses.h>
#include <termios.h>
#include <signal.h>
#include <ctype.h>
#include <math.h>
#include <pthread.h>
#include <semaphore.h>
#include <limits.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/poll.h>

#include <perfmon/pfmlib.h>
#include <perfmon/perfmon.h>

#include "detect_pmcs.h"

#define SWITCH_TIMEOUT	1000000000 /* in nanoseconds */

#define RTOP_VERSION "0.1"

#define RTOP_MAX_CPUS		1024	/* maximum number of CPU supported */

#define MAX_EVT_NAME_LEN	128
#define RTOP_NUM_PMCS 		4
#define RTOP_NUM_PMDS 		4
/* 
 * max number of cpus (threads) supported
 */
#define RTOP_MAX_CPUS		1024 /* MUST BE power of 2 */
#define RTOP_CPUMASK_BITS	(sizeof(unsigned long)<<3)
#define RTOP_CPUMASK_COUNT	(RTOP_MAX_CPUS/RTOP_CPUMASK_BITS)

#define RTOP_CPUMASK_SET(m, g)		((m)[(g)/RTOP_CPUMASK_BITS] |=  (1UL << ((g) % RTOP_CPUMASK_BITS)))
#define RTOP_CPUMASK_CLEAR(m, g)	((m)[(g)/RTOP_CPUMASK_BITS] &= ~(1UL << ((g) % RTOP_CPUMASK_BITS)))
#define RTOP_CPUMASK_ISSET(m, g)	((m)[(g)/RTOP_CPUMASK_BITS] &   (1UL << ((g) % RTOP_CPUMASK_BITS)))

typedef unsigned long rtop_cpumask_t[RTOP_CPUMASK_COUNT];

typedef struct {
	struct {
		int	opt_verbose;
		int	opt_delay;	/* refresh delay in second */
		int	opt_delay_set;
	} program_opt_flags;
	rtop_cpumask_t	cpu_mask;	  /* which CPUs to use in system wide mode */
	long		online_cpus;
	long		selected_cpus;
	unsigned long	cpu_mhz;
	char		*outfile;
} program_options_t;

#define opt_verbose program_opt_flags.opt_verbose
#define opt_delay program_opt_flags.opt_delay
#define opt_delay_set program_opt_flags.opt_delay_set


typedef struct {
	char *name;
	unsigned int plm;
} eventdesc_t;

typedef enum {
	THREAD_STARTED,
	THREAD_RUN,
	THREAD_DONE,
	THREAD_ERROR
} thread_state_t;


typedef struct {
	pthread_t	tid;		/* logical thread identification */
	long		cpuid;
	unsigned int	id;
	thread_state_t	state;
	int		is_last;
	sem_t		his_sem;
	sem_t		my_sem;
	FILE		*fp;
	uint64_t	nsamples;
	int		has_msg;
} thread_desc_t;

typedef struct _setdesc_t {
	pfarg_pmr_t		pc[RTOP_NUM_PMDS];
	pfarg_pmr_t		pd[RTOP_NUM_PMCS];
	pfmlib_input_param_t	inp;
	pfmlib_output_param_t	outp;
	uint16_t		set_id;
	uint32_t		set_flags;
	uint32_t		set_timeout; /* actual timeout */
	int			(*handler)(int fd, FILE *fp, thread_desc_t *td, struct _setdesc_t *my_sdesc);
	void 			*data;
	eventdesc_t		*evt_desc;
} setdesc_t;

typedef struct _barrier {
	pthread_mutex_t mutex;
	pthread_cond_t	cond;
	unsigned long	counter;
	unsigned long	max;
	unsigned long   generation; /* avoid race condition on wake-up */
} barrier_t;

typedef enum {
	SESSION_INIT,
	SESSION_RUN,
	SESSION_STOP,
	SESSION_ABORTED
} session_state_t;

typedef struct {
	uint64_t prev_k_cycles;
	uint64_t prev_u_cycles;
} set0_data_t;

static barrier_t 		barrier;
static session_state_t		session_state;
static program_options_t 	options;
static thread_desc_t		*thread_info;
static struct termios		saved_tty;
static int			time_to_quit;
static int			term_rows, term_cols;

static eventdesc_t	set0_evt[]={
	{ .name  = "*", .plm = PFM_PLM0 },
	{ .name  = "*", .plm = PFM_PLM3 },
	{ .name  = NULL}
 };

static int handler_set0(int fd, FILE *fp, thread_desc_t *td, setdesc_t *my_sdesc);

static setdesc_t setdesc_tab[]={
	{ .set_id    = 0,
	  .evt_desc  = set0_evt,
	  .handler   = handler_set0
	}
};
#define RTOP_NUM_SDESC (sizeof(setdesc_tab)/sizeof(setdesc_t))

static int
barrier_init(barrier_t *b, unsigned long count)
{
	int r;

	r = pthread_mutex_init(&b->mutex, NULL);
	if (r == -1) return -1;
	r = pthread_cond_init(&b->cond, NULL);
	if (r == -1) return -1;
	b->max = b->counter = count;

	b->generation = 0;

	return 0;
}

static void
cleanup_barrier(void *arg)
{
	int r;
	barrier_t *b = (barrier_t *)arg;
	r = pthread_mutex_unlock(&b->mutex);
	(void)r;
}


static int
barrier_wait(barrier_t *b)
{
	unsigned long generation;
	int oldstate;

	pthread_cleanup_push(cleanup_barrier, b);

	pthread_mutex_lock(&b->mutex);

	pthread_testcancel();

	if (--b->counter == 0) {

		/* reset barrier */
		b->counter = b->max;
		/*
		 * bump generation number, this avoids thread getting stuck in the
		 * wake up loop below in case a thread just out of the barrier goes
		 * back in right away before all the thread from the previous "round"
		 * have "escaped".
		 */
		b->generation++;

		pthread_cond_broadcast(&b->cond);
	} else {

		generation = b->generation;

		pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate);

		while (b->counter != b->max && generation == b->generation) {
			pthread_cond_wait(&b->cond, &b->mutex);
		}

		pthread_setcancelstate(oldstate, NULL);
	}
	pthread_mutex_unlock(&b->mutex);

	pthread_cleanup_pop(0);

	return 0;
}


static void fatal_error(char *fmt,...) __attribute__((noreturn));

static void
warning(char *fmt, ...)
{
	va_list ap;

	va_start(ap, fmt);
	vfprintf(stderr, fmt, ap);
	va_end(ap);
}


static void
fatal_error(char *fmt, ...)
{
	va_list ap;

	va_start(ap, fmt);
	vfprintf(stderr, fmt, ap);
	va_end(ap);

	exit(1);
}

int
gettid(void)
{
	int tmp;
	tmp = syscall(__NR_gettid);
	return tmp;
}

#ifndef __NR_sched_setaffinity
#error "you need to define __NR_sched_setaffinity"
#endif

/*
 * Hack to get this to work without libc support
 */
int
pin_self_cpu(unsigned int cpu)
{
	unsigned long my_mask;

	my_mask = 1UL << cpu;

	return syscall(__NR_sched_setaffinity, gettid(), sizeof(my_mask), &my_mask);
}

static void
sigint_handler(int n)
{
	time_to_quit = 1;
}

static unsigned long
find_cpu_speed(void)
{
	FILE *fp1;	
	unsigned long f1 = 0, f2 = 0;
	char buffer[128], *p, *value;

	memset(buffer, 0, sizeof(buffer));

	fp1 = fopen("/proc/cpuinfo", "r");
	if (fp1 == NULL) return 0;

	for (;;) {
		buffer[0] = '\0';

		p  = fgets(buffer, 127, fp1);
		if (p == NULL)
			break;

		/* skip  blank lines */
		if (*p == '\n') continue;

		p = strchr(buffer, ':');
		if (p == NULL)
			break;

		/*
		 * p+2: +1 = space, +2= firt character
		 * strlen()-1 gets rid of \n
		 */
		*p = '\0';
		value = p+2;

		value[strlen(value)-1] = '\0';

		if (!strncasecmp("cpu MHz", buffer, 7)) {
			float fl;
			sscanf(value, "%f", &fl);
			f1 = lroundf(fl);
			break;
		}
		if (!strncasecmp("BogoMIPS", buffer, 8)) {
			float fl;
			sscanf(value, "%f", &fl);
			f2 = lroundf(fl);
		}
	}
	fclose(fp1);
	return f1 == 0 ? f2 : f1;
}

static void
get_term_size(void)
{
	int ret;
        struct winsize ws;

	ret = ioctl(1, TIOCGWINSZ, &ws);
	if (ret == -1) 
		fatal_error("cannot determine screen size\n");

	if (ws.ws_row > 10) {
                term_cols = ws.ws_col;
                term_rows = ws.ws_row;
        } else {
                term_cols = 80;
                term_rows = 24;
        }

	if (term_rows < options.selected_cpus)
		fatal_error("you need at least %ld rows on your terminal to display all CPUs\n", options.selected_cpus);
}

static void
sigwinch_handler(int n)
{
	get_term_size();
}

static void
setup_screen(void)
{
	int ret;

	ret = tcgetattr(0, &saved_tty);
	if (ret == -1)
		fatal_error("cannot save tty settings\n");

	get_term_size();

	initscr();
	nocbreak();
	resizeterm(term_rows, term_cols);
}

static void
close_screen(void)
{
	int ret;

	endwin();

	ret = tcsetattr(0, TCSAFLUSH, &saved_tty);
	if (ret == -1)
		warning("cannot restore tty settings\n");
}


static void
setup_signals(void)
{
	struct sigaction act;
	sigset_t my_set;

	/*
	 * SIGINT is a asynchronous signal
	 * sent to the process (not a specific thread). POSIX states
	 * that one and only one thread will execute the handler. This
	 * could be any thread that does not have the signal blocked.
	 */

	/*
	 * install SIGINT handler
	 */
	memset(&act,0,sizeof(act));
	sigemptyset(&my_set);
	act.sa_handler = (__sighandler_t)sigint_handler;
	sigaction (SIGINT, &act, 0);

	/*
	 * install SIGWINCH handler
	 */
	memset(&act,0,sizeof(act));
	sigemptyset(&my_set);
	act.sa_handler = (__sighandler_t)sigwinch_handler;
	sigaction (SIGWINCH, &act, 0);
}

static void
setup_worker_signals(void)
{
	struct sigaction act;
	sigset_t my_set;

	/*
	 * SIGINT is a asynchronous signal
	 * sent to the process (not a specific thread). POSIX states
	 * that one and only one thread will execute the handler. This
	 * could be any thread that does not have the signal blocked.
	 */

	/*
	 * block SIGINT, forcing it to master thread only
	 */
	memset(&act,0,sizeof(act));

	sigemptyset(&my_set);
        sigaddset(&my_set, SIGINT);
        sigaddset(&my_set, SIGWINCH);

	pthread_sigmask(SIG_BLOCK, &my_set, NULL);
}


static struct option rtop_cmd_options[]={
	{ "help", 0, 0, 1 },
	{ "version", 0, 0, 2 },
	{ "delay", 0, 0, 3 },
	{ "cpu-list", 1, 0, 4 },
	{ "outfile", 1, 0, 5 },

	{ "verbose", 0, &options.opt_verbose, 1 },
	{ 0, 0, 0, 0}
};



int
handler_set0(int fd, FILE *fp, thread_desc_t *td, setdesc_t *my_sdesc)
{
	double k_cycles, u_cycles, i_cycles;
	set0_data_t *sd1;
	uint64_t itc_delta;
	long mycpu;

	mycpu = td->cpuid;

	if (my_sdesc->data == NULL) {
		my_sdesc->data = sd1 = calloc(1, sizeof(set0_data_t));
		if (sd1 == NULL)
			return -1;

	}
	sd1 = my_sdesc->data;

	/*
	 * now read the results
	 */
	if (pfm_read(fd, 0, PFM_RW_PMD, my_sdesc->pd, my_sdesc->inp.pfp_event_count * sizeof(pfarg_pmr_t)) == -1) {
		warning( "CPU%ld pfm_readerror errno %d\n", mycpu, errno);
		return -1;
	}

	/*
	 * expected maximum duration with monitoring active for this set
	 * set_timeout is in nanoseconds, we need to divide mhz by 1000
	 * to get cycles.
	 */
	itc_delta =  (my_sdesc->set_timeout*(uint64_t)options.cpu_mhz)/1000;

	k_cycles   = (double)(my_sdesc->pd[0].reg_value - sd1->prev_k_cycles)*100.0/ (double)itc_delta;
	u_cycles   = (double)(my_sdesc->pd[1].reg_value - sd1->prev_u_cycles)*100.0/ (double)itc_delta;
	i_cycles   = 100.0 - (k_cycles + u_cycles);

	/*
	 * adjust for rounding errors
	 */
	if (i_cycles < 0.0) i_cycles = 0.0;
	if (i_cycles > 100.0) i_cycles = 100.0;
	if (k_cycles > 100.0) k_cycles = 100.0;
	if (u_cycles > 100.0) u_cycles = 100.0;

	printw("CPU%-2ld %6.2f%% usr %6.2f%% sys %6.2f%% idle\n",
		mycpu,
		u_cycles,
		k_cycles,
		i_cycles);

	sd1->prev_k_cycles      = my_sdesc->pd[0].reg_value;
	sd1->prev_u_cycles      = my_sdesc->pd[1].reg_value;

	if (fp)
		fprintf(fp, "%"PRIu64" %6.2f %6.2f %6.2f\n",
			td->nsamples,
			u_cycles,
			k_cycles,
			i_cycles);
	td->nsamples++;
	return 0;
}

static void
do_measure_one_cpu(void *data)
{
	thread_desc_t	*arg= (thread_desc_t *)data;
	pfarg_set_desc_t	setd;
	setdesc_t 	*my_sdesc, *my_sdesc_tab = NULL;
	long		mycpu;
	sem_t		*his_sem;
	unsigned int	id;
	int		fd, ret, j;
	int		old_rows;
	char		cpu_str[16];
	char		*fn;
	FILE		*fp = NULL;

	setup_worker_signals();

	mycpu    = arg->cpuid;
	id       = arg->id;
	his_sem    = &arg->his_sem;
	old_rows = term_rows;


	if (options.outfile) {
		sprintf(cpu_str,".cpu%ld", mycpu);

		fn = malloc(strlen(options.outfile)+1+strlen(cpu_str));
		if (fn == NULL) goto error;

		strcpy(fn, options.outfile);
		strcat(fn, cpu_str);

		fp = fopen(fn, "w");
		if (fp == NULL) {
			warning("cannot open %s\n", fn);
			free(fn);
			goto error;
		}
		free(fn);
		fprintf(fp, "# Results for CPU%ld\n"
				"# sample delay %d seconds\n"
				"# Column1 : sample number\n"
				"# Column2 : %% user time\n"
				"# Column3 : %% system time\n"
				"# Column4 : %% idle\n"
				"# Column5 : kernel entry-exit\n",
				mycpu,
				options.opt_delay);

	}

	memset(&setd, 0, sizeof(setd));

	ret = pin_self_cpu(mycpu);
	if (ret) {
		warning("CPU%ld cannot pin\n");
	}

	my_sdesc_tab = malloc(sizeof(setdesc_t)*RTOP_NUM_SDESC);
	if (my_sdesc_tab == NULL) {
		warning("CPU%ld cannot allocate sdesc\n", mycpu);
		goto error;
	}

	memcpy(my_sdesc_tab, setdesc_tab, sizeof(setdesc_t)*RTOP_NUM_SDESC);

	fd = pfm_create(PFM_FL_SYSTEM_WIDE, NULL);
	if (fd == -1) {
		if (errno == ENOSYS) {
			fatal_error("Your kernel does not have performance monitoring support!\n");
		}
		warning("CPU%ld cannot create session: %d\n", mycpu, errno);
		goto error;
	}

	/*
	 * WARNING: on processors where the idle loop goes into some power-saving
	 * state, the results of this program may be incorrect
	 */
	for(j=0; j < RTOP_NUM_SDESC; j++) {

		my_sdesc       = my_sdesc_tab+j; 

		setd.set_id    = my_sdesc->set_id;
		setd.set_flags = my_sdesc->set_flags;
		setd.set_timeout =  SWITCH_TIMEOUT; /* in nsecs */

		/*
 		 * do not bother if we have only one set
 		 */
		if (RTOP_NUM_SDESC > 1 && pfm_create_sets(fd, 0, &setd, 1) == -1) {
			warning("CPU%ld cannot create set%u: %d\n", mycpu, j, errno);
			goto error;
		}
		my_sdesc->set_timeout = setd.set_timeout;

		if (pfm_write(fd, 0, PFM_RW_PMC, my_sdesc->pc, my_sdesc->outp.pfp_pmc_count * sizeof(pfarg_pmr_t)) == -1) {
			warning("CPU%ld pfm_write error errno %d\n", mycpu, errno);
			goto error;
		}

		/*
	 	 * To be read, each PMD must be either written or declared
	 	 * as being part of a sample (reg_smpl_pmds)
	 	 */
		if (pfm_write(fd, 0, PFM_RW_PMD, my_sdesc->pd, my_sdesc->inp.pfp_event_count * sizeof(pfarg_pmr_t)) == -1) {
			warning("CPU%ld pfm_write(PMD) error errno %d\n", mycpu, errno);
			goto error;
		}
	}

	/*
	 * in system-wide mode, this field must provide the CPU the caller wants
	 * to monitor. The kernel checks and if calling from the wrong CPU, the
	 * call fails. The affinity is not affected.
	 */
	if (pfm_attach(fd, 0, mycpu) == -1) {
		warning("CPU%ld pfm_attach error errno %d\n", mycpu, errno);
		goto error;
	}

	thread_info[id].state = THREAD_RUN;

	barrier_wait(&barrier);

	/*
	 * must wait until we are sure the master is out of its thread_create loop
	 */

	barrier_wait(&barrier);

	for(;session_state == SESSION_RUN;) {

		if (pfm_set_state(fd, 0, PFM_ST_START) == -1) {
			warning("CPU%ld pfm_set_state(start) error errno %d\n", mycpu, errno);
			goto error;
		}

		/*
		 * wait for order from master
		 */
		sem_wait(his_sem);

		if (pfm_set_state(fd, 0, PFM_ST_STOP) == -1) {
			warning("CPU%ld pfm_set_state(stop) error %d\n", mycpu, errno);
			goto error;
		}

		if (old_rows != term_rows) {
			resizeterm(term_rows, term_cols);
			old_rows = term_rows;
		}

		for(j=0; j < RTOP_NUM_SDESC; j++) {

			move(id*RTOP_NUM_SDESC+j, 0);

			if (my_sdesc_tab[j].handler)
				(*my_sdesc_tab[j].handler)(fd, fp, arg, my_sdesc_tab+j);
		}
		if (session_state == SESSION_RUN) {
			sem_post(&arg->my_sem);
			barrier_wait(&barrier);
		}
	}
	if (fp) fclose(fp);

	close(fd);

	thread_info[id].state = THREAD_DONE;

	pthread_exit((void *)(0));
error:
	thread_info[id].state = THREAD_ERROR;
	barrier_wait(&barrier);
	if (fp) fclose(fp);
	if (my_sdesc_tab) free(my_sdesc_tab);
	pthread_exit((void *)(~0));
}

static void
mainloop(void)
{
	long i, j, ncpus = 0;
	int ret;
	void *retval;
	struct pollfd	fds;

	ncpus = options.selected_cpus;

	barrier_init(&barrier, ncpus+1);

	thread_info = malloc(sizeof(thread_desc_t)*ncpus);
	if (thread_info == NULL) {
		fatal_error("cannot allocate thread_desc for %ld CPUs\n", ncpus);
	}

	for(i=0, j = 0; ncpus; i++) {

		if (RTOP_CPUMASK_ISSET(options.cpu_mask, i) == 0) continue;

		thread_info[j].id    = j;
		thread_info[j].cpuid = i;

		sem_init(&thread_info[j].his_sem, 0, 0);
		sem_init(&thread_info[j].my_sem, 0, 0);

		ret = pthread_create(&thread_info[j].tid, 
				     NULL, 
				     (void *(*)(void *))do_measure_one_cpu, 
				     (void *)(thread_info+j));
		if (ret != 0) goto abort;

		ncpus--;
		j++;
	}

	/* set last marker */
	thread_info[j-1].is_last = 1;

	ncpus = j;

	barrier_wait(&barrier);
	/*
	 * check if some threads got problems
	 */
	for(i=0; i < ncpus ; i++) {
		if (thread_info[i].state == THREAD_ERROR) {
			printw("aborting\n"); refresh();
			goto abort;
		}
	}

	session_state = SESSION_RUN;

	barrier_wait(&barrier);

	fds.fd      = 0;
	fds.events  = POLLIN;
	fds.revents = 0;

	for(;time_to_quit == 0;) {
		ret = poll(&fds, 1, options.opt_delay*1000);
		switch(ret) {
			case 0:
				for(i=0; i < ncpus ; i++) {
					/* give order to print */
					sem_post(&thread_info[i].his_sem);
					/* wait for thread to be done */
					sem_wait(&thread_info[i].my_sem);
				}
				/* give order to start measuring again */
				refresh();
				barrier_wait(&barrier);
				break;
			case -1:
				/* restart in case of signal */
				if (errno == EINTR)
					continue;
				warning("polling error: %s\n", strerror(errno));
				 /* fall through */
			default: time_to_quit = 1;
		}
	}
	session_state = SESSION_STOP;

	/*
	 * get worker thread out of their mainloop
	 */
	for (i=0; i < ncpus; i++)
		sem_post(&thread_info[i].his_sem);
join_all:
	for(i=0; i< ncpus; i++) {
		ret = pthread_join(thread_info[i].tid, &retval);
		if (ret !=0) fatal_error("cannot join thread %ld\n", i);

	}
	free(thread_info);
	return;
abort:
	session_state = SESSION_ABORTED;

	for(i=0; i < ncpus; i++) {
		pthread_cancel(thread_info[i].tid);
	}
	goto join_all;
}

static void
setup_measurement(void)
{
	pfarg_sinfo_t sif;
	pfmlib_options_t pfmlib_options;
	eventdesc_t *evt;
	setdesc_t *sdesc;
	pfmlib_event_t trigger_event;
	unsigned int i, j;
	int ret;

	/*
	 * pass options to library (optional)
	 */
	memset(&pfmlib_options, 0, sizeof(pfmlib_options));
	pfmlib_options.pfm_debug = 0;
	pfmlib_options.pfm_verbose = 0;
	pfm_set_options(&pfmlib_options);

	/*
	 * Initialize pfm library (required before we can use it)
	 */
	ret = pfm_initialize();
	if (ret != PFMLIB_SUCCESS)
		fatal_error("Cannot initialize library: %s\n", pfm_strerror(ret));

	if (pfm_get_cycle_event(&trigger_event) != PFMLIB_SUCCESS)
		fatal_error("cannot find cycle event for trigger\n");

	memset(&sif, 0, sizeof(sif));
	get_sif(PFM_FL_SYSTEM_WIDE, &sif);

	for(i=0; i < RTOP_NUM_SDESC; i++) {

		sdesc = setdesc_tab+i;

		sdesc->inp.pfp_dfl_plm   = PFM_PLM3|PFM_PLM0;
		/*
		 * indicate we are using the monitors for a system-wide session.
		 * This may impact the way the library sets up the PMC values.
		 */
		sdesc->inp.pfp_flags = PFMLIB_PFP_SYSTEMWIDE;
		evt = sdesc->evt_desc;

		for(j=0; evt[j].name ; j++) {
			if (*evt[j].name == '*')
				sdesc->inp.pfp_events[j] = trigger_event;
			else if (pfm_find_full_event(evt[j].name, &sdesc->inp.pfp_events[j]) != PFMLIB_SUCCESS)
				fatal_error("cannot find %s event\n", evt[j].name);
			sdesc->inp.pfp_events[j].plm   = evt[j].plm;
		}

		/*
		 * how many counters we use in this set (add the overflow trigger)
		 */
		sdesc->inp.pfp_event_count = j;
		/*
	 	 * build the pfp_unavail_pmcs bitmask by looking
	 	 * at what perfmon has available. It is not always
	 	 * the case that all PMU registers are actually available
	 	 * to applications. For instance, on IA-32 platforms, some
	 	 * registers may be reserved for the NMI watchdog timer.
	 	 *
	 	 * With this bitmap, the library knows which registers NOT to
	 	 * use. Of source, it is possible that no valid assignement may
	 	 * be possible if certina PMU registers  are not available.
	 	 */
		detect_unavail_pmu_regs(&sif, &sdesc->inp.pfp_unavail_pmcs, NULL);

		/*
		 * let the library figure out the values for the PMCS
		 */
		if ((ret=pfm_dispatch_events(&sdesc->inp, NULL, &sdesc->outp, NULL)) != PFMLIB_SUCCESS)
			fatal_error("cannot configure events: %s\n", pfm_strerror(ret));

		for (j=0; j < sdesc->outp.pfp_pmc_count; j++) {
			sdesc->pc[j].reg_set   = i;
			sdesc->pc[j].reg_num   = sdesc->outp.pfp_pmcs[j].reg_num;
			sdesc->pc[j].reg_value = sdesc->outp.pfp_pmcs[j].reg_value;
		}
		for (j=0; j < sdesc->outp.pfp_pmd_count; j++) {
			sdesc->pd[j].reg_num = sdesc->outp.pfp_pmds[j].reg_num;
			sdesc->pd[j].reg_set = i;
		}
	}
}

void
populate_cpumask(char *cpu_list)
{
	char *p;
	unsigned long start_cpu, end_cpu = 0;
	unsigned long i, count = 0;

	options.online_cpus = sysconf(_SC_NPROCESSORS_ONLN);
	if (options.online_cpus == -1) 
		fatal_error("cannot figure out the number of online processors\n");

	if (cpu_list == NULL)  {
		/*
		 * The limit is mostly driven by the affinity support in NPTL and glibc __CPU_SETSIZE.
		 * the kernel interface does not expose any limitation.
	         */
		if (options.online_cpus >= RTOP_MAX_CPUS)
			fatal_error("rtop can only handle to %u CPUs\n", RTOP_MAX_CPUS);

		for(i=0; i < options.online_cpus; i++) {
			RTOP_CPUMASK_SET(options.cpu_mask, i);
		}
		options.selected_cpus = options.online_cpus;

		return;
	} 

	while(isdigit(*cpu_list)) { 
		p = NULL;
		start_cpu = strtoul(cpu_list, &p, 0); /* auto-detect base */

		if (start_cpu == ULONG_MAX || (*p != '\0' && *p != ',' && *p != '-')) goto invalid;

		if (p && *p == '-') {
			cpu_list = ++p;
			p = NULL;

			end_cpu = strtoul(cpu_list, &p, 0); /* auto-detect base */
			
			if (end_cpu == ULONG_MAX || (*p != '\0' && *p != ',')) goto invalid;
			if (end_cpu < start_cpu) goto invalid_range; 
		} else {
			end_cpu = start_cpu;
		}

		if (start_cpu >= RTOP_MAX_CPUS || end_cpu >= RTOP_MAX_CPUS) goto too_big;

		for (; start_cpu <= end_cpu; start_cpu++) {

			if (start_cpu >= options.online_cpus) goto not_online; /* XXX: assume contiguous range of CPUs */

			if (RTOP_CPUMASK_ISSET(options.cpu_mask, start_cpu)) continue;

			RTOP_CPUMASK_SET(options.cpu_mask, start_cpu);

			count++;
		}

		if (*p) ++p;

		cpu_list = p;
	}

	options.selected_cpus = count;

	return;
invalid:
	fatal_error("invalid cpu list argument: %s\n", cpu_list);
	/* no return */
not_online:
	fatal_error("cpu %lu is not online\n", start_cpu);
	/* no return */
invalid_range:
	fatal_error("cpu range %lu - %lu is invalid\n", start_cpu, end_cpu);
	/* no return */
too_big:
	fatal_error("rtop is limited to %u CPUs\n", RTOP_MAX_CPUS);
	/* no return */
}


static void
usage(void)
{
	printf(	"usage: rtop [options]:\n"
		"-h, --help\t\t\tdisplay this help and exit\n"
		"-v, --verbose\t\t\tverbose output\n"
		"-V, --version\t\t\tshow version and exit\n"
		"-d nsec, --delay=nsec\t\tnumber of seconds between refresh (default=1s)\n"
		"--cpu-list=cpu1,cpu2\t\tlist of CPUs to monitor(default=all)\n"
	);

}

int
main(int argc, char **argv)
{
	int c;
	char *cpu_list = NULL;

	while ((c=getopt_long(argc, argv,"+vhVd:", rtop_cmd_options, 0)) != -1) {
		switch(c) {
			case   0: continue; /* fast path for options */
			case 'v': options.opt_verbose = 1;
				  break;
			case 1:
			case 'h':
				usage();
				exit(0);
			case 2:
			case 'V':
				printf("rtop version " RTOP_VERSION " Date: " __DATE__ "\n"
					"Copyright (C) 2004 Hewlett-Packard Company\n");
				exit(0);
			case 3:
			case 'd':
				if (options.opt_delay_set) fatal_error("cannot set delay twice\n");
				options.opt_delay     = atoi(optarg);
				if (options.opt_delay < 0) {
					fatal_error("invalid delay, must be >= 0\n");
				}
				options.opt_delay_set = 1;
				break;
			case 4:
				if (cpu_list) fatal_error("cannot specify --cpu-list more than once\n");
				if (*optarg == '\0') fatal_error("--cpu-list needs an argument\n");
				cpu_list = optarg;
				break;
			case 5:
				if (options.outfile) fatal_error("cannot specify --outfile more than once\n");
				if (*optarg == '\0') fatal_error("--outfile needs an argument\n");
				options.outfile = optarg;
				break;
			default:
				fatal_error("unknown option\n");
		}
	}
	/*
	 * default refresh delay
	 */
	if (options.opt_delay_set == 0) options.opt_delay = 1;

	options.cpu_mhz = find_cpu_speed();

	populate_cpumask(cpu_list);

	setup_measurement();
	setup_signals();
	setup_screen();
	mainloop();
	close_screen();

	return 0;
}