Blob Blame History Raw
/* $Id: virtual.c,v 1.88.2.28 2009/06/11 08:09:12 mikpe Exp $
 * Virtual per-process performance counters.
 *
 * Copyright (C) 1999-2009  Mikael Pettersson
 */
#include <linux/version.h>
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
#include <linux/config.h>
#endif
#define __NO_VERSION__
#include <linux/module.h>
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/ptrace.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/perfctr.h>

#include <asm/io.h>
#include <asm/uaccess.h>

#include "compat.h"
#include "virtual.h"
#include "marshal.h"

/****************************************************************
 *								*
 * Data types and macros.					*
 *								*
 ****************************************************************/

struct vperfctr {
/* User-visible fields: (must be first for mmap()) */
	struct perfctr_cpu_state cpu_state;
/* Kernel-private fields: */
	int si_signo;
	atomic_t count;
	spinlock_t owner_lock;
	struct task_struct *owner;
	/* sampling_timer and bad_cpus_allowed are frequently
	   accessed, so they get to share a cache line */
	unsigned int sampling_timer ____cacheline_aligned;
#ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK
	atomic_t bad_cpus_allowed;
	cpumask_t cpumask;
#endif
	pid_t updater_tgid;	/* to detect self vs remote vperfctr_control races */
#if 0 && defined(CONFIG_PERFCTR_DEBUG)
	unsigned start_smp_id;
	unsigned suspended;
#endif
#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT
	unsigned int iresume_cstatus;
#endif
	unsigned int flags;
};
#define IS_RUNNING(perfctr)	perfctr_cstatus_enabled((perfctr)->cpu_state.cstatus)

/* XXX: disabled: called from switch_to() where printk() is disallowed */
#if 0 && defined(CONFIG_PERFCTR_DEBUG)
#define debug_free(perfctr) \
do { \
	int i; \
	for(i = 0; i < PAGE_SIZE/sizeof(int); ++i) \
		((int*)(perfctr))[i] = 0xfedac0ed; \
} while(0)
#define debug_init(perfctr)	do { (perfctr)->suspended = 1; } while(0)
#define debug_suspend(perfctr) \
do { \
	if ((perfctr)->suspended) \
		printk(KERN_ERR "%s: BUG! suspending non-running perfctr (pid %d, comm %s)\n", \
		       __FUNCTION__, current->pid, current->comm); \
	(perfctr)->suspended = 1; \
} while(0)
#define debug_resume(perfctr) \
do { \
	if (!(perfctr)->suspended) \
		printk(KERN_ERR "%s: BUG! resuming non-suspended perfctr (pid %d, comm %s)\n", \
		       __FUNCTION__, current->pid, current->comm); \
	(perfctr)->suspended = 0; \
} while(0)
#define debug_check_smp_id(perfctr) \
do { \
	if ((perfctr)->start_smp_id != smp_processor_id()) { \
		printk(KERN_ERR "%s: BUG! current cpu %u differs from start cpu %u (pid %d, comm %s)\n", \
		       __FUNCTION__, smp_processor_id(), (perfctr)->start_smp_id, \
		       current->pid, current->comm); \
		return; \
	} \
} while(0)
#define debug_set_smp_id(perfctr) \
	do { (perfctr)->start_smp_id = smp_processor_id(); } while(0)
#else	/* CONFIG_PERFCTR_DEBUG */
#define debug_free(perfctr)		do{}while(0)
#define debug_init(perfctr)		do{}while(0)
#define debug_suspend(perfctr)		do{}while(0)
#define debug_resume(perfctr)		do{}while(0)
#define debug_check_smp_id(perfctr)	do{}while(0)
#define debug_set_smp_id(perfctr)	do{}while(0)
#endif	/* CONFIG_PERFCTR_DEBUG */

#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT

static void vperfctr_ihandler(unsigned long pc);
static void vperfctr_handle_overflow(struct task_struct*, struct vperfctr*);

static inline void vperfctr_set_ihandler(void)
{
	perfctr_cpu_set_ihandler(vperfctr_ihandler);
}

static inline void vperfctr_clear_iresume_cstatus(struct vperfctr *perfctr)
{
	perfctr->iresume_cstatus = 0;
}

#else
static inline void vperfctr_set_ihandler(void) { }
static inline void vperfctr_clear_iresume_cstatus(struct vperfctr *perfctr) { }
#endif

#ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK

static inline void vperfctr_init_bad_cpus_allowed(struct vperfctr *perfctr)
{
	atomic_set(&perfctr->bad_cpus_allowed, 0);
}

static inline void vperfctr_init_cpumask(struct vperfctr *perfctr)
{
	cpus_setall(perfctr->cpumask);
}

/* Concurrent set_cpus_allowed() is possible. The only lock it
   can take is the task lock, so we have to take it as well.
   task_lock/unlock also disables/enables preemption. */

static inline void vperfctr_task_lock(struct task_struct *p)
{
	task_lock(p);
}

static inline void vperfctr_task_unlock(struct task_struct *p)
{
	task_unlock(p);
}

#else	/* !CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK */

static inline void vperfctr_init_bad_cpus_allowed(struct vperfctr *perfctr) { }

static inline void vperfctr_init_cpumask(struct vperfctr *perfctr) { }

/* Concurrent set_cpus_allowed() is impossible or irrelevant.
   Disabling and enabling preemption suffices for an atomic region. */

static inline void vperfctr_task_lock(struct task_struct *p)
{
	preempt_disable();
}

static inline void vperfctr_task_unlock(struct task_struct *p)
{
	preempt_enable();
}

#endif	/* !CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK */

/* How to lock around find_task_by_vpid(). The tasklist_lock always
   works, but it's no longer exported starting with kernel 2.6.18.
   For kernels 2.6.18 and newer use rcu_read_{lock,unlock}(). */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18)
static inline void vperfctr_lock_find_task_by_vpid(void)
{
	rcu_read_lock();
}

static inline void vperfctr_unlock_find_task_by_vpid(void)
{
	rcu_read_unlock();
}
#else	/* < 2.6.18 */
static inline void vperfctr_lock_find_task_by_vpid(void)
{
	read_lock(&tasklist_lock);
}

static inline void vperfctr_unlock_find_task_by_vpid(void)
{
	read_unlock(&tasklist_lock);
}
#endif	/* < 2.6.18 */

/****************************************************************
 *								*
 * Resource management.						*
 *								*
 ****************************************************************/

/* XXX: perhaps relax this to number of _live_ perfctrs */
static DEFINE_MUTEX(nrctrs_mutex);
static int nrctrs;
static const char this_service[] = __FILE__;

static int inc_nrctrs(void)
{
	const char *other;

	other = NULL;
	mutex_lock(&nrctrs_mutex);
	if (++nrctrs == 1) {
		other = perfctr_cpu_reserve(this_service);
		if (other)
			nrctrs = 0;
	}
	mutex_unlock(&nrctrs_mutex);
	if (other) {
		printk(KERN_ERR __FILE__
		       ": cannot operate, perfctr hardware taken by '%s'\n",
		       other);
		return -EBUSY;
	}
	vperfctr_set_ihandler();
	return 0;
}

static void dec_nrctrs(void)
{
	mutex_lock(&nrctrs_mutex);
	if (--nrctrs == 0)
		perfctr_cpu_release(this_service);
	mutex_unlock(&nrctrs_mutex);
}

static struct vperfctr *vperfctr_alloc(void)
{
	unsigned long page;

	if (inc_nrctrs() != 0)
		return ERR_PTR(-EBUSY);
	page = get_zeroed_page(GFP_KERNEL);
	if (!page) {
		dec_nrctrs();
		return ERR_PTR(-ENOMEM);
	}
	SetPageReserved(virt_to_page(page));
	return (struct vperfctr*) page;
}

static void vperfctr_free(struct vperfctr *perfctr)
{
	debug_free(perfctr);
	ClearPageReserved(virt_to_page(perfctr));
	free_page((unsigned long)perfctr);
	dec_nrctrs();
}

static struct vperfctr *get_empty_vperfctr(void)
{
	struct vperfctr *perfctr = vperfctr_alloc();
	if (!IS_ERR(perfctr)) {
		atomic_set(&perfctr->count, 1);
		vperfctr_init_bad_cpus_allowed(perfctr);
		vperfctr_init_cpumask(perfctr);
		spin_lock_init(&perfctr->owner_lock);
		debug_init(perfctr);
	}
	return perfctr;
}

static void put_vperfctr(struct vperfctr *perfctr)
{
	if (atomic_dec_and_test(&perfctr->count))
		vperfctr_free(perfctr);
}

/****************************************************************
 *								*
 * Basic counter operations.					*
 * These must all be called by the owner process only.		*
 * These must all be called with preemption disabled.		*
 *								*
 ****************************************************************/

/* PRE: IS_RUNNING(perfctr)
 * Suspend the counters.
 */
static inline void vperfctr_suspend(struct vperfctr *perfctr)
{
	debug_suspend(perfctr);
	debug_check_smp_id(perfctr);
	perfctr_cpu_suspend(&perfctr->cpu_state);
}

static inline void vperfctr_reset_sampling_timer(struct vperfctr *perfctr)
{
	/* XXX: base the value on perfctr_info.cpu_khz instead! */
	perfctr->sampling_timer = HZ/2;
}

/* PRE: perfctr == current->thread.perfctr && IS_RUNNING(perfctr)
 * Restart the counters.
 */
static inline void vperfctr_resume(struct vperfctr *perfctr)
{
	debug_resume(perfctr);
	perfctr_cpu_resume(&perfctr->cpu_state);
	vperfctr_reset_sampling_timer(perfctr);
	debug_set_smp_id(perfctr);
}

static inline void vperfctr_resume_with_overflow_check(struct vperfctr *perfctr)
{
#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT
	if (perfctr_cpu_has_pending_interrupt(&perfctr->cpu_state)) {
		vperfctr_handle_overflow(current, perfctr);
		return;
	}
#endif
	vperfctr_resume(perfctr);
}

/* Sample the counters but do not suspend them. */
static void vperfctr_sample(struct vperfctr *perfctr)
{
	if (IS_RUNNING(perfctr)) {
		debug_check_smp_id(perfctr);
		perfctr_cpu_sample(&perfctr->cpu_state);
		vperfctr_reset_sampling_timer(perfctr);
	}
}

#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT
/* vperfctr interrupt handler (XXX: add buffering support) */
/* PREEMPT note: called in IRQ context with preemption disabled. */
static void vperfctr_ihandler(unsigned long pc)
{
	struct task_struct *tsk = current;
	struct vperfctr *perfctr;

	perfctr = tsk->thread.perfctr;
	if (!perfctr) {
		printk(KERN_ERR "%s: BUG! pid %d has no vperfctr\n",
		       __FUNCTION__, tsk->pid);
		return;
	}
	if (!perfctr_cstatus_has_ictrs(perfctr->cpu_state.cstatus)) {
		printk(KERN_ERR "%s: BUG! vperfctr has cstatus %#x (pid %d, comm %s)\n",
		       __FUNCTION__, perfctr->cpu_state.cstatus, tsk->pid, tsk->comm);
		return;
	}
	vperfctr_suspend(perfctr);
	vperfctr_handle_overflow(tsk, perfctr);
}

static void vperfctr_handle_overflow(struct task_struct *tsk,
				     struct vperfctr *perfctr)
{
	unsigned int pmc_mask;
	siginfo_t si;
	sigset_t old_blocked;

	pmc_mask = perfctr_cpu_identify_overflow(&perfctr->cpu_state);
	if (!pmc_mask) {
		printk(KERN_ERR "%s: BUG! pid %d has unidentifiable overflow source\n",
		       __FUNCTION__, tsk->pid);
		return;
	}
	/* suspend a-mode and i-mode PMCs, leaving only TSC on */
	/* XXX: some people also want to suspend the TSC */
	perfctr->iresume_cstatus = perfctr->cpu_state.cstatus;
	if (perfctr_cstatus_has_tsc(perfctr->iresume_cstatus)) {
		perfctr->cpu_state.cstatus = perfctr_mk_cstatus(1, 0, 0);
		vperfctr_resume(perfctr);
	} else
		perfctr->cpu_state.cstatus = 0;
	si.si_signo = perfctr->si_signo;
	si.si_errno = 0;
	si.si_code = SI_PMC_OVF;
	si.si_pmc_ovf_mask = pmc_mask;

	/* deliver signal without waking up the receiver */
	spin_lock_irq(&task_siglock(tsk));
	old_blocked = tsk->blocked;
	sigaddset(&tsk->blocked, si.si_signo);
	spin_unlock_irq(&task_siglock(tsk));

	if (!send_sig_info(si.si_signo, &si, tsk))
		send_sig(si.si_signo, tsk, 1);

	spin_lock_irq(&task_siglock(tsk));
	tsk->blocked = old_blocked;
	recalc_sigpending();
	spin_unlock_irq(&task_siglock(tsk));
}
#endif

/****************************************************************
 *								*
 * Process management operations.				*
 * These must all, with the exception of vperfctr_unlink()	*
 * and __vperfctr_set_cpus_allowed(), be called by the owner	*
 * process only.						*
 *								*
 ****************************************************************/

/* Called from exit_thread() or sys_vperfctr_unlink().
 * If the counters are running, stop them and sample their final values.
 * Detach the vperfctr object from its owner task.
 * PREEMPT note: exit_thread() does not run with preemption disabled.
 */
static void vperfctr_unlink(struct task_struct *owner, struct vperfctr *perfctr)
{
	/* this synchronises with vperfctr_ioctl() */
	spin_lock(&perfctr->owner_lock);
	perfctr->owner = NULL;
	spin_unlock(&perfctr->owner_lock);

	/* perfctr suspend+detach must be atomic wrt process suspend */
	/* this also synchronises with perfctr_set_cpus_allowed() */
	vperfctr_task_lock(owner);
	if (IS_RUNNING(perfctr) && owner == current)
		vperfctr_suspend(perfctr);
	owner->thread.perfctr = NULL;
	vperfctr_task_unlock(owner);

	perfctr->cpu_state.cstatus = 0;
	vperfctr_clear_iresume_cstatus(perfctr);
	put_vperfctr(perfctr);
}

void __vperfctr_exit(struct vperfctr *perfctr)
{
	vperfctr_unlink(current, perfctr);
}

/* sys_execve() -> .. -> flush_old_exec() -> .. -> __vperfctr_flush().
 * Unlink the thread's perfctr state, if the CLOEXEC control flag is set.
 * PREEMPT note: flush_old_exec() does not run with preemption disabled.
 */
void __vperfctr_flush(struct vperfctr *perfctr)
{
	if (perfctr->flags & VPERFCTR_CONTROL_CLOEXEC)
		__vperfctr_exit(perfctr);
}

/* schedule() --> switch_to() --> .. --> __vperfctr_suspend().
 * If the counters are running, suspend them.
 * PREEMPT note: switch_to() runs with preemption disabled.
 */
void __vperfctr_suspend(struct vperfctr *perfctr)
{
	if (IS_RUNNING(perfctr))
		vperfctr_suspend(perfctr);
}

/* schedule() --> switch_to() --> .. --> __vperfctr_resume().
 * PRE: perfctr == current->thread.perfctr
 * If the counters are runnable, resume them.
 * PREEMPT note: switch_to() runs with preemption disabled.
 */
void __vperfctr_resume(struct vperfctr *perfctr)
{
	if (IS_RUNNING(perfctr)) {
#ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK
		if (unlikely(atomic_read(&perfctr->bad_cpus_allowed)) &&
		    perfctr_cstatus_nrctrs(perfctr->cpu_state.cstatus)) {
			perfctr->cpu_state.cstatus = 0;
			vperfctr_clear_iresume_cstatus(perfctr);
			BUG_ON(current->state != TASK_RUNNING);
			send_sig(SIGILL, current, 1);
			return;
		}
#endif
		vperfctr_resume_with_overflow_check(perfctr);
	}
}

/* Called from update_one_process() [triggered by timer interrupt].
 * PRE: perfctr == current->thread.perfctr.
 * Sample the counters but do not suspend them.
 * Needed to avoid precision loss due to multiple counter
 * wraparounds between resume/suspend for CPU-bound processes.
 * PREEMPT note: called in IRQ context with preemption disabled.
 */
void __vperfctr_sample(struct vperfctr *perfctr)
{
	if (--perfctr->sampling_timer == 0)
		vperfctr_sample(perfctr);
}

#ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK
/* Called from set_cpus_allowed().
 * PRE: current holds task_lock(owner)
 * PRE: owner->thread.perfctr == perfctr
 */
void __vperfctr_set_cpus_allowed(struct task_struct *owner,
				 struct vperfctr *perfctr,
				 cpumask_t new_mask)
{
	if (!cpus_subset(new_mask, perfctr->cpumask)) {
		atomic_set(&perfctr->bad_cpus_allowed, 1);
		printk(KERN_WARNING "perfctr: process %d (comm %s) issued unsafe"
		       " set_cpus_allowed() on process %d (comm %s)\n",
		       current->pid, current->comm, owner->pid, owner->comm);
	} else
		atomic_set(&perfctr->bad_cpus_allowed, 0);
}
#endif

/****************************************************************
 *								*
 * Virtual perfctr "system calls".				*
 * These can be called by the owner process (tsk == current),	*
 * a monitor process which has the owner under ptrace ATTACH	*
 * control (tsk && tsk != current), or anyone with a handle to	*
 * an unlinked perfctr (!tsk).					*
 *								*
 ****************************************************************/

static int sys_vperfctr_control(struct vperfctr *perfctr,
				struct perfctr_struct_buf *argp,
				struct task_struct *tsk)
{
	struct vperfctr_control control;
	int err;
	unsigned int next_cstatus;
	unsigned int nrctrs, i;
	cpumask_t cpumask;

	if (!tsk)
		return -ESRCH;	/* attempt to update unlinked perfctr */

	err = perfctr_copy_from_user(&control, argp, &vperfctr_control_sdesc);
	if (err)
		return err;

	/* Step 1: Update the control but keep the counters disabled.
	   PREEMPT note: Preemption is disabled since we're updating
	   an active perfctr. */
	preempt_disable();
	if (IS_RUNNING(perfctr)) {
		if (tsk == current)
			vperfctr_suspend(perfctr);
		perfctr->cpu_state.cstatus = 0;
		vperfctr_clear_iresume_cstatus(perfctr);
	}
	perfctr->cpu_state.control = control.cpu_control;
	/* remote access note: perfctr_cpu_update_control() is ok */
	cpus_setall(cpumask);
#ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK
	/* make a stopped vperfctr have an unconstrained cpumask */
	perfctr->cpumask = cpumask;
#endif
	err = perfctr_cpu_update_control(&perfctr->cpu_state, &cpumask);
	if (err < 0) {
		next_cstatus = 0;
	} else {
		next_cstatus = perfctr->cpu_state.cstatus;
		perfctr->cpu_state.cstatus = 0;
		perfctr->updater_tgid = current->tgid;
#ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK
		perfctr->cpumask = cpumask;
#endif
	}
	preempt_enable_no_resched();

	if (!perfctr_cstatus_enabled(next_cstatus))
		return err;

#ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK
	/* Step 2: Update the task's CPU affinity mask.
	   PREEMPT note: Preemption must be enabled for set_cpus_allowed(). */
	if (control.cpu_control.nractrs || control.cpu_control.nrictrs) {
		cpumask_t old_mask, new_mask;

		old_mask = tsk->cpus_allowed;
		cpus_and(new_mask, old_mask, cpumask);

		if (cpus_empty(new_mask))
			return -EINVAL;
		if (!cpus_equal(new_mask, old_mask))
			set_cpus_allowed(tsk, new_mask);
	}
#endif

	/* Step 3: Enable the counters with the new control and affinity.
	   PREEMPT note: Preemption is disabled since we're updating
	   an active perfctr. */
	preempt_disable();

	/* We had to enable preemption above for set_cpus_allowed() so we may
	   have lost a race with a concurrent update via the remote control
	   interface. If so then we must abort our update of this perfctr. */
	if (perfctr->updater_tgid != current->tgid) {
		printk(KERN_WARNING "perfctr: control update by task %d"
		       " was lost due to race with update by task %d\n",
		       current->tgid, perfctr->updater_tgid);
		err = -EBUSY;
	} else {
		/* XXX: validate si_signo? */
		perfctr->si_signo = control.si_signo;

		perfctr->cpu_state.cstatus = next_cstatus;

		if (!perfctr_cstatus_has_tsc(next_cstatus))
			perfctr->cpu_state.tsc_sum = 0;

		nrctrs = perfctr_cstatus_nrctrs(next_cstatus);
		for(i = 0; i < nrctrs; ++i)
			if (!(control.preserve & (1<<i)))
				perfctr->cpu_state.pmc[i].sum = 0;

		perfctr->flags = control.flags;

		if (tsk == current)
			vperfctr_resume(perfctr);
	}

	preempt_enable();
	return err;
}

static int sys_vperfctr_iresume(struct vperfctr *perfctr, const struct task_struct *tsk)
{
#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT
	unsigned int iresume_cstatus;

	if (!tsk)
		return -ESRCH;	/* attempt to update unlinked perfctr */

	iresume_cstatus = perfctr->iresume_cstatus;
	if (!perfctr_cstatus_has_ictrs(iresume_cstatus))
		return -EPERM;

	/* PREEMPT note: preemption is disabled over the entire
	   region because we're updating an active perfctr. */
	preempt_disable();

	if (IS_RUNNING(perfctr) && tsk == current)
		vperfctr_suspend(perfctr);

	perfctr->cpu_state.cstatus = iresume_cstatus;
	perfctr->iresume_cstatus = 0;

	/* remote access note: perfctr_cpu_ireload() is ok */
	perfctr_cpu_ireload(&perfctr->cpu_state);

	if (tsk == current)
		vperfctr_resume(perfctr);

	preempt_enable();

	return 0;
#else
	return -ENOSYS;
#endif
}

static int sys_vperfctr_unlink(struct vperfctr *perfctr, struct task_struct *tsk)
{
	if (tsk)
		vperfctr_unlink(tsk, perfctr);
	return 0;
}

static int sys_vperfctr_read_sum(struct vperfctr *perfctr,
				 struct perfctr_struct_buf *argp,
				 const struct task_struct *tsk)
{
	struct perfctr_sum_ctrs sum;

	if (tsk == current) {
		preempt_disable();
		vperfctr_sample(perfctr);
	}
	//sum = perfctr->cpu_state.sum;
	{
		int j;
		sum.tsc = perfctr->cpu_state.tsc_sum;
		for(j = 0; j < ARRAY_SIZE(sum.pmc); ++j)
			sum.pmc[j] = perfctr->cpu_state.pmc[j].sum;
	}
	if (tsk == current)
		preempt_enable();
	return perfctr_copy_to_user(argp, &sum, &perfctr_sum_ctrs_sdesc);
}

static int sys_vperfctr_read_control(struct vperfctr *perfctr,
				     struct perfctr_struct_buf *argp,
				     const struct task_struct *tsk)
{
	struct vperfctr_control control;

	/* PREEMPT note: While we're reading our own control, another
	   process may ptrace ATTACH to us and update our control.
	   Disable preemption to ensure we get a consistent copy.
	   Not needed for other cases since the perfctr is either
	   unlinked or its owner is ptrace ATTACH suspended by us. */
	if (tsk == current)
		preempt_disable();
	control.si_signo = perfctr->si_signo;
	control.cpu_control = perfctr->cpu_state.control;
	control.flags = perfctr->flags;
	if (tsk == current)
		preempt_enable();
	control.preserve = 0;
	return perfctr_copy_to_user(argp, &control, &vperfctr_control_sdesc);
}

/****************************************************************
 *								*
 * Virtual perfctr file operations.				*
 *								*
 ****************************************************************/

static int vperfctr_mmap(struct file *filp, struct vm_area_struct *vma)
{
	struct vperfctr *perfctr;

#ifdef CONFIG_ARM
#define _PAGE_RW	L_PTE_WRITE
#endif
	/* Only allow read-only mapping of first page. */
	if ((vma->vm_end - vma->vm_start) != PAGE_SIZE ||
	    vma->vm_pgoff != 0 ||
	    (pgprot_val(vma->vm_page_prot) & _PAGE_RW) ||
	    (vma->vm_flags & (VM_WRITE | VM_MAYWRITE)))
		return -EPERM;
	perfctr = filp->private_data;
	if (!perfctr)
		return -EPERM;
	/* 2.6.29-rc1 changed arch/x86/mm/pat.c to WARN_ON when
	   remap_pfn_range() is applied to plain RAM pages.
	   Comments there indicate that one should set_memory_wc()
	   before the remap, but that doesn't silence the WARN_ON.
	   Luckily vm_insert_page() works without complaints. */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
	return vm_insert_page(vma, vma->vm_start, virt_to_page((unsigned long)perfctr));
#else
	return remap_pfn_range(vma, vma->vm_start,
			       virt_to_phys(perfctr) >> PAGE_SHIFT,
			       PAGE_SIZE, vma->vm_page_prot);
#endif
}

static int vperfctr_release(struct inode *inode, struct file *filp)
{
	struct vperfctr *perfctr = filp->private_data;
	filp->private_data = NULL;
	if (perfctr)
		put_vperfctr(perfctr);
	return 0;
}

static long vperfctr_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
	struct vperfctr *perfctr;
	struct task_struct *tsk;
	int ret;

	switch (cmd) {
	case PERFCTR_ABI:
		return sys_perfctr_abi((unsigned int*)arg);
	case PERFCTR_INFO:
		return sys_perfctr_info((struct perfctr_struct_buf*)arg);
	case PERFCTR_CPUS:
		return sys_perfctr_cpus((struct perfctr_cpu_mask*)arg);
	case PERFCTR_CPUS_FORBIDDEN:
		return sys_perfctr_cpus_forbidden((struct perfctr_cpu_mask*)arg);
	}
	perfctr = filp->private_data;
	if (!perfctr)
		return -EINVAL;
	tsk = current;
	if (perfctr != current->thread.perfctr) {
		/* this synchronises with vperfctr_unlink() and itself */
		spin_lock(&perfctr->owner_lock);
		tsk = perfctr->owner;
		if (tsk)
			get_task_struct(tsk);
		spin_unlock(&perfctr->owner_lock);
		if (tsk) {
			ret = ptrace_check_attach(tsk, 0);
			if (ret < 0)
				goto out;
		}
	}
	switch (cmd) {
	case VPERFCTR_CONTROL:
		ret = sys_vperfctr_control(perfctr, (struct perfctr_struct_buf*)arg, tsk);
		break;
	case VPERFCTR_UNLINK:
		ret = sys_vperfctr_unlink(perfctr, tsk);
		break;
	case VPERFCTR_READ_SUM:
		ret = sys_vperfctr_read_sum(perfctr, (struct perfctr_struct_buf*)arg, tsk);
		break;
	case VPERFCTR_IRESUME:
		ret = sys_vperfctr_iresume(perfctr, tsk);
		break;
	case VPERFCTR_READ_CONTROL:
		ret = sys_vperfctr_read_control(perfctr, (struct perfctr_struct_buf*)arg, tsk);
		break;
	default:
		ret = -EINVAL;
	}
 out:
	if (tsk && tsk != current)
		put_task_struct(tsk);
	return ret;
}

#if !HAVE_UNLOCKED_IOCTL
static int vperfctr_ioctl_oldstyle(struct inode *inode, struct file *filp,
				   unsigned int cmd, unsigned long arg)
{
	return vperfctr_ioctl(filp, cmd, arg);
}
#endif

static
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
const
#endif
struct file_operations vperfctr_file_ops = {
	.owner = THIS_MODULE,
	.mmap = vperfctr_mmap,
	.release = vperfctr_release,
	/* 2.6.11-rc2 introduced HAVE_UNLOCKED_IOCTL and HAVE_COMPAT_IOCTL */
#if HAVE_UNLOCKED_IOCTL
	.unlocked_ioctl = vperfctr_ioctl,
#else
	.ioctl = vperfctr_ioctl_oldstyle,
#endif
#if defined(CONFIG_IA32_EMULATION) && HAVE_COMPAT_IOCTL
	.compat_ioctl = vperfctr_ioctl,
#endif
};

/****************************************************************
 *								*
 * File system for virtual perfctrs. Based on pipefs.		*
 *								*
 ****************************************************************/

#define VPERFCTRFS_MAGIC (('V'<<24)|('P'<<16)|('M'<<8)|('C'))

#include <linux/mount.h>

/* 2.6 kernels prior to 2.6.11-rc1 don't EXPORT_SYMBOL() get_sb_pseudo().
   This is a verbatim copy, only renamed. */
#if defined(MODULE) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
static
struct super_block *
perfctr_get_sb_pseudo(struct file_system_type *fs_type, char *name,
	struct super_operations *ops, unsigned long magic)
{
	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
	static struct super_operations default_ops = {.statfs = simple_statfs};
	struct dentry *dentry;
	struct inode *root;
	struct qstr d_name = {.name = name, .len = strlen(name)};

	if (IS_ERR(s))
		return s;

	s->s_flags = MS_NOUSER;
	s->s_maxbytes = ~0ULL;
	s->s_blocksize = 1024;
	s->s_blocksize_bits = 10;
	s->s_magic = magic;
	s->s_op = ops ? ops : &default_ops;
	root = new_inode(s);
	if (!root)
		goto Enomem;
	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
	root->i_uid = root->i_gid = 0;
	root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
	dentry = d_alloc(NULL, &d_name);
	if (!dentry) {
		iput(root);
		goto Enomem;
	}
	dentry->d_sb = s;
	dentry->d_parent = dentry;
	d_instantiate(dentry, root);
	s->s_root = dentry;
	s->s_flags |= MS_ACTIVE;
	return s;

Enomem:
	up_write(&s->s_umount);
	deactivate_super(s);
	return ERR_PTR(-ENOMEM);
}
#undef get_sb_pseudo
#define get_sb_pseudo perfctr_get_sb_pseudo
#endif	/* MODULE && VERSION < 2.6.11 */

#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18)
static int
vperfctrfs_get_sb(struct file_system_type *fs_type,
		  int flags, const char *dev_name, void *data,
		  struct vfsmount *mnt)
{
	return get_sb_pseudo(fs_type, "vperfctr:", NULL, VPERFCTRFS_MAGIC, mnt);
}
#else
static struct super_block *
vperfctrfs_get_sb(struct file_system_type *fs_type,
		  int flags, const char *dev_name, void *data)
{
	return get_sb_pseudo(fs_type, "vperfctr:", NULL, VPERFCTRFS_MAGIC);
}
#endif

static struct file_system_type vperfctrfs_type = {
	.name		= "vperfctrfs",
	.get_sb		= vperfctrfs_get_sb,
	.kill_sb	= kill_anon_super,
};

/* XXX: check if s/vperfctr_mnt/vperfctrfs_type.kern_mnt/ would work */
static struct vfsmount *vperfctr_mnt;

static int __init vperfctrfs_init(void)
{
	int err = register_filesystem(&vperfctrfs_type);
	if (!err) {
		vperfctr_mnt = kern_mount(&vperfctrfs_type);
		if (!IS_ERR(vperfctr_mnt))
			return 0;
		err = PTR_ERR(vperfctr_mnt);
		unregister_filesystem(&vperfctrfs_type);
	}
	return err;
}

static void __exit vperfctrfs_exit(void)
{
	unregister_filesystem(&vperfctrfs_type);
	mntput(vperfctr_mnt);
}

static struct inode *vperfctr_get_inode(void)
{
	struct inode *inode;

	inode = new_inode(vperfctr_mnt->mnt_sb);
	if (!inode)
		return NULL;
	inode->i_fop = &vperfctr_file_ops;
	inode->i_state = I_DIRTY;
	inode->i_mode = S_IFCHR | S_IRUSR | S_IWUSR;
	inode->i_uid = current_fsuid();
	inode->i_gid = current_fsgid();
	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) && !defined(DONT_HAVE_i_blksize)
	inode->i_blksize = 0;
#endif
	return inode;
}

static int vperfctrfs_delete_dentry(struct dentry *dentry)
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
	/*
	 * At creation time, we pretended this dentry was hashed
	 * (by clearing DCACHE_UNHASHED bit in d_flags)
	 * At delete time, we restore the truth : not hashed.
	 * (so that dput() can proceed correctly)
	 */
	dentry->d_flags |= DCACHE_UNHASHED;
	return 0;
#else
	return 1;
#endif
}

static
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,30)
const
#endif
struct dentry_operations vperfctrfs_dentry_operations = {
	.d_delete	= vperfctrfs_delete_dentry,
};

static struct dentry *vperfctr_d_alloc_root(struct inode *inode)
{
	struct qstr this;
	char name[32];
	struct dentry *dentry;

	sprintf(name, "[%lu]", inode->i_ino);
	this.name = name;
	this.len = strlen(name);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
	this.hash = 0;
#else
	this.hash = inode->i_ino; /* will go */
#endif
	dentry = d_alloc(vperfctr_mnt->mnt_sb->s_root, &this);
	if (dentry) {
		dentry->d_op = &vperfctrfs_dentry_operations;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
		/*
		 * We dont want to publish this dentry into global dentry hash table.
		 * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
		 * This permits a working /proc/$pid/fd/XXX on vperfctrs
		 */
		dentry->d_flags &= ~DCACHE_UNHASHED;
		d_instantiate(dentry, inode);
#else
		d_add(dentry, inode);
#endif
	}
	return dentry;
}

static struct file *vperfctr_get_filp(void)
{
	struct file *filp;
	struct inode *inode;
	struct dentry *dentry;

	inode = vperfctr_get_inode();
	if (!inode)
		goto out;
	dentry = vperfctr_d_alloc_root(inode);
	if (!dentry)
		goto out_inode;
	/*
	 * Create the filp _after_ the inode and dentry, to avoid
	 * needing access to put_filp(), which is no longer exported
	 * starting with kernel 2.6.10-rc1. fput() is available but
	 * doesn't work on incomplete files. We now need access to
	 * dput() instead, but that's Ok.
	 */
	filp = get_empty_filp();
	if (!filp)
		goto out_dentry;

	filp_vfsmnt(filp) = mntget(vperfctr_mnt);
	filp_dentry(filp) = dentry;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,2)
	filp->f_mapping = dentry->d_inode->i_mapping;
#endif

	filp->f_pos = 0;
	filp->f_flags = 0;
	filp->f_op = fops_get(&vperfctr_file_ops); /* fops_get() for MODULE */
	filp->f_mode = FMODE_READ;
	filp->f_version = 0;

	return filp;

 out_dentry:
	dput(dentry);
	goto out; /* dput() also does iput() */
 out_inode:
	iput(inode);
 out:
	return NULL;
}

/* tid is the actual task/thread id (née pid, stored as ->pid),
   pid/tgid is that 2.6 thread group id crap (stored as ->tgid) */
int vperfctr_attach(int tid, int creat)
{
	struct file *filp;
	struct task_struct *tsk;
	struct vperfctr *perfctr;
	int err;
	int fd;

	filp = vperfctr_get_filp();
	if (!filp)
		return -ENOMEM;
	err = fd = get_unused_fd();
	if (err < 0)
		goto err_filp;
	perfctr = NULL;
	if (creat) {
		perfctr = get_empty_vperfctr(); /* may sleep */
		if (IS_ERR(perfctr)) {
			err = PTR_ERR(perfctr);
			goto err_fd;
		}
	}
	tsk = current;
	if (tid != 0 && tid != task_pid_vnr(tsk)) { /* remote? */
		vperfctr_lock_find_task_by_vpid();
		tsk = find_task_by_vpid(tid);
		if (tsk)
			get_task_struct(tsk);
		vperfctr_unlock_find_task_by_vpid();
		err = -ESRCH;
		if (!tsk)
			goto err_perfctr;
		err = ptrace_check_attach(tsk, 0);
		if (err < 0)
			goto err_tsk;
	}
	if (creat) {
		/* check+install must be atomic to prevent remote-control races */
		vperfctr_task_lock(tsk);
		if (!tsk->thread.perfctr) {
			perfctr->owner = tsk;
			tsk->thread.perfctr = perfctr;
			err = 0;
		} else
			err = -EEXIST;
		vperfctr_task_unlock(tsk);
		if (err)
			goto err_tsk;
	} else {
		perfctr = tsk->thread.perfctr;
		/* PERFCTR_ABI and PERFCTR_INFO don't need the perfctr.
		   Hence no non-NULL check here. */
	}
	filp->private_data = perfctr;
	if (perfctr)
		atomic_inc(&perfctr->count);
	if (tsk != current)
		put_task_struct(tsk);
	fd_install(fd, filp);
	return fd;
 err_tsk:
	if (tsk != current)
		put_task_struct(tsk);
 err_perfctr:
	if (perfctr)	/* can only occur if creat != 0 */
		put_vperfctr(perfctr);
 err_fd:
	put_unused_fd(fd);
 err_filp:
	fput(filp);
	return err;
}

/****************************************************************
 *								*
 * module_init/exit						*
 *								*
 ****************************************************************/

#ifdef MODULE
static struct vperfctr_stub off;

static void vperfctr_stub_init(void)
{
	off = vperfctr_stub;
	vperfctr_stub.owner = THIS_MODULE;
	vperfctr_stub.exit = __vperfctr_exit;
	vperfctr_stub.flush = __vperfctr_flush;
	vperfctr_stub.suspend = __vperfctr_suspend;
	vperfctr_stub.resume = __vperfctr_resume;
	vperfctr_stub.sample = __vperfctr_sample;
#ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK
	vperfctr_stub.set_cpus_allowed = __vperfctr_set_cpus_allowed;
#endif
}

static void vperfctr_stub_exit(void)
{
	vperfctr_stub = off;
}
#else
static inline void vperfctr_stub_init(void) { }
static inline void vperfctr_stub_exit(void) { }
#endif	/* MODULE */

int __init vperfctr_init(void)
{
	int err = vperfctrfs_init();
	if (err)
		return err;
	vperfctr_stub_init();
	return 0;
}

void __exit vperfctr_exit(void)
{
	vperfctrfs_exit();
	vperfctr_stub_exit();
}