/* $Id: virtual.c,v 1.88.2.28 2009/06/11 08:09:12 mikpe Exp $ * Virtual per-process performance counters. * * Copyright (C) 1999-2009 Mikael Pettersson */ #include #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) #include #endif #define __NO_VERSION__ #include #include #include #include #include #include #include #include #include #include #include #include "compat.h" #include "virtual.h" #include "marshal.h" /**************************************************************** * * * Data types and macros. * * * ****************************************************************/ struct vperfctr { /* User-visible fields: (must be first for mmap()) */ struct perfctr_cpu_state cpu_state; /* Kernel-private fields: */ int si_signo; atomic_t count; spinlock_t owner_lock; struct task_struct *owner; /* sampling_timer and bad_cpus_allowed are frequently accessed, so they get to share a cache line */ unsigned int sampling_timer ____cacheline_aligned; #ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK atomic_t bad_cpus_allowed; cpumask_t cpumask; #endif pid_t updater_tgid; /* to detect self vs remote vperfctr_control races */ #if 0 && defined(CONFIG_PERFCTR_DEBUG) unsigned start_smp_id; unsigned suspended; #endif #ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT unsigned int iresume_cstatus; #endif unsigned int flags; }; #define IS_RUNNING(perfctr) perfctr_cstatus_enabled((perfctr)->cpu_state.cstatus) /* XXX: disabled: called from switch_to() where printk() is disallowed */ #if 0 && defined(CONFIG_PERFCTR_DEBUG) #define debug_free(perfctr) \ do { \ int i; \ for(i = 0; i < PAGE_SIZE/sizeof(int); ++i) \ ((int*)(perfctr))[i] = 0xfedac0ed; \ } while(0) #define debug_init(perfctr) do { (perfctr)->suspended = 1; } while(0) #define debug_suspend(perfctr) \ do { \ if ((perfctr)->suspended) \ printk(KERN_ERR "%s: BUG! suspending non-running perfctr (pid %d, comm %s)\n", \ __FUNCTION__, current->pid, current->comm); \ (perfctr)->suspended = 1; \ } while(0) #define debug_resume(perfctr) \ do { \ if (!(perfctr)->suspended) \ printk(KERN_ERR "%s: BUG! resuming non-suspended perfctr (pid %d, comm %s)\n", \ __FUNCTION__, current->pid, current->comm); \ (perfctr)->suspended = 0; \ } while(0) #define debug_check_smp_id(perfctr) \ do { \ if ((perfctr)->start_smp_id != smp_processor_id()) { \ printk(KERN_ERR "%s: BUG! current cpu %u differs from start cpu %u (pid %d, comm %s)\n", \ __FUNCTION__, smp_processor_id(), (perfctr)->start_smp_id, \ current->pid, current->comm); \ return; \ } \ } while(0) #define debug_set_smp_id(perfctr) \ do { (perfctr)->start_smp_id = smp_processor_id(); } while(0) #else /* CONFIG_PERFCTR_DEBUG */ #define debug_free(perfctr) do{}while(0) #define debug_init(perfctr) do{}while(0) #define debug_suspend(perfctr) do{}while(0) #define debug_resume(perfctr) do{}while(0) #define debug_check_smp_id(perfctr) do{}while(0) #define debug_set_smp_id(perfctr) do{}while(0) #endif /* CONFIG_PERFCTR_DEBUG */ #ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT static void vperfctr_ihandler(unsigned long pc); static void vperfctr_handle_overflow(struct task_struct*, struct vperfctr*); static inline void vperfctr_set_ihandler(void) { perfctr_cpu_set_ihandler(vperfctr_ihandler); } static inline void vperfctr_clear_iresume_cstatus(struct vperfctr *perfctr) { perfctr->iresume_cstatus = 0; } #else static inline void vperfctr_set_ihandler(void) { } static inline void vperfctr_clear_iresume_cstatus(struct vperfctr *perfctr) { } #endif #ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK static inline void vperfctr_init_bad_cpus_allowed(struct vperfctr *perfctr) { atomic_set(&perfctr->bad_cpus_allowed, 0); } static inline void vperfctr_init_cpumask(struct vperfctr *perfctr) { cpus_setall(perfctr->cpumask); } /* Concurrent set_cpus_allowed() is possible. The only lock it can take is the task lock, so we have to take it as well. task_lock/unlock also disables/enables preemption. */ static inline void vperfctr_task_lock(struct task_struct *p) { task_lock(p); } static inline void vperfctr_task_unlock(struct task_struct *p) { task_unlock(p); } #else /* !CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK */ static inline void vperfctr_init_bad_cpus_allowed(struct vperfctr *perfctr) { } static inline void vperfctr_init_cpumask(struct vperfctr *perfctr) { } /* Concurrent set_cpus_allowed() is impossible or irrelevant. Disabling and enabling preemption suffices for an atomic region. */ static inline void vperfctr_task_lock(struct task_struct *p) { preempt_disable(); } static inline void vperfctr_task_unlock(struct task_struct *p) { preempt_enable(); } #endif /* !CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK */ /* How to lock around find_task_by_vpid(). The tasklist_lock always works, but it's no longer exported starting with kernel 2.6.18. For kernels 2.6.18 and newer use rcu_read_{lock,unlock}(). */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18) static inline void vperfctr_lock_find_task_by_vpid(void) { rcu_read_lock(); } static inline void vperfctr_unlock_find_task_by_vpid(void) { rcu_read_unlock(); } #else /* < 2.6.18 */ static inline void vperfctr_lock_find_task_by_vpid(void) { read_lock(&tasklist_lock); } static inline void vperfctr_unlock_find_task_by_vpid(void) { read_unlock(&tasklist_lock); } #endif /* < 2.6.18 */ /**************************************************************** * * * Resource management. * * * ****************************************************************/ /* XXX: perhaps relax this to number of _live_ perfctrs */ static DEFINE_MUTEX(nrctrs_mutex); static int nrctrs; static const char this_service[] = __FILE__; static int inc_nrctrs(void) { const char *other; other = NULL; mutex_lock(&nrctrs_mutex); if (++nrctrs == 1) { other = perfctr_cpu_reserve(this_service); if (other) nrctrs = 0; } mutex_unlock(&nrctrs_mutex); if (other) { printk(KERN_ERR __FILE__ ": cannot operate, perfctr hardware taken by '%s'\n", other); return -EBUSY; } vperfctr_set_ihandler(); return 0; } static void dec_nrctrs(void) { mutex_lock(&nrctrs_mutex); if (--nrctrs == 0) perfctr_cpu_release(this_service); mutex_unlock(&nrctrs_mutex); } static struct vperfctr *vperfctr_alloc(void) { unsigned long page; if (inc_nrctrs() != 0) return ERR_PTR(-EBUSY); page = get_zeroed_page(GFP_KERNEL); if (!page) { dec_nrctrs(); return ERR_PTR(-ENOMEM); } SetPageReserved(virt_to_page(page)); return (struct vperfctr*) page; } static void vperfctr_free(struct vperfctr *perfctr) { debug_free(perfctr); ClearPageReserved(virt_to_page(perfctr)); free_page((unsigned long)perfctr); dec_nrctrs(); } static struct vperfctr *get_empty_vperfctr(void) { struct vperfctr *perfctr = vperfctr_alloc(); if (!IS_ERR(perfctr)) { atomic_set(&perfctr->count, 1); vperfctr_init_bad_cpus_allowed(perfctr); vperfctr_init_cpumask(perfctr); spin_lock_init(&perfctr->owner_lock); debug_init(perfctr); } return perfctr; } static void put_vperfctr(struct vperfctr *perfctr) { if (atomic_dec_and_test(&perfctr->count)) vperfctr_free(perfctr); } /**************************************************************** * * * Basic counter operations. * * These must all be called by the owner process only. * * These must all be called with preemption disabled. * * * ****************************************************************/ /* PRE: IS_RUNNING(perfctr) * Suspend the counters. */ static inline void vperfctr_suspend(struct vperfctr *perfctr) { debug_suspend(perfctr); debug_check_smp_id(perfctr); perfctr_cpu_suspend(&perfctr->cpu_state); } static inline void vperfctr_reset_sampling_timer(struct vperfctr *perfctr) { /* XXX: base the value on perfctr_info.cpu_khz instead! */ perfctr->sampling_timer = HZ/2; } /* PRE: perfctr == current->thread.perfctr && IS_RUNNING(perfctr) * Restart the counters. */ static inline void vperfctr_resume(struct vperfctr *perfctr) { debug_resume(perfctr); perfctr_cpu_resume(&perfctr->cpu_state); vperfctr_reset_sampling_timer(perfctr); debug_set_smp_id(perfctr); } static inline void vperfctr_resume_with_overflow_check(struct vperfctr *perfctr) { #ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT if (perfctr_cpu_has_pending_interrupt(&perfctr->cpu_state)) { vperfctr_handle_overflow(current, perfctr); return; } #endif vperfctr_resume(perfctr); } /* Sample the counters but do not suspend them. */ static void vperfctr_sample(struct vperfctr *perfctr) { if (IS_RUNNING(perfctr)) { debug_check_smp_id(perfctr); perfctr_cpu_sample(&perfctr->cpu_state); vperfctr_reset_sampling_timer(perfctr); } } #ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT /* vperfctr interrupt handler (XXX: add buffering support) */ /* PREEMPT note: called in IRQ context with preemption disabled. */ static void vperfctr_ihandler(unsigned long pc) { struct task_struct *tsk = current; struct vperfctr *perfctr; perfctr = tsk->thread.perfctr; if (!perfctr) { printk(KERN_ERR "%s: BUG! pid %d has no vperfctr\n", __FUNCTION__, tsk->pid); return; } if (!perfctr_cstatus_has_ictrs(perfctr->cpu_state.cstatus)) { printk(KERN_ERR "%s: BUG! vperfctr has cstatus %#x (pid %d, comm %s)\n", __FUNCTION__, perfctr->cpu_state.cstatus, tsk->pid, tsk->comm); return; } vperfctr_suspend(perfctr); vperfctr_handle_overflow(tsk, perfctr); } static void vperfctr_handle_overflow(struct task_struct *tsk, struct vperfctr *perfctr) { unsigned int pmc_mask; siginfo_t si; sigset_t old_blocked; pmc_mask = perfctr_cpu_identify_overflow(&perfctr->cpu_state); if (!pmc_mask) { printk(KERN_ERR "%s: BUG! pid %d has unidentifiable overflow source\n", __FUNCTION__, tsk->pid); return; } /* suspend a-mode and i-mode PMCs, leaving only TSC on */ /* XXX: some people also want to suspend the TSC */ perfctr->iresume_cstatus = perfctr->cpu_state.cstatus; if (perfctr_cstatus_has_tsc(perfctr->iresume_cstatus)) { perfctr->cpu_state.cstatus = perfctr_mk_cstatus(1, 0, 0); vperfctr_resume(perfctr); } else perfctr->cpu_state.cstatus = 0; si.si_signo = perfctr->si_signo; si.si_errno = 0; si.si_code = SI_PMC_OVF; si.si_pmc_ovf_mask = pmc_mask; /* deliver signal without waking up the receiver */ spin_lock_irq(&task_siglock(tsk)); old_blocked = tsk->blocked; sigaddset(&tsk->blocked, si.si_signo); spin_unlock_irq(&task_siglock(tsk)); if (!send_sig_info(si.si_signo, &si, tsk)) send_sig(si.si_signo, tsk, 1); spin_lock_irq(&task_siglock(tsk)); tsk->blocked = old_blocked; recalc_sigpending(); spin_unlock_irq(&task_siglock(tsk)); } #endif /**************************************************************** * * * Process management operations. * * These must all, with the exception of vperfctr_unlink() * * and __vperfctr_set_cpus_allowed(), be called by the owner * * process only. * * * ****************************************************************/ /* Called from exit_thread() or sys_vperfctr_unlink(). * If the counters are running, stop them and sample their final values. * Detach the vperfctr object from its owner task. * PREEMPT note: exit_thread() does not run with preemption disabled. */ static void vperfctr_unlink(struct task_struct *owner, struct vperfctr *perfctr) { /* this synchronises with vperfctr_ioctl() */ spin_lock(&perfctr->owner_lock); perfctr->owner = NULL; spin_unlock(&perfctr->owner_lock); /* perfctr suspend+detach must be atomic wrt process suspend */ /* this also synchronises with perfctr_set_cpus_allowed() */ vperfctr_task_lock(owner); if (IS_RUNNING(perfctr) && owner == current) vperfctr_suspend(perfctr); owner->thread.perfctr = NULL; vperfctr_task_unlock(owner); perfctr->cpu_state.cstatus = 0; vperfctr_clear_iresume_cstatus(perfctr); put_vperfctr(perfctr); } void __vperfctr_exit(struct vperfctr *perfctr) { vperfctr_unlink(current, perfctr); } /* sys_execve() -> .. -> flush_old_exec() -> .. -> __vperfctr_flush(). * Unlink the thread's perfctr state, if the CLOEXEC control flag is set. * PREEMPT note: flush_old_exec() does not run with preemption disabled. */ void __vperfctr_flush(struct vperfctr *perfctr) { if (perfctr->flags & VPERFCTR_CONTROL_CLOEXEC) __vperfctr_exit(perfctr); } /* schedule() --> switch_to() --> .. --> __vperfctr_suspend(). * If the counters are running, suspend them. * PREEMPT note: switch_to() runs with preemption disabled. */ void __vperfctr_suspend(struct vperfctr *perfctr) { if (IS_RUNNING(perfctr)) vperfctr_suspend(perfctr); } /* schedule() --> switch_to() --> .. --> __vperfctr_resume(). * PRE: perfctr == current->thread.perfctr * If the counters are runnable, resume them. * PREEMPT note: switch_to() runs with preemption disabled. */ void __vperfctr_resume(struct vperfctr *perfctr) { if (IS_RUNNING(perfctr)) { #ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK if (unlikely(atomic_read(&perfctr->bad_cpus_allowed)) && perfctr_cstatus_nrctrs(perfctr->cpu_state.cstatus)) { perfctr->cpu_state.cstatus = 0; vperfctr_clear_iresume_cstatus(perfctr); BUG_ON(current->state != TASK_RUNNING); send_sig(SIGILL, current, 1); return; } #endif vperfctr_resume_with_overflow_check(perfctr); } } /* Called from update_one_process() [triggered by timer interrupt]. * PRE: perfctr == current->thread.perfctr. * Sample the counters but do not suspend them. * Needed to avoid precision loss due to multiple counter * wraparounds between resume/suspend for CPU-bound processes. * PREEMPT note: called in IRQ context with preemption disabled. */ void __vperfctr_sample(struct vperfctr *perfctr) { if (--perfctr->sampling_timer == 0) vperfctr_sample(perfctr); } #ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK /* Called from set_cpus_allowed(). * PRE: current holds task_lock(owner) * PRE: owner->thread.perfctr == perfctr */ void __vperfctr_set_cpus_allowed(struct task_struct *owner, struct vperfctr *perfctr, cpumask_t new_mask) { if (!cpus_subset(new_mask, perfctr->cpumask)) { atomic_set(&perfctr->bad_cpus_allowed, 1); printk(KERN_WARNING "perfctr: process %d (comm %s) issued unsafe" " set_cpus_allowed() on process %d (comm %s)\n", current->pid, current->comm, owner->pid, owner->comm); } else atomic_set(&perfctr->bad_cpus_allowed, 0); } #endif /**************************************************************** * * * Virtual perfctr "system calls". * * These can be called by the owner process (tsk == current), * * a monitor process which has the owner under ptrace ATTACH * * control (tsk && tsk != current), or anyone with a handle to * * an unlinked perfctr (!tsk). * * * ****************************************************************/ static int sys_vperfctr_control(struct vperfctr *perfctr, struct perfctr_struct_buf *argp, struct task_struct *tsk) { struct vperfctr_control control; int err; unsigned int next_cstatus; unsigned int nrctrs, i; cpumask_t cpumask; if (!tsk) return -ESRCH; /* attempt to update unlinked perfctr */ err = perfctr_copy_from_user(&control, argp, &vperfctr_control_sdesc); if (err) return err; /* Step 1: Update the control but keep the counters disabled. PREEMPT note: Preemption is disabled since we're updating an active perfctr. */ preempt_disable(); if (IS_RUNNING(perfctr)) { if (tsk == current) vperfctr_suspend(perfctr); perfctr->cpu_state.cstatus = 0; vperfctr_clear_iresume_cstatus(perfctr); } perfctr->cpu_state.control = control.cpu_control; /* remote access note: perfctr_cpu_update_control() is ok */ cpus_setall(cpumask); #ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK /* make a stopped vperfctr have an unconstrained cpumask */ perfctr->cpumask = cpumask; #endif err = perfctr_cpu_update_control(&perfctr->cpu_state, &cpumask); if (err < 0) { next_cstatus = 0; } else { next_cstatus = perfctr->cpu_state.cstatus; perfctr->cpu_state.cstatus = 0; perfctr->updater_tgid = current->tgid; #ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK perfctr->cpumask = cpumask; #endif } preempt_enable_no_resched(); if (!perfctr_cstatus_enabled(next_cstatus)) return err; #ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK /* Step 2: Update the task's CPU affinity mask. PREEMPT note: Preemption must be enabled for set_cpus_allowed(). */ if (control.cpu_control.nractrs || control.cpu_control.nrictrs) { cpumask_t old_mask, new_mask; old_mask = tsk->cpus_allowed; cpus_and(new_mask, old_mask, cpumask); if (cpus_empty(new_mask)) return -EINVAL; if (!cpus_equal(new_mask, old_mask)) set_cpus_allowed(tsk, new_mask); } #endif /* Step 3: Enable the counters with the new control and affinity. PREEMPT note: Preemption is disabled since we're updating an active perfctr. */ preempt_disable(); /* We had to enable preemption above for set_cpus_allowed() so we may have lost a race with a concurrent update via the remote control interface. If so then we must abort our update of this perfctr. */ if (perfctr->updater_tgid != current->tgid) { printk(KERN_WARNING "perfctr: control update by task %d" " was lost due to race with update by task %d\n", current->tgid, perfctr->updater_tgid); err = -EBUSY; } else { /* XXX: validate si_signo? */ perfctr->si_signo = control.si_signo; perfctr->cpu_state.cstatus = next_cstatus; if (!perfctr_cstatus_has_tsc(next_cstatus)) perfctr->cpu_state.tsc_sum = 0; nrctrs = perfctr_cstatus_nrctrs(next_cstatus); for(i = 0; i < nrctrs; ++i) if (!(control.preserve & (1<cpu_state.pmc[i].sum = 0; perfctr->flags = control.flags; if (tsk == current) vperfctr_resume(perfctr); } preempt_enable(); return err; } static int sys_vperfctr_iresume(struct vperfctr *perfctr, const struct task_struct *tsk) { #ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT unsigned int iresume_cstatus; if (!tsk) return -ESRCH; /* attempt to update unlinked perfctr */ iresume_cstatus = perfctr->iresume_cstatus; if (!perfctr_cstatus_has_ictrs(iresume_cstatus)) return -EPERM; /* PREEMPT note: preemption is disabled over the entire region because we're updating an active perfctr. */ preempt_disable(); if (IS_RUNNING(perfctr) && tsk == current) vperfctr_suspend(perfctr); perfctr->cpu_state.cstatus = iresume_cstatus; perfctr->iresume_cstatus = 0; /* remote access note: perfctr_cpu_ireload() is ok */ perfctr_cpu_ireload(&perfctr->cpu_state); if (tsk == current) vperfctr_resume(perfctr); preempt_enable(); return 0; #else return -ENOSYS; #endif } static int sys_vperfctr_unlink(struct vperfctr *perfctr, struct task_struct *tsk) { if (tsk) vperfctr_unlink(tsk, perfctr); return 0; } static int sys_vperfctr_read_sum(struct vperfctr *perfctr, struct perfctr_struct_buf *argp, const struct task_struct *tsk) { struct perfctr_sum_ctrs sum; if (tsk == current) { preempt_disable(); vperfctr_sample(perfctr); } //sum = perfctr->cpu_state.sum; { int j; sum.tsc = perfctr->cpu_state.tsc_sum; for(j = 0; j < ARRAY_SIZE(sum.pmc); ++j) sum.pmc[j] = perfctr->cpu_state.pmc[j].sum; } if (tsk == current) preempt_enable(); return perfctr_copy_to_user(argp, &sum, &perfctr_sum_ctrs_sdesc); } static int sys_vperfctr_read_control(struct vperfctr *perfctr, struct perfctr_struct_buf *argp, const struct task_struct *tsk) { struct vperfctr_control control; /* PREEMPT note: While we're reading our own control, another process may ptrace ATTACH to us and update our control. Disable preemption to ensure we get a consistent copy. Not needed for other cases since the perfctr is either unlinked or its owner is ptrace ATTACH suspended by us. */ if (tsk == current) preempt_disable(); control.si_signo = perfctr->si_signo; control.cpu_control = perfctr->cpu_state.control; control.flags = perfctr->flags; if (tsk == current) preempt_enable(); control.preserve = 0; return perfctr_copy_to_user(argp, &control, &vperfctr_control_sdesc); } /**************************************************************** * * * Virtual perfctr file operations. * * * ****************************************************************/ static int vperfctr_mmap(struct file *filp, struct vm_area_struct *vma) { struct vperfctr *perfctr; #ifdef CONFIG_ARM #define _PAGE_RW L_PTE_WRITE #endif /* Only allow read-only mapping of first page. */ if ((vma->vm_end - vma->vm_start) != PAGE_SIZE || vma->vm_pgoff != 0 || (pgprot_val(vma->vm_page_prot) & _PAGE_RW) || (vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) return -EPERM; perfctr = filp->private_data; if (!perfctr) return -EPERM; /* 2.6.29-rc1 changed arch/x86/mm/pat.c to WARN_ON when remap_pfn_range() is applied to plain RAM pages. Comments there indicate that one should set_memory_wc() before the remap, but that doesn't silence the WARN_ON. Luckily vm_insert_page() works without complaints. */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29) return vm_insert_page(vma, vma->vm_start, virt_to_page((unsigned long)perfctr)); #else return remap_pfn_range(vma, vma->vm_start, virt_to_phys(perfctr) >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot); #endif } static int vperfctr_release(struct inode *inode, struct file *filp) { struct vperfctr *perfctr = filp->private_data; filp->private_data = NULL; if (perfctr) put_vperfctr(perfctr); return 0; } static long vperfctr_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct vperfctr *perfctr; struct task_struct *tsk; int ret; switch (cmd) { case PERFCTR_ABI: return sys_perfctr_abi((unsigned int*)arg); case PERFCTR_INFO: return sys_perfctr_info((struct perfctr_struct_buf*)arg); case PERFCTR_CPUS: return sys_perfctr_cpus((struct perfctr_cpu_mask*)arg); case PERFCTR_CPUS_FORBIDDEN: return sys_perfctr_cpus_forbidden((struct perfctr_cpu_mask*)arg); } perfctr = filp->private_data; if (!perfctr) return -EINVAL; tsk = current; if (perfctr != current->thread.perfctr) { /* this synchronises with vperfctr_unlink() and itself */ spin_lock(&perfctr->owner_lock); tsk = perfctr->owner; if (tsk) get_task_struct(tsk); spin_unlock(&perfctr->owner_lock); if (tsk) { ret = ptrace_check_attach(tsk, 0); if (ret < 0) goto out; } } switch (cmd) { case VPERFCTR_CONTROL: ret = sys_vperfctr_control(perfctr, (struct perfctr_struct_buf*)arg, tsk); break; case VPERFCTR_UNLINK: ret = sys_vperfctr_unlink(perfctr, tsk); break; case VPERFCTR_READ_SUM: ret = sys_vperfctr_read_sum(perfctr, (struct perfctr_struct_buf*)arg, tsk); break; case VPERFCTR_IRESUME: ret = sys_vperfctr_iresume(perfctr, tsk); break; case VPERFCTR_READ_CONTROL: ret = sys_vperfctr_read_control(perfctr, (struct perfctr_struct_buf*)arg, tsk); break; default: ret = -EINVAL; } out: if (tsk && tsk != current) put_task_struct(tsk); return ret; } #if !HAVE_UNLOCKED_IOCTL static int vperfctr_ioctl_oldstyle(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { return vperfctr_ioctl(filp, cmd, arg); } #endif static #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) const #endif struct file_operations vperfctr_file_ops = { .owner = THIS_MODULE, .mmap = vperfctr_mmap, .release = vperfctr_release, /* 2.6.11-rc2 introduced HAVE_UNLOCKED_IOCTL and HAVE_COMPAT_IOCTL */ #if HAVE_UNLOCKED_IOCTL .unlocked_ioctl = vperfctr_ioctl, #else .ioctl = vperfctr_ioctl_oldstyle, #endif #if defined(CONFIG_IA32_EMULATION) && HAVE_COMPAT_IOCTL .compat_ioctl = vperfctr_ioctl, #endif }; /**************************************************************** * * * File system for virtual perfctrs. Based on pipefs. * * * ****************************************************************/ #define VPERFCTRFS_MAGIC (('V'<<24)|('P'<<16)|('M'<<8)|('C')) #include /* 2.6 kernels prior to 2.6.11-rc1 don't EXPORT_SYMBOL() get_sb_pseudo(). This is a verbatim copy, only renamed. */ #if defined(MODULE) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) static struct super_block * perfctr_get_sb_pseudo(struct file_system_type *fs_type, char *name, struct super_operations *ops, unsigned long magic) { struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); static struct super_operations default_ops = {.statfs = simple_statfs}; struct dentry *dentry; struct inode *root; struct qstr d_name = {.name = name, .len = strlen(name)}; if (IS_ERR(s)) return s; s->s_flags = MS_NOUSER; s->s_maxbytes = ~0ULL; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = magic; s->s_op = ops ? ops : &default_ops; root = new_inode(s); if (!root) goto Enomem; root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; root->i_uid = root->i_gid = 0; root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME; dentry = d_alloc(NULL, &d_name); if (!dentry) { iput(root); goto Enomem; } dentry->d_sb = s; dentry->d_parent = dentry; d_instantiate(dentry, root); s->s_root = dentry; s->s_flags |= MS_ACTIVE; return s; Enomem: up_write(&s->s_umount); deactivate_super(s); return ERR_PTR(-ENOMEM); } #undef get_sb_pseudo #define get_sb_pseudo perfctr_get_sb_pseudo #endif /* MODULE && VERSION < 2.6.11 */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18) static int vperfctrfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { return get_sb_pseudo(fs_type, "vperfctr:", NULL, VPERFCTRFS_MAGIC, mnt); } #else static struct super_block * vperfctrfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return get_sb_pseudo(fs_type, "vperfctr:", NULL, VPERFCTRFS_MAGIC); } #endif static struct file_system_type vperfctrfs_type = { .name = "vperfctrfs", .get_sb = vperfctrfs_get_sb, .kill_sb = kill_anon_super, }; /* XXX: check if s/vperfctr_mnt/vperfctrfs_type.kern_mnt/ would work */ static struct vfsmount *vperfctr_mnt; static int __init vperfctrfs_init(void) { int err = register_filesystem(&vperfctrfs_type); if (!err) { vperfctr_mnt = kern_mount(&vperfctrfs_type); if (!IS_ERR(vperfctr_mnt)) return 0; err = PTR_ERR(vperfctr_mnt); unregister_filesystem(&vperfctrfs_type); } return err; } static void __exit vperfctrfs_exit(void) { unregister_filesystem(&vperfctrfs_type); mntput(vperfctr_mnt); } static struct inode *vperfctr_get_inode(void) { struct inode *inode; inode = new_inode(vperfctr_mnt->mnt_sb); if (!inode) return NULL; inode->i_fop = &vperfctr_file_ops; inode->i_state = I_DIRTY; inode->i_mode = S_IFCHR | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) && !defined(DONT_HAVE_i_blksize) inode->i_blksize = 0; #endif return inode; } static int vperfctrfs_delete_dentry(struct dentry *dentry) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) /* * At creation time, we pretended this dentry was hashed * (by clearing DCACHE_UNHASHED bit in d_flags) * At delete time, we restore the truth : not hashed. * (so that dput() can proceed correctly) */ dentry->d_flags |= DCACHE_UNHASHED; return 0; #else return 1; #endif } static #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,30) const #endif struct dentry_operations vperfctrfs_dentry_operations = { .d_delete = vperfctrfs_delete_dentry, }; static struct dentry *vperfctr_d_alloc_root(struct inode *inode) { struct qstr this; char name[32]; struct dentry *dentry; sprintf(name, "[%lu]", inode->i_ino); this.name = name; this.len = strlen(name); #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) this.hash = 0; #else this.hash = inode->i_ino; /* will go */ #endif dentry = d_alloc(vperfctr_mnt->mnt_sb->s_root, &this); if (dentry) { dentry->d_op = &vperfctrfs_dentry_operations; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20) /* * We dont want to publish this dentry into global dentry hash table. * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED * This permits a working /proc/$pid/fd/XXX on vperfctrs */ dentry->d_flags &= ~DCACHE_UNHASHED; d_instantiate(dentry, inode); #else d_add(dentry, inode); #endif } return dentry; } static struct file *vperfctr_get_filp(void) { struct file *filp; struct inode *inode; struct dentry *dentry; inode = vperfctr_get_inode(); if (!inode) goto out; dentry = vperfctr_d_alloc_root(inode); if (!dentry) goto out_inode; /* * Create the filp _after_ the inode and dentry, to avoid * needing access to put_filp(), which is no longer exported * starting with kernel 2.6.10-rc1. fput() is available but * doesn't work on incomplete files. We now need access to * dput() instead, but that's Ok. */ filp = get_empty_filp(); if (!filp) goto out_dentry; filp_vfsmnt(filp) = mntget(vperfctr_mnt); filp_dentry(filp) = dentry; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,2) filp->f_mapping = dentry->d_inode->i_mapping; #endif filp->f_pos = 0; filp->f_flags = 0; filp->f_op = fops_get(&vperfctr_file_ops); /* fops_get() for MODULE */ filp->f_mode = FMODE_READ; filp->f_version = 0; return filp; out_dentry: dput(dentry); goto out; /* dput() also does iput() */ out_inode: iput(inode); out: return NULL; } /* tid is the actual task/thread id (née pid, stored as ->pid), pid/tgid is that 2.6 thread group id crap (stored as ->tgid) */ int vperfctr_attach(int tid, int creat) { struct file *filp; struct task_struct *tsk; struct vperfctr *perfctr; int err; int fd; filp = vperfctr_get_filp(); if (!filp) return -ENOMEM; err = fd = get_unused_fd(); if (err < 0) goto err_filp; perfctr = NULL; if (creat) { perfctr = get_empty_vperfctr(); /* may sleep */ if (IS_ERR(perfctr)) { err = PTR_ERR(perfctr); goto err_fd; } } tsk = current; if (tid != 0 && tid != task_pid_vnr(tsk)) { /* remote? */ vperfctr_lock_find_task_by_vpid(); tsk = find_task_by_vpid(tid); if (tsk) get_task_struct(tsk); vperfctr_unlock_find_task_by_vpid(); err = -ESRCH; if (!tsk) goto err_perfctr; err = ptrace_check_attach(tsk, 0); if (err < 0) goto err_tsk; } if (creat) { /* check+install must be atomic to prevent remote-control races */ vperfctr_task_lock(tsk); if (!tsk->thread.perfctr) { perfctr->owner = tsk; tsk->thread.perfctr = perfctr; err = 0; } else err = -EEXIST; vperfctr_task_unlock(tsk); if (err) goto err_tsk; } else { perfctr = tsk->thread.perfctr; /* PERFCTR_ABI and PERFCTR_INFO don't need the perfctr. Hence no non-NULL check here. */ } filp->private_data = perfctr; if (perfctr) atomic_inc(&perfctr->count); if (tsk != current) put_task_struct(tsk); fd_install(fd, filp); return fd; err_tsk: if (tsk != current) put_task_struct(tsk); err_perfctr: if (perfctr) /* can only occur if creat != 0 */ put_vperfctr(perfctr); err_fd: put_unused_fd(fd); err_filp: fput(filp); return err; } /**************************************************************** * * * module_init/exit * * * ****************************************************************/ #ifdef MODULE static struct vperfctr_stub off; static void vperfctr_stub_init(void) { off = vperfctr_stub; vperfctr_stub.owner = THIS_MODULE; vperfctr_stub.exit = __vperfctr_exit; vperfctr_stub.flush = __vperfctr_flush; vperfctr_stub.suspend = __vperfctr_suspend; vperfctr_stub.resume = __vperfctr_resume; vperfctr_stub.sample = __vperfctr_sample; #ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK vperfctr_stub.set_cpus_allowed = __vperfctr_set_cpus_allowed; #endif } static void vperfctr_stub_exit(void) { vperfctr_stub = off; } #else static inline void vperfctr_stub_init(void) { } static inline void vperfctr_stub_exit(void) { } #endif /* MODULE */ int __init vperfctr_init(void) { int err = vperfctrfs_init(); if (err) return err; vperfctr_stub_init(); return 0; } void __exit vperfctr_exit(void) { vperfctrfs_exit(); vperfctr_stub_exit(); }