Blob Blame History Raw
/* dzl-counter.h
 *
 * Copyright (C) 2013-2015 Christian Hergert <christian@hergert.me>
 *
 * This file is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This file is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * Additionally, this file does not claim copyright over the expansion
 * of macros in your source program.
 */

#ifndef DZL_COUNTER_H
#define DZL_COUNTER_H

#include <glib-object.h>

#include "dzl-version-macros.h"

/*
 * History
 * =======
 *
 * DzlCounter is a performance counter based on ideas from previous work
 * on high performance counters. They are not guaranteed to be 100%
 * correct, but they approach that with no synchronization given new
 * enough hardware. In particular, we use %ecx from rdtscp (the core id)
 * to determine which cachline to increment the counter within.
 *
 * Given a counter, the value will be split up int NCPU cachelines where
 * NCPU is the number of cores returned from get_nprocs() (on Linux).
 *
 * Updating the counter is very cheap, reading back the counter requires
 * a volatile read of each cacheline. Again, no correctness is guaranteed.
 *
 * In practice, very few values are lost even during tight competing loops.
 * A loss can happen when the thread is pre-empted between the %rdtscp
 * instruction and the addq increment (on x86_64).
 *
 *
 * Using DzlCounter
 * ================
 *
 * To define a counter, you must have support for constructor attributes.
 *
 *   DZL_DEFINE_COUNTER (Symbol, "Category", "Name", "Description")
 *
 * To increment the counter in a function of your choice (but within the
 * same module), use DZL_COUNTER_ADD, DZL_COUNTER_INC, DZL_COUNTER_DEC.
 *
 *   DZL_COUNTER_INC (Symbol);
 *
 *
 * Architecture Support
 * ====================
 *
 * If you are not on x86_64, or are missing the rdtscp instruction, a 64-bit
 * atomic will be performed using __sync_fetch_and_add8(). Clearly, we can
 * do some more work here to abstract which implementation is used, but we
 * only support GCC and Clang today, which both have that intrinsic. Some
 * architectures may not have it (such as 32-bit PPC), but I'm not too
 * concerned about that at the moment.
 *
 * The counters are mapped into a shared memory zone using shm_open() and
 * mmap(). An external program can then discover the available counters
 * and print them without blocking the target program. It simply must
 * perform the reads in a volatile manner just like the target process
 * would need to do for readback.
 *
 * DzlCounterArena provides a helper to walk through the counters in the
 * shared memory zone. dzl_counter_arena_foreach().
 *
 * You cannot remove a counter once it has been registered.
 *
 *
 * Accessing Counters Remotely
 * ===========================
 *
 * You can access the counters from out of process. By opening the SHM zone
 * and reading the contents from each cachline, you can get the approximate
 * state of the target application without blocking it.
 *
 * DzlCounterArena provides a helper for you to do this.
 *
 *   DzlCounterArena *arena;
 *
 *   arena = dzl_counter_arena_new_for_pid (other_process_pid);
 *   dzl_counter_arena_foreach (arena, my_counter_callback, user_data);
 *
 *
 * Data Layout
 * ===========
 *
 * The layout of the shared memory zone is broken into "cells". Each cell
 * is an approximate cacheline (64-bytes) on modern Intel hardware. Indexes
 * to data locations are represented in cells to simplify the math and
 * allow the compiler to know we are working with properly aligned structures.
 *
 * The base pointer in DzlCounter.values is not 64-byte aligned! It is 8-byte
 * aligned and points to the offset within the cacheline for that counter.
 * We pack 8 64-bit counters into a single cacheline. This allows us to avoid
 * an extra MOV instruction when incrementing since we only need to perform
 * the offset from the base pointer.
 *
 * The first two cells are the header which contain information about the
 * underlying shm file and how large the mmap() range should be.
 *
 * After that, begin the counters.
 *
 * The counters are layed out in groups of 8 counters.
 *
 *  [8 CounterInfo Structs (128-bytes each)][N_CPU Data Zones (64-byte each)]
 *
 * See dzl-counter.c for more information on the contents of these structures.
 *
 *
 * Build System Requirements
 * =========================
 *
 * We need to know if rdtscp is available at compile time. In an effort
 * to keep the headers as portable as possible (if that matters here?) we
 * require that you define DZL_HAVE_RDTSCP if the instruction is supported.
 *
 * An example for autoconf might be similar to the following:
 *
 *   AC_MSG_CHECKING([for fast counters with rdtscp])
 *   AC_RUN_IFELSE(
 *     [AC_LANG_SOURCE([[
 *      #include <x86intrin.h>
 *      int main (int argc, char *argv[]) { int cpu; __builtin_ia32_rdtscp (&cpu); return 0; }]])],
 *     [have_rdtscp=yes],
 *     [have_rdtscp=no])
 *   AC_MSG_RESULT([$have_rdtscp])
 *   AS_IF([test "$have_rdtscp" = "yes"],
 *         [CFLAGS="$CFLAGS -DDZL_HAVE_RDTSCP"])
 */

G_BEGIN_DECLS

#ifdef DZL_HAVE_RDTSCP
# include <x86intrin.h>
  static inline guint
  dzl_get_current_cpu_rdtscp (void)
  {
    /*
     * This extracts the IA32_TSC_AUX into the ecx register. On Linux,
     * that value contains a value with the bottom 12 bits being the
     * cpu identifier, and the next 10 bits being the node group.
     */
    guint aux;
    __builtin_ia32_rdtscp (&aux);
    return aux & 0xFFF;
  }
# define dzl_get_current_cpu() dzl_get_current_cpu_rdtscp()
#elif defined(__linux__)
# define dzl_get_current_cpu() dzl_get_current_cpu_call()
#else
# define dzl_get_current_cpu() 0
# define DZL_COUNTER_REQUIRES_ATOMIC 1
#endif

/**
 * DZL_DEFINE_COUNTER:
 * @Identifier: The symbol name of the counter
 * @Category: A string category for the counter.
 * @Name: A string name for the counter.
 * @Description: A string description for the counter.
 *
 * |[<!-- language="C" -->
 * DZL_DEFINE_COUNTER (my_counter, "My", "Counter", "My Counter Description");
 * ]|
 */
#define DZL_DEFINE_COUNTER(Identifier, Category, Name, Description)                 \
 static DzlCounter Identifier##_ctr = { NULL, Category, Name, Description };        \
 static void Identifier##_ctr_init (void) __attribute__((constructor));             \
 static void                                                                        \
 Identifier##_ctr_init (void)                                                       \
 {                                                                                  \
   dzl_counter_arena_register (dzl_counter_arena_get_default(), &Identifier##_ctr); \
 }

/**
 * DZL_COUNTER_INC:
 * @Identifier: The identifier of the counter.
 *
 * Increments the counter @Identifier by 1.
 */
#define DZL_COUNTER_INC(Identifier) DZL_COUNTER_ADD(Identifier, G_GINT64_CONSTANT(1))

/**
 * DZL_COUNTER_DEC:
 * @Identifier: The identifier of the counter.
 *
 * Decrements the counter @Identifier by 1.
 */
#define DZL_COUNTER_DEC(Identifier) DZL_COUNTER_SUB(Identifier, G_GINT64_CONSTANT(1))

/**
 * DZL_COUNTER_SUB:
 * @Identifier: The identifier of the counter.
 * @Count: the amount to subtract.
 *
 * Subtracts from the counter identified by @Identifier by @Count.
 */
#define DZL_COUNTER_SUB(Identifier, Count) DZL_COUNTER_ADD(Identifier, (-(Count)))

/**
 * DZL_COUNTER_ADD:
 * @Identifier: The identifier of the counter.
 * @Count: the amount to add to the counter.
 *
 * Adds @Count to @Identifier.
 *
 * This operation is not guaranteed to have full correctness. It tries to find
 * a happy medium between fast, and accurate. When possible, the %rdtscp
 * instruction is used to get a cacheline owned by the executing CPU, to avoid
 * collisions. However, this is not guaranteed as the thread could be swapped
 * between the calls to %rdtscp and %addq (on 64-bit Intel).
 *
 * Other platforms have fallbacks which may give different guarantees, such as
 * using atomic operations (and therefore, memory barriers).
 *
 * See #DzlCounter for more information.
 */
#ifdef DZL_COUNTER_REQUIRES_ATOMIC
# define DZL_COUNTER_ADD(Identifier, Count)                                          \
  G_STMT_START {                                                                     \
    __sync_add_and_fetch ((gint64 *)&Identifier##_ctr.values[0], ((gint64)(Count))); \
  } G_STMT_END
#else
# define DZL_COUNTER_ADD(Identifier, Count)                                    \
  G_STMT_START {                                                               \
    Identifier##_ctr.values[dzl_get_current_cpu()].value += ((gint64)(Count)); \
  } G_STMT_END
#endif

typedef struct _DzlCounter      DzlCounter;
typedef struct _DzlCounterArena DzlCounterArena;
typedef struct _DzlCounterValue DzlCounterValue;

/**
 * DzlCounterForeachFunc:
 * @counter: the counter.
 * @user_data: data supplied to dzl_counter_arena_foreach().
 *
 * Function prototype for callbacks provided to dzl_counter_arena_foreach().
 */
typedef void (*DzlCounterForeachFunc) (DzlCounter *counter,
                                       gpointer    user_data);

struct _DzlCounter
{
  /*< Private >*/
  DzlCounterValue *values;
  const gchar     *category;
  const gchar     *name;
  const gchar     *description;
} __attribute__ ((aligned(8)));

struct _DzlCounterValue
{
  volatile gint64 value;
  gint64          padding [7];
} __attribute__ ((aligned(8)));

DZL_AVAILABLE_IN_ALL
GType            dzl_counter_arena_get_type     (void);
DZL_AVAILABLE_IN_ALL
guint            dzl_get_current_cpu_call       (void);
DZL_AVAILABLE_IN_ALL
DzlCounterArena *dzl_counter_arena_get_default  (void);
DZL_AVAILABLE_IN_ALL
DzlCounterArena *dzl_counter_arena_new_for_pid  (GPid                   pid);
DZL_AVAILABLE_IN_ALL
DzlCounterArena *dzl_counter_arena_ref          (DzlCounterArena       *arena);
DZL_AVAILABLE_IN_ALL
void             dzl_counter_arena_unref        (DzlCounterArena       *arena);
DZL_AVAILABLE_IN_ALL
void             dzl_counter_arena_register     (DzlCounterArena       *arena,
                                                 DzlCounter            *counter);
DZL_AVAILABLE_IN_ALL
void             dzl_counter_arena_foreach      (DzlCounterArena       *arena,
                                                 DzlCounterForeachFunc  func,
                                                 gpointer               user_data);
DZL_AVAILABLE_IN_ALL
void             dzl_counter_reset              (DzlCounter            *counter);
DZL_AVAILABLE_IN_ALL
gint64           dzl_counter_get                (DzlCounter            *counter);

G_END_DECLS

#endif /* DZL_COUNTER_H */