/****************************/
/* THIS IS OPEN SOURCE CODE */
/****************************/
/**
* @file linux-lustre.c
* @author Haihang You (in collaboration with Michael Kluge, TU Dresden)
* you@eecs.utk.edu
* @author Heike Jagode
* jagode@eecs.utk.edu
* @author Vince Weaver
* vweaver1@eecs.utk.edu
* @brief A component for the luster filesystem.
*/
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <dirent.h>
#include <stdint.h>
#include <ctype.h>
#include "papi.h"
#include "papi_internal.h"
#include "papi_vector.h"
#include "papi_memory.h"
/** describes a single counter with its properties */
typedef struct counter_info_struct
{
int idx;
char *name;
char *description;
char *unit;
unsigned long long value;
} counter_info;
typedef struct
{
int count;
char **data;
} string_list;
/** describes the infos collected from a mounted Lustre filesystem */
typedef struct lustre_fs_struct
{
char *proc_file;
char *proc_file_readahead;
counter_info *write_cntr;
counter_info *read_cntr;
counter_info *readahead_cntr;
struct lustre_fs_struct *next;
} lustre_fs;
#define LUSTRE_MAX_COUNTERS 100
#define LUSTRE_MAX_COUNTER_TERMS LUSTRE_MAX_COUNTERS
typedef counter_info LUSTRE_register_t;
typedef counter_info LUSTRE_native_event_entry_t;
typedef counter_info LUSTRE_reg_alloc_t;
typedef struct LUSTRE_control_state
{
long long start_count[LUSTRE_MAX_COUNTERS];
long long current_count[LUSTRE_MAX_COUNTERS];
long long difference[LUSTRE_MAX_COUNTERS];
int which_counter[LUSTRE_MAX_COUNTERS];
int num_events;
} LUSTRE_control_state_t;
typedef struct LUSTRE_context
{
LUSTRE_control_state_t state;
} LUSTRE_context_t;
/* Default path to lustre stats */
#ifdef FAKE_LUSTRE
const char proc_base_path[] = "./components/lustre/fake_proc/fs/lustre/";
#else
const char proc_base_path[] = "/proc/fs/lustre/";
#endif
static counter_info **lustre_native_table = NULL;
static int num_events = 0;
static int table_size = 32;
/* mount Lustre fs are kept in a list */
static lustre_fs *root_lustre_fs = NULL;
papi_vector_t _lustre_vector;
/******************************************************************************
******** BEGIN FUNCTIONS USED INTERNALLY SPECIFIC TO THIS COMPONENT ********
*****************************************************************************/
static int resize_native_table() {
SUBDBG("ENTER:\n");
counter_info** new_table;
int new_size = table_size*2;
new_table = (counter_info**)papi_calloc(new_size, sizeof(counter_info*));
if (NULL==new_table) {
SUBDBG("EXIT: PAPI_ENOMEM\n");
return PAPI_ENOMEM;
}
if ( lustre_native_table) {
memcpy(new_table, lustre_native_table, sizeof(counter_info*) * table_size );
papi_free(lustre_native_table);
}
lustre_native_table = new_table;
table_size*=2;
SUBDBG("EXIT: PAPI_OK\n");
return PAPI_OK;
}
/**
* add a counter to the list of available counters
* @param name the short name of the counter
* @param desc a longer description
* @param unit the unit for this counter
*/
static counter_info *
addCounter( const char *name, const char *desc, const char *unit )
{
SUBDBG("ENTER: name: %s, desc: %s, unit: %s\n", name, desc, unit);
counter_info *cntr;
if ( num_events >= table_size )
if (PAPI_OK != resize_native_table()) {
SUBDBG("EXIT: can not resize native table\n" );
return NULL;
}
cntr = malloc( sizeof ( counter_info ) );
if ( cntr == NULL ) {
SUBDBG("EXIT: can not allocate memory for new counter\n" );
return NULL;
}
cntr->idx=num_events;
cntr->name = strdup( name );
cntr->description = strdup( desc );
cntr->unit = strdup( unit );
cntr->value = 0;
lustre_native_table[num_events]=cntr;
num_events++;
SUBDBG("EXIT: cntr: %p\n", cntr);
return cntr;
}
/**
* adds a Lustre fs to the fs list and creates the counters for it
* @param name fs name
* @param procpath_general path to the 'stats' file in /proc/fs/lustre/... for this fs
* @param procpath_readahead path to the 'readahead' file in /proc/fs/lustre/... for this fs
*/
static int
addLustreFS( const char *name,
const char *procpath_general,
const char *procpath_readahead )
{
lustre_fs *fs, *last;
char counter_name[512];
FILE *fff;
SUBDBG("Adding lustre fs\n");
fs = malloc( sizeof ( lustre_fs ) );
if ( fs == NULL ) {
SUBDBG("can not allocate memory for new Lustre FS description\n" );
return PAPI_ENOMEM;
}
fs->proc_file=strdup(procpath_general);
fff = fopen( procpath_general, "r" );
if ( fff == NULL ) {
SUBDBG("can not open '%s'\n", procpath_general );
free(fs);
return PAPI_ESYS;
}
fclose(fff);
fs->proc_file_readahead = strdup(procpath_readahead);
fff = fopen( procpath_readahead, "r" );
if ( fff == NULL ) {
SUBDBG("can not open '%s'\n", procpath_readahead );
free(fs);
return PAPI_ESYS;
}
fclose(fff);
sprintf( counter_name, "%s_llread", name );
if (NULL == (fs->read_cntr = addCounter( counter_name,
"bytes read on this lustre client",
"bytes" ))) {
free(fs);
return PAPI_ENOMEM;
}
sprintf( counter_name, "%s_llwrite", name );
if ( NULL == (fs->write_cntr = addCounter( counter_name,
"bytes written on this lustre client",
"bytes" ))) {
free(fs->read_cntr);
free(fs);
return PAPI_ENOMEM;
}
sprintf( counter_name, "%s_wrong_readahead", name );
if ( NULL == (fs->readahead_cntr = addCounter( counter_name,
"bytes read but discarded due to readahead",
"bytes" ))) {
free(fs->read_cntr);
free(fs->write_cntr);
free(fs);
return PAPI_ENOMEM;
}
fs->next = NULL;
/* Insert into the linked list */
/* Does this need locking? */
if ( root_lustre_fs == NULL ) {
root_lustre_fs = fs;
} else {
last = root_lustre_fs;
while ( last->next != NULL )
last = last->next;
last->next = fs;
}
return PAPI_OK;
}
/**
* goes through proc and tries to discover all mounted Lustre fs
*/
static int
init_lustre_counters( void )
{
SUBDBG("ENTER:\n");
char lustre_dir[PATH_MAX];
char path[PATH_MAX];
char path_readahead[PATH_MAX],path_stats[PATH_MAX];
char *ptr;
char fs_name[100];
int found_luster_fs = 0;
int idx = 0;
int tmp_fd;
DIR *proc_dir;
struct dirent *entry;
sprintf(lustre_dir,"%s/llite",proc_base_path);
proc_dir = opendir( lustre_dir );
if ( proc_dir == NULL ) {
SUBDBG("EXIT: PAPI_ESYS (Cannot open %s)\n",lustre_dir);
return PAPI_ESYS;
}
while ( (entry = readdir( proc_dir )) != NULL ) {
memset( path, 0, PATH_MAX );
snprintf( path, PATH_MAX - 1, "%s/%s/stats", lustre_dir,
entry->d_name );
SUBDBG("checking for file %s\n", path);
if ( ( tmp_fd = open( path, O_RDONLY ) ) == -1 ) {
SUBDBG("Path: %s, can not be opened.\n", path);
continue;
}
close( tmp_fd );
/* erase \r and \n at the end of path */
/* why is this necessary? */
idx = strlen( path );
idx--;
while ( path[idx] == '\r' || path[idx] == '\n' )
path[idx--] = 0;
/* Lustre paths are of type server-UUID */
idx = 0;
ptr = strstr(path,"llite/") + 6;
if (ptr == NULL) {
SUBDBG("Path: %s, missing llite directory, performance event not created.\n", path);
continue;
}
strncpy(fs_name, ptr, sizeof(fs_name)-1);
fs_name[sizeof(fs_name)-1] = '\0';
SUBDBG("found Lustre FS: %s\n", fs_name);
snprintf( path_stats, PATH_MAX - 1,
"%s/%s/stats",
lustre_dir,
entry->d_name );
SUBDBG("Found file %s\n", path_stats);
snprintf( path_readahead, PATH_MAX - 1,
"%s/%s/read_ahead_stats",
lustre_dir,
entry->d_name );
SUBDBG("Now checking for file %s\n", path_readahead);
strcpy( ptr, "read_ahead_stats" );
addLustreFS( fs_name, path_stats, path_readahead );
found_luster_fs++;
}
closedir( proc_dir );
if (found_luster_fs == 0) {
SUBDBG("EXIT: PAPI_ESYS (No luster file systems found)\n");
return PAPI_ESYS;
}
SUBDBG("EXIT: PAPI_OK\n");
return PAPI_OK;
}
/**
* updates all Lustre related counters
*/
static void
read_lustre_counter( )
{
lustre_fs *fs = root_lustre_fs;
FILE *fff;
char buffer[BUFSIZ];
while ( fs != NULL ) {
/* read values from stats file */
fff=fopen(fs->proc_file,"r" );
if (fff != NULL) {
while(1) {
if (fgets(buffer,BUFSIZ,fff)==NULL) break;
if (strstr( buffer, "write_bytes" )) {
sscanf(buffer,"%*s %*d %*s %*s %*d %*d %llu",&fs->write_cntr->value);
SUBDBG("Read %llu write_bytes\n",fs->write_cntr->value);
}
if (strstr( buffer, "read_bytes" )) {
sscanf(buffer,"%*s %*d %*s %*s %*d %*d %llu",&fs->read_cntr->value);
SUBDBG("Read %llu read_bytes\n",fs->read_cntr->value);
}
}
fclose(fff);
}
fff=fopen(fs->proc_file_readahead,"r");
if (fff != NULL) {
while(1) {
if (fgets(buffer,BUFSIZ,fff)==NULL) break;
if (strstr( buffer, "read but discarded")) {
sscanf(buffer,"%*s %*s %*s %llu",&fs->readahead_cntr->value);
SUBDBG("Read %llu discared\n",fs->readahead_cntr->value);
break;
}
}
fclose(fff);
}
fs = fs->next;
}
}
/**
* frees all allocated resources
*/
static void
host_finalize( void )
{
int i;
lustre_fs *fs, *next_fs;
counter_info *cntr;
for(i=0;i<num_events;i++) {
cntr=lustre_native_table[i];
if ( cntr != NULL ) {
free( cntr->name );
free( cntr->description );
free( cntr->unit );
free( cntr );
}
lustre_native_table[i]=NULL;
}
fs = root_lustre_fs;
while ( fs != NULL ) {
next_fs = fs->next;
free(fs->proc_file);
free(fs->proc_file_readahead);
free( fs );
fs = next_fs;
}
root_lustre_fs = NULL;
}
/*****************************************************************************
******************* BEGIN PAPI's COMPONENT REQUIRED FUNCTIONS *************
*****************************************************************************/
/*
* Component setup and shutdown
*/
static int
_lustre_init_component( int cidx )
{
SUBDBG("ENTER:\n");
int ret = PAPI_OK;
resize_native_table();
ret=init_lustre_counters();
if (ret!=PAPI_OK) {
strncpy(_lustre_vector.cmp_info.disabled_reason,
"No lustre filesystems found",PAPI_MAX_STR_LEN);
SUBDBG("EXIT: ret: %d\n", ret);
return ret;
}
_lustre_vector.cmp_info.num_native_events=num_events;
_lustre_vector.cmp_info.CmpIdx = cidx;
SUBDBG("EXIT: ret: %d\n", ret);
return ret;
}
/*
* This is called whenever a thread is initialized
*/
static int
_lustre_init_thread( hwd_context_t * ctx )
{
(void) ctx;
return PAPI_OK;
}
/*
*
*/
static int
_lustre_shutdown_component( void )
{
SUBDBG("ENTER:\n");
host_finalize( );
papi_free( lustre_native_table );
lustre_native_table = NULL;
num_events = 0;
table_size = 32;
SUBDBG("EXIT:\n");
return PAPI_OK;
}
/*
*
*/
static int
_lustre_shutdown_thread( hwd_context_t * ctx )
{
( void ) ctx;
return PAPI_OK;
}
/*
* Control of counters (Reading/Writing/Starting/Stopping/Setup) functions
*/
static int
_lustre_init_control_state( hwd_control_state_t *ctl )
{
LUSTRE_control_state_t *lustre_ctl = (LUSTRE_control_state_t *)ctl;
memset(lustre_ctl->start_count,0,sizeof(long long)*LUSTRE_MAX_COUNTERS);
memset(lustre_ctl->current_count,0,sizeof(long long)*LUSTRE_MAX_COUNTERS);
return PAPI_OK;
}
/*
*
*/
static int
_lustre_update_control_state( hwd_control_state_t *ctl,
NativeInfo_t *native,
int count,
hwd_context_t *ctx )
{
SUBDBG("ENTER: ctl: %p, native: %p, count: %d, ctx: %p\n", ctl, native, count, ctx);
LUSTRE_control_state_t *lustre_ctl = (LUSTRE_control_state_t *)ctl;
( void ) ctx;
int i, index;
for ( i = 0; i < count; i++ ) {
index = native[i].ni_event;
lustre_ctl->which_counter[i]=index;
native[i].ni_position = i;
}
lustre_ctl->num_events=count;
SUBDBG("EXIT: PAPI_OK\n");
return PAPI_OK;
}
/*
*
*/
static int
_lustre_start( hwd_context_t *ctx, hwd_control_state_t *ctl )
{
( void ) ctx;
LUSTRE_control_state_t *lustre_ctl = (LUSTRE_control_state_t *)ctl;
int i;
read_lustre_counter( );
for(i=0;i<lustre_ctl->num_events;i++) {
lustre_ctl->current_count[i]=
lustre_native_table[lustre_ctl->which_counter[i]]->value;
}
memcpy( lustre_ctl->start_count,
lustre_ctl->current_count,
LUSTRE_MAX_COUNTERS * sizeof ( long long ) );
return PAPI_OK;
}
/*
*
*/
static int
_lustre_stop( hwd_context_t *ctx, hwd_control_state_t *ctl )
{
(void) ctx;
LUSTRE_control_state_t *lustre_ctl = (LUSTRE_control_state_t *)ctl;
int i;
read_lustre_counter( );
for(i=0;i<lustre_ctl->num_events;i++) {
lustre_ctl->current_count[i]=
lustre_native_table[lustre_ctl->which_counter[i]]->value;
}
return PAPI_OK;
}
/*
*
*/
static int
_lustre_read( hwd_context_t *ctx, hwd_control_state_t *ctl,
long long **events, int flags )
{
(void) ctx;
( void ) flags;
LUSTRE_control_state_t *lustre_ctl = (LUSTRE_control_state_t *)ctl;
int i;
read_lustre_counter( );
for(i=0;i<lustre_ctl->num_events;i++) {
lustre_ctl->current_count[i]=
lustre_native_table[lustre_ctl->which_counter[i]]->value;
lustre_ctl->difference[i]=lustre_ctl->current_count[i]-
lustre_ctl->start_count[i];
}
*events = lustre_ctl->difference;
return PAPI_OK;
}
/*
*
*/
static int
_lustre_reset( hwd_context_t * ctx, hwd_control_state_t * ctrl )
{
/* re-initializes counter_start values to current */
_lustre_start(ctx,ctrl);
return PAPI_OK;
}
/*
* Unused lustre write function
*/
/* static int */
/* _lustre_write( hwd_context_t * ctx, hwd_control_state_t * ctrl, long long *from ) */
/* { */
/* ( void ) ctx; */
/* ( void ) ctrl; */
/* ( void ) from; */
/* return PAPI_OK; */
/* } */
/*
* Functions for setting up various options
*/
/* This function sets various options in the component
* The valid codes being passed in are PAPI_SET_DEFDOM,
* PAPI_SET_DOMAIN, PAPI_SETDEFGRN, PAPI_SET_GRANUL * and PAPI_SET_INHERIT
*/
static int
_lustre_ctl( hwd_context_t * ctx, int code, _papi_int_option_t * option )
{
( void ) ctx;
( void ) code;
( void ) option;
return PAPI_OK;
}
/*
* This function can be used to set the event set level domains
* where the events should be counted. In particular: PAPI_DOM_USER,
* PAPI_DOM_KERNEL PAPI_DOM_OTHER. But the lustre component does not
* provide a field in its control_state (LUSTRE_control_state_t) to
* save this information. It would also need some way to control when
* the counts get updated in order to support domain filters for
* event counting.
*
* So we just ignore this call.
*/
static int
_lustre_set_domain( hwd_control_state_t * cntrl, int domain )
{
( void ) cntrl;
( void ) domain;
SUBDBG("ENTER: \n");
// this component does not allow limiting which domains will increment event counts
SUBDBG("EXIT: PAPI_OK\n");
return PAPI_OK;
}
/*
*
*/
static int
_lustre_ntv_code_to_name( unsigned int EventCode, char *name, int len )
{
SUBDBG("ENTER: EventCode: %#x, name: %p, len: %d\n", EventCode, name, len);
int event=EventCode;
if (event >=0 && event < num_events) {
strncpy( name, lustre_native_table[event]->name, len-1 );
name[len-1] = '\0';
SUBDBG("EXIT: event name: %s\n", name);
return PAPI_OK;
}
SUBDBG("EXIT: PAPI_ENOEVNT\n");
return PAPI_ENOEVNT;
}
/*
*
*/
static int
_lustre_ntv_code_to_descr( unsigned int EventCode, char *name, int len )
{
SUBDBG("ENTER: EventCode: %#x, name: %p, len: %d\n", EventCode, name, len);
int event=EventCode;
if (event >=0 && event < num_events) {
strncpy( name, lustre_native_table[event]->description, len-1 );
name[len-1] = '\0';
SUBDBG("EXIT: description: %s\n", name);
return PAPI_OK;
}
SUBDBG("EXIT: PAPI_ENOEVNT\n");
return PAPI_ENOEVNT;
}
/*
*
*/
static int
_lustre_ntv_enum_events( unsigned int *EventCode, int modifier )
{
SUBDBG("ENTER: EventCode: %p, modifier: %d\n", EventCode, modifier);
if ( modifier == PAPI_ENUM_FIRST ) {
if (num_events==0) return PAPI_ENOEVNT;
*EventCode = 0;
SUBDBG("EXIT: *EventCode: %#x\n", *EventCode);
return PAPI_OK;
}
if ( modifier == PAPI_ENUM_EVENTS ) {
int index = *EventCode;
if ((index+1 < num_events) && lustre_native_table[index + 1]) {
*EventCode = *EventCode + 1;
SUBDBG("EXIT: *EventCode: %#x\n", *EventCode);
return PAPI_OK;
} else {
SUBDBG("EXIT: PAPI_ENOEVNT\n");
return PAPI_ENOEVNT;
}
}
SUBDBG("EXIT: PAPI_EINVAL\n");
return PAPI_EINVAL;
}
/*
*
*/
papi_vector_t _lustre_vector = {
.cmp_info = {
/* component information (unspecified values initialized to 0) */
.name = "lustre",
.short_name = "lustre",
.version = "1.9",
.description = "Lustre filesystem statistics",
.num_mpx_cntrs = LUSTRE_MAX_COUNTERS,
.num_cntrs = LUSTRE_MAX_COUNTERS,
.default_domain = PAPI_DOM_ALL,
.default_granularity = PAPI_GRN_SYS,
.available_granularities = PAPI_GRN_SYS,
.hardware_intr_sig = PAPI_INT_SIGNAL,
/* component specific cmp_info initializations */
.fast_real_timer = 0,
.fast_virtual_timer = 0,
.attach = 0,
.attach_must_ptrace = 0,
.available_domains = PAPI_DOM_ALL,
},
/* sizes of framework-opaque component-private structures */
.size = {
.context = sizeof ( LUSTRE_context_t ),
.control_state = sizeof ( LUSTRE_control_state_t ),
.reg_value = sizeof ( LUSTRE_register_t ),
.reg_alloc = sizeof ( LUSTRE_reg_alloc_t ),
},
/* function pointers in this component */
.init_thread = _lustre_init_thread,
.init_component = _lustre_init_component,
.init_control_state = _lustre_init_control_state,
.start = _lustre_start,
.stop = _lustre_stop,
.read = _lustre_read,
.shutdown_thread = _lustre_shutdown_thread,
.shutdown_component = _lustre_shutdown_component,
.ctl = _lustre_ctl,
.update_control_state = _lustre_update_control_state,
.set_domain = _lustre_set_domain,
.reset = _lustre_reset,
.ntv_enum_events = _lustre_ntv_enum_events,
.ntv_code_to_name = _lustre_ntv_code_to_name,
.ntv_code_to_descr = _lustre_ntv_code_to_descr,
};