/**************************************************************************** *C *C matrix-hl.f *C An example of matrix-matrix multiplication and using PAPI high level *C to look at the performance. written by Kevin London *C March 2000 *C Added to c tests to check stop *C**************************************************************************** */ #include #include #include "papi.h" #include "papi_test.h" #include "do_loops.h" int main( int argc, char **argv ) { #define NROWS1 175 #define NCOLS1 225 #define NROWS2 NCOLS1 #define NCOLS2 150 double p[NROWS1][NCOLS1], q[NROWS2][NCOLS2], r[NROWS1][NCOLS2]; int i, j, k, num_events, retval; /* PAPI standardized event to be monitored */ int event[2]; /* PAPI values of the counters */ long long values[2], tmp; int quiet; quiet = tests_quiet( argc, argv ); /* Setup default values */ num_events = 0; /* See how many hardware events at one time are supported * This also initializes the PAPI library */ num_events = PAPI_num_counters( ); if ( num_events < 2 ) { if (!quiet) printf( "This example program requries the architecture to " "support 2 simultaneous hardware events...shutting down.\n" ); test_skip( __FILE__, __LINE__, "PAPI_num_counters", 1 ); } if ( !quiet ) printf( "Number of hardware counters supported: %d\n", num_events ); if ( PAPI_query_event( PAPI_FP_OPS ) == PAPI_OK ) event[0] = PAPI_FP_OPS; else if ( PAPI_query_event( PAPI_FP_INS ) == PAPI_OK ) event[0] = PAPI_FP_INS; else event[0] = PAPI_TOT_INS; /* Time used */ event[1] = PAPI_TOT_CYC; /* matrix 1: read in the matrix values */ for ( i = 0; i < NROWS1; i++ ) for ( j = 0; j < NCOLS1; j++ ) p[i][j] = i * j * 1.0; for ( i = 0; i < NROWS2; i++ ) for ( j = 0; j < NCOLS2; j++ ) q[i][j] = i * j * 1.0; for ( i = 0; i < NROWS1; i++ ) for ( j = 0; j < NCOLS2; j++ ) r[i][j] = i * j * 1.0; /* Set up the counters */ num_events = 2; retval = PAPI_start_counters( event, num_events ); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_start_counters", retval ); /* Clear the counter values */ retval = PAPI_read_counters( values, num_events ); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_read_counters", retval ); /* Compute the matrix-matrix multiplication */ for ( i = 0; i < NROWS1; i++ ) for ( j = 0; j < NCOLS2; j++ ) for ( k = 0; k < NCOLS1; k++ ) r[i][j] = r[i][j] + p[i][k] * q[k][j]; /* Stop the counters and put the results in the array values */ retval = PAPI_stop_counters( values, num_events ); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_stop_counters", retval ); /* Make sure the compiler does not optimize away the multiplication * with dummy(r); */ dummy( r ); if ( !quiet ) { if ( event[0] == PAPI_TOT_INS ) { printf( TAB1, "TOT Instructions:", values[0] ); } else { printf( TAB1, "FP Instructions:", values[0] ); } printf( TAB1, "Cycles:", values[1] ); } /* * Intel Core overreports flops by 50% when using -O * Use -O2 or -O3 to produce the expected # of flops */ if ( event[0] == PAPI_FP_INS ) { /* Compare measured FLOPS to expected value */ tmp = 2 * ( long long ) ( NROWS1 ) * ( long long ) ( NCOLS2 ) * ( long long ) ( NCOLS1 ); if ( abs( ( int ) values[0] - ( int ) tmp ) > ( double ) tmp * 0.05 ) { /* Maybe we are counting FMAs? */ tmp = tmp / 2; if ( abs( ( int ) values[0] - ( int ) tmp ) > ( double ) tmp * 0.05 ) { printf( "\n" TAB1, "Expected operation count: ", 2 * tmp ); printf( TAB1, "Or possibly (using FMA): ", tmp ); printf( TAB1, "Instead I got: ", values[0] ); test_fail( __FILE__, __LINE__, "Unexpected FLOP count (check vector operations)", 1 ); } } } test_pass( __FILE__ ); return 0; }