/* This includes various workloads that had been scattered all over */
/* the various ctests.  The goal is to have them in one place, and */
/* share them, as well as maybe have only one file that has to be */
/* compiled with reduced optimizations */

#include <stdio.h>
#include <stdlib.h>

#include "testcode.h"

#define ROWS	1000
#define COLUMNS	1000

static float float_matrixa[ROWS][COLUMNS],
		float_matrixb[ROWS][COLUMNS],
		float_mresult[ROWS][COLUMNS];

static double double_matrixa[ROWS][COLUMNS],
		double_matrixb[ROWS][COLUMNS],
		double_mresult[ROWS][COLUMNS];


int flops_float_init_matrix(void) {

	int i,j;

	/* Initialize the Matrix arrays */
	/* Non-optimail row major.  Intentional? */
	for ( i = 0; i < ROWS; i++ ) {
		for ( j = 0; j < COLUMNS; j++) {
			float_mresult[j][i] = 0.0;
			float_matrixa[j][i] = ( float ) rand() * ( float ) 1.1;
			float_matrixb[j][i] = ( float ) rand() * ( float ) 1.1;
		}
	}

#if defined(__powerpc__)
	/* Has fused multiply-add */
	return ROWS*ROWS*ROWS;
#else
	return ROWS*ROWS*ROWS*2;
#endif

}

float flops_float_matrix_matrix_multiply(void) {

	int i,j,k;

	/* Matrix-Matrix multiply */
	for ( i = 0; i < ROWS; i++ ) {
		for ( j = 0; j < COLUMNS; j++ ) {
			for ( k = 0; k < COLUMNS; k++ ) {
				float_mresult[i][j] += float_matrixa[i][k] * float_matrixb[k][j];
			}
		}
	}

	return float_mresult[10][10];
}

float flops_float_swapped_matrix_matrix_multiply(void) {

	int i, j, k;

	/* Matrix-Matrix multiply */
	/* With inner loops swapped */

	for (i = 0; i < ROWS; i++) {
		for (k = 0; k < COLUMNS; k++) {
			for (j = 0; j < COLUMNS; j++) {
				float_mresult[i][j] += float_matrixa[i][k] * float_matrixb[k][j];
			}
		}
	}
	return float_mresult[10][10];
}


int flops_double_init_matrix(void) {

	int i,j;

	/* Initialize the Matrix arrays */
	/* Non-optimail row major.  Intentional? */
	for ( i = 0; i < ROWS; i++ ) {
		for ( j = 0; j < COLUMNS; j++) {
			double_mresult[j][i] = 0.0;
			double_matrixa[j][i] = ( double ) rand() * ( double ) 1.1;
			double_matrixb[j][i] = ( double ) rand() * ( double ) 1.1;
		}
	}

#if defined(__powerpc__)
		/* has fused multiply-add */
		return ROWS*ROWS*ROWS;
#else
	return ROWS*ROWS*ROWS*2;
#endif

}

double flops_double_matrix_matrix_multiply(void) {

	int i,j,k;

	/* Matrix-Matrix multiply */
	for ( i = 0; i < ROWS; i++ ) {
		for ( j = 0; j < COLUMNS; j++ ) {
			for ( k = 0; k < COLUMNS; k++ ) {
				double_mresult[i][j] += double_matrixa[i][k] * double_matrixb[k][j];
			}
		}
	}

	return double_mresult[10][10];
}

double flops_double_swapped_matrix_matrix_multiply(void) {

	int i, j, k;

	/* Matrix-Matrix multiply */
	/* With inner loops swapped */

	for (i = 0; i < ROWS; i++) {
		for (k = 0; k < COLUMNS; k++) {
			for (j = 0; j < COLUMNS; j++) {
				double_mresult[i][j] += double_matrixa[i][k] * double_matrixb[k][j];
			}
		}
	}
	return double_mresult[10][10];
}


/* This was originally called "dummy3" in the various sdsc tests */
/* Does a lot of floating point ops near 1.0 */
/* In theory returns a value roughly equal to the number of flops */
double
do_flops3( double x, int iters, int quiet )
{
	int i;
	double w, y, z, a, b, c, d, e, f, g, h;
	double result;
	double one;
	one = 1.0;
	w = x;
	y = x;
	z = x;
	a = x;
	b = x;
	c = x;
	d = x;
	e = x;
	f = x;
	g = x;
	h = x;
	for ( i = 1; i <= iters; i++ ) {
		w = w * 1.000000000001 + one;
		y = y * 1.000000000002 + one;
		z = z * 1.000000000003 + one;
		a = a * 1.000000000004 + one;
		b = b * 1.000000000005 + one;
		c = c * 0.999999999999 + one;
		d = d * 0.999999999998 + one;
		e = e * 0.999999999997 + one;
		f = f * 0.999999999996 + one;
		g = h * 0.999999999995 + one;
		h = h * 1.000000000006 + one;
	}
	result = 2.0 * ( a + b + c + d + e + f + w + x + y + z + g + h );

	if (!quiet) printf("Result = %lf\n", result);

	return result;
}


volatile double a = 0.5, b = 2.2;

double
do_flops( int n, int quiet )
{
        int i;
        double c = 0.11;

        for ( i = 0; i < n; i++ ) {
                c += a * b;
        }

	if (!quiet) printf("%lf\n",c);

	return c;
}