// This program measures the speed of the matrix computation C = C + A * B,
// which is a slight generalization of matrix multiplication.  (If C is
// initially 0, then this would be matrix multiplication.)  The three
// matrices A, B, and C are all of size n*n, where n is passed to this
// program on its command line.  The matrices are initialized with some
// pseudo-random numbers.
//
// The computation is done twice, once with one thread and once with
// two threads.  The speedup (speed multiplier) from using two threads
// is reported.  The results are also compared.  If any element of the
// result matrices differs by more than 1e-6, a failure is reported.
//
// The times used to calculated the speedup are total elapsed time,
// not CPU time (since CPU time wouldn't show a speedup, as it is
// total across CPUs).  This means the test must be done on an
// otherwise idle system if it is to give meaningful results.

#include <stdlib.h>
#include <assert.h>
#include <sys/time.h>
#include <stdio.h>
#include <pthread.h>
#include <math.h>

static double *a, *b, *c;
static int n;

static void doComputations(int threadNumber, int threads){
  int i, j, k, start, stop;
  start = threadNumber * n / threads;
  stop = (threadNumber + 1) * n / threads;
  for(i = start; i < stop; i++){
    for(k = 0; k < n; k++){
      for(j = 0; j < n; j++){
	c[i*n+j] += a[i*n+k] * b[k*n + j];
      }
    }
  }
}  

static void *child(void *ignored){
  doComputations(0, 2);
}

static double randomDouble(){
  return random() / (double) 0x7fffffff;
}

static void initialize(){
  int i;
  srandom(284); // reset the pseudo-random number generator
  for(i = 0; i < n*n; i++){
    a[i] = randomDouble();
    b[i] = randomDouble();
    c[i] = randomDouble();
  }
}

int main(int argc, char* argv[]){
  // Get the value of n from the command line.
  assert(argc == 2);
  n = atoi(argv[1]);
  assert(n > 0);

  // Allocate space for three n*n arrays of double-precision floating point numbers.
  a = malloc(n*n*sizeof(double));
  assert(a != 0);
  b = malloc(n*n*sizeof(double));
  assert(b != 0);
  c = malloc(n*n*sizeof(double));
  assert(c != 0);

  initialize();

  // Get time before the matrix computation as a baseline.
  struct timeval before;
  assert(gettimeofday(&before, NULL) == 0);

  doComputations(0, 1);  // single threaded for baseline time and correctness

  // Get the time after the matrix computation.
  struct timeval after;
  assert(gettimeofday(&after, NULL) == 0);

  double singleThreadedTime = 
    ((after.tv_usec - before.tv_usec) * 1e-6) +
    (after.tv_sec - before.tv_sec);

  double *cReference = c;  // save results for comparison
  c = malloc(n*n*sizeof(double)); // and allocate new copy
  assert(c != 0);

  initialize();  // go back to the same starting point

  // Get time before the matrix computation as a baseline.
  assert(gettimeofday(&before, NULL) == 0);

  pthread_t child_thread; // the child thread will be number 0 of 2
  int code = pthread_create(&child_thread, NULL, child, NULL);
  if(code){
    fprintf(stderr, "pthread_create failed with code %d\n", code);
    return 1;
  }
  doComputations(1, 2); // the parent thread is number 1 of 2
  code = pthread_join(child_thread, NULL);
  if(code){
    fprintf(stderr, "pthread_join failed with code %d\n", code);
    return 1;
  }

  // Get the time after the matrix computation.
  assert(gettimeofday(&after, NULL) == 0);

  double twoThreadedTime = 
    ((after.tv_usec - before.tv_usec) * 1e-6) +
    (after.tv_sec - before.tv_sec);

  printf("%g\n", singleThreadedTime / twoThreadedTime); // print the speedup

  int i;
  for(i = 0; i < n*n; i++){
    // check the results were the same
    if(fabs(c[i] - cReference[i]) > 1e-6){
      fprintf(stderr, "c[%d] = %g, but cRreference[%d] = %g\n", i, c[i], i, cReference[i]);
      return 1;
    }
  }

  // Exit normally.
  return 0;
}