#include <stdio.h>

#ifdef _OPENACC
#include <openacc.h>
#endif



void matmul(float *  C, float * A, float * B,
	    int m, int n, int p)
{
  /* A is m x n, B is n x p, C = A*B is m x p */
  int i,j,k;
  /* OpenACC */  
#pragma acc kernels copyin(A[0:m*n], B[0:n*p])  copyout(C[0:m*p])
  /* OpenMP */
#pragma omp parallel shared(A,B,C) private(i,j,k)
#pragma omp for schedule(static)

  for (i=0; i<m; i++)  {
    for (j=0; j<n; j++) {
      float sum = 0;
      for (k=0; k<p; k++) {
	float a = A[i*n+k];
        float b = B[k*p+j];
	sum += a*b;
      }
      C[i*n+j] = sum;
    }
  }
}

