#ifdef _OPENACC
#include <openacc.h>
#endif

void matmul_acc(float * restrict C, float * restrict A, 
		float * restrict B, int m, int n, int p)
{
  /* A is m x n, B is n x p, C = A*B is m x p */
  int i,j,k;
#pragma acc kernels copyin(A[0:m*n], B[0:n*p])  copyout(C[0:m*p])
  {
    for (i=0; i<m; i++)  
      for (j=0; j<p; j++) 
	{
	  float sum = 0;
	  for (k=0; k<n; k++) 
	    sum += A[i*n+k]*B[k*p+j];
	  
	  C[i*p+j] = sum;
	}
  }
}


