// TODO: write codelet directive with label sgemm2:
// - for CUDA target, 
// - with atfirstcall transfer policy for alpha and beta
// - with manual transfer policy for vin1, vin2 and vout
// - with atcall transfer policy for n
// - arguments vin1, vin2 and vout should be mirrored

void sgemm( int n, float alpha, const float vin1[n][n], const float vin2[n][n], float beta, float vout[n][n] ) {
  int j;
  for( j = 0 ; j < n ; j++ ) {
    int i;
    for( i = 0 ; i < n ; i++ ) {
      int k;
      float prod = 0.0f;
      for( k = 0 ; k < n ; k++ ) {
        prod += vin1[k][i] * vin2[j][k];
      }
      vout[j][i] = alpha * prod + beta * vout[j][i];
    }
  }
}
