#pragma hmpp sgemm3 codelet, target=CUDA, args[vin1;vin2;vout].mirror, args[vin1;vin2;vout].transfer=manual, args[alpha;beta].transfer=atfirstcall, args[n].transfer=atcall
void sgemm( int n, float alpha, const float vin1[n][n], const float vin2[n][n], float beta, float vout[n][n] ) {
  int j;

  /* TO DO: Add unrolling 8 times (if GT200) 
            or 4 times (if Fermi) on i and j.
            Add jam on the 2 levels and split on i.
            Do not Forget the noremainder. */
#pragma hmppcg unroll i:8, j:8, jam, noremainder(i,j), split(i)
  for( j = 0 ; j < n ; j++ ) {
    int i;
    for( i = 0 ; i < n ; i++ ) {
      int k;
      float prod = 0.0f;
      for( k = 0 ; k < n ; k++ ) {
        prod += vin1[k][i] * vin2[j][k];
      }
      vout[j][i] = alpha * prod + beta * vout[j][i];
    }
  }
}
