__kernel void gemm_tn(int TA, int TB, int M, int N, int K, float ALPHA, __global float *A, int lda, __global float *B, int ldb, float BETA, __global float *C, int ldc) { __local float Asub[BLOCK][BLOCK]; __local float Bsub[BLOCK][BLOCK]; int col = get_global_id(0); int row = get_global_id(1); int col_block = get_group_id(0); int row_block = get_group_id(1); col = (col < N) ? col : N - 1; row = (row < M) ? row : M - 1; int x = get_local_id(0); int y = get_local_id(1); int i,j; float val = 0; float orig = C[row*ldc + col]; for(i = 0; i < K; i += BLOCK){ int arow = y + i; int acol = x + row_block*BLOCK; int brow = y + i; int bcol = col; arow = (arow < K) ? arow : K-1; acol = (acol < M) ? acol : M-1; brow = (brow < K) ? brow : K-1; int aind = arow*lda + acol; int bind = brow*ldb + bcol; Asub[x][y] = A[aind]; Bsub[y][x] = B[bind]; barrier(CLK_LOCAL_MEM_FENCE); for(j = 0; j < BLOCK && i+j