__kernel void gemm_nn_fast(int TA, int TB, int M, int N, int K, float ALPHA, 
                    __global float *A, int a_off, int lda, 
                    __global float *B, int b_off, int ldb,
                    float BETA,
                    __global float *C, int c_off, int ldc)
{
    int i, j, k, x, y;
    A += a_off;
    B += b_off;
    C += c_off;

    __local float Asub[TILE]  [TILE_K];
    __local float Bsub[TILE_K][TILE];

    int ctile = get_group_id(0);
    int rtile = get_group_id(1);

    float Breg;
    float Areg[WPT];
    float acc[WPT][WPT];

    A += rtile*TILE*lda;
    B += ctile*TILE;
    C += rtile*TILE*ldc + ctile*TILE;

    for(i = 0; i < WPT; ++i){
        for(j = 0; j < WPT; ++j){
            acc[i][j] = 0;
        }
    }

    int offset = get_local_id(0);

    for(i = 0; i < K; i += TILE_K){
        for(j = 0; j < TILE*TILE_K; j += THREADS){
            int index = j+offset;

            int row = index / TILE_K;
            int col = index % TILE_K;
            Asub[row][col] = A[row*lda + col];

            row = index / TILE;
            col = index % TILE;
            Bsub[row][col] = B[row*ldb + col];
        }

        A += TILE_K;
        B += TILE_K*ldb;

        barrier(CLK_LOCAL_MEM_FENCE);

        for(k = 0; k < TILE_K; ++k){
            for(y = 0; y < WPT; ++y){
                int row = (offset + (y*WPT)*THREADS)/TILE;
                //Areg[y] = Asub[y*WPT][k];
            }
            for(y = 0; y < WPT; ++y){
                for(x = 0; x < WPT; ++x){
                    int index = offset + (y*WPT + x)*THREADS;
                    int row = index / TILE;
                    int col = index % TILE;
                    acc[y][x] += Asub[row][k]*Bsub[k][col];
                }
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }

    for(y = 0; y < WPT; ++y){
        for(x = 0; x < WPT; ++x){
            int index = offset + (y*WPT + x)*THREADS;
            int row = index / TILE;
            int col = index % TILE;
            C[row*ldc+col] = ALPHA*acc[y][x] + BETA*C[row*ldc+col];
        }
    }
}