Joseph Redmon
2015-01-20 4ac78c89269138b4623993f9f1d81829d8e88131
I am so done with opencl, switching to cuda
3 files modified
45 ■■■■■ changed files
src/cnn.c 2 ●●● patch | view | raw | blame | history
src/gemm.c 6 ●●●● patch | view | raw | blame | history
src/gemm_fast.cl 37 ●●●● patch | view | raw | blame | history
src/cnn.c
@@ -210,7 +210,7 @@
    //network net = parse_network_cfg("/home/pjreddie/imagenet_backup/alexnet_1270.cfg");
    srand(time(0));
    network net = parse_network_cfg(cfgfile);
    set_learning_network(&net, net.learning_rate*10., net.momentum, net.decay);
    set_learning_network(&net, net.learning_rate*100., net.momentum, net.decay);
    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
    int imgs = 1024;
    int i = 6600;
src/gemm.c
@@ -164,8 +164,7 @@
#define TILE 64
#define TILE_K 16
#define WPT 8
#define THREADS (TILE*TILE)/(WPT*WPT)
#define THREADS 64
cl_kernel get_gemm_nn_fast_kernel()
{
@@ -175,7 +174,6 @@
        gemm_kernel = get_kernel("src/gemm_fast.cl", "gemm_nn_fast", "-D TILE=" STR(TILE)
                                                                    " -cl-nv-verbose "
                                                                    " -D TILE_K=" STR(TILE_K)
                                                                    " -D WPT=" STR(WPT)
                                                                    " -D THREADS=" STR(THREADS));
        init = 1;
    }
@@ -464,7 +462,6 @@
    test_gpu_accuracy(0,0,128,128,128); 
/*
    time_ongpu(0,0,64,2916,363); 
    time_ongpu_fast(0,0,64,2916,363); 
    time_ongpu(0,0,64,2916,363); 
@@ -483,7 +480,6 @@
    time_ongpu_fast(0,0,128,4096,12544); 
    time_ongpu(0,0,128,4096,4096); 
    time_ongpu_fast(0,0,128,4096,4096); 
    */
//    time_ongpu(1,0,2304,196,256); 
//    time_ongpu_fast(1,0,2304,196,256); 
//    time_ongpu(0,1,256,2304,196); 
src/gemm_fast.cl
@@ -16,16 +16,15 @@
    int ctile = get_group_id(0);
    int rtile = get_group_id(1);
    float Breg;
    float Areg[WPT];
    float acc[WPT][WPT];
    float Areg[TILE];
    float acc[TILE][TILE/THREADS];
    A += rtile*TILE*lda;
    B += ctile*TILE;
    C += rtile*TILE*ldc + ctile*TILE;
    for(i = 0; i < WPT; ++i){
        for(j = 0; j < WPT; ++j){
    for(i = 0; i < TILE; ++i){
        for(j = 0; j < TILE/THREADS; ++j){
            acc[i][j] = 0;
        }
    }
@@ -51,28 +50,26 @@
        barrier(CLK_LOCAL_MEM_FENCE);
        for(k = 0; k < TILE_K; ++k){
            for(y = 0; y < WPT; ++y){
                int row = (offset + (y*WPT)*THREADS)/TILE;
                //Areg[y] = Asub[y*WPT][k];
            #pragma unroll
            for(y = 0; y < TILE; ++y){
                Areg[y] = Asub[y][k];
            }
            for(y = 0; y < WPT; ++y){
                for(x = 0; x < WPT; ++x){
                    int index = offset + (y*WPT + x)*THREADS;
                    int row = index / TILE;
                    int col = index % TILE;
                    acc[y][x] += Asub[row][k]*Bsub[k][col];
            for(x = 0; x < TILE; x += THREADS){
                float Breg = Bsub[k][x+offset];
                #pragma unroll
                for(y = 0; y < TILE; ++y){
                    acc[y][x/THREADS] += Breg * Areg[y];
                }
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    for(y = 0; y < WPT; ++y){
        for(x = 0; x < WPT; ++x){
            int index = offset + (y*WPT + x)*THREADS;
            int row = index / TILE;
            int col = index % TILE;
            C[row*ldc+col] = ALPHA*acc[y][x] + BETA*C[row*ldc+col];
    for(i = 0; i < TILE; ++i){
        for(j = 0; j < TILE/THREADS; ++j){
            int col = j*THREADS + offset;
            int row = i;
            C[row*ldc+col] = ALPHA*acc[i][j] + BETA*C[row*ldc+col];
        }
    }
}