From edbccdfcaf46f11e631afe98796f3e6e170da5d0 Mon Sep 17 00:00:00 2001
From: Joseph Redmon <pjreddie@gmail.com>
Date: Sun, 26 Oct 2014 05:04:34 +0000
Subject: [PATCH] Maybe something changed?

---
 src/network.c             |   13 +++++++++++--
 src/im2col.cl             |   19 ++++++++++---------
 src/convolutional_layer.c |    2 +-
 src/gemm.c                |    4 +++-
 src/cnn.c                 |    7 ++++---
 src/im2col.c              |    8 +++-----
 src/col2im.cl             |    4 ++--
 7 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/src/cnn.c b/src/cnn.c
index 1488c77..2d09582 100644
--- a/src/cnn.c
+++ b/src/cnn.c
@@ -499,7 +499,7 @@
     int iters = 10000/net.batch;
     while(++count <= 2000){
         clock_t start = clock(), end;
-        float loss = train_network_sgd(net, train, iters);
+        float loss = train_network_sgd_gpu(net, train, iters);
         end = clock();
         float test_acc = network_accuracy(net, test);
         //float test_acc = 0;
@@ -957,8 +957,9 @@
 
 int main(int argc, char *argv[])
 {
-    //test_gpu_blas();
-    train_imagenet();
+    test_gpu_blas();
+    //train_imagenet();
+    //train_nist();
     fprintf(stderr, "Success!\n");
     return 0;
 }
diff --git a/src/col2im.cl b/src/col2im.cl
index 00d8f83..38d7af3 100644
--- a/src/col2im.cl
+++ b/src/col2im.cl
@@ -23,11 +23,11 @@
 
     int w_start = (w<ksize)?0:(w-ksize)/stride + 1;
     int w_end = w/stride + 1;
-    if(width_col < w_end) w_end = width_col;
+    w_end = (width_col < w_end) ? width_col : w_end;
 
     int h_start = (h<ksize)?0:(h-ksize)/stride+1;
     int h_end = h/stride + 1;
-    if(height_col < h_end) h_end = height_col;
+    h_end = (height_col < h_end) ? height_col : h_end;
 
     int rows = channels * ksize * ksize;
     int cols = height_col*width_col;
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 541f31b..1587ae8 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -342,7 +342,7 @@
     check_error(cl);
 }
 
-#define TIMEIT
+//#define TIMEIT
 
 void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in)
 {
diff --git a/src/gemm.c b/src/gemm.c
index 6eada0a..63c2950 100644
--- a/src/gemm.c
+++ b/src/gemm.c
@@ -176,12 +176,14 @@
         float BETA,
         cl_mem C_gpu, int ldc)
 {
+/*
     cl_setup();
     cl_command_queue queue = cl.queue;
     cl_event event;
     cl.error = clblasSgemm(clblasRowMajor, TA?clblasTrans:clblasNoTrans, TB?clblasTrans:clblasNoTrans,M, N, K,ALPHA, A_gpu, 0, lda,B_gpu, 0, ldb,BETA, C_gpu, 0, ldc,1, &queue, 0, NULL, &event);
 
-    //gemm_ongpu_new(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc);
+*/
+    gemm_ongpu_new(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc);
 }
 
 void gemm_ongpu_new(int TA, int TB, int M, int N, int K, float ALPHA, 
diff --git a/src/im2col.c b/src/im2col.c
index 08f7ce4..b743e34 100644
--- a/src/im2col.c
+++ b/src/im2col.c
@@ -91,12 +91,10 @@
         width_col = 1 + (width-1) / stride;
     }
 
-    size_t global_size[2];
-    global_size[0] = batch*channels_col;
-    global_size[1] = height_col*width_col;
+    size_t global_size = batch*channels_col*height_col*width_col;
 
-    clEnqueueNDRangeKernel(queue, im2col_kernel, 2, 0,
-            global_size, 0, 0, 0, 0);
+    clEnqueueNDRangeKernel(queue, im2col_kernel, 1, 0,
+            &global_size, 0, 0, 0, 0);
     check_error(cl);
 }
 
diff --git a/src/im2col.cl b/src/im2col.cl
index 877ee52..8169e1a 100644
--- a/src/im2col.cl
+++ b/src/im2col.cl
@@ -16,21 +16,22 @@
     int c,h,w,b;
     int height_col = (height - ksize) / stride + 1;
     int width_col = (width - ksize) / stride + 1;
+    int channels_col = channels * ksize * ksize;
     if (pad){
         height_col = 1 + (height-1) / stride;
         width_col = 1 + (width-1) / stride;
         pad = ksize/2;
     }
-    int gid1 = get_global_id(0);
-    b = gid1%batch;
-    c = gid1/batch;
+    int id = get_global_id(0);
+    w = id % width_col;
+    id /= width_col;
+    h = id % height_col;
+    id /= height_col;
+    c = id % channels_col;
+    id /= channels_col;
+    b = id % batch;
+    id /= batch;
 
-    int gid2 = get_global_id(1);
-    h = gid2%height_col;
-    w = gid2/height_col;
-
-
-    int channels_col = channels * ksize * ksize;
     int col_size = height_col*width_col*channels_col;
     int w_offset = c % ksize;
     int h_offset = (c / ksize) % ksize;
diff --git a/src/network.c b/src/network.c
index 51b0700..8167d85 100644
--- a/src/network.c
+++ b/src/network.c
@@ -38,7 +38,7 @@
     //printf("start\n");
     int i;
     for(i = 0; i < net.n; ++i){
-        clock_t time = clock();
+        //clock_t time = clock();
         if(net.types[i] == CONVOLUTIONAL){
             convolutional_layer layer = *(convolutional_layer *)net.layers[i];
             forward_convolutional_layer_gpu(layer, input);
@@ -63,7 +63,7 @@
             forward_softmax_layer_gpu(layer, input);
             input = layer.output_cl;
         }
-        printf("%d %f\n", i, sec(clock()-time));
+        //printf("%d %f\n", i, sec(clock()-time));
         /*
            else if(net.types[i] == CROP){
            crop_layer layer = *(crop_layer *)net.layers[i];
@@ -386,6 +386,7 @@
 {
     int x_size = get_network_input_size(net)*net.batch;
     int y_size = get_network_output_size(net)*net.batch;
+    clock_t time = clock();
     if(!*net.input_cl){
         *net.input_cl = cl_make_array(x, x_size);
         *net.truth_cl = cl_make_array(y, y_size);
@@ -393,10 +394,18 @@
         cl_write_array(*net.input_cl, x, x_size);
         cl_write_array(*net.truth_cl, y, y_size);
     }
+    //printf("trans %f\n", sec(clock()-time));
+    time = clock();
     forward_network_gpu(net, *net.input_cl, *net.truth_cl, 1);
+    //printf("forw %f\n", sec(clock()-time));
+    time = clock();
     backward_network_gpu(net, *net.input_cl);
+    //printf("back %f\n", sec(clock()-time));
+    time = clock();
     float error = get_network_cost(net);
     update_network_gpu(net);
+    //printf("updt %f\n", sec(clock()-time));
+    time = clock();
     return error;
 }
 

--
Gitblit v1.10.0