From 14303717dcddae43cdc55beb0685dae86f566fd8 Mon Sep 17 00:00:00 2001
From: Joseph Redmon <pjreddie@gmail.com>
Date: Sat, 25 Oct 2014 18:57:26 +0000
Subject: [PATCH] Fast, needs to be faster

---
 src/network.c             |   23 ++
 src/gemm.cl               |    8 
 src/gemm_new.cl           |  162 ++++++++++++++++
 src/network.h             |    2 
 Makefile                  |    4 
 src/axpy.c                |    8 
 src/connected_layer.c     |    6 
 src/connected_layer.h     |    1 
 src/data.c                |   26 ++
 src/gemm.c                |  146 +++++++++++++
 src/cnn.c                 |   82 +++++++
 src/convolutional_layer.h |    1 
 src/data.h                |    2 
 src/image.c               |   20 +
 src/convolutional_layer.c |    8 
 src/parser.c              |   44 ---
 src/mini_blas.h           |    8 
 src/opencl.c              |    6 
 src/image.h               |    1 
 19 files changed, 484 insertions(+), 74 deletions(-)

diff --git a/Makefile b/Makefile
index b5ad1eb..29dccbb 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 CC=gcc
 GPU=1
-COMMON=-Wall -Wfatal-errors `pkg-config --cflags opencv` -I/usr/local/cuda/include/
+COMMON=-Wall -Wfatal-errors `pkg-config --cflags opencv` -I/usr/local/cuda/include/ -I/usr/local/clblas/include/
 ifeq ($(GPU), 1) 
 COMMON+=-DGPU
 else
@@ -15,7 +15,7 @@
 else
 OPTS+= -march=native
 ifeq ($(GPU), 1)
-LDFLAGS= -lOpenCL
+LDFLAGS= -lOpenCL -lclBLAS
 endif
 endif
 CFLAGS= $(COMMON) $(OPTS)
diff --git a/src/axpy.c b/src/axpy.c
index c4ec1eb..10ffca4 100644
--- a/src/axpy.c
+++ b/src/axpy.c
@@ -1,24 +1,24 @@
 #include "mini_blas.h"
 
-inline void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
+void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
 {
     int i;
     for(i = 0; i < N; ++i) Y[i*INCY] += ALPHA*X[i*INCX];
 }
 
-inline void scal_cpu(int N, float ALPHA, float *X, int INCX)
+void scal_cpu(int N, float ALPHA, float *X, int INCX)
 {
     int i;
     for(i = 0; i < N; ++i) X[i*INCX] *= ALPHA;
 }
 
-inline void copy_cpu(int N, float *X, int INCX, float *Y, int INCY)
+void copy_cpu(int N, float *X, int INCX, float *Y, int INCY)
 {
     int i;
     for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX];
 }
 
-inline float dot_cpu(int N, float *X, int INCX, float *Y, int INCY)
+float dot_cpu(int N, float *X, int INCX, float *Y, int INCY)
 {
     int i;
     float dot = 0;
diff --git a/src/cnn.c b/src/cnn.c
index 7e90a80..a31e59c 100644
--- a/src/cnn.c
+++ b/src/cnn.c
@@ -286,14 +286,16 @@
 	srand(2222222);
 	int i = 0;
 	char *labels[] = {"cat","dog"};
+    clock_t time;
 	while(1){
 		i += 1000;
+        time=clock();
 		data train = load_data_image_pathfile_random("data/assira/train.list", imgs*net.batch, labels, 2, 256, 256);
 		normalize_data_rows(train);
-		clock_t start = clock(), end;
-		float loss = train_network_sgd_gpu(net, train, imgs);
-		end = clock();
-		printf("%d: %f, Time: %lf seconds\n", i, loss, (float)(end-start)/CLOCKS_PER_SEC );
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+        time=clock();
+		float loss = train_network_sgd(net, train, imgs);
+		printf("%d: %f, Time: %lf seconds\n", i, loss, sec(clock()-time));
 		free_data(train);
 		if(i%10000==0){
 			char buff[256];
@@ -304,9 +306,69 @@
 	}
 }
 
+void train_imagenet()
+{
+	network net = parse_network_cfg("cfg/imagenet_backup_710.cfg");
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    int imgs = 1000/net.batch+1;
+    //imgs=1;
+	srand(888888);
+	int i = 0;
+    char **labels = get_labels("/home/pjreddie/data/imagenet/cls.labels.list");
+    list *plist = get_paths("/home/pjreddie/data/imagenet/cls.cropped.list");
+    char **paths = (char **)list_to_array(plist);
+    clock_t time;
+	while(1){
+		i += 1;
+        time=clock();
+		data train = load_data_random(imgs*net.batch, paths, plist->size, labels, 1000, 256, 256);
+		normalize_data_rows(train);
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+        time=clock();
+        #ifdef GPU
+		float loss = train_network_sgd_gpu(net, train, imgs);
+		printf("%d: %f, %lf seconds, %d images\n", i, loss, sec(clock()-time), i*imgs*net.batch);
+        #endif
+		free_data(train);
+		if(i%10==0){
+			char buff[256];
+			sprintf(buff, "/home/pjreddie/imagenet_backup/imagenet_backup_%d.cfg", i);
+			save_network(net, buff);
+		}
+	}
+}
+
+void test_imagenet()
+{
+	network net = parse_network_cfg("cfg/imagenet_test.cfg");
+    //imgs=1;
+	srand(2222222);
+	int i = 0;
+    char **names = get_labels("cfg/shortnames.txt");
+    clock_t time;
+    char filename[256];
+    int indexes[10];
+	while(1){
+        gets(filename);
+        image im = load_image_color(filename, 256, 256);
+        normalize_image(im);
+        printf("%d %d %d\n", im.h, im.w, im.c);
+        float *X = im.data;
+        time=clock();
+        float *predictions = network_predict(net, X);
+        top_predictions(net, 10, indexes);
+		printf("%s: Predicted in %f seconds.\n", filename, sec(clock()-time));
+        for(i = 0; i < 10; ++i){
+            int index = indexes[i];
+            printf("%s: %f\n", names[index], predictions[index]);
+        }
+		free_image(im);
+	}
+}
+
 void test_visualize()
 {
-	network net = parse_network_cfg("cfg/voc_imagenet.cfg");
+	network net = parse_network_cfg("cfg/assira_backup_740000.cfg");
 	srand(2222222);
 	visualize_network(net);
 	cvWaitKey(0);
@@ -322,7 +384,7 @@
 	for(i = 0; i < total; ++i){
 		visualize_network(net);
 		cvWaitKey(100);
-		data test = load_data_image_pathfile_part("images/assira/test.list", i, total, labels, 2, 256, 256);
+		data test = load_data_image_pathfile_part("data/assira/test.list", i, total, labels, 2, 256, 256);
 		image im = float_to_image(256, 256, 3,test.X.vals[0]);
 		show_image(im, "input");
 		cvWaitKey(100);
@@ -437,7 +499,7 @@
     int iters = 10000/net.batch;
     while(++count <= 2000){
         clock_t start = clock(), end;
-        float loss = train_network_sgd_gpu(net, train, iters);
+        float loss = train_network_sgd(net, train, iters);
         end = clock();
         float test_acc = network_accuracy(net, test);
         //float test_acc = 0;
@@ -895,10 +957,14 @@
 
 int main(int argc, char *argv[])
 {
+    test_gpu_blas();
     //test_blas();
-    train_assira();
+    //train_assira();
+	//test_visualize();
     //test_distribution();
     //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+    //train_imagenet();
+    //test_imagenet();
 
     //test_blas();
     //test_visualize();
diff --git a/src/connected_layer.c b/src/connected_layer.c
index b41ae91..dba0b2a 100644
--- a/src/connected_layer.c
+++ b/src/connected_layer.c
@@ -114,6 +114,12 @@
     cl_read_array(layer.biases_cl, layer.biases, layer.outputs);
 }
 
+void push_connected_layer(connected_layer layer)
+{
+    cl_write_array(layer.weights_cl, layer.weights, layer.inputs*layer.outputs);
+    cl_write_array(layer.biases_cl, layer.biases, layer.outputs);
+}
+
 void update_connected_layer_gpu(connected_layer layer)
 {
     axpy_ongpu(layer.outputs, layer.learning_rate, layer.bias_updates_cl, 1, layer.biases_cl, 1);
diff --git a/src/connected_layer.h b/src/connected_layer.h
index 19bcfa2..1e5b4a7 100644
--- a/src/connected_layer.h
+++ b/src/connected_layer.h
@@ -48,6 +48,7 @@
 void forward_connected_layer_gpu(connected_layer layer, cl_mem input);
 void backward_connected_layer_gpu(connected_layer layer, cl_mem input, cl_mem delta);
 void update_connected_layer_gpu(connected_layer layer);
+void push_connected_layer(connected_layer layer);
 #endif
 
 #endif
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 0ed5a99..1587ae8 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -212,7 +212,7 @@
 {
     int size = layer.size*layer.size*layer.c*layer.n;
     axpy_cpu(layer.n, layer.learning_rate, layer.bias_updates, 1, layer.biases, 1);
-    scal_cpu(layer.n,layer.momentum, layer.bias_updates, 1);
+    scal_cpu(layer.n, layer.momentum, layer.bias_updates, 1);
 
     scal_cpu(size, 1.-layer.learning_rate*layer.decay, layer.filters, 1);
     axpy_cpu(size, layer.learning_rate, layer.filter_updates, 1, layer.filters, 1);
@@ -434,6 +434,12 @@
     cl_read_array(layer.biases_cl, layer.biases, layer.n);
 }
 
+void push_convolutional_layer(convolutional_layer layer)
+{
+    cl_write_array(layer.filters_cl, layer.filters, layer.c*layer.n*layer.size*layer.size);
+    cl_write_array(layer.biases_cl, layer.biases, layer.n);
+}
+
 void update_convolutional_layer_gpu(convolutional_layer layer)
 {
     int size = layer.size*layer.size*layer.c*layer.n;
diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h
index 465d309..970a9b1 100644
--- a/src/convolutional_layer.h
+++ b/src/convolutional_layer.h
@@ -49,6 +49,7 @@
 void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in);
 void backward_convolutional_layer_gpu(convolutional_layer layer, cl_mem delta_cl);
 void update_convolutional_layer_gpu(convolutional_layer layer);
+void push_convolutional_layer(convolutional_layer layer);
 #endif
 
 convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, float learning_rate, float momentum, float decay);
diff --git a/src/data.c b/src/data.c
index aa8fecf..734fffa 100644
--- a/src/data.c
+++ b/src/data.c
@@ -41,9 +41,11 @@
     d.y = make_matrix(n, k);
 
     for(i = 0; i < n; ++i){
-        image im = load_image(paths[i], h, w);
+        image im = load_image_color(paths[i], h, w);
         d.X.vals[i] = im.data;
         d.X.cols = im.h*im.w*im.c;
+    }
+    for(i = 0; i < n; ++i){
         fill_truth(paths[i], labels, k, d.y.vals[i]);
     }
     return d;
@@ -60,6 +62,14 @@
     return d;
 }
 
+char **get_labels(char *filename)
+{
+    list *plist = get_paths(filename);
+    char **labels = (char **)list_to_array(plist);
+    free_list(plist);
+    return labels;
+}
+
 void free_data(data d)
 {
     if(!d.shallow){
@@ -84,6 +94,20 @@
     return d;
 }
 
+data load_data_random(int n, char **paths, int m, char **labels, int k, int h, int w)
+{
+    char **random_paths = calloc(n, sizeof(char*));
+    int i;
+    for(i = 0; i < n; ++i){
+        int index = rand()%m;
+        random_paths[i] = paths[index];
+        if(i == 0) printf("%s\n", paths[index]);
+    }
+    data d = load_data_image_paths(random_paths, n, labels, k, h, w);
+    free(random_paths);
+    return d;
+}
+
 data load_data_image_pathfile_random(char *filename, int n, char **labels, int k, int h, int w)
 {
     int i;
diff --git a/src/data.h b/src/data.h
index bd677e8..eefef8b 100644
--- a/src/data.h
+++ b/src/data.h
@@ -12,6 +12,7 @@
 
 
 void free_data(data d);
+data load_data_random(int n, char **paths, int m, char **labels, int k, int h, int w);
 data load_data_image_pathfile(char *filename, char **labels, int k, int h, int w);
 data load_data_image_pathfile_part(char *filename, int part, int total, 
                                     char **labels, int k, int h, int w);
@@ -20,6 +21,7 @@
 data load_cifar10_data(char *filename);
 data load_all_cifar10();
 list *get_paths(char *filename);
+char **get_labels(char *filename);
 void get_batch(data d, int n, float *X, float *y);
 data load_categorical_data_csv(char *filename, int target, int k);
 void normalize_data_rows(data d);
diff --git a/src/gemm.c b/src/gemm.c
index fa78daf..2e53b31 100644
--- a/src/gemm.c
+++ b/src/gemm.c
@@ -1,5 +1,5 @@
 #include "mini_blas.h"
-#include <clBLAS.h>
+#include "utils.h"
 
 void gemm(int TA, int TB, int M, int N, int K, float ALPHA, 
         float *A, int lda, 
@@ -104,6 +104,7 @@
 
 #include "opencl.h"
 #include <math.h>
+#include <clBLAS.h>
 
 #define STR_HELPER(x) #x
 #define STR(x) STR_HELPER(x)
@@ -111,7 +112,7 @@
 #ifdef __APPLE__
 #define BLOCK 1
 #else
-#define BLOCK 8
+#define BLOCK 16
 #endif
 
 cl_kernel get_gemm_kernel()
@@ -125,6 +126,44 @@
     return gemm_kernel;
 }
 
+cl_kernel get_gemm_nt_kernel()
+{
+    static int init = 0;
+    static cl_kernel gemm_kernel;
+    if(!init){
+        gemm_kernel = get_kernel("src/gemm_new.cl", "gemm_nt", "-D BLOCK=" STR(BLOCK) );
+        init = 1;
+    }
+    return gemm_kernel;
+}
+
+cl_kernel get_gemm_tn_kernel()
+{
+    static int init = 0;
+    static cl_kernel gemm_kernel;
+    if(!init){
+        gemm_kernel = get_kernel("src/gemm_new.cl", "gemm_tn", "-D BLOCK=" STR(BLOCK) );
+        init = 1;
+    }
+    return gemm_kernel;
+}
+
+cl_kernel get_gemm_nn_kernel()
+{
+    static int init = 0;
+    static cl_kernel gemm_kernel;
+    if(!init){
+        gemm_kernel = get_kernel("src/gemm_new.cl", "gemm_nn", "-D BLOCK=" STR(BLOCK) );
+        init = 1;
+    }
+    return gemm_kernel;
+}
+
+void gemm_ongpu_new(int TA, int TB, int M, int N, int K, float ALPHA, 
+        cl_mem A_gpu, int lda, 
+        cl_mem B_gpu, int ldb,
+        float BETA,
+        cl_mem C_gpu, int ldc);
 void gemm_ongpu_old(int TA, int TB, int M, int N, int K, float ALPHA, 
         cl_mem A_gpu, int lda, 
         cl_mem B_gpu, int ldb,
@@ -137,10 +176,51 @@
         float BETA,
         cl_mem C_gpu, int ldc)
 {
+    /*
     cl_setup();
-    //cl.error = clblasSgemm(clblasRowMajor, TA?clblasTrans:clblasNoTrans, TB?clblasTrans:clblasNoTrans,M, N, K,ALPHA, A_gpu, 0, lda,B_gpu, 0, ldb,BETA, C_gpu, 0, ldc,1, &queue, 0, NULL, &event);
-    //check_error(cl);
-    gemm_ongpu_old(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc);
+    cl_command_queue queue = cl.queue;
+    cl_event event;
+    cl.error = clblasSgemm(clblasRowMajor, TA?clblasTrans:clblasNoTrans, TB?clblasTrans:clblasNoTrans,M, N, K,ALPHA, A_gpu, 0, lda,B_gpu, 0, ldb,BETA, C_gpu, 0, ldc,1, &queue, 0, NULL, &event);
+    */
+
+    gemm_ongpu_new(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc);
+}
+
+void gemm_ongpu_new(int TA, int TB, int M, int N, int K, float ALPHA, 
+        cl_mem A_gpu, int lda, 
+        cl_mem B_gpu, int ldb,
+        float BETA,
+        cl_mem C_gpu, int ldc)
+{
+    //printf("gpu: %d %d %d %d %d\n",TA, TB, M, N, K);
+    cl_setup();
+    cl_kernel      gemm_kernel = get_gemm_kernel();
+    if(!TA && !TB) gemm_kernel = get_gemm_nn_kernel();
+    if(!TA && TB)  gemm_kernel = get_gemm_nt_kernel();
+    if(TA && !TB)  gemm_kernel = get_gemm_tn_kernel();
+    cl_command_queue queue = cl.queue;
+
+    cl_uint i = 0;
+    cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TA), (void*) &TA);
+    cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TB), (void*) &TB);
+    cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(M), (void*) &M);
+    cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(N), (void*) &N);
+    cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(K), (void*) &K);
+    cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ALPHA), (void*) &ALPHA);
+    cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(A_gpu), (void*) &A_gpu);
+    cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(lda), (void*) &lda);
+    cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(B_gpu), (void*) &B_gpu);
+    cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldb), (void*) &ldb);
+    cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(BETA), (void*) &BETA);
+    cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(C_gpu), (void*) &C_gpu);
+    cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc);
+    check_error(cl);
+
+    const size_t global_size[] = {ceil((float)N/BLOCK)*BLOCK, ceil((float)M/BLOCK)*BLOCK};
+    const size_t local_size[] = {BLOCK, BLOCK};
+
+    clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, local_size, 0, 0, 0);
+    check_error(cl);
 }
 
 void gemm_ongpu_old(int TA, int TB, int M, int N, int K, float ALPHA, 
@@ -170,7 +250,7 @@
     cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc);
     check_error(cl);
 
-    const size_t global_size[] = {ceil((float)M/BLOCK)*BLOCK, ceil((float)N/BLOCK)*BLOCK};
+    const size_t global_size[] = {ceil((float)N/BLOCK)*BLOCK, ceil((float)M/BLOCK)*BLOCK};
     const size_t local_size[] = {BLOCK, BLOCK};
 
     clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, local_size, 0, 0, 0);
@@ -235,7 +315,7 @@
     float *c = random_matrix(m,n);
     int i;
     clock_t start = clock(), end;
-    for(i = 0; i<10; ++i){
+    for(i = 0; i<32; ++i){
         gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n);
     }
     end = clock();
@@ -245,6 +325,39 @@
     free(c);
 }
 
+void time_ongpu(int TA, int TB, int m, int k, int n)
+{
+    int iter = 100;
+    float *a = random_matrix(m,k);
+    float *b = random_matrix(k,n);
+
+    int lda = (!TA)?k:m;
+    int ldb = (!TB)?n:k;
+
+    float *c = random_matrix(m,n);
+
+    cl_mem a_cl = cl_make_array(a, m*k);
+    cl_mem b_cl = cl_make_array(b, k*n);
+    cl_mem c_cl = cl_make_array(c, m*n);
+
+    int i;
+    clock_t start = clock(), end;
+    for(i = 0; i<iter; ++i){
+        gemm_ongpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
+    }
+    int flop = m*n*(2*k+3)*iter;
+    float gflop = flop/pow(10., 9);
+    end = clock();
+    float seconds = sec(end-start);
+    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds);
+    clReleaseMemObject(a_cl);
+    clReleaseMemObject(b_cl);
+    clReleaseMemObject(c_cl);
+    free(a);
+    free(b);
+    free(c);
+}
+
 void test_gpu_accuracy(int TA, int TB, int m, int k, int n)
 {
     srand(0);
@@ -272,14 +385,16 @@
         //printf("%f %f\n", c[i], c_gpu[i]);
         sse += pow(c[i]-c_gpu[i], 2);
     }
-    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %g MSE\n",m,k,k,n, TA, TB, sse/(m*n));
+    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %g SSE\n",m,k,k,n, TA, TB, sse/(m*n));
     free(a);
     free(b);
     free(c);
+    free(c_gpu);
 }
 
 void test_gpu_blas()
 {
+    /*
     test_gpu_accuracy(0,0,10,576,75); 
 
     test_gpu_accuracy(0,0,17,10,10); 
@@ -291,6 +406,21 @@
     test_gpu_accuracy(1,0,1000,10,100); 
     test_gpu_accuracy(0,1,1000,10,100); 
     test_gpu_accuracy(1,1,1000,10,100); 
+    */
+    test_gpu_accuracy(0,0,131,4093,1199); 
+    test_gpu_accuracy(0,1,131,4093,1199); 
+    test_gpu_accuracy(1,0,131,4093,1199); 
+    test_gpu_accuracy(1,1,131,4093,1199); 
+
+    time_ongpu(0,0,1024,1024,1024); 
+    time_ongpu(0,1,1024,1024,1024); 
+    time_ongpu(1,0,1024,1024,1024); 
+    time_ongpu(1,1,1024,1024,1024); 
+
+    time_ongpu(0,0,128,4096,1200); 
+    time_ongpu(0,1,128,4096,1200); 
+    time_ongpu(1,0,128,4096,1200); 
+    time_ongpu(1,1,128,4096,1200); 
 
     /*
        time_gpu_random_matrix(0,0,1000,1000,100); 
diff --git a/src/gemm.cl b/src/gemm.cl
index 9e45783..c5a0698 100644
--- a/src/gemm.cl
+++ b/src/gemm.cl
@@ -10,11 +10,11 @@
 
     float val = 0;
     
-    int row_block = get_group_id(0);
-    int col_block = get_group_id(1);
+    int row_block = get_group_id(1);
+    int col_block = get_group_id(0);
 
-    int sub_row = get_local_id(0);
-    int sub_col = get_local_id(1);
+    int sub_row = get_local_id(1);
+    int sub_col = get_local_id(0);
 
     int row = row_block*BLOCK + sub_row;
     int col = col_block*BLOCK + sub_col;
diff --git a/src/gemm_new.cl b/src/gemm_new.cl
new file mode 100644
index 0000000..110807a
--- /dev/null
+++ b/src/gemm_new.cl
@@ -0,0 +1,162 @@
+__kernel void gemm_tn(int TA, int TB, int M, int N, int K, float ALPHA, 
+                    __global float *A, int lda, 
+                    __global float *B, int ldb,
+                    float BETA,
+                    __global float *C, int ldc)
+{
+    __local float Asub[BLOCK][BLOCK];
+    __local float Bsub[BLOCK][BLOCK];
+
+    int col = get_global_id(0);
+    int row = get_global_id(1);
+
+    int col_block = get_group_id(0);
+    int row_block = get_group_id(1);
+
+    col = (col < N) ? col : N - 1;
+    row = (row < M) ? row : M - 1;
+
+    int x = get_local_id(0);
+    int y = get_local_id(1);
+
+    int i,j;
+
+    float val = 0;
+    float orig = C[row*ldc + col];
+
+    for(i = 0; i < K; i += BLOCK){
+        
+        int arow = y + i;
+        int acol = x + row_block*BLOCK;
+
+        int brow = y + i;
+        int bcol = col;
+
+        arow = (arow < K) ? arow : K-1;
+        acol = (acol < M) ? acol : M-1;
+        brow = (brow < K) ? brow : K-1;
+        
+        int aind = arow*lda + acol;
+        int bind = brow*ldb + bcol;
+        
+        Asub[x][y] = A[aind];
+        Bsub[y][x] = B[bind];
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for(j = 0; j < BLOCK && i+j<K; ++j){
+            val += Asub[y][j]*Bsub[j][x];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    C[row*ldc+col] = ALPHA*val + BETA*orig;
+}
+
+__kernel void gemm_nt(int TA, int TB, int M, int N, int K, float ALPHA, 
+                    __global float *A, int lda, 
+                    __global float *B, int ldb,
+                    float BETA,
+                    __global float *C, int ldc)
+{
+    __local float Asub[BLOCK][BLOCK];
+    __local float Bsub[BLOCK][BLOCK];
+
+    
+    int col = get_global_id(0);
+    int row = get_global_id(1);
+
+    int col_block = get_group_id(0);
+    int row_block = get_group_id(1);
+
+    col = (col < N) ? col : N - 1;
+    row = (row < M) ? row : M - 1;
+
+    int x = get_local_id(0);
+    int y = get_local_id(1);
+
+    int i,j;
+
+    float val = 0;
+    float orig = C[row*ldc + col];
+
+    for(i = 0; i < K; i += BLOCK){
+        
+        int arow = row;
+        int acol = x + i;
+
+        int brow = col_block*BLOCK + y;
+        int bcol = x + i;
+
+        brow = (brow < N) ? brow : N-1;
+        acol = (acol < K) ? acol : K-1;
+        bcol = (bcol < K) ? bcol : K-1;
+        
+        int aind = arow*lda + acol;
+        int bind = brow*ldb + bcol;
+        
+        Asub[y][x] = A[aind];
+        Bsub[x][y] = B[bind];
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for(j = 0; j < BLOCK && i+j<K; ++j){
+            val += Asub[y][j]*Bsub[j][x];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    C[row*ldc+col] = ALPHA*val + BETA*orig;
+}
+
+__kernel void gemm_nn(int TA, int TB, int M, int N, int K, float ALPHA, 
+                    __global float *A, int lda, 
+                    __global float *B, int ldb,
+                    float BETA,
+                    __global float *C, int ldc)
+{
+    __local float Asub[BLOCK][BLOCK];
+    __local float Bsub[BLOCK][BLOCK];
+
+    int col = get_global_id(0);
+    int row = get_global_id(1);
+
+    col = (col < N) ? col : N - 1;
+    row = (row < M) ? row : M - 1;
+
+    int x = get_local_id(0);
+    int y = get_local_id(1);
+
+    int i,j;
+
+    float orig = C[row*ldc+col];
+    float val = 0;
+    
+    for(i = 0; i < K; i += BLOCK){
+        
+        int arow = row;
+        int acol = x + i;
+
+        int brow = y + i;
+        int bcol = col;
+
+        acol = (acol < K) ? acol : K-1;
+        brow = (brow < K) ? brow : K-1;
+        
+        int aind = arow*lda + acol;
+        int bind = brow*ldb + bcol;
+        
+        Asub[y][x] = A[aind];
+        Bsub[y][x] = B[bind];
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for(j = 0; j < BLOCK && i+j<K; ++j){
+            val += Asub[y][j]*Bsub[j][x];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    C[row*ldc+col] = ALPHA*val + BETA*orig;
+}
+
diff --git a/src/image.c b/src/image.c
index b25bf05..da8b54a 100644
--- a/src/image.c
+++ b/src/image.c
@@ -369,7 +369,6 @@
         // Will do a scaled image resize with the correct aspect ratio.
         outImg = resizeImage(croppedImg, newHeight, newWidth, 0);
         cvReleaseImage( &croppedImg );
-
     }
     else {
 
@@ -415,6 +414,25 @@
     return out;
 }
 
+image load_image_color(char *filename, int h, int w)
+{
+    IplImage* src = 0;
+    if( (src = cvLoadImage(filename, 1)) == 0 )
+    {
+        printf("Cannot load file image %s\n", filename);
+        exit(0);
+    }
+    if(h && w && (src->height != h || src->width != w)){
+        printf("Resized!\n");
+        IplImage *resized = resizeImage(src, h, w, 1);
+        cvReleaseImage(&src);
+        src = resized;
+    }
+    image out = ipl_to_image(src);
+    cvReleaseImage(&src);
+    return out;
+}
+
 image load_image(char *filename, int h, int w)
 {
     IplImage* src = 0;
diff --git a/src/image.h b/src/image.h
index fe25742..9f7fc8e 100644
--- a/src/image.h
+++ b/src/image.h
@@ -45,6 +45,7 @@
 image float_to_image(int h, int w, int c, float *data);
 image copy_image(image p);
 image load_image(char *filename, int h, int w);
+image load_image_color(char *filename, int h, int w);
 image ipl_to_image(IplImage* src);
 
 float get_pixel(image m, int x, int y, int c);
diff --git a/src/mini_blas.h b/src/mini_blas.h
index a155c35..923afc7 100644
--- a/src/mini_blas.h
+++ b/src/mini_blas.h
@@ -55,8 +55,8 @@
                     float *B, int ldb,
                     float BETA,
                     float *C, int ldc);
-inline void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
-inline void copy_cpu(int N, float *X, int INCX, float *Y, int INCY);
-inline void scal_cpu(int N, float ALPHA, float *X, int INCX);
-inline float dot_cpu(int N, float *X, int INCX, float *Y, int INCY);
+void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
+void copy_cpu(int N, float *X, int INCX, float *Y, int INCY);
+void scal_cpu(int N, float ALPHA, float *X, int INCX);
+float dot_cpu(int N, float *X, int INCX, float *Y, int INCY);
 void test_gpu_blas();
diff --git a/src/network.c b/src/network.c
index 6696769..5ea449c 100644
--- a/src/network.c
+++ b/src/network.c
@@ -621,7 +621,7 @@
     image *prev = 0;
     int i;
     char buff[256];
-    show_image(get_network_image_layer(net, 0), "Crop");
+    //show_image(get_network_image_layer(net, 0), "Crop");
     for(i = 0; i < net.n; ++i){
         sprintf(buff, "Layer %d", i);
         if(net.types[i] == CONVOLUTIONAL){
@@ -635,6 +635,27 @@
     } 
 }
 
+void top_predictions(network net, int n, int *index)
+{
+    int i,j;
+    int k = get_network_output_size(net);
+    float *out = get_network_output(net);
+    float thresh = FLT_MAX;
+    for(i = 0; i < n; ++i){
+        float max = -FLT_MAX;
+        int max_i = -1;
+        for(j = 0; j < k; ++j){
+            float val = out[j];
+            if(val > max &&  val < thresh){
+                max = val;
+                max_i = j;
+            }
+        }
+        index[i] = max_i;
+        thresh = max;
+    }
+}
+
 float *network_predict(network net, float *input)
 {
     forward_network(net, input, 0, 0);
diff --git a/src/network.h b/src/network.h
index 22e277c..c95f6fa 100644
--- a/src/network.h
+++ b/src/network.h
@@ -52,8 +52,10 @@
 float train_network_batch(network net, data d, int n);
 void train_network(network net, data d);
 matrix network_predict_data(network net, data test);
+float *network_predict(network net, float *input);
 float network_accuracy(network net, data d);
 float network_accuracy_multi(network net, data d, int n);
+void top_predictions(network net, int n, int *index);
 float *get_network_output(network net);
 float *get_network_output_layer(network net, int i);
 float *get_network_delta_layer(network net, int i);
diff --git a/src/opencl.c b/src/opencl.c
index a2e7366..604a2e3 100644
--- a/src/opencl.c
+++ b/src/opencl.c
@@ -4,7 +4,7 @@
 #include <string.h>
 #include <time.h>
 #include <unistd.h>
-//#include <clBLAS.h>
+#include <clBLAS.h>
 
 #include "opencl.h"
 #include "utils.h"
@@ -81,7 +81,7 @@
 
     }
     int index = getpid()%num_devices;
-    index = 0;
+    index = 1;
     printf("%d rand, %d devices, %d index\n", getpid(), num_devices, index);
     info.device = devices[index];
     fprintf(stderr, "Found %d device(s)\n", num_devices);
@@ -99,7 +99,7 @@
         info.queues[i] = clCreateCommandQueue(info.context, info.device, 0, &info.error);
         check_error(info);
     }
-    //info.error = clblasSetup();
+    info.error = clblasSetup();
     check_error(info);
     info.initialized = 1;
     return info;
diff --git a/src/parser.c b/src/parser.c
index 9bd2eb7..79d4a3a 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -67,7 +67,6 @@
 
 convolutional_layer *parse_convolutional(list *options, network *net, int count)
 {
-    int i;
     int h,w,c;
     float learning_rate, momentum, decay;
     int n = option_find_int(options, "filters",1);
@@ -98,34 +97,19 @@
         if(h == 0) error("Layer before convolutional layer must output image.");
     }
     convolutional_layer *layer = make_convolutional_layer(net->batch,h,w,c,n,size,stride,pad,activation,learning_rate,momentum,decay);
-    char *data = option_find_str(options, "data", 0);
-    if(data){
-        char *curr = data;
-        char *next = data;
-        for(i = 0; i < n; ++i){
-            while(*++next !='\0' && *next != ',');
-            *next = '\0';
-            sscanf(curr, "%g", &layer->biases[i]);
-            curr = next+1;
-        }
-        for(i = 0; i < c*n*size*size; ++i){
-            while(*++next !='\0' && *next != ',');
-            *next = '\0';
-            sscanf(curr, "%g", &layer->filters[i]);
-            curr = next+1;
-        }
-    }
     char *weights = option_find_str(options, "weights", 0);
     char *biases = option_find_str(options, "biases", 0);
-    parse_data(biases, layer->biases, n);
     parse_data(weights, layer->filters, c*n*size*size);
+    parse_data(biases, layer->biases, n);
+    #ifdef GPU
+    push_convolutional_layer(*layer);
+    #endif
     option_unused(options);
     return layer;
 }
 
 connected_layer *parse_connected(list *options, network *net, int count)
 {
-    int i;
     int input;
     float learning_rate, momentum, decay;
     int output = option_find_int(options, "output",1);
@@ -147,27 +131,13 @@
         input =  get_network_output_size_layer(*net, count-1);
     }
     connected_layer *layer = make_connected_layer(net->batch, input, output, activation,learning_rate,momentum,decay);
-    char *data = option_find_str(options, "data", 0);
-    if(data){
-        char *curr = data;
-        char *next = data;
-        for(i = 0; i < output; ++i){
-            while(*++next !='\0' && *next != ',');
-            *next = '\0';
-            sscanf(curr, "%g", &layer->biases[i]);
-            curr = next+1;
-        }
-        for(i = 0; i < input*output; ++i){
-            while(*++next !='\0' && *next != ',');
-            *next = '\0';
-            sscanf(curr, "%g", &layer->weights[i]);
-            curr = next+1;
-        }
-    }
     char *weights = option_find_str(options, "weights", 0);
     char *biases = option_find_str(options, "biases", 0);
     parse_data(biases, layer->biases, output);
     parse_data(weights, layer->weights, input*output);
+    #ifdef GPU
+    push_connected_layer(*layer);
+    #endif
     option_unused(options);
     return layer;
 }

--
Gitblit v1.10.0