From 76ee68f96d864a27312c9aa09856ddda559a5cd9 Mon Sep 17 00:00:00 2001
From: Joseph Redmon <pjreddie@gmail.com>
Date: Thu, 28 Aug 2014 02:11:46 +0000
Subject: [PATCH] Trying some stuff w/ dropout
---
src/network.c | 53 ---
src/network.h | 4
src/data.c | 16 +
src/gemm.c | 14
src/cnn.c | 55 +++
src/im2col.c | 84 +----
src/convolutional_layer.h | 1
src/data.h | 1
src/activations.cl | 33 ++
src/im2col.cl | 47 ++-
src/convolutional_layer.c | 152 ++++++++++-
src/activations.h | 2
src/col2im.c | 81 ++++-
src/convolutional_layer.cl | 25 +
src/mini_blas.h | 29 +
src/activations.c | 51 +++
src/col2im.cl | 41 +++
src/opencl.c | 61 ++++
18 files changed, 550 insertions(+), 200 deletions(-)
diff --git a/src/activations.c b/src/activations.c
index 04b27c9..4a4bd3f 100644
--- a/src/activations.c
+++ b/src/activations.c
@@ -41,6 +41,12 @@
float ramp_activate(float x){return x*(x>0)+.1*x;}
float tanh_activate(float x){return (exp(2*x)-1)/(exp(2*x)+1);}
+float linear_gradient(float x){return 1;}
+float sigmoid_gradient(float x){return (1-x)*x;}
+float relu_gradient(float x){return (x>0);}
+float ramp_gradient(float x){return (x>0)+.1;}
+float tanh_gradient(float x){return 1-x*x;}
+
float activate(float x, ACTIVATION a)
{
switch(a){
@@ -66,19 +72,19 @@
}
}
-
-float gradient(float x, ACTIVATION a){
+float gradient(float x, ACTIVATION a)
+{
switch(a){
case LINEAR:
- return 1;
+ return linear_gradient(x);
case SIGMOID:
- return (1.-x)*x;
+ return sigmoid_gradient(x);
case RELU:
- return (x>0);
+ return relu_gradient(x);
case RAMP:
- return (x>0) + .1;
+ return ramp_gradient(x);
case TANH:
- return 1-x*x;
+ return tanh_gradient(x);
}
return 0;
}
@@ -107,7 +113,6 @@
return kernel;
}
-
void activate_array_ongpu(cl_mem x, int n, ACTIVATION a)
{
cl_setup();
@@ -125,4 +130,34 @@
clEnqueueNDRangeKernel(queue, kernel, 1, 0, &gsize, 0, 0, 0, 0);
check_error(cl);
}
+
+cl_kernel get_gradient_kernel()
+{
+ static int init = 0;
+ static cl_kernel kernel;
+ if(!init){
+ kernel = get_kernel("src/activations.cl", "gradient_array", 0);
+ init = 1;
+ }
+ return kernel;
+}
+
+void gradient_array_ongpu(cl_mem x, int n, ACTIVATION a, cl_mem delta)
+{
+ cl_setup();
+ cl_kernel kernel = get_gradient_kernel();
+ cl_command_queue queue = cl.queue;
+
+ cl_uint i = 0;
+ cl.error = clSetKernelArg(kernel, i++, sizeof(x), (void*) &x);
+ cl.error = clSetKernelArg(kernel, i++, sizeof(n), (void*) &n);
+ cl.error = clSetKernelArg(kernel, i++, sizeof(a), (void*) &a);
+ cl.error = clSetKernelArg(kernel, i++, sizeof(delta), (void*) &delta);
+ check_error(cl);
+
+ size_t gsize = n;
+
+ clEnqueueNDRangeKernel(queue, kernel, 1, 0, &gsize, 0, 0, 0, 0);
+ check_error(cl);
+}
#endif
diff --git a/src/activations.cl b/src/activations.cl
index 65131c5..da06e8a 100644
--- a/src/activations.cl
+++ b/src/activations.cl
@@ -8,6 +8,12 @@
float ramp_activate(float x){return x*(x>0)+.1*x;}
float tanh_activate(float x){return (exp(2*x)-1)/(exp(2*x)+1);}
+float linear_gradient(float x){return 1;}
+float sigmoid_gradient(float x){return (1-x)*x;}
+float relu_gradient(float x){return (x>0);}
+float ramp_gradient(float x){return (x>0)+.1;}
+float tanh_gradient(float x){return 1-x*x;}
+
float activate(float x, ACTIVATION a)
{
switch(a){
@@ -25,9 +31,32 @@
return 0;
}
-__kernel void activate_array(__global float *x,
- const int n, const ACTIVATION a)
+float gradient(float x, ACTIVATION a)
+{
+ switch(a){
+ case LINEAR:
+ return linear_gradient(x);
+ case SIGMOID:
+ return sigmoid_gradient(x);
+ case RELU:
+ return relu_gradient(x);
+ case RAMP:
+ return ramp_gradient(x);
+ case TANH:
+ return tanh_gradient(x);
+ }
+ return 0;
+}
+
+__kernel void activate_array(__global float *x, int n, ACTIVATION a)
{
int i = get_global_id(0);
x[i] = activate(x[i], a);
}
+
+__kernel void gradient_array(__global float *x, int n, ACTIVATION a, __global float *delta)
+{
+ int i = get_global_id(0);
+ delta[i] *= gradient(x[i], a);
+}
+
diff --git a/src/activations.h b/src/activations.h
index 8c4287e..c406c18 100644
--- a/src/activations.h
+++ b/src/activations.h
@@ -14,7 +14,9 @@
void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta);
void activate_array(float *x, const int n, const ACTIVATION a);
#ifdef GPU
+cl_kernel get_activation_kernel();
void activate_array_ongpu(cl_mem x, int n, ACTIVATION a);
+void gradient_array_ongpu(cl_mem x, int n, ACTIVATION a, cl_mem delta);
#endif
#endif
diff --git a/src/cnn.c b/src/cnn.c
index 72ad4a1..0cd6da3 100644
--- a/src/cnn.c
+++ b/src/cnn.c
@@ -32,6 +32,51 @@
show_image_layers(edge, "Test Convolve");
}
+#ifdef GPU
+
+void test_convolutional_layer()
+{
+ int i;
+ image dog = load_image("data/dog.jpg",256,256);
+ network net = parse_network_cfg("cfg/convolutional.cfg");
+// data test = load_cifar10_data("data/cifar10/test_batch.bin");
+// float *X = calloc(net.batch*test.X.cols, sizeof(float));
+// float *y = calloc(net.batch*test.y.cols, sizeof(float));
+ int in_size = get_network_input_size(net)*net.batch;
+ int size = get_network_output_size(net)*net.batch;
+float *X = calloc(in_size, sizeof(float));
+ for(i = 0; i < in_size; ++i){
+ X[i] = dog.data[i%get_network_input_size(net)];
+ }
+// get_batch(test, net.batch, X, y);
+ clock_t start, end;
+ cl_mem input_cl = cl_make_array(X, in_size);
+
+ forward_network_gpu(net, input_cl, 1);
+ start = clock();
+ forward_network_gpu(net, input_cl, 1);
+ end = clock();
+ float gpu_sec = (float)(end-start)/CLOCKS_PER_SEC;
+ float *gpu_out = calloc(size, sizeof(float));
+ memcpy(gpu_out, get_network_output(net), size*sizeof(float));
+
+ start = clock();
+ forward_network(net, X, 1);
+ end = clock();
+ float cpu_sec = (float)(end-start)/CLOCKS_PER_SEC;
+ float *cpu_out = calloc(size, sizeof(float));
+ memcpy(cpu_out, get_network_output(net), size*sizeof(float));
+
+ float sum = 0;
+ for(i = 0; i < size; ++i) {
+ //printf("%f, %f\n", gpu_out[i], cpu_out[i]);
+ sum += pow(gpu_out[i] - cpu_out[i], 2);
+ }
+ printf("gpu: %f sec, cpu: %f sec, diff: %f, size: %d\n", gpu_sec, cpu_sec, sum, size);
+}
+
+#endif
+
void test_convolve_matrix()
{
image dog = load_image("dog.jpg",300,400);
@@ -325,7 +370,7 @@
void train_nist()
{
srand(222222);
- network net = parse_network_cfg("cfg/nist_final.cfg");
+ network net = parse_network_cfg("cfg/nist.cfg");
data train = load_categorical_data_csv("data/mnist/mnist_train.csv", 0, 10);
data test = load_categorical_data_csv("data/mnist/mnist_test.csv",0,10);
translate_data_rows(train, -144);
@@ -349,7 +394,7 @@
mean_array(get_network_output_layer(net,3), 100),
mean_array(get_network_output_layer(net,4), 100));
*/
- save_network(net, "cfg/nist_final2.cfg");
+ //save_network(net, "cfg/nist_final2.cfg");
//printf("%5d Training Loss: %lf, Params: %f %f %f, ",count*1000, loss, lr, momentum, decay);
//end = clock();
@@ -798,7 +843,7 @@
{
//train_full();
//test_distribution();
- feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+ //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
//test_blas();
//test_visualize();
@@ -809,7 +854,9 @@
//test_split();
//test_ensemble();
//test_nist_single();
- test_nist();
+ //test_nist();
+ train_nist();
+ //test_convolutional_layer();
//test_cifar10();
//train_cifar10();
//test_vince();
diff --git a/src/col2im.c b/src/col2im.c
index fd7de4f..c418fa5 100644
--- a/src/col2im.c
+++ b/src/col2im.c
@@ -1,21 +1,21 @@
#include <stdio.h>
#include <math.h>
inline void col2im_add_pixel(float *im, int height, int width, int channels,
- int row, int col, int channel, int pad, float val)
+ int b, int row, int col, int channel, int pad, float val)
{
row -= pad;
col -= pad;
if (row < 0 || col < 0 ||
row >= height || col >= width) return;
- im[col + width*(row + channel*height)] += val;
+ im[col + width*(row + height*(channel+b*channels))] += val;
}
//This one might be too, can't remember.
-void col2im_cpu(float* data_col,
- const int channels, const int height, const int width,
- const int ksize, const int stride, int pad, float* data_im)
+void col2im_cpu(float* data_col, int batch,
+ int channels, int height, int width,
+ int ksize, int stride, int pad, float* data_im)
{
- int c,h,w;
+ int b,c,h,w;
int height_col = (height - ksize) / stride + 1;
int width_col = (width - ksize) / stride + 1;
if (pad){
@@ -24,20 +24,67 @@
pad = ksize/2;
}
int channels_col = channels * ksize * ksize;
- for (c = 0; c < channels_col; ++c) {
- int w_offset = c % ksize;
- int h_offset = (c / ksize) % ksize;
- int c_im = c / ksize / ksize;
- for (h = 0; h < height_col; ++h) {
- for (w = 0; w < width_col; ++w) {
- int im_row = h_offset + h * stride;
- int im_col = w_offset + w * stride;
- double val = data_col[(c * height_col + h) * width_col + w];
- col2im_add_pixel(data_im, height, width, channels,
- im_row, im_col, c_im, pad, val);
+ int col_size = height_col*width_col*channels_col;
+ for(b = 0; b < batch; ++b){
+ for (c = 0; c < channels_col; ++c) {
+ int w_offset = c % ksize;
+ int h_offset = (c / ksize) % ksize;
+ int c_im = c / ksize / ksize;
+ for (h = 0; h < height_col; ++h) {
+ for (w = 0; w < width_col; ++w) {
+ int im_row = h_offset + h * stride;
+ int im_col = w_offset + w * stride;
+ int col_index = (c * height_col + h) * width_col + w + b*col_size;
+ double val = data_col[col_index];
+ col2im_add_pixel(data_im, height, width, channels,
+ b, im_row, im_col, c_im, pad, val);
+ }
}
}
}
}
+#ifdef GPU
+
+#include "opencl.h"
+
+cl_kernel get_col2im_kernel()
+{
+ static int init = 0;
+ static cl_kernel im2col_kernel;
+ if(!init){
+ im2col_kernel = get_kernel("src/col2im.cl", "col2im", 0);
+ init = 1;
+ }
+ return im2col_kernel;
+}
+
+void col2im_ongpu(cl_mem data_col, int batch,
+ int channels, int height, int width,
+ int ksize, int stride, int pad, cl_mem data_im)
+{
+ cl_setup();
+ cl_kernel kernel = get_col2im_kernel();
+ cl_command_queue queue = cl.queue;
+
+ cl_uint i = 0;
+ cl.error = clSetKernelArg(kernel, i++, sizeof(data_col), (void*) &data_col);
+ cl.error = clSetKernelArg(kernel, i++, sizeof(batch), (void*) &batch);
+ cl.error = clSetKernelArg(kernel, i++, sizeof(channels), (void*) &channels);
+ cl.error = clSetKernelArg(kernel, i++, sizeof(height), (void*) &height);
+ cl.error = clSetKernelArg(kernel, i++, sizeof(width), (void*) &width);
+ cl.error = clSetKernelArg(kernel, i++, sizeof(ksize), (void*) &ksize);
+ cl.error = clSetKernelArg(kernel, i++, sizeof(stride), (void*) &stride);
+ cl.error = clSetKernelArg(kernel, i++, sizeof(pad), (void*) &pad);
+ cl.error = clSetKernelArg(kernel, i++, sizeof(data_im), (void*) &data_im);
+ check_error(cl);
+
+ size_t global_size = {channels*height*width*batch};
+
+ clEnqueueNDRangeKernel(queue, kernel, 3, 0,
+ global_size, 0, 0, 0, 0);
+ check_error(cl);
+}
+
+#endif
diff --git a/src/col2im.cl b/src/col2im.cl
index e69de29..c8e3b30 100644
--- a/src/col2im.cl
+++ b/src/col2im.cl
@@ -0,0 +1,41 @@
+int index(int row, int col)
+{
+
+}
+
+__kernel void col2im(__global float *data_col, int batch,
+ int channels, int height, int width,
+ int ksize, int stride, int pad, __global float *data_im)
+{
+ int id = get_global_id(0);
+ int index = id;
+ int w = id%width;
+ id /= width;
+ int h = id%height;
+ id /= height;
+ int c = id%channels;
+ id /= channels;
+ int b = id%batch;
+
+ int height_col = (height - ksize) / stride + 1;
+ int width_col = (width - ksize) / stride + 1;
+ int rows = channels * ksize * ksize;
+ if (pad){
+ height_col = 1 + (height-1) / stride;
+ width_col = 1 + (width-1) / stride;
+ pad = ksize/2;
+ }
+ int cols = height_col*width_col;
+ int batch_offset = b*cols*rows;
+ int channel_offset = c*cols*ksize*ksize;
+ data_col[index] = 0;
+ int i,j;
+ for(i = 0; i < ksize; ++i){
+ row_offset = i*height_col*width_col;
+ for(j = 0; j < ksize; ++j){
+ col_offset =
+ }
+ }
+
+ data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, b, im_row, im_col, c_im, pad);
+}
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 2d4d748..bdbfbfd 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -147,15 +147,9 @@
for(i = 0; i < layer.batch; ++i){
gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
- c += n*m;
- in += layer.h*layer.w*layer.c;
b += k*n;
+ c += n*m;
}
- /*
- int i;
- for(i = 0; i < m*n; ++i) printf("%f, ", layer.output[i]);
- printf("\n");
- */
activate_array(layer.output, m*n*layer.batch, layer.activation);
}
@@ -205,10 +199,10 @@
for(i = 0; i < layer.batch; ++i){
gemm(1,0,m,n,k,1,a,m,b,n,0,c,n);
- col2im_cpu(c, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, delta);
- c += k*n;
- delta += layer.h*layer.w*layer.c;
+ b += k*n;
+ c += m*n;
}
+ col2im_cpu(layer.col_image, layer.batch, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, delta);
}
}
@@ -278,22 +272,140 @@
}
#ifdef GPU
+
+cl_kernel get_convolutional_learn_bias_kernel()
+{
+ static int init = 0;
+ static cl_kernel kernel;
+ if(!init){
+ kernel = get_kernel("src/convolutional_layer.cl", "learn_bias", 0);
+ init = 1;
+ }
+ return kernel;
+}
+
+void learn_bias_convolutional_layer_ongpu(convolutional_layer layer)
+{
+ int size = convolutional_out_height(layer) * convolutional_out_width(layer);
+
+ cl_setup();
+ cl_kernel kernel = get_convolutional_learn_bias_kernel();
+ cl_command_queue queue = cl.queue;
+
+ cl_uint i = 0;
+ cl.error = clSetKernelArg(kernel, i++, sizeof(layer.batch), (void*) &layer.batch);
+ cl.error = clSetKernelArg(kernel, i++, sizeof(layer.n), (void*) &layer.n);
+ cl.error = clSetKernelArg(kernel, i++, sizeof(size), (void*) &size);
+ cl.error = clSetKernelArg(kernel, i++, sizeof(layer.delta_cl), (void*) &layer.delta_cl);
+ cl.error = clSetKernelArg(kernel, i++, sizeof(layer.bias_updates_cl), (void*) &layer.bias_updates_cl);
+ check_error(cl);
+
+ const size_t global_size[] = {layer.n};
+
+ clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0);
+ check_error(cl);
+}
+
+cl_kernel get_convolutional_bias_kernel()
+{
+ static int init = 0;
+ static cl_kernel kernel;
+ if(!init){
+ kernel = get_kernel("src/convolutional_layer.cl", "bias", 0);
+ init = 1;
+ }
+ return kernel;
+}
+
+void bias_output_gpu(const convolutional_layer layer)
+{
+ int out_h = convolutional_out_height(layer);
+ int out_w = convolutional_out_width(layer);
+ int size = out_h*out_w;
+
+ cl_setup();
+ cl_kernel kernel = get_convolutional_bias_kernel();
+ cl_command_queue queue = cl.queue;
+
+ cl_uint i = 0;
+ cl.error = clSetKernelArg(kernel, i++, sizeof(layer.n), (void*) &layer.n);
+ cl.error = clSetKernelArg(kernel, i++, sizeof(size), (void*) &size);
+ cl.error = clSetKernelArg(kernel, i++, sizeof(layer.biases_cl), (void*) &layer.biases_cl);
+ cl.error = clSetKernelArg(kernel, i++, sizeof(layer.output_cl), (void*) &layer.output_cl);
+ check_error(cl);
+
+ const size_t global_size[] = {layer.batch, layer.n*size};
+
+ clEnqueueNDRangeKernel(queue, kernel, 2, 0, global_size, 0, 0, 0, 0);
+ check_error(cl);
+}
+
void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in)
{
+ int i;
int m = layer.n;
int k = layer.size*layer.size*layer.c;
int n = convolutional_out_height(layer)*
- convolutional_out_width(layer)*
- layer.batch;
+ convolutional_out_width(layer);
- cl_write_array(layer.filters_cl, layer.filters, m*k);
- cl_mem a = layer.filters_cl;
- cl_mem b = layer.col_image_cl;
- cl_mem c = layer.output_cl;
- im2col_ongpu(in, layer.batch, layer.c, layer.h, layer.w, layer.size, layer.stride, b);
- gemm_ongpu(0,0,m,n,k,1,a,k,b,n,0,c,n);
- activate_array_ongpu(layer.output_cl, m*n, layer.activation);
- cl_read_array(layer.output_cl, layer.output, m*n);
+ //cl_write_array(layer.filters_cl, layer.filters, m*k);
+ //cl_write_array(layer.biases_cl, layer.biases, m);
+ bias_output_gpu(layer);
+ im2col_ongpu(in, layer.batch, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, layer.col_image_cl);
+ for(i = 0; i < layer.batch; ++i){
+ cl_mem a = layer.filters_cl;
+ cl_mem b = cl_sub_array(layer.col_image_cl, i*k*n, k*n);
+ cl_mem c = cl_sub_array(layer.output_cl, i*m*n, m*n);
+ gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c,n);
+ clReleaseMemObject(b);
+ clReleaseMemObject(c);
+ }
+ activate_array_ongpu(layer.output_cl, m*n*layer.batch, layer.activation);
+ cl_read_array(layer.output_cl, layer.output, m*n*layer.batch);
}
+
+void backward_convolutional_layer_gpu(convolutional_layer layer, cl_mem delta_cl)
+{
+ int i;
+ int m = layer.n;
+ int n = layer.size*layer.size*layer.c;
+ int k = convolutional_out_height(layer)*
+ convolutional_out_width(layer);
+ gradient_array_ongpu(layer.output_cl, m*k*layer.batch, layer.activation, layer.delta_cl);
+ learn_bias_convolutional_layer_ongpu(layer);
+
+ for(i = 0; i < layer.batch; ++i){
+ cl_mem a = cl_sub_array(layer.delta_cl,i*m*k, m*k);
+ cl_mem b = cl_sub_array(layer.col_image_cl,i*k*n, k*n);
+ cl_mem c = layer.filter_updates_cl;
+
+ gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
+
+ clReleaseMemObject(a);
+ clReleaseMemObject(b);
+ }
+ cl_read_array(layer.filter_updates_cl, layer.filter_updates, m*n);
+ cl_read_array(layer.bias_updates_cl, layer.bias_updates, m);
+
+
+ if(delta_cl){
+ m = layer.size*layer.size*layer.c;
+ k = layer.n;
+ n = convolutional_out_height(layer)*
+ convolutional_out_width(layer);
+
+ for(i = 0; i < layer.batch; ++i){
+ a = layer.filters_cl;
+ b = cl_sub_array(layer.delta_cl, i*k*n, k*n);
+ c = cl_sub_array(layer.col_image_cl, i*m*n, m*n);
+
+ gemm_ongpu(1,0,m,n,k,1,a,m,b,n,0,c,n);
+ clReleaseMemObject(b);
+ clReleaseMemObject(c);
+ }
+ col2im_gpu(layer.col_image_cl, layer.batch, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, delta_cl);
+ }
+}
+
#endif
diff --git a/src/convolutional_layer.cl b/src/convolutional_layer.cl
new file mode 100644
index 0000000..6393c37
--- /dev/null
+++ b/src/convolutional_layer.cl
@@ -0,0 +1,25 @@
+
+__kernel void bias(int n, int size, __global float *biases, __global float *output)
+{
+ int batch = get_global_id(0);
+ int id = get_global_id(1);
+ int filter = id/size;
+ int position = id%size;
+
+ output[batch*n*size + id] = biases[filter];
+}
+
+__kernel void learn_bias(int batch, int n, int size, __global float *delta, __global float *bias_updates)
+{
+ int i,b;
+ int filter = get_global_id(0);
+ float sum = 0;
+ for(b = 0; b < batch; ++b){
+ for(i = 0; i < size; ++i){
+ int index = i + size*(filter + n*b);
+ sum += delta[index];
+ }
+ }
+ bias_updates[filter] += sum;
+}
+
diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h
index f876e8b..cf897a7 100644
--- a/src/convolutional_layer.h
+++ b/src/convolutional_layer.h
@@ -50,6 +50,7 @@
#ifdef GPU
void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in);
+void backward_convolutional_layer_gpu(convolutional_layer layer, cl_mem delta_cl);
#endif
convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, float learning_rate, float momentum, float decay);
diff --git a/src/data.c b/src/data.c
index 846b950..aa8fecf 100644
--- a/src/data.c
+++ b/src/data.c
@@ -148,6 +148,16 @@
return d;
}
+void get_batch(data d, int n, float *X, float *y)
+{
+ int j;
+ for(j = 0; j < n; ++j){
+ int index = rand()%d.X.rows;
+ memcpy(X+j*d.X.cols, d.X.vals[index], d.X.cols*sizeof(float));
+ memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float));
+ }
+}
+
data load_all_cifar10()
{
data d;
@@ -158,7 +168,7 @@
d.X = X;
d.y = y;
-
+
for(b = 0; b < 5; ++b){
char buff[256];
sprintf(buff, "data/cifar10/data_batch_%d.bin", b+1);
@@ -176,8 +186,8 @@
fclose(fp);
}
//normalize_data_rows(d);
- translate_data_rows(d, -144);
- scale_data_rows(d, 1./128);
+ translate_data_rows(d, -144);
+ scale_data_rows(d, 1./128);
return d;
}
diff --git a/src/data.h b/src/data.h
index 0a1830e..bd677e8 100644
--- a/src/data.h
+++ b/src/data.h
@@ -20,6 +20,7 @@
data load_cifar10_data(char *filename);
data load_all_cifar10();
list *get_paths(char *filename);
+void get_batch(data d, int n, float *X, float *y);
data load_categorical_data_csv(char *filename, int target, int k);
void normalize_data_rows(data d);
void scale_data_rows(data d, float s);
diff --git a/src/gemm.c b/src/gemm.c
index 1a7bcdd..65542bc 100644
--- a/src/gemm.c
+++ b/src/gemm.c
@@ -6,11 +6,7 @@
float BETA,
float *C, int ldc)
{
-#ifdef GPU
- gemm_gpu( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
-#else
gemm_cpu( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
-#endif
}
void gemm_nn(int M, int N, int K, float ALPHA,
@@ -83,6 +79,7 @@
float BETA,
float *C, int ldc)
{
+ //printf("cpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
int i, j;
for(i = 0; i < M; ++i){
for(j = 0; j < N; ++j){
@@ -107,7 +104,11 @@
#define STR_HELPER(x) #x
#define STR(x) STR_HELPER(x)
+#ifdef __APPLE__
+#define BLOCK 1
+#else
#define BLOCK 8
+#endif
cl_kernel get_gemm_kernel()
{
@@ -126,6 +127,7 @@
float BETA,
cl_mem C_gpu, int ldc)
{
+ //printf("gpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
cl_setup();
cl_kernel gemm_kernel = get_gemm_kernel();
cl_command_queue queue = cl.queue;
@@ -256,6 +258,8 @@
void test_gpu_blas()
{
+ test_gpu_accuracy(0,0,10,576,75);
+
test_gpu_accuracy(0,0,17,10,10);
test_gpu_accuracy(1,0,17,10,10);
test_gpu_accuracy(0,1,17,10,10);
@@ -266,6 +270,7 @@
test_gpu_accuracy(0,1,1000,10,100);
test_gpu_accuracy(1,1,1000,10,100);
+/*
time_gpu_random_matrix(0,0,1000,1000,100);
time_random_matrix(0,0,1000,1000,100);
@@ -277,6 +282,7 @@
time_gpu_random_matrix(1,1,1000,1000,100);
time_random_matrix(1,1,1000,1000,100);
+ */
}
#endif
diff --git a/src/im2col.c b/src/im2col.c
index 6ed9d89..08f7ce4 100644
--- a/src/im2col.c
+++ b/src/im2col.c
@@ -1,22 +1,21 @@
#include "mini_blas.h"
#include <stdio.h>
-
inline float im2col_get_pixel(float *im, int height, int width, int channels,
- int row, int col, int channel, int pad)
+ int b, int row, int col, int channel, int pad)
{
row -= pad;
col -= pad;
if (row < 0 || col < 0 ||
row >= height || col >= width) return 0;
- return im[col + width*(row + channel*height)];
+ return im[col + width*(row + height*(channel+b*channels))];
}
//From Berkeley Vision's Caffe!
//https://github.com/BVLC/caffe/blob/master/LICENSE
-void im2col_cpu_batch(float* data_im,
- const int batch, const int channels, const int height, const int width,
- const int ksize, const int stride, int pad, float* data_col)
+void im2col_cpu(float* data_im, int batch,
+ int channels, int height, int width,
+ int ksize, int stride, int pad, float* data_col)
{
int c,h,w,b;
int height_col = (height - ksize) / stride + 1;
@@ -27,44 +26,6 @@
pad = ksize/2;
}
int channels_col = channels * ksize * ksize;
- int im_size = height*width*channels;
- //int col_size = height_col*width_col*channels_col;
- for (b = 0; b < batch; ++b) {
- for (c = 0; c < channels_col; ++c) {
- int w_offset = c % ksize;
- int h_offset = (c / ksize) % ksize;
- int c_im = c / ksize / ksize;
- for (h = 0; h < height_col; ++h) {
- for (w = 0; w < width_col; ++w) {
- int im_row = h_offset + h * stride;
- int im_col = w_offset + w * stride;
- int col_index = (c * height_col + h) * width_col + w + (batch-1) * c * height_col*width_col;
- data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
- im_row, im_col, c_im, pad);
- }
- }
- }
- data_im += im_size;
- data_col+= channels_col;
- }
-}
-
-//From Berkeley Vision's Caffe!
-//https://github.com/BVLC/caffe/blob/master/LICENSE
-void im2col_cpu(float* data_im, const int batch,
- const int channels, const int height, const int width,
- const int ksize, const int stride, int pad, float* data_col)
-{
- int c,h,w,b;
- int height_col = (height - ksize) / stride + 1;
- int width_col = (width - ksize) / stride + 1;
- if (pad){
- height_col = 1 + (height-1) / stride;
- width_col = 1 + (width-1) / stride;
- pad = ksize/2;
- }
- int channels_col = channels * ksize * ksize;
- int im_size = height*width*channels;
int col_size = height_col*width_col*channels_col;
for (b = 0; b < batch; ++b) {
for (c = 0; c < channels_col; ++c) {
@@ -75,14 +36,12 @@
for (w = 0; w < width_col; ++w) {
int im_row = h_offset + h * stride;
int im_col = w_offset + w * stride;
- int col_index = (c * height_col + h) * width_col + w;
+ int col_index = (c * height_col + h) * width_col + w + b*col_size;
data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
- im_row, im_col, c_im, pad);
+ b, im_row, im_col, c_im, pad);
}
}
}
- data_im += im_size;
- data_col += col_size;
}
}
@@ -104,9 +63,9 @@
}
-void im2col_ongpu(cl_mem data_im, const int batch,
- const int channels, const int height, const int width,
- const int ksize, const int stride, cl_mem data_col)
+void im2col_ongpu(cl_mem data_im, int batch,
+ int channels, int height, int width,
+ int ksize, int stride, int pad, cl_mem data_col)
{
cl_setup();
cl_kernel im2col_kernel = get_im2col_kernel();
@@ -120,29 +79,30 @@
cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(width), (void*) &width);
cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(ksize), (void*) &ksize);
cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(stride), (void*) &stride);
+ cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(pad), (void*) &pad);
cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(data_col), (void*) &data_col);
check_error(cl);
int height_col = (height - ksize) / stride + 1;
int width_col = (width - ksize) / stride + 1;
int channels_col = channels * ksize * ksize;
+ if (pad){
+ height_col = 1 + (height-1) / stride;
+ width_col = 1 + (width-1) / stride;
+ }
size_t global_size[2];
- size_t local_size[2];
- global_size[0] = batch;
- global_size[1] = channels_col;
- local_size[0] = height_col;
- local_size[1] = width_col;
+ global_size[0] = batch*channels_col;
+ global_size[1] = height_col*width_col;
clEnqueueNDRangeKernel(queue, im2col_kernel, 2, 0,
- global_size, local_size, 0, 0, 0);
+ global_size, 0, 0, 0, 0);
check_error(cl);
}
-void im2col_gpu(float *data_im,
- const int batch, const int channels, const int height, const int width,
- const int ksize, const int stride,
- float *data_col)
+void im2col_gpu(float *data_im, int batch,
+ int channels, int height, int width,
+ int ksize, int stride, int pad, float *data_col)
{
cl_setup();
cl_context context = cl.context;
@@ -165,7 +125,7 @@
check_error(cl);
im2col_ongpu(im_gpu, batch, channels, height, width,
- ksize, stride, col_gpu);
+ ksize, stride, pad, col_gpu);
clEnqueueReadBuffer(queue, col_gpu, CL_TRUE, 0, size, data_col, 0, 0, 0);
check_error(cl);
diff --git a/src/im2col.cl b/src/im2col.cl
index 765a92d..6ed5d89 100644
--- a/src/im2col.cl
+++ b/src/im2col.cl
@@ -1,26 +1,43 @@
-__kernel void im2col(__global float *data_im, const int im_offset,
- const int channels, const int height, const int width,
- const int ksize, const int stride, __global float *data_col, const int col_offset)
+float im2col_get_pixel(__global float *im, int height, int width, int channels,
+ int batch, int row, int col, int channel, int pad)
{
- int b = get_global_id(0);
- int c = get_global_id(1);
+ row -= pad;
+ col -= pad;
- int h = get_local_id(0);
- int w = get_local_id(1);
+ if (row < 0 || col < 0 || row >= height || col >= width) return 0;
+ int index = col + width*(row + height*(channel+batch*channels));
+ return im[index];
+}
+__kernel void im2col(__global float *data_im, int batch,
+ int channels, int height, int width,
+ int ksize, int stride, int pad, __global float *data_col)
+{
+ int c,h,w,b;
int height_col = (height - ksize) / stride + 1;
int width_col = (width - ksize) / stride + 1;
+ if (pad){
+ height_col = 1 + (height-1) / stride;
+ width_col = 1 + (width-1) / stride;
+ pad = ksize/2;
+ }
+ int gid1 = get_global_id(0);
+ b = gid1%batch;
+ c = gid1/batch;
+
+ int gid2 = get_global_id(1);
+ h = gid2%height_col;
+ w = gid2/height_col;
+
+
int channels_col = channels * ksize * ksize;
-
- int im_offset = height*width*channels*b;
- int col_offset = height_col*width_col*channels_col*b;
-
+ int col_size = height_col*width_col*channels_col;
int w_offset = c % ksize;
int h_offset = (c / ksize) % ksize;
int c_im = c / ksize / ksize;
-
- data_col[(c * height_col + h) * width_col + w + col_offset] =
- data_im[(c_im * height + h * stride + h_offset) * width
- + w * stride + w_offset + im_offset];
+ int im_row = h_offset + h * stride;
+ int im_col = w_offset + w * stride;
+ int col_index = (c * height_col + h) * width_col + w + b*col_size;
+ data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, b, im_row, im_col, c_im, pad);
}
diff --git a/src/mini_blas.h b/src/mini_blas.h
index c80e6ad..34905a1 100644
--- a/src/mini_blas.h
+++ b/src/mini_blas.h
@@ -10,13 +10,17 @@
void time_random_matrix(int TA, int TB, int m, int k, int n);
#ifdef GPU
-void im2col_ongpu(cl_mem data_im, const int batch,
- const int channels, const int height, const int width,
- const int ksize, const int stride, cl_mem data_col);
+void im2col_ongpu(cl_mem data_im, int batch,
+ int channels, int height, int width,
+ int ksize, int stride, int pad, cl_mem data_col);
-void im2col_gpu(float *data_im,
- const int batch, const int channels, const int height, const int width,
- const int ksize, const int stride, float *data_col);
+void col2im_ongpu(cl_mem data_col, int batch,
+ int channels, int height, int width,
+ int ksize, int stride, int pad, cl_mem data_im);
+
+void im2col_gpu(float *data_im, int batch,
+ int channels, int height, int width,
+ int ksize, int stride, int pad, float *data_col);
void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
cl_mem A_gpu, int lda,
@@ -25,13 +29,14 @@
cl_mem C_gpu, int ldc);
#endif
-void im2col_cpu(float* data_im, const int batch,
- const int channels, const int height, const int width,
- const int ksize, const int stride, int pad, float* data_col);
+void im2col_cpu(float* data_im, int batch,
+ int channels, int height, int width,
+ int ksize, int stride, int pad, float* data_col);
-void col2im_cpu(float* data_col,
- const int channels, const int height, const int width,
- const int ksize, const int stride, int pad, float* data_im);
+void col2im_cpu(float* data_col, int batch,
+ int channels, int height, int width,
+ int ksize, int stride, int pad, float* data_im);
+
void test_blas();
void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA,
diff --git a/src/network.c b/src/network.c
index 292bba0..3761bf9 100644
--- a/src/network.c
+++ b/src/network.c
@@ -28,25 +28,16 @@
}
#ifdef GPU
-void forward_network(network net, float *input, int train)
+void forward_network_gpu(network net, cl_mem input_cl, int train)
{
- cl_setup();
- size_t size = get_network_input_size(net);
- if(!net.input_cl){
- net.input_cl = clCreateBuffer(cl.context,
- CL_MEM_READ_WRITE, size*sizeof(float), 0, &cl.error);
- check_error(cl);
- }
- cl_write_array(net.input_cl, input, size);
- cl_mem input_cl = net.input_cl;
int i;
for(i = 0; i < net.n; ++i){
if(net.types[i] == CONVOLUTIONAL){
convolutional_layer layer = *(convolutional_layer *)net.layers[i];
forward_convolutional_layer_gpu(layer, input_cl);
input_cl = layer.output_cl;
- input = layer.output;
}
+ /*
else if(net.types[i] == CONNECTED){
connected_layer layer = *(connected_layer *)net.layers[i];
forward_connected_layer(layer, input, train);
@@ -72,10 +63,11 @@
forward_normalization_layer(layer, input);
input = layer.output;
}
+ */
}
}
-#else
+#endif
void forward_network(network net, float *input, int train)
{
@@ -118,7 +110,6 @@
}
}
}
-#endif
void update_network(network net)
{
@@ -275,45 +266,13 @@
float *X = calloc(batch*d.X.cols, sizeof(float));
float *y = calloc(batch*d.y.cols, sizeof(float));
- int i,j;
+ int i;
float sum = 0;
- int index = 0;
for(i = 0; i < n; ++i){
- for(j = 0; j < batch; ++j){
- index = rand()%d.X.rows;
- memcpy(X+j*d.X.cols, d.X.vals[index], d.X.cols*sizeof(float));
- memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float));
- }
-
+ get_batch(d, batch, X, y);
float err = train_network_datum(net, X, y);
sum += err;
- //train_network_datum(net, X, y);
- /*
- float *y = d.y.vals[index];
- int class = get_predicted_class_network(net);
- correct += (y[class]?1:0);
- */
-
-/*
- for(j = 0; j < d.y.cols*batch; ++j){
- printf("%6.3f ", y[j]);
- }
- printf("\n");
- for(j = 0; j < d.y.cols*batch; ++j){
- printf("%6.3f ", get_network_output(net)[j]);
- }
- printf("\n");
- printf("\n");
- */
-
-
- //printf("%d %f %f\n", i,net.output[0], d.y.vals[index][0]);
- //if((i+1)%10 == 0){
- // printf("%d: %f\n", (i+1), (float)correct/(i+1));
- //}
}
- //printf("Accuracy: %f\n",(float) correct/n);
- //show_image(float_to_image(32,32,3,X), "Orig");
free(X);
free(y);
return (float)sum/(n*batch);
diff --git a/src/network.h b/src/network.h
index f8666e6..65ace57 100644
--- a/src/network.h
+++ b/src/network.h
@@ -33,6 +33,10 @@
#endif
} network;
+#ifdef GPU
+void forward_network_gpu(network net, cl_mem input, int train);
+#endif
+
network make_network(int n, int batch);
void forward_network(network net, float *input, int train);
float backward_network(network net, float *input, float *truth);
diff --git a/src/opencl.c b/src/opencl.c
index 8f9edd3..bcc0f09 100644
--- a/src/opencl.c
+++ b/src/opencl.c
@@ -11,6 +11,7 @@
void check_error(cl_info info)
{
+ clFinish(cl.queue);
if (info.error != CL_SUCCESS) {
printf("\n Error number %d", info.error);
exit(1);
@@ -27,13 +28,60 @@
// Fetch the Platform and Device IDs; we only want one.
cl_device_id devices[MAX_DEVICES];
info.error=clGetPlatformIDs(1, &info.platform, &num_platforms);
+
+ printf("=== %d OpenCL platform(s) found: ===\n", num_platforms);
+ char buffer[10240];
+ clGetPlatformInfo(info.platform, CL_PLATFORM_PROFILE, 10240, buffer, NULL);
+ printf(" PROFILE = %s\n", buffer);
+ clGetPlatformInfo(info.platform, CL_PLATFORM_VERSION, 10240, buffer, NULL);
+ printf(" VERSION = %s\n", buffer);
+ clGetPlatformInfo(info.platform, CL_PLATFORM_NAME, 10240, buffer, NULL);
+ printf(" NAME = %s\n", buffer);
+ clGetPlatformInfo(info.platform, CL_PLATFORM_VENDOR, 10240, buffer, NULL);
+ printf(" VENDOR = %s\n", buffer);
+ clGetPlatformInfo(info.platform, CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL);
+ printf(" EXTENSIONS = %s\n", buffer);
+
check_error(info);
info.error=clGetDeviceIDs(info.platform, CL_DEVICE_TYPE_ALL, MAX_DEVICES, devices, &num_devices);
if(num_devices > MAX_DEVICES) num_devices = MAX_DEVICES;
+ printf("=== %d OpenCL device(s) found on platform:\n", num_devices);
+ int i;
+ for (i=0; i<num_devices; i++)
+ {
+ char buffer[10240];
+ cl_uint buf_uint;
+ cl_ulong buf_ulong;
+ printf(" -- %d --\n", i);
+ clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL);
+ printf(" DEVICE_NAME = %s\n", buffer);
+ clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL);
+ printf(" DEVICE_VENDOR = %s\n", buffer);
+ clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL);
+ printf(" DEVICE_VERSION = %s\n", buffer);
+ clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL);
+ printf(" DRIVER_VERSION = %s\n", buffer);
+ clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL);
+ printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
+ clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL);
+ printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
+ clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL);
+ printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong);
+ clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(buf_ulong), &buf_ulong, NULL);
+ printf(" DEVICE_MAX_WORK_GROUP_SIZE = %llu\n", (unsigned long long)buf_ulong);
+ cl_uint items;
+ clGetDeviceInfo( devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint),
+ &items, NULL);
+ printf(" DEVICE_MAX_WORK_ITEM_DIMENSIONS = %u\n", (unsigned int)items);
+ size_t workitem_size[10];
+ clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, 10*sizeof(workitem_size), workitem_size, NULL);
+ printf(" DEVICE_MAX_WORK_ITEM_SIZES = %u / %u / %u \n", (unsigned int)workitem_size[0], (unsigned int)workitem_size[1], (unsigned int)workitem_size[2]);
+
+ }
int index = getpid()%num_devices;
printf("%d rand, %d devices, %d index\n", getpid(), num_devices, index);
//info.device = devices[index];
- info.device = devices[1];
+ info.device = devices[0];
fprintf(stderr, "Found %d device(s)\n", num_devices);
check_error(info);
@@ -52,8 +100,8 @@
cl_program cl_fprog(char *filename, char *options, cl_info info)
{
size_t srcsize;
- char src[8192];
- memset(src, 0, 8192);
+ char src[64*1024];
+ memset(src, 0, 64*1024);
FILE *fil=fopen(filename,"r");
srcsize=fread(src, sizeof src, 1, fil);
fclose(fil);
@@ -61,12 +109,12 @@
// Submit the source code of the example kernel to OpenCL
cl_program prog=clCreateProgramWithSource(info.context,1, srcptr, &srcsize, &info.error);
check_error(info);
- char build_c[4096];
+ char build_c[1024*64];
// and compile it (after this we could extract the compiled version)
info.error=clBuildProgram(prog, 0, 0, options, 0, 0);
if ( info.error != CL_SUCCESS ) {
fprintf(stderr, "Error Building Program: %d\n", info.error);
- clGetProgramBuildInfo( prog, info.device, CL_PROGRAM_BUILD_LOG, 4096, build_c, 0);
+ clGetProgramBuildInfo( prog, info.device, CL_PROGRAM_BUILD_LOG, 1024*64, build_c, 0);
fprintf(stderr, "Build Log for %s program:\n%s\n", filename, build_c);
}
check_error(info);
@@ -115,7 +163,8 @@
cl_buffer_region r;
r.origin = offset*sizeof(float);
r.size = size*sizeof(float);
- cl_mem sub = clCreateSubBuffer(src, CL_MEM_USE_HOST_PTR, CL_BUFFER_CREATE_TYPE_REGION, &r, 0);
+ cl_mem sub = clCreateSubBuffer(src, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &r, &cl.error);
+ check_error(cl);
return sub;
}
--
Gitblit v1.10.0