~speedprog/mtg/mtg_card_detector.git

parent: 176d65b7 | patch | commit | show whitespace

Joseph Redmon

2014-08-28 76ee68f96d864a27312c9aa09856ddda559a5cd9

Trying some stuff w/ dropout

17 files modified

1 files added

	src/activations.c	51 ●●●●● patch \| view \| raw \| blame \| history
	src/activations.cl	33 ●●●●● patch \| view \| raw \| blame \| history
	src/activations.h	2 ●●●●● patch \| view \| raw \| blame \| history
	src/cnn.c	55 ●●●●● patch \| view \| raw \| blame \| history
	src/col2im.c	63 ●●●●● patch \| view \| raw \| blame \| history
	src/col2im.cl	41 ●●●●● patch \| view \| raw \| blame \| history
	src/convolutional_layer.c	150 ●●●●● patch \| view \| raw \| blame \| history
	src/convolutional_layer.cl	25 ●●●●● patch \| view \| raw \| blame \| history
	src/convolutional_layer.h	1 ●●●●● patch \| view \| raw \| blame \| history
	src/data.c	10 ●●●●● patch \| view \| raw \| blame \| history
	src/data.h	1 ●●●●● patch \| view \| raw \| blame \| history
	src/gemm.c	14 ●●●●● patch \| view \| raw \| blame \| history
	src/im2col.c	84 ●●●●● patch \| view \| raw \| blame \| history
	src/im2col.cl	47 ●●●●● patch \| view \| raw \| blame \| history
	src/mini_blas.h	29 ●●●●● patch \| view \| raw \| blame \| history
	src/network.c	53 ●●●●● patch \| view \| raw \| blame \| history
	src/network.h	4 ●●●●● patch \| view \| raw \| blame \| history
	src/opencl.c	61 ●●●●● patch \| view \| raw \| blame \| history

 src/activations.c

@@ -41,6 +41,12 @@
float ramp_activate(float x){return x*(x>0)+.1*x;}
float tanh_activate(float x){return (exp(2*x)-1)/(exp(2*x)+1);}

float linear_gradient(float x){return 1;}
float sigmoid_gradient(float x){return (1-x)*x;}
float relu_gradient(float x){return (x>0);}
float ramp_gradient(float x){return (x>0)+.1;}
float tanh_gradient(float x){return 1-x*x;}

float activate(float x, ACTIVATION a)
{
    switch(a){
@@ -66,19 +72,19 @@
    }
}


float gradient(float x, ACTIVATION a){
float gradient(float x, ACTIVATION a)
{
    switch(a){
        case LINEAR:
            return 1;
            return linear_gradient(x);
        case SIGMOID:
            return (1.-x)*x;
            return sigmoid_gradient(x);
        case RELU:
            return (x>0);
            return relu_gradient(x);
        case RAMP:
            return (x>0) + .1;
            return ramp_gradient(x);
        case TANH:
            return 1-x*x;
            return tanh_gradient(x);
    }
    return 0;
}
@@ -107,7 +113,6 @@
    return kernel;
}


void activate_array_ongpu(cl_mem x, int n, ACTIVATION a) 
{
    cl_setup();
@@ -125,4 +130,34 @@
    clEnqueueNDRangeKernel(queue, kernel, 1, 0, &gsize, 0, 0, 0, 0);
    check_error(cl);
}

cl_kernel get_gradient_kernel()
{
    static int init = 0;
    static cl_kernel kernel;
    if(!init){
        kernel = get_kernel("src/activations.cl", "gradient_array", 0);
        init = 1;
    }
    return kernel;
}

void gradient_array_ongpu(cl_mem x, int n, ACTIVATION a, cl_mem delta) 
{
    cl_setup();
    cl_kernel kernel = get_gradient_kernel();
    cl_command_queue queue = cl.queue;

    cl_uint i = 0;
    cl.error = clSetKernelArg(kernel, i++, sizeof(x), (void*) &x);
    cl.error = clSetKernelArg(kernel, i++, sizeof(n), (void*) &n);
    cl.error = clSetKernelArg(kernel, i++, sizeof(a), (void*) &a);
    cl.error = clSetKernelArg(kernel, i++, sizeof(delta), (void*) &delta);
    check_error(cl);

    size_t gsize = n;

    clEnqueueNDRangeKernel(queue, kernel, 1, 0, &gsize, 0, 0, 0, 0);
    check_error(cl);
}
#endif

 src/activations.cl

@@ -8,6 +8,12 @@
float ramp_activate(float x){return x*(x>0)+.1*x;}
float tanh_activate(float x){return (exp(2*x)-1)/(exp(2*x)+1);}

float linear_gradient(float x){return 1;}
float sigmoid_gradient(float x){return (1-x)*x;}
float relu_gradient(float x){return (x>0);}
float ramp_gradient(float x){return (x>0)+.1;}
float tanh_gradient(float x){return 1-x*x;}

float activate(float x, ACTIVATION a)
{
    switch(a){
@@ -25,9 +31,32 @@
    return 0;
}

__kernel void activate_array(__global float *x,
    const int n, const ACTIVATION a)
float gradient(float x, ACTIVATION a)
{
    switch(a){
        case LINEAR:
            return linear_gradient(x);
        case SIGMOID:
            return sigmoid_gradient(x);
        case RELU:
            return relu_gradient(x);
        case RAMP:
            return ramp_gradient(x);
        case TANH:
            return tanh_gradient(x);
    }
    return 0;
}

__kernel void activate_array(__global float *x, int n, ACTIVATION a)
{
    int i = get_global_id(0);
    x[i] = activate(x[i], a);
}

__kernel void gradient_array(__global float *x, int n, ACTIVATION a, __global float *delta)
{
    int i = get_global_id(0);
    delta[i] *= gradient(x[i], a);
}


 src/activations.h

@@ -14,7 +14,9 @@
void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta);
void activate_array(float *x, const int n, const ACTIVATION a);
#ifdef GPU
cl_kernel get_activation_kernel();
void activate_array_ongpu(cl_mem x, int n, ACTIVATION a);
void gradient_array_ongpu(cl_mem x, int n, ACTIVATION a, cl_mem delta);
#endif

#endif

 src/cnn.c

@@ -32,6 +32,51 @@
    show_image_layers(edge, "Test Convolve");
}

#ifdef GPU

void test_convolutional_layer()
{
    int i;
    image dog = load_image("data/dog.jpg",256,256);
    network net = parse_network_cfg("cfg/convolutional.cfg");
//    data test = load_cifar10_data("data/cifar10/test_batch.bin");
//    float *X = calloc(net.batch*test.X.cols, sizeof(float));
//    float *y = calloc(net.batch*test.y.cols, sizeof(float));
    int in_size = get_network_input_size(net)*net.batch;
    int size = get_network_output_size(net)*net.batch;
float *X = calloc(in_size, sizeof(float));
    for(i = 0; i < in_size; ++i){
        X[i] = dog.data[i%get_network_input_size(net)];
    }
//    get_batch(test, net.batch, X, y);
    clock_t start, end;
    cl_mem input_cl = cl_make_array(X, in_size);

    forward_network_gpu(net, input_cl, 1);
    start = clock();
    forward_network_gpu(net, input_cl, 1);
    end = clock();
    float gpu_sec = (float)(end-start)/CLOCKS_PER_SEC;
    float *gpu_out = calloc(size, sizeof(float));
    memcpy(gpu_out, get_network_output(net), size*sizeof(float));

    start = clock();
    forward_network(net, X, 1);
    end = clock();
    float cpu_sec = (float)(end-start)/CLOCKS_PER_SEC;
    float *cpu_out = calloc(size, sizeof(float));
    memcpy(cpu_out, get_network_output(net), size*sizeof(float));

    float sum = 0;
    for(i = 0; i < size; ++i) {
        //printf("%f, %f\n", gpu_out[i], cpu_out[i]);
        sum += pow(gpu_out[i] - cpu_out[i], 2);
    }
    printf("gpu: %f sec, cpu: %f sec, diff: %f, size: %d\n", gpu_sec, cpu_sec, sum, size);
}

#endif

void test_convolve_matrix()
{
    image dog = load_image("dog.jpg",300,400);
@@ -325,7 +370,7 @@
void train_nist()
{
    srand(222222);
    network net = parse_network_cfg("cfg/nist_final.cfg");
    network net = parse_network_cfg("cfg/nist.cfg");
    data train = load_categorical_data_csv("data/mnist/mnist_train.csv", 0, 10);
    data test = load_categorical_data_csv("data/mnist/mnist_test.csv",0,10);
    translate_data_rows(train, -144);
@@ -349,7 +394,7 @@
          mean_array(get_network_output_layer(net,3), 100),
          mean_array(get_network_output_layer(net,4), 100));
         */
        save_network(net, "cfg/nist_final2.cfg");
        //save_network(net, "cfg/nist_final2.cfg");

        //printf("%5d Training Loss: %lf, Params: %f %f %f, ",count*1000, loss, lr, momentum, decay);
        //end = clock();
@@ -798,7 +843,7 @@
{
    //train_full();
    //test_distribution();
    feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
    //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);

    //test_blas();
    //test_visualize();
@@ -809,7 +854,9 @@
    //test_split();
    //test_ensemble();
    //test_nist_single();
    test_nist();
    //test_nist();
    train_nist();
    //test_convolutional_layer();
    //test_cifar10();
    //train_cifar10();
    //test_vince();

 src/col2im.c

@@ -1,21 +1,21 @@
#include <stdio.h>
#include <math.h>
inline void col2im_add_pixel(float *im, int height, int width, int channels,
                        int row, int col, int channel, int pad, float val)
                        int b, int row, int col, int channel, int pad, float val)
{
    row -= pad;
    col -= pad;

    if (row < 0 || col < 0 ||
        row >= height || col >= width) return;
    im[col + width*(row + channel*height)] += val;
    im[col + width*(row + height*(channel+b*channels))] += val;
}
//This one might be too, can't remember.
void col2im_cpu(float* data_col,
        const int channels, const int height, const int width,
        const int ksize, const int stride, int pad, float* data_im) 
void col2im_cpu(float* data_col, int batch,
         int channels,  int height,  int width,
         int ksize,  int stride, int pad, float* data_im) 
{
    int c,h,w;
    int b,c,h,w;
    int height_col = (height - ksize) / stride + 1;
    int width_col = (width - ksize) / stride + 1;
    if (pad){
@@ -24,6 +24,8 @@
        pad = ksize/2;
    }
    int channels_col = channels * ksize * ksize;
    int col_size = height_col*width_col*channels_col;
    for(b = 0; b < batch; ++b){
    for (c = 0; c < channels_col; ++c) {
        int w_offset = c % ksize;
        int h_offset = (c / ksize) % ksize;
@@ -32,12 +34,57 @@
            for (w = 0; w < width_col; ++w) {
                int im_row = h_offset + h * stride;
                int im_col = w_offset + w * stride;
                double val = data_col[(c * height_col + h) * width_col + w];
                    int col_index = (c * height_col + h) * width_col + w + b*col_size;
                    double val = data_col[col_index];
                col2im_add_pixel(data_im, height, width, channels,
                        im_row, im_col, c_im, pad, val);
                            b, im_row, im_col, c_im, pad, val);
                }
            }
        }
    }
}


#ifdef GPU

#include "opencl.h"

cl_kernel get_col2im_kernel()
{
    static int init = 0;
    static cl_kernel im2col_kernel;
    if(!init){
        im2col_kernel = get_kernel("src/col2im.cl", "col2im", 0);
        init = 1;
    }
    return im2col_kernel;
}

void col2im_ongpu(cl_mem data_col,  int batch,
         int channels,  int height,  int width,
         int ksize,  int stride,  int pad, cl_mem data_im)
{
    cl_setup();
    cl_kernel kernel = get_col2im_kernel();
    cl_command_queue queue = cl.queue;

    cl_uint i = 0;
    cl.error = clSetKernelArg(kernel, i++, sizeof(data_col), (void*) &data_col);
    cl.error = clSetKernelArg(kernel, i++, sizeof(batch), (void*) &batch);
    cl.error = clSetKernelArg(kernel, i++, sizeof(channels), (void*) &channels);
    cl.error = clSetKernelArg(kernel, i++, sizeof(height), (void*) &height);
    cl.error = clSetKernelArg(kernel, i++, sizeof(width), (void*) &width);
    cl.error = clSetKernelArg(kernel, i++, sizeof(ksize), (void*) &ksize);
    cl.error = clSetKernelArg(kernel, i++, sizeof(stride), (void*) &stride);
    cl.error = clSetKernelArg(kernel, i++, sizeof(pad), (void*) &pad);
    cl.error = clSetKernelArg(kernel, i++, sizeof(data_im), (void*) &data_im);
    check_error(cl);

    size_t global_size = {channels*height*width*batch};

    clEnqueueNDRangeKernel(queue, kernel, 3, 0,
            global_size, 0, 0, 0, 0);
    check_error(cl);
}

#endif

 src/col2im.cl

@@ -0,0 +1,41 @@
int index(int row, int col)
{
    
}

__kernel void col2im(__global float *data_col,  int batch,
     int channels,  int height,  int width,
     int ksize,  int stride,  int pad, __global float *data_im)
{
    int id = get_global_id(0);
    int index = id;
    int w = id%width;
    id /= width;
    int h = id%height;
    id /= height;
    int c = id%channels;
    id /= channels;
    int b = id%batch;

    int height_col = (height - ksize) / stride + 1;
    int width_col = (width - ksize) / stride + 1;
    int rows = channels * ksize * ksize;
    if (pad){
        height_col = 1 + (height-1) / stride;
        width_col = 1 + (width-1) / stride;
        pad = ksize/2;
    }
    int cols = height_col*width_col;
    int batch_offset = b*cols*rows;
    int channel_offset = c*cols*ksize*ksize;
    data_col[index] = 0;
    int i,j;
    for(i = 0; i < ksize; ++i){
        row_offset = i*height_col*width_col;
        for(j = 0; j < ksize; ++j){
            col_offset = 
        }
    }

    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, b, im_row, im_col, c_im, pad);
}

 src/convolutional_layer.c

@@ -147,15 +147,9 @@

    for(i = 0; i < layer.batch; ++i){
        gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
        c += n*m;
        in += layer.h*layer.w*layer.c;
        b += k*n;
        c += n*m;
    }
    /*
    int i;
    for(i = 0; i < m*n; ++i) printf("%f, ", layer.output[i]);
    printf("\n");
    */
    activate_array(layer.output, m*n*layer.batch, layer.activation);
}

@@ -205,10 +199,10 @@

        for(i = 0; i < layer.batch; ++i){
            gemm(1,0,m,n,k,1,a,m,b,n,0,c,n);
            col2im_cpu(c, layer.c,  layer.h,  layer.w,  layer.size,  layer.stride, layer.pad, delta);
            c += k*n;
            delta += layer.h*layer.w*layer.c;
            b += k*n;
            c += m*n;
        }
        col2im_cpu(layer.col_image, layer.batch, layer.c,  layer.h,  layer.w,  layer.size,  layer.stride, layer.pad, delta);
    }
}

@@ -278,22 +272,140 @@
}

#ifdef GPU

cl_kernel get_convolutional_learn_bias_kernel()
{
    static int init = 0;
    static cl_kernel kernel;
    if(!init){
        kernel = get_kernel("src/convolutional_layer.cl", "learn_bias", 0);
        init = 1;
    }
    return kernel;
}

void learn_bias_convolutional_layer_ongpu(convolutional_layer layer)
{
    int size = convolutional_out_height(layer) * convolutional_out_width(layer);

    cl_setup();
    cl_kernel kernel = get_convolutional_learn_bias_kernel();
    cl_command_queue queue = cl.queue;

    cl_uint i = 0;
    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.batch), (void*) &layer.batch);
    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.n), (void*) &layer.n);
    cl.error = clSetKernelArg(kernel, i++, sizeof(size), (void*) &size);
    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.delta_cl), (void*) &layer.delta_cl);
    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.bias_updates_cl), (void*) &layer.bias_updates_cl);
    check_error(cl);

    const size_t global_size[] = {layer.n};

    clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0);
    check_error(cl);
}

cl_kernel get_convolutional_bias_kernel()
{
    static int init = 0;
    static cl_kernel kernel;
    if(!init){
        kernel = get_kernel("src/convolutional_layer.cl", "bias", 0);
        init = 1;
    }
    return kernel;
}

void bias_output_gpu(const convolutional_layer layer)
{
    int out_h = convolutional_out_height(layer);
    int out_w = convolutional_out_width(layer);
    int size = out_h*out_w;

    cl_setup();
    cl_kernel kernel = get_convolutional_bias_kernel();
    cl_command_queue queue = cl.queue;

    cl_uint i = 0;
    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.n), (void*) &layer.n);
    cl.error = clSetKernelArg(kernel, i++, sizeof(size), (void*) &size);
    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.biases_cl), (void*) &layer.biases_cl);
    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.output_cl), (void*) &layer.output_cl);
    check_error(cl);

    const size_t global_size[] = {layer.batch, layer.n*size};

    clEnqueueNDRangeKernel(queue, kernel, 2, 0, global_size, 0, 0, 0, 0);
    check_error(cl);
}

void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in)
{
    int i;
    int m = layer.n;
    int k = layer.size*layer.size*layer.c;
    int n = convolutional_out_height(layer)*
        convolutional_out_width(layer)*
        layer.batch;
        convolutional_out_width(layer);

    cl_write_array(layer.filters_cl, layer.filters, m*k);
    //cl_write_array(layer.filters_cl, layer.filters, m*k);
    //cl_write_array(layer.biases_cl, layer.biases, m);
    bias_output_gpu(layer);
    im2col_ongpu(in, layer.batch, layer.c,  layer.h,  layer.w,  layer.size,  layer.stride, layer.pad, layer.col_image_cl);
    for(i = 0; i < layer.batch; ++i){
    cl_mem a = layer.filters_cl;
    cl_mem b = layer.col_image_cl;
    cl_mem c = layer.output_cl;
    im2col_ongpu(in, layer.batch, layer.c,  layer.h,  layer.w,  layer.size,  layer.stride, b);
    gemm_ongpu(0,0,m,n,k,1,a,k,b,n,0,c,n);
    activate_array_ongpu(layer.output_cl, m*n, layer.activation);
    cl_read_array(layer.output_cl, layer.output, m*n);
        cl_mem b = cl_sub_array(layer.col_image_cl, i*k*n, k*n);
        cl_mem c = cl_sub_array(layer.output_cl, i*m*n, m*n);
        gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c,n);
        clReleaseMemObject(b);
        clReleaseMemObject(c);
}
    activate_array_ongpu(layer.output_cl, m*n*layer.batch, layer.activation);
    cl_read_array(layer.output_cl, layer.output, m*n*layer.batch);
}

void backward_convolutional_layer_gpu(convolutional_layer layer, cl_mem delta_cl)
{
    int i;
    int m = layer.n;
    int n = layer.size*layer.size*layer.c;
    int k = convolutional_out_height(layer)*
        convolutional_out_width(layer);
    gradient_array_ongpu(layer.output_cl, m*k*layer.batch, layer.activation, layer.delta_cl);
    learn_bias_convolutional_layer_ongpu(layer);

    for(i = 0; i < layer.batch; ++i){
        cl_mem a = cl_sub_array(layer.delta_cl,i*m*k, m*k);
        cl_mem b = cl_sub_array(layer.col_image_cl,i*k*n, k*n);
        cl_mem c = layer.filter_updates_cl;

        gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);

        clReleaseMemObject(a);
        clReleaseMemObject(b);
    }
    cl_read_array(layer.filter_updates_cl, layer.filter_updates, m*n);
    cl_read_array(layer.bias_updates_cl, layer.bias_updates, m);
    

    if(delta_cl){
        m = layer.size*layer.size*layer.c;
        k = layer.n;
        n = convolutional_out_height(layer)*
            convolutional_out_width(layer);

        for(i = 0; i < layer.batch; ++i){
            a = layer.filters_cl;
            b = cl_sub_array(layer.delta_cl, i*k*n, k*n);
            c = cl_sub_array(layer.col_image_cl, i*m*n, m*n);

            gemm_ongpu(1,0,m,n,k,1,a,m,b,n,0,c,n);
            clReleaseMemObject(b);
            clReleaseMemObject(c);
        }
        col2im_gpu(layer.col_image_cl, layer.batch, layer.c,  layer.h,  layer.w,  layer.size,  layer.stride, layer.pad, delta_cl);
    }
}

#endif


 src/convolutional_layer.cl

New file
@@ -0,0 +1,25 @@

__kernel void bias(int n, int size, __global float *biases, __global float *output)
{
    int batch = get_global_id(0);
    int id = get_global_id(1);
    int filter = id/size;
    int position = id%size;

    output[batch*n*size + id] = biases[filter];
}

__kernel void learn_bias(int batch, int n, int size, __global float *delta, __global float *bias_updates)
{
    int i,b;
    int filter = get_global_id(0);
    float sum = 0;
    for(b = 0; b < batch; ++b){
        for(i = 0; i < size; ++i){
            int index = i + size*(filter + n*b);
            sum += delta[index];
        }
    }
    bias_updates[filter] += sum;
}


 src/convolutional_layer.h

@@ -50,6 +50,7 @@

#ifdef GPU
void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in);
void backward_convolutional_layer_gpu(convolutional_layer layer, cl_mem delta_cl);
#endif

convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, float learning_rate, float momentum, float decay);

 src/data.c

@@ -148,6 +148,16 @@
    return d;
}

void get_batch(data d, int n, float *X, float *y)
{
    int j;
    for(j = 0; j < n; ++j){
        int index = rand()%d.X.rows;
        memcpy(X+j*d.X.cols, d.X.vals[index], d.X.cols*sizeof(float));
        memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float));
    }
}

data load_all_cifar10()
{
    data d;

 src/data.h

@@ -20,6 +20,7 @@
data load_cifar10_data(char *filename);
data load_all_cifar10();
list *get_paths(char *filename);
void get_batch(data d, int n, float *X, float *y);
data load_categorical_data_csv(char *filename, int target, int k);
void normalize_data_rows(data d);
void scale_data_rows(data d, float s);

 src/gemm.c

@@ -6,11 +6,7 @@
        float BETA,
        float *C, int ldc)
{
#ifdef GPU
    gemm_gpu( TA,  TB,  M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
#else
    gemm_cpu( TA,  TB,  M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
#endif
}

void gemm_nn(int M, int N, int K, float ALPHA, 
@@ -83,6 +79,7 @@
        float BETA,
        float *C, int ldc)
{
    //printf("cpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
    int i, j;
    for(i = 0; i < M; ++i){
        for(j = 0; j < N; ++j){
@@ -107,7 +104,11 @@
#define STR_HELPER(x) #x
#define STR(x) STR_HELPER(x)

#ifdef __APPLE__
#define BLOCK 1
#else
#define BLOCK 8
#endif

cl_kernel get_gemm_kernel()
{
@@ -126,6 +127,7 @@
        float BETA,
        cl_mem C_gpu, int ldc)
{
    //printf("gpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
    cl_setup();
    cl_kernel gemm_kernel = get_gemm_kernel();
    cl_command_queue queue = cl.queue;
@@ -256,6 +258,8 @@

void test_gpu_blas()
{
    test_gpu_accuracy(0,0,10,576,75); 

    test_gpu_accuracy(0,0,17,10,10); 
    test_gpu_accuracy(1,0,17,10,10); 
    test_gpu_accuracy(0,1,17,10,10); 
@@ -266,6 +270,7 @@
    test_gpu_accuracy(0,1,1000,10,100); 
    test_gpu_accuracy(1,1,1000,10,100); 

/*
    time_gpu_random_matrix(0,0,1000,1000,100); 
    time_random_matrix(0,0,1000,1000,100); 

@@ -277,6 +282,7 @@

    time_gpu_random_matrix(1,1,1000,1000,100); 
    time_random_matrix(1,1,1000,1000,100); 
    */

}
#endif

 src/im2col.c

@@ -1,22 +1,21 @@
#include "mini_blas.h"
#include <stdio.h>

inline float im2col_get_pixel(float *im, int height, int width, int channels,
                        int row, int col, int channel, int pad)
                        int b, int row, int col, int channel, int pad)
{
    row -= pad;
    col -= pad;

    if (row < 0 || col < 0 ||
        row >= height || col >= width) return 0;
    return im[col + width*(row + channel*height)];
    return im[col + width*(row + height*(channel+b*channels))];
}

//From Berkeley Vision's Caffe!
//https://github.com/BVLC/caffe/blob/master/LICENSE
void im2col_cpu_batch(float* data_im,
    const int batch, const int channels, const int height, const int width,
    const int ksize, const int stride, int pad, float* data_col) 
void im2col_cpu(float* data_im,  int batch,
     int channels,  int height,  int width,
     int ksize,  int stride, int pad, float* data_col) 
{
    int c,h,w,b;
    int height_col = (height - ksize) / stride + 1;
@@ -27,44 +26,6 @@
        pad = ksize/2;
    }
    int channels_col = channels * ksize * ksize;
    int im_size = height*width*channels;
    //int col_size = height_col*width_col*channels_col;
    for (b = 0; b < batch; ++b) {
        for (c = 0; c < channels_col; ++c) {
            int w_offset = c % ksize;
            int h_offset = (c / ksize) % ksize;
            int c_im = c / ksize / ksize;
            for (h = 0; h < height_col; ++h) {
                for (w = 0; w < width_col; ++w) {
                    int im_row = h_offset + h * stride;
                    int im_col = w_offset + w * stride;
                    int col_index = (c * height_col + h) * width_col + w + (batch-1) * c * height_col*width_col;
                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
                                        im_row, im_col, c_im, pad);
                }
            }
        }
        data_im += im_size;
        data_col+= channels_col;
    }
}

//From Berkeley Vision's Caffe!
//https://github.com/BVLC/caffe/blob/master/LICENSE
void im2col_cpu(float* data_im, const int batch,
    const int channels, const int height, const int width,
    const int ksize, const int stride, int pad, float* data_col) 
{
    int c,h,w,b;
    int height_col = (height - ksize) / stride + 1;
    int width_col = (width - ksize) / stride + 1;
    if (pad){
        height_col = 1 + (height-1) / stride;
        width_col = 1 + (width-1) / stride;
        pad = ksize/2;
    }
    int channels_col = channels * ksize * ksize;
    int im_size = height*width*channels;
    int col_size = height_col*width_col*channels_col;
    for (b = 0; b < batch; ++b) {
        for (c = 0; c < channels_col; ++c) {
@@ -75,14 +36,12 @@
                for (w = 0; w < width_col; ++w) {
                    int im_row = h_offset + h * stride;
                    int im_col = w_offset + w * stride;
                    int col_index = (c * height_col + h) * width_col + w;
                    int col_index = (c * height_col + h) * width_col + w + b*col_size;
                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
                            im_row, im_col, c_im, pad);
                            b, im_row, im_col, c_im, pad);
                }
            }
        }
        data_im += im_size;
        data_col += col_size;
    }
}

@@ -104,9 +63,9 @@
}


void im2col_ongpu(cl_mem data_im, const int batch,
        const int channels, const int height, const int width,
        const int ksize, const int stride, cl_mem data_col) 
void im2col_ongpu(cl_mem data_im,  int batch,
         int channels,  int height,  int width,
         int ksize,  int stride,  int pad, cl_mem data_col)
{
    cl_setup();
    cl_kernel im2col_kernel = get_im2col_kernel();
@@ -120,29 +79,30 @@
    cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(width), (void*) &width);
    cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(ksize), (void*) &ksize);
    cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(stride), (void*) &stride);
    cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(pad), (void*) &pad);
    cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(data_col), (void*) &data_col);
    check_error(cl);

    int height_col = (height - ksize) / stride + 1;
    int width_col = (width - ksize) / stride + 1;
    int channels_col = channels * ksize * ksize;
    if (pad){
        height_col = 1 + (height-1) / stride;
        width_col = 1 + (width-1) / stride;
    }

    size_t global_size[2];
    size_t local_size[2];
    global_size[0] = batch;
    global_size[1] = channels_col;
    local_size[0] = height_col;
    local_size[1] = width_col;
    global_size[0] = batch*channels_col;
    global_size[1] = height_col*width_col;

    clEnqueueNDRangeKernel(queue, im2col_kernel, 2, 0,
            global_size, local_size, 0, 0, 0);
            global_size, 0, 0, 0, 0);
    check_error(cl);
}

void im2col_gpu(float *data_im,
        const int batch, const int channels, const int height, const int width,
        const int ksize, const int stride,
        float *data_col) 
void im2col_gpu(float *data_im,  int batch,
         int channels,  int height,  int width,
         int ksize,  int stride,  int pad, float *data_col) 
{
    cl_setup();
    cl_context context = cl.context;
@@ -165,7 +125,7 @@
    check_error(cl);

    im2col_ongpu(im_gpu, batch, channels, height, width,
            ksize, stride, col_gpu);
            ksize, stride, pad, col_gpu);

    clEnqueueReadBuffer(queue, col_gpu, CL_TRUE, 0, size, data_col, 0, 0, 0);
    check_error(cl);

 src/im2col.cl

@@ -1,26 +1,43 @@

__kernel void im2col(__global float *data_im, const int im_offset,
    const int channels, const int height, const int width,
    const int ksize, const int stride, __global float *data_col, const int col_offset) 
float im2col_get_pixel(__global float *im, int height, int width, int channels,
                       int batch, int row, int col, int channel, int pad)
{
    int b = get_global_id(0);
    int c = get_global_id(1);
    row -= pad;
    col -= pad;

    int h = get_local_id(0);
    int w = get_local_id(1);
    if (row < 0 || col < 0 || row >= height || col >= width) return 0;
    int index = col + width*(row + height*(channel+batch*channels));
    return im[index];
}

__kernel void im2col(__global float *data_im,  int batch,
     int channels,  int height,  int width,
     int ksize,  int stride,  int pad, __global float *data_col)
{
    int c,h,w,b;
    int height_col = (height - ksize) / stride + 1;
    int width_col = (width - ksize) / stride + 1;
    if (pad){
        height_col = 1 + (height-1) / stride;
        width_col = 1 + (width-1) / stride;
        pad = ksize/2;
    }
    int gid1 = get_global_id(0);
    b = gid1%batch;
    c = gid1/batch;

    int gid2 = get_global_id(1);
    h = gid2%height_col;
    w = gid2/height_col;


    int channels_col = channels * ksize * ksize;

    int im_offset = height*width*channels*b;
    int col_offset = height_col*width_col*channels_col*b;

    int col_size = height_col*width_col*channels_col;
    int w_offset = c % ksize;
    int h_offset = (c / ksize) % ksize;
    int c_im = c / ksize / ksize;

    data_col[(c * height_col + h) * width_col + w + col_offset] =
        data_im[(c_im * height + h * stride + h_offset) * width
        + w * stride + w_offset + im_offset];
    int im_row = h_offset + h * stride;
    int im_col = w_offset + w * stride;
    int col_index = (c * height_col + h) * width_col + w + b*col_size;
    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, b, im_row, im_col, c_im, pad);
}

 src/mini_blas.h

@@ -10,13 +10,17 @@
void time_random_matrix(int TA, int TB, int m, int k, int n);

#ifdef GPU
void im2col_ongpu(cl_mem data_im, const int batch,
        const int channels, const int height, const int width,
        const int ksize, const int stride, cl_mem data_col);
void im2col_ongpu(cl_mem data_im, int batch,
         int channels, int height, int width,
         int ksize, int stride, int pad, cl_mem data_col);

void im2col_gpu(float *data_im,
    const int batch, const int channels, const int height, const int width,
    const int ksize, const int stride, float *data_col);
void col2im_ongpu(cl_mem data_col, int batch,
        int channels, int height, int width,
        int ksize, int stride, int pad, cl_mem data_im);

void im2col_gpu(float *data_im, int batch,
         int channels, int height, int width,
         int ksize, int stride, int pad, float *data_col);

void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, 
        cl_mem A_gpu, int lda, 
@@ -25,13 +29,14 @@
        cl_mem C_gpu, int ldc);
#endif

void im2col_cpu(float* data_im, const int batch,
    const int channels, const int height, const int width,
    const int ksize, const int stride, int pad, float* data_col);
void im2col_cpu(float* data_im, int batch,
    int channels, int height, int width,
    int ksize, int stride, int pad, float* data_col);

void col2im_cpu(float* data_col,
        const int channels, const int height, const int width,
        const int ksize, const int stride, int pad, float* data_im);
void col2im_cpu(float* data_col, int batch,
        int channels, int height, int width,
        int ksize, int stride, int pad, float* data_im);

void test_blas();

void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, 

 src/network.c

@@ -28,25 +28,16 @@
}

#ifdef GPU
void forward_network(network net, float *input, int train)
void forward_network_gpu(network net, cl_mem input_cl, int train)
{
    cl_setup();
    size_t size = get_network_input_size(net);
    if(!net.input_cl){
        net.input_cl = clCreateBuffer(cl.context,
            CL_MEM_READ_WRITE, size*sizeof(float), 0, &cl.error);
        check_error(cl);
    }
    cl_write_array(net.input_cl, input, size);
    cl_mem input_cl = net.input_cl;
    int i;
    for(i = 0; i < net.n; ++i){
        if(net.types[i] == CONVOLUTIONAL){
            convolutional_layer layer = *(convolutional_layer *)net.layers[i];
            forward_convolutional_layer_gpu(layer, input_cl);
            input_cl = layer.output_cl;
            input = layer.output;
        }
        /*
        else if(net.types[i] == CONNECTED){
            connected_layer layer = *(connected_layer *)net.layers[i];
            forward_connected_layer(layer, input, train);
@@ -72,10 +63,11 @@
            forward_normalization_layer(layer, input);
            input = layer.output;
        }
        */
    }
}

#else
#endif

void forward_network(network net, float *input, int train)
{
@@ -118,7 +110,6 @@
        }
    }
}
#endif

void update_network(network net)
{
@@ -275,45 +266,13 @@
    float *X = calloc(batch*d.X.cols, sizeof(float));
    float *y = calloc(batch*d.y.cols, sizeof(float));

    int i,j;
    int i;
    float sum = 0;
    int index = 0;
    for(i = 0; i < n; ++i){
        for(j = 0; j < batch; ++j){
            index = rand()%d.X.rows;
            memcpy(X+j*d.X.cols, d.X.vals[index], d.X.cols*sizeof(float));
            memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float));
        }

        get_batch(d, batch, X, y);
        float err = train_network_datum(net, X, y);
        sum += err;
        //train_network_datum(net, X, y);
        /*
        float *y = d.y.vals[index];
        int class = get_predicted_class_network(net);
        correct += (y[class]?1:0);
        */

/*
        for(j = 0; j < d.y.cols*batch; ++j){
            printf("%6.3f ", y[j]);
        }
        printf("\n");
        for(j = 0; j < d.y.cols*batch; ++j){
            printf("%6.3f ", get_network_output(net)[j]);
        }
        printf("\n");
        printf("\n");
        */


        //printf("%d %f %f\n", i,net.output[0], d.y.vals[index][0]);
        //if((i+1)%10 == 0){
        //    printf("%d: %f\n", (i+1), (float)correct/(i+1));
        //}
    }
    //printf("Accuracy: %f\n",(float) correct/n);
    //show_image(float_to_image(32,32,3,X), "Orig");
    free(X);
    free(y);
    return (float)sum/(n*batch);

 src/network.h

@@ -33,6 +33,10 @@
    #endif
} network;

#ifdef GPU
void forward_network_gpu(network net, cl_mem input, int train);
#endif

network make_network(int n, int batch);
void forward_network(network net, float *input, int train);
float backward_network(network net, float *input, float *truth);

 src/opencl.c

@@ -11,6 +11,7 @@

void check_error(cl_info info)
{
    clFinish(cl.queue);
    if (info.error != CL_SUCCESS) {
        printf("\n Error number %d", info.error);
        exit(1);
@@ -27,13 +28,60 @@
    // Fetch the Platform and Device IDs; we only want one.
    cl_device_id devices[MAX_DEVICES];
    info.error=clGetPlatformIDs(1, &info.platform, &num_platforms);

    printf("=== %d OpenCL platform(s) found: ===\n", num_platforms);
    char buffer[10240];
    clGetPlatformInfo(info.platform, CL_PLATFORM_PROFILE, 10240, buffer, NULL);
    printf("  PROFILE = %s\n", buffer);
    clGetPlatformInfo(info.platform, CL_PLATFORM_VERSION, 10240, buffer, NULL);
    printf("  VERSION = %s\n", buffer);
    clGetPlatformInfo(info.platform, CL_PLATFORM_NAME, 10240, buffer, NULL);
    printf("  NAME = %s\n", buffer);
    clGetPlatformInfo(info.platform, CL_PLATFORM_VENDOR, 10240, buffer, NULL);
    printf("  VENDOR = %s\n", buffer);
    clGetPlatformInfo(info.platform, CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL);
    printf("  EXTENSIONS = %s\n", buffer);

    check_error(info);
    info.error=clGetDeviceIDs(info.platform, CL_DEVICE_TYPE_ALL, MAX_DEVICES, devices, &num_devices);
    if(num_devices > MAX_DEVICES) num_devices = MAX_DEVICES;
    printf("=== %d OpenCL device(s) found on platform:\n", num_devices);
    int i;
    for (i=0; i<num_devices; i++)
    {
        char buffer[10240];
        cl_uint buf_uint;
        cl_ulong buf_ulong;
        printf("  -- %d --\n", i);
        clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL);
        printf("  DEVICE_NAME = %s\n", buffer);
        clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL);
        printf("  DEVICE_VENDOR = %s\n", buffer);
        clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL);
        printf("  DEVICE_VERSION = %s\n", buffer);
        clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL);
        printf("  DRIVER_VERSION = %s\n", buffer);
        clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL);
        printf("  DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
        clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL);
        printf("  DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
        clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL);
        printf("  DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong);
        clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(buf_ulong), &buf_ulong, NULL);
        printf("  DEVICE_MAX_WORK_GROUP_SIZE = %llu\n", (unsigned long long)buf_ulong);
        cl_uint items;
        clGetDeviceInfo( devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), 
                                       &items, NULL);
        printf("  DEVICE_MAX_WORK_ITEM_DIMENSIONS = %u\n", (unsigned int)items);
        size_t workitem_size[10];
        clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, 10*sizeof(workitem_size), workitem_size, NULL);
        printf("  DEVICE_MAX_WORK_ITEM_SIZES = %u / %u / %u \n", (unsigned int)workitem_size[0], (unsigned int)workitem_size[1], (unsigned int)workitem_size[2]);

    }
    int index = getpid()%num_devices;
    printf("%d rand, %d devices, %d index\n", getpid(), num_devices, index);
    //info.device = devices[index];
    info.device = devices[1];
    info.device = devices[0];
    fprintf(stderr, "Found %d device(s)\n", num_devices);
    check_error(info);

@@ -52,8 +100,8 @@
cl_program cl_fprog(char *filename, char *options, cl_info info)
{
    size_t srcsize;
    char src[8192];
    memset(src, 0, 8192);
    char src[64*1024];
    memset(src, 0, 64*1024);
    FILE *fil=fopen(filename,"r");
    srcsize=fread(src, sizeof src, 1, fil);
    fclose(fil);
@@ -61,12 +109,12 @@
    // Submit the source code of the example kernel to OpenCL
    cl_program prog=clCreateProgramWithSource(info.context,1, srcptr, &srcsize, &info.error);
    check_error(info);
    char build_c[4096];
    char build_c[1024*64];
    // and compile it (after this we could extract the compiled version)
    info.error=clBuildProgram(prog, 0, 0, options, 0, 0);
    if ( info.error != CL_SUCCESS ) {
        fprintf(stderr, "Error Building Program: %d\n", info.error);
        clGetProgramBuildInfo( prog, info.device, CL_PROGRAM_BUILD_LOG, 4096, build_c, 0);
        clGetProgramBuildInfo( prog, info.device, CL_PROGRAM_BUILD_LOG, 1024*64, build_c, 0);
        fprintf(stderr, "Build Log for %s program:\n%s\n", filename, build_c);
    }
    check_error(info);
@@ -115,7 +163,8 @@
    cl_buffer_region r;
    r.origin = offset*sizeof(float);
    r.size = size*sizeof(float);
    cl_mem sub = clCreateSubBuffer(src, CL_MEM_USE_HOST_PTR, CL_BUFFER_CREATE_TYPE_REGION, &r, 0);
    cl_mem sub = clCreateSubBuffer(src, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &r, &cl.error);
    check_error(cl);
    return sub;
}

			@@ -41,6 +41,12 @@
			float ramp_activate(float x){return x(x>0)+.1x;}
			float tanh_activate(float x){return (exp(2x)-1)/(exp(2x)+1);}

			float linear_gradient(float x){return 1;}
			float sigmoid_gradient(float x){return (1-x)*x;}
			float relu_gradient(float x){return (x>0);}
			float ramp_gradient(float x){return (x>0)+.1;}
			float tanh_gradient(float x){return 1-x*x;}

			float activate(float x, ACTIVATION a)
			{
			switch(a){
			@@ -66,19 +72,19 @@
			}
			}


			float gradient(float x, ACTIVATION a){
			float gradient(float x, ACTIVATION a)
			{
			switch(a){
			case LINEAR:
			return 1;
			return linear_gradient(x);
			case SIGMOID:
			return (1.-x)*x;
			return sigmoid_gradient(x);
			case RELU:
			return (x>0);
			return relu_gradient(x);
			case RAMP:
			return (x>0) + .1;
			return ramp_gradient(x);
			case TANH:
			return 1-x*x;
			return tanh_gradient(x);
			}
			return 0;
			}
			@@ -107,7 +113,6 @@
			return kernel;
			}


			void activate_array_ongpu(cl_mem x, int n, ACTIVATION a)
			{
			cl_setup();
			@@ -125,4 +130,34 @@
			clEnqueueNDRangeKernel(queue, kernel, 1, 0, &gsize, 0, 0, 0, 0);
			check_error(cl);
			}

			cl_kernel get_gradient_kernel()
			{
			static int init = 0;
			static cl_kernel kernel;
			if(!init){
			kernel = get_kernel("src/activations.cl", "gradient_array", 0);
			init = 1;
			}
			return kernel;
			}

			void gradient_array_ongpu(cl_mem x, int n, ACTIVATION a, cl_mem delta)
			{
			cl_setup();
			cl_kernel kernel = get_gradient_kernel();
			cl_command_queue queue = cl.queue;

			cl_uint i = 0;
			cl.error = clSetKernelArg(kernel, i++, sizeof(x), (void*) &x);
			cl.error = clSetKernelArg(kernel, i++, sizeof(n), (void*) &n);
			cl.error = clSetKernelArg(kernel, i++, sizeof(a), (void*) &a);
			cl.error = clSetKernelArg(kernel, i++, sizeof(delta), (void*) &delta);
			check_error(cl);

			size_t gsize = n;

			clEnqueueNDRangeKernel(queue, kernel, 1, 0, &gsize, 0, 0, 0, 0);
			check_error(cl);
			}
			#endif

			@@ -8,6 +8,12 @@
			float ramp_activate(float x){return x(x>0)+.1x;}
			float tanh_activate(float x){return (exp(2x)-1)/(exp(2x)+1);}

			float linear_gradient(float x){return 1;}
			float sigmoid_gradient(float x){return (1-x)*x;}
			float relu_gradient(float x){return (x>0);}
			float ramp_gradient(float x){return (x>0)+.1;}
			float tanh_gradient(float x){return 1-x*x;}

			float activate(float x, ACTIVATION a)
			{
			switch(a){
			@@ -25,9 +31,32 @@
			return 0;
			}

			__kernel void activate_array(__global float *x,
			const int n, const ACTIVATION a)
			float gradient(float x, ACTIVATION a)
			{
			switch(a){
			case LINEAR:
			return linear_gradient(x);
			case SIGMOID:
			return sigmoid_gradient(x);
			case RELU:
			return relu_gradient(x);
			case RAMP:
			return ramp_gradient(x);
			case TANH:
			return tanh_gradient(x);
			}
			return 0;
			}

			__kernel void activate_array(__global float *x, int n, ACTIVATION a)
			{
			int i = get_global_id(0);
			x[i] = activate(x[i], a);
			}

			__kernel void gradient_array(__global float x, int n, ACTIVATION a, __global float delta)
			{
			int i = get_global_id(0);
			delta[i] *= gradient(x[i], a);
			}

			@@ -14,7 +14,9 @@
			void gradient_array(const float x, const int n, const ACTIVATION a, float delta);
			void activate_array(float *x, const int n, const ACTIVATION a);
			#ifdef GPU
			cl_kernel get_activation_kernel();
			void activate_array_ongpu(cl_mem x, int n, ACTIVATION a);
			void gradient_array_ongpu(cl_mem x, int n, ACTIVATION a, cl_mem delta);
			#endif

			#endif

			@@ -32,6 +32,51 @@
			show_image_layers(edge, "Test Convolve");
			}

			#ifdef GPU

			void test_convolutional_layer()
			{
			int i;
			image dog = load_image("data/dog.jpg",256,256);
			network net = parse_network_cfg("cfg/convolutional.cfg");
			// data test = load_cifar10_data("data/cifar10/test_batch.bin");
			// float X = calloc(net.batchtest.X.cols, sizeof(float));
			// float y = calloc(net.batchtest.y.cols, sizeof(float));
			int in_size = get_network_input_size(net)*net.batch;
			int size = get_network_output_size(net)*net.batch;
			float *X = calloc(in_size, sizeof(float));
			for(i = 0; i < in_size; ++i){
			X[i] = dog.data[i%get_network_input_size(net)];
			}
			// get_batch(test, net.batch, X, y);
			clock_t start, end;
			cl_mem input_cl = cl_make_array(X, in_size);

			forward_network_gpu(net, input_cl, 1);
			start = clock();
			forward_network_gpu(net, input_cl, 1);
			end = clock();
			float gpu_sec = (float)(end-start)/CLOCKS_PER_SEC;
			float *gpu_out = calloc(size, sizeof(float));
			memcpy(gpu_out, get_network_output(net), size*sizeof(float));

			start = clock();
			forward_network(net, X, 1);
			end = clock();
			float cpu_sec = (float)(end-start)/CLOCKS_PER_SEC;
			float *cpu_out = calloc(size, sizeof(float));
			memcpy(cpu_out, get_network_output(net), size*sizeof(float));

			float sum = 0;
			for(i = 0; i < size; ++i) {
			//printf("%f, %f\n", gpu_out[i], cpu_out[i]);
			sum += pow(gpu_out[i] - cpu_out[i], 2);
			}
			printf("gpu: %f sec, cpu: %f sec, diff: %f, size: %d\n", gpu_sec, cpu_sec, sum, size);
			}

			#endif

			void test_convolve_matrix()
			{
			image dog = load_image("dog.jpg",300,400);
			@@ -325,7 +370,7 @@
			void train_nist()
			{
			srand(222222);
			network net = parse_network_cfg("cfg/nist_final.cfg");
			network net = parse_network_cfg("cfg/nist.cfg");
			data train = load_categorical_data_csv("data/mnist/mnist_train.csv", 0, 10);
			data test = load_categorical_data_csv("data/mnist/mnist_test.csv",0,10);
			translate_data_rows(train, -144);
			@@ -349,7 +394,7 @@
			mean_array(get_network_output_layer(net,3), 100),
			mean_array(get_network_output_layer(net,4), 100));
			*/
			save_network(net, "cfg/nist_final2.cfg");
			//save_network(net, "cfg/nist_final2.cfg");

			//printf("%5d Training Loss: %lf, Params: %f %f %f, ",count*1000, loss, lr, momentum, decay);
			//end = clock();
			@@ -798,7 +843,7 @@
			{
			//train_full();
			//test_distribution();
			feenableexcept(FE_DIVBYZERO \| FE_INVALID \| FE_OVERFLOW);
			//feenableexcept(FE_DIVBYZERO \| FE_INVALID \| FE_OVERFLOW);

			//test_blas();
			//test_visualize();
			@@ -809,7 +854,9 @@
			//test_split();
			//test_ensemble();
			//test_nist_single();
			test_nist();
			//test_nist();
			train_nist();
			//test_convolutional_layer();
			//test_cifar10();
			//train_cifar10();
			//test_vince();

			@@ -1,21 +1,21 @@
			#include <stdio.h>
			#include <math.h>
			inline void col2im_add_pixel(float *im, int height, int width, int channels,
			int row, int col, int channel, int pad, float val)
			int b, int row, int col, int channel, int pad, float val)
			{
			row -= pad;
			col -= pad;

			if (row < 0 \|\| col < 0 \|\|
			row >= height \|\| col >= width) return;
			im[col + width(row + channelheight)] += val;
			im[col + width(row + height(channel+b*channels))] += val;
			}
			//This one might be too, can't remember.
			void col2im_cpu(float* data_col,
			const int channels, const int height, const int width,
			const int ksize, const int stride, int pad, float* data_im)
			void col2im_cpu(float* data_col, int batch,
			int channels, int height, int width,
			int ksize, int stride, int pad, float* data_im)
			{
			int c,h,w;
			int b,c,h,w;
			int height_col = (height - ksize) / stride + 1;
			int width_col = (width - ksize) / stride + 1;
			if (pad){
			@@ -24,6 +24,8 @@
			pad = ksize/2;
			}
			int channels_col = channels * ksize * ksize;
			int col_size = height_colwidth_colchannels_col;
			for(b = 0; b < batch; ++b){
			for (c = 0; c < channels_col; ++c) {
			int w_offset = c % ksize;
			int h_offset = (c / ksize) % ksize;
			@@ -32,12 +34,57 @@
			for (w = 0; w < width_col; ++w) {
			int im_row = h_offset + h * stride;
			int im_col = w_offset + w * stride;
			double val = data_col[(c * height_col + h) * width_col + w];
			int col_index = (c * height_col + h) * width_col + w + b*col_size;
			double val = data_col[col_index];
			col2im_add_pixel(data_im, height, width, channels,
			im_row, im_col, c_im, pad, val);
			b, im_row, im_col, c_im, pad, val);
			}
			}
			}
			}
			}


			#ifdef GPU

			#include "opencl.h"

			cl_kernel get_col2im_kernel()
			{
			static int init = 0;
			static cl_kernel im2col_kernel;
			if(!init){
			im2col_kernel = get_kernel("src/col2im.cl", "col2im", 0);
			init = 1;
			}
			return im2col_kernel;
			}

			void col2im_ongpu(cl_mem data_col, int batch,
			int channels, int height, int width,
			int ksize, int stride, int pad, cl_mem data_im)
			{
			cl_setup();
			cl_kernel kernel = get_col2im_kernel();
			cl_command_queue queue = cl.queue;

			cl_uint i = 0;
			cl.error = clSetKernelArg(kernel, i++, sizeof(data_col), (void*) &data_col);
			cl.error = clSetKernelArg(kernel, i++, sizeof(batch), (void*) &batch);
			cl.error = clSetKernelArg(kernel, i++, sizeof(channels), (void*) &channels);
			cl.error = clSetKernelArg(kernel, i++, sizeof(height), (void*) &height);
			cl.error = clSetKernelArg(kernel, i++, sizeof(width), (void*) &width);
			cl.error = clSetKernelArg(kernel, i++, sizeof(ksize), (void*) &ksize);
			cl.error = clSetKernelArg(kernel, i++, sizeof(stride), (void*) &stride);
			cl.error = clSetKernelArg(kernel, i++, sizeof(pad), (void*) &pad);
			cl.error = clSetKernelArg(kernel, i++, sizeof(data_im), (void*) &data_im);
			check_error(cl);

			size_t global_size = {channelsheightwidth*batch};

			clEnqueueNDRangeKernel(queue, kernel, 3, 0,
			global_size, 0, 0, 0, 0);
			check_error(cl);
			}

			#endif

			@@ -0,0 +1,41 @@
			int index(int row, int col)
			{

			}

			__kernel void col2im(__global float *data_col, int batch,
			int channels, int height, int width,
			int ksize, int stride, int pad, __global float *data_im)
			{
			int id = get_global_id(0);
			int index = id;
			int w = id%width;
			id /= width;
			int h = id%height;
			id /= height;
			int c = id%channels;
			id /= channels;
			int b = id%batch;

			int height_col = (height - ksize) / stride + 1;
			int width_col = (width - ksize) / stride + 1;
			int rows = channels * ksize * ksize;
			if (pad){
			height_col = 1 + (height-1) / stride;
			width_col = 1 + (width-1) / stride;
			pad = ksize/2;
			}
			int cols = height_col*width_col;
			int batch_offset = bcolsrows;
			int channel_offset = ccolsksize*ksize;
			data_col[index] = 0;
			int i,j;
			for(i = 0; i < ksize; ++i){
			row_offset = iheight_colwidth_col;
			for(j = 0; j < ksize; ++j){
			col_offset =
			}
			}

			data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, b, im_row, im_col, c_im, pad);
			}

			@@ -147,15 +147,9 @@

			for(i = 0; i < layer.batch; ++i){
			gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
			c += n*m;
			in += layer.hlayer.wlayer.c;
			b += k*n;
			c += n*m;
			}
			/*
			int i;
			for(i = 0; i < m*n; ++i) printf("%f, ", layer.output[i]);
			printf("\n");
			*/
			activate_array(layer.output, mnlayer.batch, layer.activation);
			}

			@@ -205,10 +199,10 @@

			for(i = 0; i < layer.batch; ++i){
			gemm(1,0,m,n,k,1,a,m,b,n,0,c,n);
			col2im_cpu(c, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, delta);
			c += k*n;
			delta += layer.hlayer.wlayer.c;
			b += k*n;
			c += m*n;
			}
			col2im_cpu(layer.col_image, layer.batch, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, delta);
			}
			}

			@@ -278,22 +272,140 @@
			}

			#ifdef GPU

			cl_kernel get_convolutional_learn_bias_kernel()
			{
			static int init = 0;
			static cl_kernel kernel;
			if(!init){
			kernel = get_kernel("src/convolutional_layer.cl", "learn_bias", 0);
			init = 1;
			}
			return kernel;
			}

			void learn_bias_convolutional_layer_ongpu(convolutional_layer layer)
			{
			int size = convolutional_out_height(layer) * convolutional_out_width(layer);

			cl_setup();
			cl_kernel kernel = get_convolutional_learn_bias_kernel();
			cl_command_queue queue = cl.queue;

			cl_uint i = 0;
			cl.error = clSetKernelArg(kernel, i++, sizeof(layer.batch), (void*) &layer.batch);
			cl.error = clSetKernelArg(kernel, i++, sizeof(layer.n), (void*) &layer.n);
			cl.error = clSetKernelArg(kernel, i++, sizeof(size), (void*) &size);
			cl.error = clSetKernelArg(kernel, i++, sizeof(layer.delta_cl), (void*) &layer.delta_cl);
			cl.error = clSetKernelArg(kernel, i++, sizeof(layer.bias_updates_cl), (void*) &layer.bias_updates_cl);
			check_error(cl);

			const size_t global_size[] = {layer.n};

			clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0);
			check_error(cl);
			}

			cl_kernel get_convolutional_bias_kernel()
			{
			static int init = 0;
			static cl_kernel kernel;
			if(!init){
			kernel = get_kernel("src/convolutional_layer.cl", "bias", 0);
			init = 1;
			}
			return kernel;
			}

			void bias_output_gpu(const convolutional_layer layer)
			{
			int out_h = convolutional_out_height(layer);
			int out_w = convolutional_out_width(layer);
			int size = out_h*out_w;

			cl_setup();
			cl_kernel kernel = get_convolutional_bias_kernel();
			cl_command_queue queue = cl.queue;

			cl_uint i = 0;
			cl.error = clSetKernelArg(kernel, i++, sizeof(layer.n), (void*) &layer.n);
			cl.error = clSetKernelArg(kernel, i++, sizeof(size), (void*) &size);
			cl.error = clSetKernelArg(kernel, i++, sizeof(layer.biases_cl), (void*) &layer.biases_cl);
			cl.error = clSetKernelArg(kernel, i++, sizeof(layer.output_cl), (void*) &layer.output_cl);
			check_error(cl);

			const size_t global_size[] = {layer.batch, layer.n*size};

			clEnqueueNDRangeKernel(queue, kernel, 2, 0, global_size, 0, 0, 0, 0);
			check_error(cl);
			}

			void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in)
			{
			int i;
			int m = layer.n;
			int k = layer.sizelayer.sizelayer.c;
			int n = convolutional_out_height(layer)*
			convolutional_out_width(layer)*
			layer.batch;
			convolutional_out_width(layer);

			cl_write_array(layer.filters_cl, layer.filters, m*k);
			//cl_write_array(layer.filters_cl, layer.filters, m*k);
			//cl_write_array(layer.biases_cl, layer.biases, m);
			bias_output_gpu(layer);
			im2col_ongpu(in, layer.batch, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, layer.col_image_cl);
			for(i = 0; i < layer.batch; ++i){
			cl_mem a = layer.filters_cl;
			cl_mem b = layer.col_image_cl;
			cl_mem c = layer.output_cl;
			im2col_ongpu(in, layer.batch, layer.c, layer.h, layer.w, layer.size, layer.stride, b);
			gemm_ongpu(0,0,m,n,k,1,a,k,b,n,0,c,n);
			activate_array_ongpu(layer.output_cl, m*n, layer.activation);
			cl_read_array(layer.output_cl, layer.output, m*n);
			cl_mem b = cl_sub_array(layer.col_image_cl, ikn, k*n);
			cl_mem c = cl_sub_array(layer.output_cl, imn, m*n);
			gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c,n);
			clReleaseMemObject(b);
			clReleaseMemObject(c);
			}
			activate_array_ongpu(layer.output_cl, mnlayer.batch, layer.activation);
			cl_read_array(layer.output_cl, layer.output, mnlayer.batch);
			}

			void backward_convolutional_layer_gpu(convolutional_layer layer, cl_mem delta_cl)
			{
			int i;
			int m = layer.n;
			int n = layer.sizelayer.sizelayer.c;
			int k = convolutional_out_height(layer)*
			convolutional_out_width(layer);
			gradient_array_ongpu(layer.output_cl, mklayer.batch, layer.activation, layer.delta_cl);
			learn_bias_convolutional_layer_ongpu(layer);

			for(i = 0; i < layer.batch; ++i){
			cl_mem a = cl_sub_array(layer.delta_cl,imk, m*k);
			cl_mem b = cl_sub_array(layer.col_image_cl,ikn, k*n);
			cl_mem c = layer.filter_updates_cl;

			gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);

			clReleaseMemObject(a);
			clReleaseMemObject(b);
			}
			cl_read_array(layer.filter_updates_cl, layer.filter_updates, m*n);
			cl_read_array(layer.bias_updates_cl, layer.bias_updates, m);


			if(delta_cl){
			m = layer.sizelayer.sizelayer.c;
			k = layer.n;
			n = convolutional_out_height(layer)*
			convolutional_out_width(layer);

			for(i = 0; i < layer.batch; ++i){
			a = layer.filters_cl;
			b = cl_sub_array(layer.delta_cl, ikn, k*n);
			c = cl_sub_array(layer.col_image_cl, imn, m*n);

			gemm_ongpu(1,0,m,n,k,1,a,m,b,n,0,c,n);
			clReleaseMemObject(b);
			clReleaseMemObject(c);
			}
			col2im_gpu(layer.col_image_cl, layer.batch, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, delta_cl);
			}
			}

			#endif

New file
			@@ -0,0 +1,25 @@

			__kernel void bias(int n, int size, __global float biases, __global float output)
			{
			int batch = get_global_id(0);
			int id = get_global_id(1);
			int filter = id/size;
			int position = id%size;

			output[batchnsize + id] = biases[filter];
			}

			__kernel void learn_bias(int batch, int n, int size, __global float delta, __global float bias_updates)
			{
			int i,b;
			int filter = get_global_id(0);
			float sum = 0;
			for(b = 0; b < batch; ++b){
			for(i = 0; i < size; ++i){
			int index = i + size(filter + nb);
			sum += delta[index];
			}
			}
			bias_updates[filter] += sum;
			}

			@@ -50,6 +50,7 @@

			#ifdef GPU
			void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in);
			void backward_convolutional_layer_gpu(convolutional_layer layer, cl_mem delta_cl);
			#endif

			convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, float learning_rate, float momentum, float decay);

			@@ -148,6 +148,16 @@
			return d;
			}

			void get_batch(data d, int n, float X, float y)
			{
			int j;
			for(j = 0; j < n; ++j){
			int index = rand()%d.X.rows;
			memcpy(X+jd.X.cols, d.X.vals[index], d.X.colssizeof(float));
			memcpy(y+jd.y.cols, d.y.vals[index], d.y.colssizeof(float));
			}
			}

			data load_all_cifar10()
			{
			data d;

			@@ -20,6 +20,7 @@
			data load_cifar10_data(char *filename);
			data load_all_cifar10();
			list get_paths(char filename);
			void get_batch(data d, int n, float X, float y);
			data load_categorical_data_csv(char *filename, int target, int k);
			void normalize_data_rows(data d);
			void scale_data_rows(data d, float s);

			@@ -6,11 +6,7 @@
			float BETA,
			float *C, int ldc)
			{
			#ifdef GPU
			gemm_gpu( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
			#else
			gemm_cpu( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
			#endif
			}

			void gemm_nn(int M, int N, int K, float ALPHA,
			@@ -83,6 +79,7 @@
			float BETA,
			float *C, int ldc)
			{
			//printf("cpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
			int i, j;
			for(i = 0; i < M; ++i){
			for(j = 0; j < N; ++j){
			@@ -107,7 +104,11 @@
			#define STR_HELPER(x) #x
			#define STR(x) STR_HELPER(x)

			#ifdef __APPLE__
			#define BLOCK 1
			#else
			#define BLOCK 8
			#endif

			cl_kernel get_gemm_kernel()
			{
			@@ -126,6 +127,7 @@
			float BETA,
			cl_mem C_gpu, int ldc)
			{
			//printf("gpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
			cl_setup();
			cl_kernel gemm_kernel = get_gemm_kernel();
			cl_command_queue queue = cl.queue;
			@@ -256,6 +258,8 @@

			void test_gpu_blas()
			{
			test_gpu_accuracy(0,0,10,576,75);

			test_gpu_accuracy(0,0,17,10,10);
			test_gpu_accuracy(1,0,17,10,10);
			test_gpu_accuracy(0,1,17,10,10);
			@@ -266,6 +270,7 @@
			test_gpu_accuracy(0,1,1000,10,100);
			test_gpu_accuracy(1,1,1000,10,100);

			/*
			time_gpu_random_matrix(0,0,1000,1000,100);
			time_random_matrix(0,0,1000,1000,100);

			@@ -277,6 +282,7 @@

			time_gpu_random_matrix(1,1,1000,1000,100);
			time_random_matrix(1,1,1000,1000,100);
			*/

			}
			#endif

			@@ -1,22 +1,21 @@
			#include "mini_blas.h"
			#include <stdio.h>

			inline float im2col_get_pixel(float *im, int height, int width, int channels,
			int row, int col, int channel, int pad)
			int b, int row, int col, int channel, int pad)
			{
			row -= pad;
			col -= pad;

			if (row < 0 \|\| col < 0 \|\|
			row >= height \|\| col >= width) return 0;
			return im[col + width(row + channelheight)];
			return im[col + width(row + height(channel+b*channels))];
			}

			//From Berkeley Vision's Caffe!
			//https://github.com/BVLC/caffe/blob/master/LICENSE
			void im2col_cpu_batch(float* data_im,
			const int batch, const int channels, const int height, const int width,
			const int ksize, const int stride, int pad, float* data_col)
			void im2col_cpu(float* data_im, int batch,
			int channels, int height, int width,
			int ksize, int stride, int pad, float* data_col)
			{
			int c,h,w,b;
			int height_col = (height - ksize) / stride + 1;
			@@ -27,44 +26,6 @@
			pad = ksize/2;
			}
			int channels_col = channels * ksize * ksize;
			int im_size = heightwidthchannels;
			//int col_size = height_colwidth_colchannels_col;
			for (b = 0; b < batch; ++b) {
			for (c = 0; c < channels_col; ++c) {
			int w_offset = c % ksize;
			int h_offset = (c / ksize) % ksize;
			int c_im = c / ksize / ksize;
			for (h = 0; h < height_col; ++h) {
			for (w = 0; w < width_col; ++w) {
			int im_row = h_offset + h * stride;
			int im_col = w_offset + w * stride;
			int col_index = (c * height_col + h) * width_col + w + (batch-1) * c * height_col*width_col;
			data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
			im_row, im_col, c_im, pad);
			}
			}
			}
			data_im += im_size;
			data_col+= channels_col;
			}
			}

			//From Berkeley Vision's Caffe!
			//https://github.com/BVLC/caffe/blob/master/LICENSE
			void im2col_cpu(float* data_im, const int batch,
			const int channels, const int height, const int width,
			const int ksize, const int stride, int pad, float* data_col)
			{
			int c,h,w,b;
			int height_col = (height - ksize) / stride + 1;
			int width_col = (width - ksize) / stride + 1;
			if (pad){
			height_col = 1 + (height-1) / stride;
			width_col = 1 + (width-1) / stride;
			pad = ksize/2;
			}
			int channels_col = channels * ksize * ksize;
			int im_size = heightwidthchannels;
			int col_size = height_colwidth_colchannels_col;
			for (b = 0; b < batch; ++b) {
			for (c = 0; c < channels_col; ++c) {
			@@ -75,14 +36,12 @@
			for (w = 0; w < width_col; ++w) {
			int im_row = h_offset + h * stride;
			int im_col = w_offset + w * stride;
			int col_index = (c * height_col + h) * width_col + w;
			int col_index = (c * height_col + h) * width_col + w + b*col_size;
			data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
			im_row, im_col, c_im, pad);
			b, im_row, im_col, c_im, pad);
			}
			}
			}
			data_im += im_size;
			data_col += col_size;
			}
			}

			@@ -104,9 +63,9 @@
			}


			void im2col_ongpu(cl_mem data_im, const int batch,
			const int channels, const int height, const int width,
			const int ksize, const int stride, cl_mem data_col)
			void im2col_ongpu(cl_mem data_im, int batch,
			int channels, int height, int width,
			int ksize, int stride, int pad, cl_mem data_col)
			{
			cl_setup();
			cl_kernel im2col_kernel = get_im2col_kernel();
			@@ -120,29 +79,30 @@
			cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(width), (void*) &width);
			cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(ksize), (void*) &ksize);
			cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(stride), (void*) &stride);
			cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(pad), (void*) &pad);
			cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(data_col), (void*) &data_col);
			check_error(cl);

			int height_col = (height - ksize) / stride + 1;
			int width_col = (width - ksize) / stride + 1;
			int channels_col = channels * ksize * ksize;
			if (pad){
			height_col = 1 + (height-1) / stride;
			width_col = 1 + (width-1) / stride;
			}

			size_t global_size[2];
			size_t local_size[2];
			global_size[0] = batch;
			global_size[1] = channels_col;
			local_size[0] = height_col;
			local_size[1] = width_col;
			global_size[0] = batch*channels_col;
			global_size[1] = height_col*width_col;

			clEnqueueNDRangeKernel(queue, im2col_kernel, 2, 0,
			global_size, local_size, 0, 0, 0);
			global_size, 0, 0, 0, 0);
			check_error(cl);
			}

			void im2col_gpu(float *data_im,
			const int batch, const int channels, const int height, const int width,
			const int ksize, const int stride,
			float *data_col)
			void im2col_gpu(float *data_im, int batch,
			int channels, int height, int width,
			int ksize, int stride, int pad, float *data_col)
			{
			cl_setup();
			cl_context context = cl.context;
			@@ -165,7 +125,7 @@
			check_error(cl);

			im2col_ongpu(im_gpu, batch, channels, height, width,
			ksize, stride, col_gpu);
			ksize, stride, pad, col_gpu);

			clEnqueueReadBuffer(queue, col_gpu, CL_TRUE, 0, size, data_col, 0, 0, 0);
			check_error(cl);

			@@ -1,26 +1,43 @@

			__kernel void im2col(__global float *data_im, const int im_offset,
			const int channels, const int height, const int width,
			const int ksize, const int stride, __global float *data_col, const int col_offset)
			float im2col_get_pixel(__global float *im, int height, int width, int channels,
			int batch, int row, int col, int channel, int pad)
			{
			int b = get_global_id(0);
			int c = get_global_id(1);
			row -= pad;
			col -= pad;

			int h = get_local_id(0);
			int w = get_local_id(1);
			if (row < 0 \|\| col < 0 \|\| row >= height \|\| col >= width) return 0;
			int index = col + width(row + height(channel+batch*channels));
			return im[index];
			}

			__kernel void im2col(__global float *data_im, int batch,
			int channels, int height, int width,
			int ksize, int stride, int pad, __global float *data_col)
			{
			int c,h,w,b;
			int height_col = (height - ksize) / stride + 1;
			int width_col = (width - ksize) / stride + 1;
			if (pad){
			height_col = 1 + (height-1) / stride;
			width_col = 1 + (width-1) / stride;
			pad = ksize/2;
			}
			int gid1 = get_global_id(0);
			b = gid1%batch;
			c = gid1/batch;

			int gid2 = get_global_id(1);
			h = gid2%height_col;
			w = gid2/height_col;


			int channels_col = channels * ksize * ksize;

			int im_offset = heightwidthchannels*b;
			int col_offset = height_colwidth_colchannels_col*b;

			int col_size = height_colwidth_colchannels_col;
			int w_offset = c % ksize;
			int h_offset = (c / ksize) % ksize;
			int c_im = c / ksize / ksize;

			data_col[(c * height_col + h) * width_col + w + col_offset] =
			data_im[(c_im * height + h * stride + h_offset) * width
			+ w * stride + w_offset + im_offset];
			int im_row = h_offset + h * stride;
			int im_col = w_offset + w * stride;
			int col_index = (c * height_col + h) * width_col + w + b*col_size;
			data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, b, im_row, im_col, c_im, pad);
			}

			@@ -10,13 +10,17 @@
			void time_random_matrix(int TA, int TB, int m, int k, int n);

			#ifdef GPU
			void im2col_ongpu(cl_mem data_im, const int batch,
			const int channels, const int height, const int width,
			const int ksize, const int stride, cl_mem data_col);
			void im2col_ongpu(cl_mem data_im, int batch,
			int channels, int height, int width,
			int ksize, int stride, int pad, cl_mem data_col);

			void im2col_gpu(float *data_im,
			const int batch, const int channels, const int height, const int width,
			const int ksize, const int stride, float *data_col);
			void col2im_ongpu(cl_mem data_col, int batch,
			int channels, int height, int width,
			int ksize, int stride, int pad, cl_mem data_im);

			void im2col_gpu(float *data_im, int batch,
			int channels, int height, int width,
			int ksize, int stride, int pad, float *data_col);

			void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
			cl_mem A_gpu, int lda,
			@@ -25,13 +29,14 @@
			cl_mem C_gpu, int ldc);
			#endif

			void im2col_cpu(float* data_im, const int batch,
			const int channels, const int height, const int width,
			const int ksize, const int stride, int pad, float* data_col);
			void im2col_cpu(float* data_im, int batch,
			int channels, int height, int width,
			int ksize, int stride, int pad, float* data_col);

			void col2im_cpu(float* data_col,
			const int channels, const int height, const int width,
			const int ksize, const int stride, int pad, float* data_im);
			void col2im_cpu(float* data_col, int batch,
			int channels, int height, int width,
			int ksize, int stride, int pad, float* data_im);

			void test_blas();

			void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA,

			@@ -28,25 +28,16 @@
			}

			#ifdef GPU
			void forward_network(network net, float *input, int train)
			void forward_network_gpu(network net, cl_mem input_cl, int train)
			{
			cl_setup();
			size_t size = get_network_input_size(net);
			if(!net.input_cl){
			net.input_cl = clCreateBuffer(cl.context,
			CL_MEM_READ_WRITE, size*sizeof(float), 0, &cl.error);
			check_error(cl);
			}
			cl_write_array(net.input_cl, input, size);
			cl_mem input_cl = net.input_cl;
			int i;
			for(i = 0; i < net.n; ++i){
			if(net.types[i] == CONVOLUTIONAL){
			convolutional_layer layer = (convolutional_layer )net.layers[i];
			forward_convolutional_layer_gpu(layer, input_cl);
			input_cl = layer.output_cl;
			input = layer.output;
			}
			/*
			else if(net.types[i] == CONNECTED){
			connected_layer layer = (connected_layer )net.layers[i];
			forward_connected_layer(layer, input, train);
			@@ -72,10 +63,11 @@
			forward_normalization_layer(layer, input);
			input = layer.output;
			}
			*/
			}
			}

			#else
			#endif

			void forward_network(network net, float *input, int train)
			{
			@@ -118,7 +110,6 @@
			}
			}
			}
			#endif

			void update_network(network net)
			{
			@@ -275,45 +266,13 @@
			float X = calloc(batchd.X.cols, sizeof(float));
			float y = calloc(batchd.y.cols, sizeof(float));

			int i,j;
			int i;
			float sum = 0;
			int index = 0;
			for(i = 0; i < n; ++i){
			for(j = 0; j < batch; ++j){
			index = rand()%d.X.rows;
			memcpy(X+jd.X.cols, d.X.vals[index], d.X.colssizeof(float));
			memcpy(y+jd.y.cols, d.y.vals[index], d.y.colssizeof(float));
			}

			get_batch(d, batch, X, y);
			float err = train_network_datum(net, X, y);
			sum += err;
			//train_network_datum(net, X, y);
			/*
			float *y = d.y.vals[index];
			int class = get_predicted_class_network(net);
			correct += (y[class]?1:0);
			*/

			/*
			for(j = 0; j < d.y.cols*batch; ++j){
			printf("%6.3f ", y[j]);
			}
			printf("\n");
			for(j = 0; j < d.y.cols*batch; ++j){
			printf("%6.3f ", get_network_output(net)[j]);
			}
			printf("\n");
			printf("\n");
			*/


			//printf("%d %f %f\n", i,net.output[0], d.y.vals[index][0]);
			//if((i+1)%10 == 0){
			// printf("%d: %f\n", (i+1), (float)correct/(i+1));
			//}
			}
			//printf("Accuracy: %f\n",(float) correct/n);
			//show_image(float_to_image(32,32,3,X), "Orig");
			free(X);
			free(y);
			return (float)sum/(n*batch);

			@@ -33,6 +33,10 @@
			#endif
			} network;

			#ifdef GPU
			void forward_network_gpu(network net, cl_mem input, int train);
			#endif

			network make_network(int n, int batch);
			void forward_network(network net, float *input, int train);
			float backward_network(network net, float input, float truth);

			@@ -11,6 +11,7 @@

			void check_error(cl_info info)
			{
			clFinish(cl.queue);
			if (info.error != CL_SUCCESS) {
			printf("\n Error number %d", info.error);
			exit(1);
			@@ -27,13 +28,60 @@
			// Fetch the Platform and Device IDs; we only want one.
			cl_device_id devices[MAX_DEVICES];
			info.error=clGetPlatformIDs(1, &info.platform, &num_platforms);

			printf("=== %d OpenCL platform(s) found: ===\n", num_platforms);
			char buffer[10240];
			clGetPlatformInfo(info.platform, CL_PLATFORM_PROFILE, 10240, buffer, NULL);
			printf(" PROFILE = %s\n", buffer);
			clGetPlatformInfo(info.platform, CL_PLATFORM_VERSION, 10240, buffer, NULL);
			printf(" VERSION = %s\n", buffer);
			clGetPlatformInfo(info.platform, CL_PLATFORM_NAME, 10240, buffer, NULL);
			printf(" NAME = %s\n", buffer);
			clGetPlatformInfo(info.platform, CL_PLATFORM_VENDOR, 10240, buffer, NULL);
			printf(" VENDOR = %s\n", buffer);
			clGetPlatformInfo(info.platform, CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL);
			printf(" EXTENSIONS = %s\n", buffer);

			check_error(info);
			info.error=clGetDeviceIDs(info.platform, CL_DEVICE_TYPE_ALL, MAX_DEVICES, devices, &num_devices);
			if(num_devices > MAX_DEVICES) num_devices = MAX_DEVICES;
			printf("=== %d OpenCL device(s) found on platform:\n", num_devices);
			int i;
			for (i=0; i<num_devices; i++)
			{
			char buffer[10240];
			cl_uint buf_uint;
			cl_ulong buf_ulong;
			printf(" -- %d --\n", i);
			clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL);
			printf(" DEVICE_NAME = %s\n", buffer);
			clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL);
			printf(" DEVICE_VENDOR = %s\n", buffer);
			clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL);
			printf(" DEVICE_VERSION = %s\n", buffer);
			clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL);
			printf(" DRIVER_VERSION = %s\n", buffer);
			clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL);
			printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
			clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL);
			printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
			clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL);
			printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong);
			clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(buf_ulong), &buf_ulong, NULL);
			printf(" DEVICE_MAX_WORK_GROUP_SIZE = %llu\n", (unsigned long long)buf_ulong);
			cl_uint items;
			clGetDeviceInfo( devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint),
			&items, NULL);
			printf(" DEVICE_MAX_WORK_ITEM_DIMENSIONS = %u\n", (unsigned int)items);
			size_t workitem_size[10];
			clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, 10*sizeof(workitem_size), workitem_size, NULL);
			printf(" DEVICE_MAX_WORK_ITEM_SIZES = %u / %u / %u \n", (unsigned int)workitem_size[0], (unsigned int)workitem_size[1], (unsigned int)workitem_size[2]);

			}
			int index = getpid()%num_devices;
			printf("%d rand, %d devices, %d index\n", getpid(), num_devices, index);
			//info.device = devices[index];
			info.device = devices[1];
			info.device = devices[0];
			fprintf(stderr, "Found %d device(s)\n", num_devices);
			check_error(info);

			@@ -52,8 +100,8 @@
			cl_program cl_fprog(char filename, char options, cl_info info)
			{
			size_t srcsize;
			char src[8192];
			memset(src, 0, 8192);
			char src[64*1024];
			memset(src, 0, 64*1024);
			FILE *fil=fopen(filename,"r");
			srcsize=fread(src, sizeof src, 1, fil);
			fclose(fil);
			@@ -61,12 +109,12 @@
			// Submit the source code of the example kernel to OpenCL
			cl_program prog=clCreateProgramWithSource(info.context,1, srcptr, &srcsize, &info.error);
			check_error(info);
			char build_c[4096];
			char build_c[1024*64];
			// and compile it (after this we could extract the compiled version)
			info.error=clBuildProgram(prog, 0, 0, options, 0, 0);
			if ( info.error != CL_SUCCESS ) {
			fprintf(stderr, "Error Building Program: %d\n", info.error);
			clGetProgramBuildInfo( prog, info.device, CL_PROGRAM_BUILD_LOG, 4096, build_c, 0);
			clGetProgramBuildInfo( prog, info.device, CL_PROGRAM_BUILD_LOG, 1024*64, build_c, 0);
			fprintf(stderr, "Build Log for %s program:\n%s\n", filename, build_c);
			}
			check_error(info);
			@@ -115,7 +163,8 @@
			cl_buffer_region r;
			r.origin = offset*sizeof(float);
			r.size = size*sizeof(float);
			cl_mem sub = clCreateSubBuffer(src, CL_MEM_USE_HOST_PTR, CL_BUFFER_CREATE_TYPE_REGION, &r, 0);
			cl_mem sub = clCreateSubBuffer(src, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &r, &cl.error);
			check_error(cl);
			return sub;
			}