Trying some stuff w/ dropout
17 files modified
1 files added
| | |
| | | float ramp_activate(float x){return x*(x>0)+.1*x;} |
| | | float tanh_activate(float x){return (exp(2*x)-1)/(exp(2*x)+1);} |
| | | |
| | | float linear_gradient(float x){return 1;} |
| | | float sigmoid_gradient(float x){return (1-x)*x;} |
| | | float relu_gradient(float x){return (x>0);} |
| | | float ramp_gradient(float x){return (x>0)+.1;} |
| | | float tanh_gradient(float x){return 1-x*x;} |
| | | |
| | | float activate(float x, ACTIVATION a) |
| | | { |
| | | switch(a){ |
| | |
| | | } |
| | | } |
| | | |
| | | |
| | | float gradient(float x, ACTIVATION a){ |
| | | float gradient(float x, ACTIVATION a) |
| | | { |
| | | switch(a){ |
| | | case LINEAR: |
| | | return 1; |
| | | return linear_gradient(x); |
| | | case SIGMOID: |
| | | return (1.-x)*x; |
| | | return sigmoid_gradient(x); |
| | | case RELU: |
| | | return (x>0); |
| | | return relu_gradient(x); |
| | | case RAMP: |
| | | return (x>0) + .1; |
| | | return ramp_gradient(x); |
| | | case TANH: |
| | | return 1-x*x; |
| | | return tanh_gradient(x); |
| | | } |
| | | return 0; |
| | | } |
| | |
| | | return kernel; |
| | | } |
| | | |
| | | |
| | | void activate_array_ongpu(cl_mem x, int n, ACTIVATION a) |
| | | { |
| | | cl_setup(); |
| | |
| | | clEnqueueNDRangeKernel(queue, kernel, 1, 0, &gsize, 0, 0, 0, 0); |
| | | check_error(cl); |
| | | } |
| | | |
| | | cl_kernel get_gradient_kernel() |
| | | { |
| | | static int init = 0; |
| | | static cl_kernel kernel; |
| | | if(!init){ |
| | | kernel = get_kernel("src/activations.cl", "gradient_array", 0); |
| | | init = 1; |
| | | } |
| | | return kernel; |
| | | } |
| | | |
| | | void gradient_array_ongpu(cl_mem x, int n, ACTIVATION a, cl_mem delta) |
| | | { |
| | | cl_setup(); |
| | | cl_kernel kernel = get_gradient_kernel(); |
| | | cl_command_queue queue = cl.queue; |
| | | |
| | | cl_uint i = 0; |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(x), (void*) &x); |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(n), (void*) &n); |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(a), (void*) &a); |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(delta), (void*) &delta); |
| | | check_error(cl); |
| | | |
| | | size_t gsize = n; |
| | | |
| | | clEnqueueNDRangeKernel(queue, kernel, 1, 0, &gsize, 0, 0, 0, 0); |
| | | check_error(cl); |
| | | } |
| | | #endif |
| | |
| | | float ramp_activate(float x){return x*(x>0)+.1*x;} |
| | | float tanh_activate(float x){return (exp(2*x)-1)/(exp(2*x)+1);} |
| | | |
| | | float linear_gradient(float x){return 1;} |
| | | float sigmoid_gradient(float x){return (1-x)*x;} |
| | | float relu_gradient(float x){return (x>0);} |
| | | float ramp_gradient(float x){return (x>0)+.1;} |
| | | float tanh_gradient(float x){return 1-x*x;} |
| | | |
| | | float activate(float x, ACTIVATION a) |
| | | { |
| | | switch(a){ |
| | |
| | | return 0; |
| | | } |
| | | |
| | | __kernel void activate_array(__global float *x, |
| | | const int n, const ACTIVATION a) |
| | | float gradient(float x, ACTIVATION a) |
| | | { |
| | | switch(a){ |
| | | case LINEAR: |
| | | return linear_gradient(x); |
| | | case SIGMOID: |
| | | return sigmoid_gradient(x); |
| | | case RELU: |
| | | return relu_gradient(x); |
| | | case RAMP: |
| | | return ramp_gradient(x); |
| | | case TANH: |
| | | return tanh_gradient(x); |
| | | } |
| | | return 0; |
| | | } |
| | | |
| | | __kernel void activate_array(__global float *x, int n, ACTIVATION a) |
| | | { |
| | | int i = get_global_id(0); |
| | | x[i] = activate(x[i], a); |
| | | } |
| | | |
| | | __kernel void gradient_array(__global float *x, int n, ACTIVATION a, __global float *delta) |
| | | { |
| | | int i = get_global_id(0); |
| | | delta[i] *= gradient(x[i], a); |
| | | } |
| | | |
| | |
| | | void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta); |
| | | void activate_array(float *x, const int n, const ACTIVATION a); |
| | | #ifdef GPU |
| | | cl_kernel get_activation_kernel(); |
| | | void activate_array_ongpu(cl_mem x, int n, ACTIVATION a); |
| | | void gradient_array_ongpu(cl_mem x, int n, ACTIVATION a, cl_mem delta); |
| | | #endif |
| | | |
| | | #endif |
| | |
| | | show_image_layers(edge, "Test Convolve"); |
| | | } |
| | | |
| | | #ifdef GPU |
| | | |
| | | void test_convolutional_layer() |
| | | { |
| | | int i; |
| | | image dog = load_image("data/dog.jpg",256,256); |
| | | network net = parse_network_cfg("cfg/convolutional.cfg"); |
| | | // data test = load_cifar10_data("data/cifar10/test_batch.bin"); |
| | | // float *X = calloc(net.batch*test.X.cols, sizeof(float)); |
| | | // float *y = calloc(net.batch*test.y.cols, sizeof(float)); |
| | | int in_size = get_network_input_size(net)*net.batch; |
| | | int size = get_network_output_size(net)*net.batch; |
| | | float *X = calloc(in_size, sizeof(float)); |
| | | for(i = 0; i < in_size; ++i){ |
| | | X[i] = dog.data[i%get_network_input_size(net)]; |
| | | } |
| | | // get_batch(test, net.batch, X, y); |
| | | clock_t start, end; |
| | | cl_mem input_cl = cl_make_array(X, in_size); |
| | | |
| | | forward_network_gpu(net, input_cl, 1); |
| | | start = clock(); |
| | | forward_network_gpu(net, input_cl, 1); |
| | | end = clock(); |
| | | float gpu_sec = (float)(end-start)/CLOCKS_PER_SEC; |
| | | float *gpu_out = calloc(size, sizeof(float)); |
| | | memcpy(gpu_out, get_network_output(net), size*sizeof(float)); |
| | | |
| | | start = clock(); |
| | | forward_network(net, X, 1); |
| | | end = clock(); |
| | | float cpu_sec = (float)(end-start)/CLOCKS_PER_SEC; |
| | | float *cpu_out = calloc(size, sizeof(float)); |
| | | memcpy(cpu_out, get_network_output(net), size*sizeof(float)); |
| | | |
| | | float sum = 0; |
| | | for(i = 0; i < size; ++i) { |
| | | //printf("%f, %f\n", gpu_out[i], cpu_out[i]); |
| | | sum += pow(gpu_out[i] - cpu_out[i], 2); |
| | | } |
| | | printf("gpu: %f sec, cpu: %f sec, diff: %f, size: %d\n", gpu_sec, cpu_sec, sum, size); |
| | | } |
| | | |
| | | #endif |
| | | |
| | | void test_convolve_matrix() |
| | | { |
| | | image dog = load_image("dog.jpg",300,400); |
| | |
| | | void train_nist() |
| | | { |
| | | srand(222222); |
| | | network net = parse_network_cfg("cfg/nist_final.cfg"); |
| | | network net = parse_network_cfg("cfg/nist.cfg"); |
| | | data train = load_categorical_data_csv("data/mnist/mnist_train.csv", 0, 10); |
| | | data test = load_categorical_data_csv("data/mnist/mnist_test.csv",0,10); |
| | | translate_data_rows(train, -144); |
| | |
| | | mean_array(get_network_output_layer(net,3), 100), |
| | | mean_array(get_network_output_layer(net,4), 100)); |
| | | */ |
| | | save_network(net, "cfg/nist_final2.cfg"); |
| | | //save_network(net, "cfg/nist_final2.cfg"); |
| | | |
| | | //printf("%5d Training Loss: %lf, Params: %f %f %f, ",count*1000, loss, lr, momentum, decay); |
| | | //end = clock(); |
| | |
| | | { |
| | | //train_full(); |
| | | //test_distribution(); |
| | | feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); |
| | | //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); |
| | | |
| | | //test_blas(); |
| | | //test_visualize(); |
| | |
| | | //test_split(); |
| | | //test_ensemble(); |
| | | //test_nist_single(); |
| | | test_nist(); |
| | | //test_nist(); |
| | | train_nist(); |
| | | //test_convolutional_layer(); |
| | | //test_cifar10(); |
| | | //train_cifar10(); |
| | | //test_vince(); |
| | |
| | | #include <stdio.h> |
| | | #include <math.h> |
| | | inline void col2im_add_pixel(float *im, int height, int width, int channels, |
| | | int row, int col, int channel, int pad, float val) |
| | | int b, int row, int col, int channel, int pad, float val) |
| | | { |
| | | row -= pad; |
| | | col -= pad; |
| | | |
| | | if (row < 0 || col < 0 || |
| | | row >= height || col >= width) return; |
| | | im[col + width*(row + channel*height)] += val; |
| | | im[col + width*(row + height*(channel+b*channels))] += val; |
| | | } |
| | | //This one might be too, can't remember. |
| | | void col2im_cpu(float* data_col, |
| | | const int channels, const int height, const int width, |
| | | const int ksize, const int stride, int pad, float* data_im) |
| | | void col2im_cpu(float* data_col, int batch, |
| | | int channels, int height, int width, |
| | | int ksize, int stride, int pad, float* data_im) |
| | | { |
| | | int c,h,w; |
| | | int b,c,h,w; |
| | | int height_col = (height - ksize) / stride + 1; |
| | | int width_col = (width - ksize) / stride + 1; |
| | | if (pad){ |
| | |
| | | pad = ksize/2; |
| | | } |
| | | int channels_col = channels * ksize * ksize; |
| | | int col_size = height_col*width_col*channels_col; |
| | | for(b = 0; b < batch; ++b){ |
| | | for (c = 0; c < channels_col; ++c) { |
| | | int w_offset = c % ksize; |
| | | int h_offset = (c / ksize) % ksize; |
| | |
| | | for (w = 0; w < width_col; ++w) { |
| | | int im_row = h_offset + h * stride; |
| | | int im_col = w_offset + w * stride; |
| | | double val = data_col[(c * height_col + h) * width_col + w]; |
| | | int col_index = (c * height_col + h) * width_col + w + b*col_size; |
| | | double val = data_col[col_index]; |
| | | col2im_add_pixel(data_im, height, width, channels, |
| | | im_row, im_col, c_im, pad, val); |
| | | b, im_row, im_col, c_im, pad, val); |
| | | } |
| | | } |
| | | } |
| | | } |
| | | } |
| | | |
| | | |
| | | #ifdef GPU |
| | | |
| | | #include "opencl.h" |
| | | |
| | | cl_kernel get_col2im_kernel() |
| | | { |
| | | static int init = 0; |
| | | static cl_kernel im2col_kernel; |
| | | if(!init){ |
| | | im2col_kernel = get_kernel("src/col2im.cl", "col2im", 0); |
| | | init = 1; |
| | | } |
| | | return im2col_kernel; |
| | | } |
| | | |
| | | void col2im_ongpu(cl_mem data_col, int batch, |
| | | int channels, int height, int width, |
| | | int ksize, int stride, int pad, cl_mem data_im) |
| | | { |
| | | cl_setup(); |
| | | cl_kernel kernel = get_col2im_kernel(); |
| | | cl_command_queue queue = cl.queue; |
| | | |
| | | cl_uint i = 0; |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(data_col), (void*) &data_col); |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(batch), (void*) &batch); |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(channels), (void*) &channels); |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(height), (void*) &height); |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(width), (void*) &width); |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(ksize), (void*) &ksize); |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(stride), (void*) &stride); |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(pad), (void*) &pad); |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(data_im), (void*) &data_im); |
| | | check_error(cl); |
| | | |
| | | size_t global_size = {channels*height*width*batch}; |
| | | |
| | | clEnqueueNDRangeKernel(queue, kernel, 3, 0, |
| | | global_size, 0, 0, 0, 0); |
| | | check_error(cl); |
| | | } |
| | | |
| | | #endif |
| | |
| | | int index(int row, int col) |
| | | { |
| | | |
| | | } |
| | | |
| | | __kernel void col2im(__global float *data_col, int batch, |
| | | int channels, int height, int width, |
| | | int ksize, int stride, int pad, __global float *data_im) |
| | | { |
| | | int id = get_global_id(0); |
| | | int index = id; |
| | | int w = id%width; |
| | | id /= width; |
| | | int h = id%height; |
| | | id /= height; |
| | | int c = id%channels; |
| | | id /= channels; |
| | | int b = id%batch; |
| | | |
| | | int height_col = (height - ksize) / stride + 1; |
| | | int width_col = (width - ksize) / stride + 1; |
| | | int rows = channels * ksize * ksize; |
| | | if (pad){ |
| | | height_col = 1 + (height-1) / stride; |
| | | width_col = 1 + (width-1) / stride; |
| | | pad = ksize/2; |
| | | } |
| | | int cols = height_col*width_col; |
| | | int batch_offset = b*cols*rows; |
| | | int channel_offset = c*cols*ksize*ksize; |
| | | data_col[index] = 0; |
| | | int i,j; |
| | | for(i = 0; i < ksize; ++i){ |
| | | row_offset = i*height_col*width_col; |
| | | for(j = 0; j < ksize; ++j){ |
| | | col_offset = |
| | | } |
| | | } |
| | | |
| | | data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, b, im_row, im_col, c_im, pad); |
| | | } |
| | |
| | | |
| | | for(i = 0; i < layer.batch; ++i){ |
| | | gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); |
| | | c += n*m; |
| | | in += layer.h*layer.w*layer.c; |
| | | b += k*n; |
| | | c += n*m; |
| | | } |
| | | /* |
| | | int i; |
| | | for(i = 0; i < m*n; ++i) printf("%f, ", layer.output[i]); |
| | | printf("\n"); |
| | | */ |
| | | activate_array(layer.output, m*n*layer.batch, layer.activation); |
| | | } |
| | | |
| | |
| | | |
| | | for(i = 0; i < layer.batch; ++i){ |
| | | gemm(1,0,m,n,k,1,a,m,b,n,0,c,n); |
| | | col2im_cpu(c, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, delta); |
| | | c += k*n; |
| | | delta += layer.h*layer.w*layer.c; |
| | | b += k*n; |
| | | c += m*n; |
| | | } |
| | | col2im_cpu(layer.col_image, layer.batch, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, delta); |
| | | } |
| | | } |
| | | |
| | |
| | | } |
| | | |
| | | #ifdef GPU |
| | | |
| | | cl_kernel get_convolutional_learn_bias_kernel() |
| | | { |
| | | static int init = 0; |
| | | static cl_kernel kernel; |
| | | if(!init){ |
| | | kernel = get_kernel("src/convolutional_layer.cl", "learn_bias", 0); |
| | | init = 1; |
| | | } |
| | | return kernel; |
| | | } |
| | | |
| | | void learn_bias_convolutional_layer_ongpu(convolutional_layer layer) |
| | | { |
| | | int size = convolutional_out_height(layer) * convolutional_out_width(layer); |
| | | |
| | | cl_setup(); |
| | | cl_kernel kernel = get_convolutional_learn_bias_kernel(); |
| | | cl_command_queue queue = cl.queue; |
| | | |
| | | cl_uint i = 0; |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(layer.batch), (void*) &layer.batch); |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(layer.n), (void*) &layer.n); |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(size), (void*) &size); |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(layer.delta_cl), (void*) &layer.delta_cl); |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(layer.bias_updates_cl), (void*) &layer.bias_updates_cl); |
| | | check_error(cl); |
| | | |
| | | const size_t global_size[] = {layer.n}; |
| | | |
| | | clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0); |
| | | check_error(cl); |
| | | } |
| | | |
| | | cl_kernel get_convolutional_bias_kernel() |
| | | { |
| | | static int init = 0; |
| | | static cl_kernel kernel; |
| | | if(!init){ |
| | | kernel = get_kernel("src/convolutional_layer.cl", "bias", 0); |
| | | init = 1; |
| | | } |
| | | return kernel; |
| | | } |
| | | |
| | | void bias_output_gpu(const convolutional_layer layer) |
| | | { |
| | | int out_h = convolutional_out_height(layer); |
| | | int out_w = convolutional_out_width(layer); |
| | | int size = out_h*out_w; |
| | | |
| | | cl_setup(); |
| | | cl_kernel kernel = get_convolutional_bias_kernel(); |
| | | cl_command_queue queue = cl.queue; |
| | | |
| | | cl_uint i = 0; |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(layer.n), (void*) &layer.n); |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(size), (void*) &size); |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(layer.biases_cl), (void*) &layer.biases_cl); |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(layer.output_cl), (void*) &layer.output_cl); |
| | | check_error(cl); |
| | | |
| | | const size_t global_size[] = {layer.batch, layer.n*size}; |
| | | |
| | | clEnqueueNDRangeKernel(queue, kernel, 2, 0, global_size, 0, 0, 0, 0); |
| | | check_error(cl); |
| | | } |
| | | |
| | | void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in) |
| | | { |
| | | int i; |
| | | int m = layer.n; |
| | | int k = layer.size*layer.size*layer.c; |
| | | int n = convolutional_out_height(layer)* |
| | | convolutional_out_width(layer)* |
| | | layer.batch; |
| | | convolutional_out_width(layer); |
| | | |
| | | cl_write_array(layer.filters_cl, layer.filters, m*k); |
| | | //cl_write_array(layer.filters_cl, layer.filters, m*k); |
| | | //cl_write_array(layer.biases_cl, layer.biases, m); |
| | | bias_output_gpu(layer); |
| | | im2col_ongpu(in, layer.batch, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, layer.col_image_cl); |
| | | for(i = 0; i < layer.batch; ++i){ |
| | | cl_mem a = layer.filters_cl; |
| | | cl_mem b = layer.col_image_cl; |
| | | cl_mem c = layer.output_cl; |
| | | im2col_ongpu(in, layer.batch, layer.c, layer.h, layer.w, layer.size, layer.stride, b); |
| | | gemm_ongpu(0,0,m,n,k,1,a,k,b,n,0,c,n); |
| | | activate_array_ongpu(layer.output_cl, m*n, layer.activation); |
| | | cl_read_array(layer.output_cl, layer.output, m*n); |
| | | cl_mem b = cl_sub_array(layer.col_image_cl, i*k*n, k*n); |
| | | cl_mem c = cl_sub_array(layer.output_cl, i*m*n, m*n); |
| | | gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c,n); |
| | | clReleaseMemObject(b); |
| | | clReleaseMemObject(c); |
| | | } |
| | | activate_array_ongpu(layer.output_cl, m*n*layer.batch, layer.activation); |
| | | cl_read_array(layer.output_cl, layer.output, m*n*layer.batch); |
| | | } |
| | | |
| | | void backward_convolutional_layer_gpu(convolutional_layer layer, cl_mem delta_cl) |
| | | { |
| | | int i; |
| | | int m = layer.n; |
| | | int n = layer.size*layer.size*layer.c; |
| | | int k = convolutional_out_height(layer)* |
| | | convolutional_out_width(layer); |
| | | gradient_array_ongpu(layer.output_cl, m*k*layer.batch, layer.activation, layer.delta_cl); |
| | | learn_bias_convolutional_layer_ongpu(layer); |
| | | |
| | | for(i = 0; i < layer.batch; ++i){ |
| | | cl_mem a = cl_sub_array(layer.delta_cl,i*m*k, m*k); |
| | | cl_mem b = cl_sub_array(layer.col_image_cl,i*k*n, k*n); |
| | | cl_mem c = layer.filter_updates_cl; |
| | | |
| | | gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n); |
| | | |
| | | clReleaseMemObject(a); |
| | | clReleaseMemObject(b); |
| | | } |
| | | cl_read_array(layer.filter_updates_cl, layer.filter_updates, m*n); |
| | | cl_read_array(layer.bias_updates_cl, layer.bias_updates, m); |
| | | |
| | | |
| | | if(delta_cl){ |
| | | m = layer.size*layer.size*layer.c; |
| | | k = layer.n; |
| | | n = convolutional_out_height(layer)* |
| | | convolutional_out_width(layer); |
| | | |
| | | for(i = 0; i < layer.batch; ++i){ |
| | | a = layer.filters_cl; |
| | | b = cl_sub_array(layer.delta_cl, i*k*n, k*n); |
| | | c = cl_sub_array(layer.col_image_cl, i*m*n, m*n); |
| | | |
| | | gemm_ongpu(1,0,m,n,k,1,a,m,b,n,0,c,n); |
| | | clReleaseMemObject(b); |
| | | clReleaseMemObject(c); |
| | | } |
| | | col2im_gpu(layer.col_image_cl, layer.batch, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, delta_cl); |
| | | } |
| | | } |
| | | |
| | | #endif |
| | | |
| New file |
| | |
| | | |
| | | __kernel void bias(int n, int size, __global float *biases, __global float *output) |
| | | { |
| | | int batch = get_global_id(0); |
| | | int id = get_global_id(1); |
| | | int filter = id/size; |
| | | int position = id%size; |
| | | |
| | | output[batch*n*size + id] = biases[filter]; |
| | | } |
| | | |
| | | __kernel void learn_bias(int batch, int n, int size, __global float *delta, __global float *bias_updates) |
| | | { |
| | | int i,b; |
| | | int filter = get_global_id(0); |
| | | float sum = 0; |
| | | for(b = 0; b < batch; ++b){ |
| | | for(i = 0; i < size; ++i){ |
| | | int index = i + size*(filter + n*b); |
| | | sum += delta[index]; |
| | | } |
| | | } |
| | | bias_updates[filter] += sum; |
| | | } |
| | | |
| | |
| | | |
| | | #ifdef GPU |
| | | void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in); |
| | | void backward_convolutional_layer_gpu(convolutional_layer layer, cl_mem delta_cl); |
| | | #endif |
| | | |
| | | convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, float learning_rate, float momentum, float decay); |
| | |
| | | return d; |
| | | } |
| | | |
| | | void get_batch(data d, int n, float *X, float *y) |
| | | { |
| | | int j; |
| | | for(j = 0; j < n; ++j){ |
| | | int index = rand()%d.X.rows; |
| | | memcpy(X+j*d.X.cols, d.X.vals[index], d.X.cols*sizeof(float)); |
| | | memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float)); |
| | | } |
| | | } |
| | | |
| | | data load_all_cifar10() |
| | | { |
| | | data d; |
| | |
| | | data load_cifar10_data(char *filename); |
| | | data load_all_cifar10(); |
| | | list *get_paths(char *filename); |
| | | void get_batch(data d, int n, float *X, float *y); |
| | | data load_categorical_data_csv(char *filename, int target, int k); |
| | | void normalize_data_rows(data d); |
| | | void scale_data_rows(data d, float s); |
| | |
| | | float BETA, |
| | | float *C, int ldc) |
| | | { |
| | | #ifdef GPU |
| | | gemm_gpu( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc); |
| | | #else |
| | | gemm_cpu( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc); |
| | | #endif |
| | | } |
| | | |
| | | void gemm_nn(int M, int N, int K, float ALPHA, |
| | |
| | | float BETA, |
| | | float *C, int ldc) |
| | | { |
| | | //printf("cpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc); |
| | | int i, j; |
| | | for(i = 0; i < M; ++i){ |
| | | for(j = 0; j < N; ++j){ |
| | |
| | | #define STR_HELPER(x) #x |
| | | #define STR(x) STR_HELPER(x) |
| | | |
| | | #ifdef __APPLE__ |
| | | #define BLOCK 1 |
| | | #else |
| | | #define BLOCK 8 |
| | | #endif |
| | | |
| | | cl_kernel get_gemm_kernel() |
| | | { |
| | |
| | | float BETA, |
| | | cl_mem C_gpu, int ldc) |
| | | { |
| | | //printf("gpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc); |
| | | cl_setup(); |
| | | cl_kernel gemm_kernel = get_gemm_kernel(); |
| | | cl_command_queue queue = cl.queue; |
| | |
| | | |
| | | void test_gpu_blas() |
| | | { |
| | | test_gpu_accuracy(0,0,10,576,75); |
| | | |
| | | test_gpu_accuracy(0,0,17,10,10); |
| | | test_gpu_accuracy(1,0,17,10,10); |
| | | test_gpu_accuracy(0,1,17,10,10); |
| | |
| | | test_gpu_accuracy(0,1,1000,10,100); |
| | | test_gpu_accuracy(1,1,1000,10,100); |
| | | |
| | | /* |
| | | time_gpu_random_matrix(0,0,1000,1000,100); |
| | | time_random_matrix(0,0,1000,1000,100); |
| | | |
| | |
| | | |
| | | time_gpu_random_matrix(1,1,1000,1000,100); |
| | | time_random_matrix(1,1,1000,1000,100); |
| | | */ |
| | | |
| | | } |
| | | #endif |
| | |
| | | #include "mini_blas.h" |
| | | #include <stdio.h> |
| | | |
| | | inline float im2col_get_pixel(float *im, int height, int width, int channels, |
| | | int row, int col, int channel, int pad) |
| | | int b, int row, int col, int channel, int pad) |
| | | { |
| | | row -= pad; |
| | | col -= pad; |
| | | |
| | | if (row < 0 || col < 0 || |
| | | row >= height || col >= width) return 0; |
| | | return im[col + width*(row + channel*height)]; |
| | | return im[col + width*(row + height*(channel+b*channels))]; |
| | | } |
| | | |
| | | //From Berkeley Vision's Caffe! |
| | | //https://github.com/BVLC/caffe/blob/master/LICENSE |
| | | void im2col_cpu_batch(float* data_im, |
| | | const int batch, const int channels, const int height, const int width, |
| | | const int ksize, const int stride, int pad, float* data_col) |
| | | void im2col_cpu(float* data_im, int batch, |
| | | int channels, int height, int width, |
| | | int ksize, int stride, int pad, float* data_col) |
| | | { |
| | | int c,h,w,b; |
| | | int height_col = (height - ksize) / stride + 1; |
| | |
| | | pad = ksize/2; |
| | | } |
| | | int channels_col = channels * ksize * ksize; |
| | | int im_size = height*width*channels; |
| | | //int col_size = height_col*width_col*channels_col; |
| | | for (b = 0; b < batch; ++b) { |
| | | for (c = 0; c < channels_col; ++c) { |
| | | int w_offset = c % ksize; |
| | | int h_offset = (c / ksize) % ksize; |
| | | int c_im = c / ksize / ksize; |
| | | for (h = 0; h < height_col; ++h) { |
| | | for (w = 0; w < width_col; ++w) { |
| | | int im_row = h_offset + h * stride; |
| | | int im_col = w_offset + w * stride; |
| | | int col_index = (c * height_col + h) * width_col + w + (batch-1) * c * height_col*width_col; |
| | | data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, |
| | | im_row, im_col, c_im, pad); |
| | | } |
| | | } |
| | | } |
| | | data_im += im_size; |
| | | data_col+= channels_col; |
| | | } |
| | | } |
| | | |
| | | //From Berkeley Vision's Caffe! |
| | | //https://github.com/BVLC/caffe/blob/master/LICENSE |
| | | void im2col_cpu(float* data_im, const int batch, |
| | | const int channels, const int height, const int width, |
| | | const int ksize, const int stride, int pad, float* data_col) |
| | | { |
| | | int c,h,w,b; |
| | | int height_col = (height - ksize) / stride + 1; |
| | | int width_col = (width - ksize) / stride + 1; |
| | | if (pad){ |
| | | height_col = 1 + (height-1) / stride; |
| | | width_col = 1 + (width-1) / stride; |
| | | pad = ksize/2; |
| | | } |
| | | int channels_col = channels * ksize * ksize; |
| | | int im_size = height*width*channels; |
| | | int col_size = height_col*width_col*channels_col; |
| | | for (b = 0; b < batch; ++b) { |
| | | for (c = 0; c < channels_col; ++c) { |
| | |
| | | for (w = 0; w < width_col; ++w) { |
| | | int im_row = h_offset + h * stride; |
| | | int im_col = w_offset + w * stride; |
| | | int col_index = (c * height_col + h) * width_col + w; |
| | | int col_index = (c * height_col + h) * width_col + w + b*col_size; |
| | | data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, |
| | | im_row, im_col, c_im, pad); |
| | | b, im_row, im_col, c_im, pad); |
| | | } |
| | | } |
| | | } |
| | | data_im += im_size; |
| | | data_col += col_size; |
| | | } |
| | | } |
| | | |
| | |
| | | } |
| | | |
| | | |
| | | void im2col_ongpu(cl_mem data_im, const int batch, |
| | | const int channels, const int height, const int width, |
| | | const int ksize, const int stride, cl_mem data_col) |
| | | void im2col_ongpu(cl_mem data_im, int batch, |
| | | int channels, int height, int width, |
| | | int ksize, int stride, int pad, cl_mem data_col) |
| | | { |
| | | cl_setup(); |
| | | cl_kernel im2col_kernel = get_im2col_kernel(); |
| | |
| | | cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(width), (void*) &width); |
| | | cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(ksize), (void*) &ksize); |
| | | cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(stride), (void*) &stride); |
| | | cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(pad), (void*) &pad); |
| | | cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(data_col), (void*) &data_col); |
| | | check_error(cl); |
| | | |
| | | int height_col = (height - ksize) / stride + 1; |
| | | int width_col = (width - ksize) / stride + 1; |
| | | int channels_col = channels * ksize * ksize; |
| | | if (pad){ |
| | | height_col = 1 + (height-1) / stride; |
| | | width_col = 1 + (width-1) / stride; |
| | | } |
| | | |
| | | size_t global_size[2]; |
| | | size_t local_size[2]; |
| | | global_size[0] = batch; |
| | | global_size[1] = channels_col; |
| | | local_size[0] = height_col; |
| | | local_size[1] = width_col; |
| | | global_size[0] = batch*channels_col; |
| | | global_size[1] = height_col*width_col; |
| | | |
| | | clEnqueueNDRangeKernel(queue, im2col_kernel, 2, 0, |
| | | global_size, local_size, 0, 0, 0); |
| | | global_size, 0, 0, 0, 0); |
| | | check_error(cl); |
| | | } |
| | | |
| | | void im2col_gpu(float *data_im, |
| | | const int batch, const int channels, const int height, const int width, |
| | | const int ksize, const int stride, |
| | | float *data_col) |
| | | void im2col_gpu(float *data_im, int batch, |
| | | int channels, int height, int width, |
| | | int ksize, int stride, int pad, float *data_col) |
| | | { |
| | | cl_setup(); |
| | | cl_context context = cl.context; |
| | |
| | | check_error(cl); |
| | | |
| | | im2col_ongpu(im_gpu, batch, channels, height, width, |
| | | ksize, stride, col_gpu); |
| | | ksize, stride, pad, col_gpu); |
| | | |
| | | clEnqueueReadBuffer(queue, col_gpu, CL_TRUE, 0, size, data_col, 0, 0, 0); |
| | | check_error(cl); |
| | |
| | | |
| | | __kernel void im2col(__global float *data_im, const int im_offset, |
| | | const int channels, const int height, const int width, |
| | | const int ksize, const int stride, __global float *data_col, const int col_offset) |
| | | float im2col_get_pixel(__global float *im, int height, int width, int channels, |
| | | int batch, int row, int col, int channel, int pad) |
| | | { |
| | | int b = get_global_id(0); |
| | | int c = get_global_id(1); |
| | | row -= pad; |
| | | col -= pad; |
| | | |
| | | int h = get_local_id(0); |
| | | int w = get_local_id(1); |
| | | if (row < 0 || col < 0 || row >= height || col >= width) return 0; |
| | | int index = col + width*(row + height*(channel+batch*channels)); |
| | | return im[index]; |
| | | } |
| | | |
| | | __kernel void im2col(__global float *data_im, int batch, |
| | | int channels, int height, int width, |
| | | int ksize, int stride, int pad, __global float *data_col) |
| | | { |
| | | int c,h,w,b; |
| | | int height_col = (height - ksize) / stride + 1; |
| | | int width_col = (width - ksize) / stride + 1; |
| | | if (pad){ |
| | | height_col = 1 + (height-1) / stride; |
| | | width_col = 1 + (width-1) / stride; |
| | | pad = ksize/2; |
| | | } |
| | | int gid1 = get_global_id(0); |
| | | b = gid1%batch; |
| | | c = gid1/batch; |
| | | |
| | | int gid2 = get_global_id(1); |
| | | h = gid2%height_col; |
| | | w = gid2/height_col; |
| | | |
| | | |
| | | int channels_col = channels * ksize * ksize; |
| | | |
| | | int im_offset = height*width*channels*b; |
| | | int col_offset = height_col*width_col*channels_col*b; |
| | | |
| | | int col_size = height_col*width_col*channels_col; |
| | | int w_offset = c % ksize; |
| | | int h_offset = (c / ksize) % ksize; |
| | | int c_im = c / ksize / ksize; |
| | | |
| | | data_col[(c * height_col + h) * width_col + w + col_offset] = |
| | | data_im[(c_im * height + h * stride + h_offset) * width |
| | | + w * stride + w_offset + im_offset]; |
| | | int im_row = h_offset + h * stride; |
| | | int im_col = w_offset + w * stride; |
| | | int col_index = (c * height_col + h) * width_col + w + b*col_size; |
| | | data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, b, im_row, im_col, c_im, pad); |
| | | } |
| | |
| | | void time_random_matrix(int TA, int TB, int m, int k, int n); |
| | | |
| | | #ifdef GPU |
| | | void im2col_ongpu(cl_mem data_im, const int batch, |
| | | const int channels, const int height, const int width, |
| | | const int ksize, const int stride, cl_mem data_col); |
| | | void im2col_ongpu(cl_mem data_im, int batch, |
| | | int channels, int height, int width, |
| | | int ksize, int stride, int pad, cl_mem data_col); |
| | | |
| | | void im2col_gpu(float *data_im, |
| | | const int batch, const int channels, const int height, const int width, |
| | | const int ksize, const int stride, float *data_col); |
| | | void col2im_ongpu(cl_mem data_col, int batch, |
| | | int channels, int height, int width, |
| | | int ksize, int stride, int pad, cl_mem data_im); |
| | | |
| | | void im2col_gpu(float *data_im, int batch, |
| | | int channels, int height, int width, |
| | | int ksize, int stride, int pad, float *data_col); |
| | | |
| | | void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | cl_mem A_gpu, int lda, |
| | |
| | | cl_mem C_gpu, int ldc); |
| | | #endif |
| | | |
| | | void im2col_cpu(float* data_im, const int batch, |
| | | const int channels, const int height, const int width, |
| | | const int ksize, const int stride, int pad, float* data_col); |
| | | void im2col_cpu(float* data_im, int batch, |
| | | int channels, int height, int width, |
| | | int ksize, int stride, int pad, float* data_col); |
| | | |
| | | void col2im_cpu(float* data_col, |
| | | const int channels, const int height, const int width, |
| | | const int ksize, const int stride, int pad, float* data_im); |
| | | void col2im_cpu(float* data_col, int batch, |
| | | int channels, int height, int width, |
| | | int ksize, int stride, int pad, float* data_im); |
| | | |
| | | void test_blas(); |
| | | |
| | | void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, |
| | |
| | | } |
| | | |
| | | #ifdef GPU |
| | | void forward_network(network net, float *input, int train) |
| | | void forward_network_gpu(network net, cl_mem input_cl, int train) |
| | | { |
| | | cl_setup(); |
| | | size_t size = get_network_input_size(net); |
| | | if(!net.input_cl){ |
| | | net.input_cl = clCreateBuffer(cl.context, |
| | | CL_MEM_READ_WRITE, size*sizeof(float), 0, &cl.error); |
| | | check_error(cl); |
| | | } |
| | | cl_write_array(net.input_cl, input, size); |
| | | cl_mem input_cl = net.input_cl; |
| | | int i; |
| | | for(i = 0; i < net.n; ++i){ |
| | | if(net.types[i] == CONVOLUTIONAL){ |
| | | convolutional_layer layer = *(convolutional_layer *)net.layers[i]; |
| | | forward_convolutional_layer_gpu(layer, input_cl); |
| | | input_cl = layer.output_cl; |
| | | input = layer.output; |
| | | } |
| | | /* |
| | | else if(net.types[i] == CONNECTED){ |
| | | connected_layer layer = *(connected_layer *)net.layers[i]; |
| | | forward_connected_layer(layer, input, train); |
| | |
| | | forward_normalization_layer(layer, input); |
| | | input = layer.output; |
| | | } |
| | | */ |
| | | } |
| | | } |
| | | |
| | | #else |
| | | #endif |
| | | |
| | | void forward_network(network net, float *input, int train) |
| | | { |
| | |
| | | } |
| | | } |
| | | } |
| | | #endif |
| | | |
| | | void update_network(network net) |
| | | { |
| | |
| | | float *X = calloc(batch*d.X.cols, sizeof(float)); |
| | | float *y = calloc(batch*d.y.cols, sizeof(float)); |
| | | |
| | | int i,j; |
| | | int i; |
| | | float sum = 0; |
| | | int index = 0; |
| | | for(i = 0; i < n; ++i){ |
| | | for(j = 0; j < batch; ++j){ |
| | | index = rand()%d.X.rows; |
| | | memcpy(X+j*d.X.cols, d.X.vals[index], d.X.cols*sizeof(float)); |
| | | memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float)); |
| | | } |
| | | |
| | | get_batch(d, batch, X, y); |
| | | float err = train_network_datum(net, X, y); |
| | | sum += err; |
| | | //train_network_datum(net, X, y); |
| | | /* |
| | | float *y = d.y.vals[index]; |
| | | int class = get_predicted_class_network(net); |
| | | correct += (y[class]?1:0); |
| | | */ |
| | | |
| | | /* |
| | | for(j = 0; j < d.y.cols*batch; ++j){ |
| | | printf("%6.3f ", y[j]); |
| | | } |
| | | printf("\n"); |
| | | for(j = 0; j < d.y.cols*batch; ++j){ |
| | | printf("%6.3f ", get_network_output(net)[j]); |
| | | } |
| | | printf("\n"); |
| | | printf("\n"); |
| | | */ |
| | | |
| | | |
| | | //printf("%d %f %f\n", i,net.output[0], d.y.vals[index][0]); |
| | | //if((i+1)%10 == 0){ |
| | | // printf("%d: %f\n", (i+1), (float)correct/(i+1)); |
| | | //} |
| | | } |
| | | //printf("Accuracy: %f\n",(float) correct/n); |
| | | //show_image(float_to_image(32,32,3,X), "Orig"); |
| | | free(X); |
| | | free(y); |
| | | return (float)sum/(n*batch); |
| | |
| | | #endif |
| | | } network; |
| | | |
| | | #ifdef GPU |
| | | void forward_network_gpu(network net, cl_mem input, int train); |
| | | #endif |
| | | |
| | | network make_network(int n, int batch); |
| | | void forward_network(network net, float *input, int train); |
| | | float backward_network(network net, float *input, float *truth); |
| | |
| | | |
| | | void check_error(cl_info info) |
| | | { |
| | | clFinish(cl.queue); |
| | | if (info.error != CL_SUCCESS) { |
| | | printf("\n Error number %d", info.error); |
| | | exit(1); |
| | |
| | | // Fetch the Platform and Device IDs; we only want one. |
| | | cl_device_id devices[MAX_DEVICES]; |
| | | info.error=clGetPlatformIDs(1, &info.platform, &num_platforms); |
| | | |
| | | printf("=== %d OpenCL platform(s) found: ===\n", num_platforms); |
| | | char buffer[10240]; |
| | | clGetPlatformInfo(info.platform, CL_PLATFORM_PROFILE, 10240, buffer, NULL); |
| | | printf(" PROFILE = %s\n", buffer); |
| | | clGetPlatformInfo(info.platform, CL_PLATFORM_VERSION, 10240, buffer, NULL); |
| | | printf(" VERSION = %s\n", buffer); |
| | | clGetPlatformInfo(info.platform, CL_PLATFORM_NAME, 10240, buffer, NULL); |
| | | printf(" NAME = %s\n", buffer); |
| | | clGetPlatformInfo(info.platform, CL_PLATFORM_VENDOR, 10240, buffer, NULL); |
| | | printf(" VENDOR = %s\n", buffer); |
| | | clGetPlatformInfo(info.platform, CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL); |
| | | printf(" EXTENSIONS = %s\n", buffer); |
| | | |
| | | check_error(info); |
| | | info.error=clGetDeviceIDs(info.platform, CL_DEVICE_TYPE_ALL, MAX_DEVICES, devices, &num_devices); |
| | | if(num_devices > MAX_DEVICES) num_devices = MAX_DEVICES; |
| | | printf("=== %d OpenCL device(s) found on platform:\n", num_devices); |
| | | int i; |
| | | for (i=0; i<num_devices; i++) |
| | | { |
| | | char buffer[10240]; |
| | | cl_uint buf_uint; |
| | | cl_ulong buf_ulong; |
| | | printf(" -- %d --\n", i); |
| | | clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL); |
| | | printf(" DEVICE_NAME = %s\n", buffer); |
| | | clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL); |
| | | printf(" DEVICE_VENDOR = %s\n", buffer); |
| | | clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL); |
| | | printf(" DEVICE_VERSION = %s\n", buffer); |
| | | clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL); |
| | | printf(" DRIVER_VERSION = %s\n", buffer); |
| | | clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL); |
| | | printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); |
| | | clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL); |
| | | printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); |
| | | clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL); |
| | | printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); |
| | | clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(buf_ulong), &buf_ulong, NULL); |
| | | printf(" DEVICE_MAX_WORK_GROUP_SIZE = %llu\n", (unsigned long long)buf_ulong); |
| | | cl_uint items; |
| | | clGetDeviceInfo( devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), |
| | | &items, NULL); |
| | | printf(" DEVICE_MAX_WORK_ITEM_DIMENSIONS = %u\n", (unsigned int)items); |
| | | size_t workitem_size[10]; |
| | | clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, 10*sizeof(workitem_size), workitem_size, NULL); |
| | | printf(" DEVICE_MAX_WORK_ITEM_SIZES = %u / %u / %u \n", (unsigned int)workitem_size[0], (unsigned int)workitem_size[1], (unsigned int)workitem_size[2]); |
| | | |
| | | } |
| | | int index = getpid()%num_devices; |
| | | printf("%d rand, %d devices, %d index\n", getpid(), num_devices, index); |
| | | //info.device = devices[index]; |
| | | info.device = devices[1]; |
| | | info.device = devices[0]; |
| | | fprintf(stderr, "Found %d device(s)\n", num_devices); |
| | | check_error(info); |
| | | |
| | |
| | | cl_program cl_fprog(char *filename, char *options, cl_info info) |
| | | { |
| | | size_t srcsize; |
| | | char src[8192]; |
| | | memset(src, 0, 8192); |
| | | char src[64*1024]; |
| | | memset(src, 0, 64*1024); |
| | | FILE *fil=fopen(filename,"r"); |
| | | srcsize=fread(src, sizeof src, 1, fil); |
| | | fclose(fil); |
| | |
| | | // Submit the source code of the example kernel to OpenCL |
| | | cl_program prog=clCreateProgramWithSource(info.context,1, srcptr, &srcsize, &info.error); |
| | | check_error(info); |
| | | char build_c[4096]; |
| | | char build_c[1024*64]; |
| | | // and compile it (after this we could extract the compiled version) |
| | | info.error=clBuildProgram(prog, 0, 0, options, 0, 0); |
| | | if ( info.error != CL_SUCCESS ) { |
| | | fprintf(stderr, "Error Building Program: %d\n", info.error); |
| | | clGetProgramBuildInfo( prog, info.device, CL_PROGRAM_BUILD_LOG, 4096, build_c, 0); |
| | | clGetProgramBuildInfo( prog, info.device, CL_PROGRAM_BUILD_LOG, 1024*64, build_c, 0); |
| | | fprintf(stderr, "Build Log for %s program:\n%s\n", filename, build_c); |
| | | } |
| | | check_error(info); |
| | |
| | | cl_buffer_region r; |
| | | r.origin = offset*sizeof(float); |
| | | r.size = size*sizeof(float); |
| | | cl_mem sub = clCreateSubBuffer(src, CL_MEM_USE_HOST_PTR, CL_BUFFER_CREATE_TYPE_REGION, &r, 0); |
| | | cl_mem sub = clCreateSubBuffer(src, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &r, &cl.error); |
| | | check_error(cl); |
| | | return sub; |
| | | } |
| | | |