18 files modified
1 files added
| | |
| | | CC=gcc |
| | | GPU=1 |
| | | COMMON=-Wall -Wfatal-errors `pkg-config --cflags opencv` -I/usr/local/cuda/include/ |
| | | COMMON=-Wall -Wfatal-errors `pkg-config --cflags opencv` -I/usr/local/cuda/include/ -I/usr/local/clblas/include/ |
| | | ifeq ($(GPU), 1) |
| | | COMMON+=-DGPU |
| | | else |
| | |
| | | else |
| | | OPTS+= -march=native |
| | | ifeq ($(GPU), 1) |
| | | LDFLAGS= -lOpenCL |
| | | LDFLAGS= -lOpenCL -lclBLAS |
| | | endif |
| | | endif |
| | | CFLAGS= $(COMMON) $(OPTS) |
| | |
| | | #include "mini_blas.h" |
| | | |
| | | inline void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY) |
| | | void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY) |
| | | { |
| | | int i; |
| | | for(i = 0; i < N; ++i) Y[i*INCY] += ALPHA*X[i*INCX]; |
| | | } |
| | | |
| | | inline void scal_cpu(int N, float ALPHA, float *X, int INCX) |
| | | void scal_cpu(int N, float ALPHA, float *X, int INCX) |
| | | { |
| | | int i; |
| | | for(i = 0; i < N; ++i) X[i*INCX] *= ALPHA; |
| | | } |
| | | |
| | | inline void copy_cpu(int N, float *X, int INCX, float *Y, int INCY) |
| | | void copy_cpu(int N, float *X, int INCX, float *Y, int INCY) |
| | | { |
| | | int i; |
| | | for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX]; |
| | | } |
| | | |
| | | inline float dot_cpu(int N, float *X, int INCX, float *Y, int INCY) |
| | | float dot_cpu(int N, float *X, int INCX, float *Y, int INCY) |
| | | { |
| | | int i; |
| | | float dot = 0; |
| | |
| | | srand(2222222); |
| | | int i = 0; |
| | | char *labels[] = {"cat","dog"}; |
| | | clock_t time; |
| | | while(1){ |
| | | i += 1000; |
| | | time=clock(); |
| | | data train = load_data_image_pathfile_random("data/assira/train.list", imgs*net.batch, labels, 2, 256, 256); |
| | | normalize_data_rows(train); |
| | | clock_t start = clock(), end; |
| | | float loss = train_network_sgd_gpu(net, train, imgs); |
| | | end = clock(); |
| | | printf("%d: %f, Time: %lf seconds\n", i, loss, (float)(end-start)/CLOCKS_PER_SEC ); |
| | | printf("Loaded: %lf seconds\n", sec(clock()-time)); |
| | | time=clock(); |
| | | float loss = train_network_sgd(net, train, imgs); |
| | | printf("%d: %f, Time: %lf seconds\n", i, loss, sec(clock()-time)); |
| | | free_data(train); |
| | | if(i%10000==0){ |
| | | char buff[256]; |
| | |
| | | } |
| | | } |
| | | |
| | | void train_imagenet() |
| | | { |
| | | network net = parse_network_cfg("cfg/imagenet_backup_710.cfg"); |
| | | printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); |
| | | int imgs = 1000/net.batch+1; |
| | | //imgs=1; |
| | | srand(888888); |
| | | int i = 0; |
| | | char **labels = get_labels("/home/pjreddie/data/imagenet/cls.labels.list"); |
| | | list *plist = get_paths("/home/pjreddie/data/imagenet/cls.cropped.list"); |
| | | char **paths = (char **)list_to_array(plist); |
| | | clock_t time; |
| | | while(1){ |
| | | i += 1; |
| | | time=clock(); |
| | | data train = load_data_random(imgs*net.batch, paths, plist->size, labels, 1000, 256, 256); |
| | | normalize_data_rows(train); |
| | | printf("Loaded: %lf seconds\n", sec(clock()-time)); |
| | | time=clock(); |
| | | #ifdef GPU |
| | | float loss = train_network_sgd_gpu(net, train, imgs); |
| | | printf("%d: %f, %lf seconds, %d images\n", i, loss, sec(clock()-time), i*imgs*net.batch); |
| | | #endif |
| | | free_data(train); |
| | | if(i%10==0){ |
| | | char buff[256]; |
| | | sprintf(buff, "/home/pjreddie/imagenet_backup/imagenet_backup_%d.cfg", i); |
| | | save_network(net, buff); |
| | | } |
| | | } |
| | | } |
| | | |
| | | void test_imagenet() |
| | | { |
| | | network net = parse_network_cfg("cfg/imagenet_test.cfg"); |
| | | //imgs=1; |
| | | srand(2222222); |
| | | int i = 0; |
| | | char **names = get_labels("cfg/shortnames.txt"); |
| | | clock_t time; |
| | | char filename[256]; |
| | | int indexes[10]; |
| | | while(1){ |
| | | gets(filename); |
| | | image im = load_image_color(filename, 256, 256); |
| | | normalize_image(im); |
| | | printf("%d %d %d\n", im.h, im.w, im.c); |
| | | float *X = im.data; |
| | | time=clock(); |
| | | float *predictions = network_predict(net, X); |
| | | top_predictions(net, 10, indexes); |
| | | printf("%s: Predicted in %f seconds.\n", filename, sec(clock()-time)); |
| | | for(i = 0; i < 10; ++i){ |
| | | int index = indexes[i]; |
| | | printf("%s: %f\n", names[index], predictions[index]); |
| | | } |
| | | free_image(im); |
| | | } |
| | | } |
| | | |
| | | void test_visualize() |
| | | { |
| | | network net = parse_network_cfg("cfg/voc_imagenet.cfg"); |
| | | network net = parse_network_cfg("cfg/assira_backup_740000.cfg"); |
| | | srand(2222222); |
| | | visualize_network(net); |
| | | cvWaitKey(0); |
| | |
| | | for(i = 0; i < total; ++i){ |
| | | visualize_network(net); |
| | | cvWaitKey(100); |
| | | data test = load_data_image_pathfile_part("images/assira/test.list", i, total, labels, 2, 256, 256); |
| | | data test = load_data_image_pathfile_part("data/assira/test.list", i, total, labels, 2, 256, 256); |
| | | image im = float_to_image(256, 256, 3,test.X.vals[0]); |
| | | show_image(im, "input"); |
| | | cvWaitKey(100); |
| | |
| | | int iters = 10000/net.batch; |
| | | while(++count <= 2000){ |
| | | clock_t start = clock(), end; |
| | | float loss = train_network_sgd_gpu(net, train, iters); |
| | | float loss = train_network_sgd(net, train, iters); |
| | | end = clock(); |
| | | float test_acc = network_accuracy(net, test); |
| | | //float test_acc = 0; |
| | |
| | | |
| | | int main(int argc, char *argv[]) |
| | | { |
| | | test_gpu_blas(); |
| | | //test_blas(); |
| | | train_assira(); |
| | | //train_assira(); |
| | | //test_visualize(); |
| | | //test_distribution(); |
| | | //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); |
| | | //train_imagenet(); |
| | | //test_imagenet(); |
| | | |
| | | //test_blas(); |
| | | //test_visualize(); |
| | |
| | | cl_read_array(layer.biases_cl, layer.biases, layer.outputs); |
| | | } |
| | | |
| | | void push_connected_layer(connected_layer layer) |
| | | { |
| | | cl_write_array(layer.weights_cl, layer.weights, layer.inputs*layer.outputs); |
| | | cl_write_array(layer.biases_cl, layer.biases, layer.outputs); |
| | | } |
| | | |
| | | void update_connected_layer_gpu(connected_layer layer) |
| | | { |
| | | axpy_ongpu(layer.outputs, layer.learning_rate, layer.bias_updates_cl, 1, layer.biases_cl, 1); |
| | |
| | | void forward_connected_layer_gpu(connected_layer layer, cl_mem input); |
| | | void backward_connected_layer_gpu(connected_layer layer, cl_mem input, cl_mem delta); |
| | | void update_connected_layer_gpu(connected_layer layer); |
| | | void push_connected_layer(connected_layer layer); |
| | | #endif |
| | | |
| | | #endif |
| | |
| | | { |
| | | int size = layer.size*layer.size*layer.c*layer.n; |
| | | axpy_cpu(layer.n, layer.learning_rate, layer.bias_updates, 1, layer.biases, 1); |
| | | scal_cpu(layer.n,layer.momentum, layer.bias_updates, 1); |
| | | scal_cpu(layer.n, layer.momentum, layer.bias_updates, 1); |
| | | |
| | | scal_cpu(size, 1.-layer.learning_rate*layer.decay, layer.filters, 1); |
| | | axpy_cpu(size, layer.learning_rate, layer.filter_updates, 1, layer.filters, 1); |
| | |
| | | cl_read_array(layer.biases_cl, layer.biases, layer.n); |
| | | } |
| | | |
| | | void push_convolutional_layer(convolutional_layer layer) |
| | | { |
| | | cl_write_array(layer.filters_cl, layer.filters, layer.c*layer.n*layer.size*layer.size); |
| | | cl_write_array(layer.biases_cl, layer.biases, layer.n); |
| | | } |
| | | |
| | | void update_convolutional_layer_gpu(convolutional_layer layer) |
| | | { |
| | | int size = layer.size*layer.size*layer.c*layer.n; |
| | |
| | | void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in); |
| | | void backward_convolutional_layer_gpu(convolutional_layer layer, cl_mem delta_cl); |
| | | void update_convolutional_layer_gpu(convolutional_layer layer); |
| | | void push_convolutional_layer(convolutional_layer layer); |
| | | #endif |
| | | |
| | | convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, float learning_rate, float momentum, float decay); |
| | |
| | | d.y = make_matrix(n, k); |
| | | |
| | | for(i = 0; i < n; ++i){ |
| | | image im = load_image(paths[i], h, w); |
| | | image im = load_image_color(paths[i], h, w); |
| | | d.X.vals[i] = im.data; |
| | | d.X.cols = im.h*im.w*im.c; |
| | | } |
| | | for(i = 0; i < n; ++i){ |
| | | fill_truth(paths[i], labels, k, d.y.vals[i]); |
| | | } |
| | | return d; |
| | |
| | | return d; |
| | | } |
| | | |
| | | char **get_labels(char *filename) |
| | | { |
| | | list *plist = get_paths(filename); |
| | | char **labels = (char **)list_to_array(plist); |
| | | free_list(plist); |
| | | return labels; |
| | | } |
| | | |
| | | void free_data(data d) |
| | | { |
| | | if(!d.shallow){ |
| | |
| | | return d; |
| | | } |
| | | |
| | | data load_data_random(int n, char **paths, int m, char **labels, int k, int h, int w) |
| | | { |
| | | char **random_paths = calloc(n, sizeof(char*)); |
| | | int i; |
| | | for(i = 0; i < n; ++i){ |
| | | int index = rand()%m; |
| | | random_paths[i] = paths[index]; |
| | | if(i == 0) printf("%s\n", paths[index]); |
| | | } |
| | | data d = load_data_image_paths(random_paths, n, labels, k, h, w); |
| | | free(random_paths); |
| | | return d; |
| | | } |
| | | |
| | | data load_data_image_pathfile_random(char *filename, int n, char **labels, int k, int h, int w) |
| | | { |
| | | int i; |
| | |
| | | |
| | | |
| | | void free_data(data d); |
| | | data load_data_random(int n, char **paths, int m, char **labels, int k, int h, int w); |
| | | data load_data_image_pathfile(char *filename, char **labels, int k, int h, int w); |
| | | data load_data_image_pathfile_part(char *filename, int part, int total, |
| | | char **labels, int k, int h, int w); |
| | |
| | | data load_cifar10_data(char *filename); |
| | | data load_all_cifar10(); |
| | | list *get_paths(char *filename); |
| | | char **get_labels(char *filename); |
| | | void get_batch(data d, int n, float *X, float *y); |
| | | data load_categorical_data_csv(char *filename, int target, int k); |
| | | void normalize_data_rows(data d); |
| | |
| | | #include "mini_blas.h" |
| | | #include <clBLAS.h> |
| | | #include "utils.h" |
| | | |
| | | void gemm(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | float *A, int lda, |
| | |
| | | |
| | | #include "opencl.h" |
| | | #include <math.h> |
| | | #include <clBLAS.h> |
| | | |
| | | #define STR_HELPER(x) #x |
| | | #define STR(x) STR_HELPER(x) |
| | |
| | | #ifdef __APPLE__ |
| | | #define BLOCK 1 |
| | | #else |
| | | #define BLOCK 8 |
| | | #define BLOCK 16 |
| | | #endif |
| | | |
| | | cl_kernel get_gemm_kernel() |
| | |
| | | return gemm_kernel; |
| | | } |
| | | |
| | | cl_kernel get_gemm_nt_kernel() |
| | | { |
| | | static int init = 0; |
| | | static cl_kernel gemm_kernel; |
| | | if(!init){ |
| | | gemm_kernel = get_kernel("src/gemm_new.cl", "gemm_nt", "-D BLOCK=" STR(BLOCK) ); |
| | | init = 1; |
| | | } |
| | | return gemm_kernel; |
| | | } |
| | | |
| | | cl_kernel get_gemm_tn_kernel() |
| | | { |
| | | static int init = 0; |
| | | static cl_kernel gemm_kernel; |
| | | if(!init){ |
| | | gemm_kernel = get_kernel("src/gemm_new.cl", "gemm_tn", "-D BLOCK=" STR(BLOCK) ); |
| | | init = 1; |
| | | } |
| | | return gemm_kernel; |
| | | } |
| | | |
| | | cl_kernel get_gemm_nn_kernel() |
| | | { |
| | | static int init = 0; |
| | | static cl_kernel gemm_kernel; |
| | | if(!init){ |
| | | gemm_kernel = get_kernel("src/gemm_new.cl", "gemm_nn", "-D BLOCK=" STR(BLOCK) ); |
| | | init = 1; |
| | | } |
| | | return gemm_kernel; |
| | | } |
| | | |
| | | void gemm_ongpu_new(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | cl_mem A_gpu, int lda, |
| | | cl_mem B_gpu, int ldb, |
| | | float BETA, |
| | | cl_mem C_gpu, int ldc); |
| | | void gemm_ongpu_old(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | cl_mem A_gpu, int lda, |
| | | cl_mem B_gpu, int ldb, |
| | |
| | | float BETA, |
| | | cl_mem C_gpu, int ldc) |
| | | { |
| | | /* |
| | | cl_setup(); |
| | | //cl.error = clblasSgemm(clblasRowMajor, TA?clblasTrans:clblasNoTrans, TB?clblasTrans:clblasNoTrans,M, N, K,ALPHA, A_gpu, 0, lda,B_gpu, 0, ldb,BETA, C_gpu, 0, ldc,1, &queue, 0, NULL, &event); |
| | | //check_error(cl); |
| | | gemm_ongpu_old(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc); |
| | | cl_command_queue queue = cl.queue; |
| | | cl_event event; |
| | | cl.error = clblasSgemm(clblasRowMajor, TA?clblasTrans:clblasNoTrans, TB?clblasTrans:clblasNoTrans,M, N, K,ALPHA, A_gpu, 0, lda,B_gpu, 0, ldb,BETA, C_gpu, 0, ldc,1, &queue, 0, NULL, &event); |
| | | */ |
| | | |
| | | gemm_ongpu_new(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc); |
| | | } |
| | | |
| | | void gemm_ongpu_new(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | cl_mem A_gpu, int lda, |
| | | cl_mem B_gpu, int ldb, |
| | | float BETA, |
| | | cl_mem C_gpu, int ldc) |
| | | { |
| | | //printf("gpu: %d %d %d %d %d\n",TA, TB, M, N, K); |
| | | cl_setup(); |
| | | cl_kernel gemm_kernel = get_gemm_kernel(); |
| | | if(!TA && !TB) gemm_kernel = get_gemm_nn_kernel(); |
| | | if(!TA && TB) gemm_kernel = get_gemm_nt_kernel(); |
| | | if(TA && !TB) gemm_kernel = get_gemm_tn_kernel(); |
| | | cl_command_queue queue = cl.queue; |
| | | |
| | | cl_uint i = 0; |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TA), (void*) &TA); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TB), (void*) &TB); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(M), (void*) &M); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(N), (void*) &N); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(K), (void*) &K); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ALPHA), (void*) &ALPHA); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(A_gpu), (void*) &A_gpu); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(lda), (void*) &lda); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(B_gpu), (void*) &B_gpu); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldb), (void*) &ldb); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(BETA), (void*) &BETA); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(C_gpu), (void*) &C_gpu); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc); |
| | | check_error(cl); |
| | | |
| | | const size_t global_size[] = {ceil((float)N/BLOCK)*BLOCK, ceil((float)M/BLOCK)*BLOCK}; |
| | | const size_t local_size[] = {BLOCK, BLOCK}; |
| | | |
| | | clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, local_size, 0, 0, 0); |
| | | check_error(cl); |
| | | } |
| | | |
| | | void gemm_ongpu_old(int TA, int TB, int M, int N, int K, float ALPHA, |
| | |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc); |
| | | check_error(cl); |
| | | |
| | | const size_t global_size[] = {ceil((float)M/BLOCK)*BLOCK, ceil((float)N/BLOCK)*BLOCK}; |
| | | const size_t global_size[] = {ceil((float)N/BLOCK)*BLOCK, ceil((float)M/BLOCK)*BLOCK}; |
| | | const size_t local_size[] = {BLOCK, BLOCK}; |
| | | |
| | | clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, local_size, 0, 0, 0); |
| | |
| | | float *c = random_matrix(m,n); |
| | | int i; |
| | | clock_t start = clock(), end; |
| | | for(i = 0; i<10; ++i){ |
| | | for(i = 0; i<32; ++i){ |
| | | gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n); |
| | | } |
| | | end = clock(); |
| | |
| | | free(c); |
| | | } |
| | | |
| | | void time_ongpu(int TA, int TB, int m, int k, int n) |
| | | { |
| | | int iter = 100; |
| | | float *a = random_matrix(m,k); |
| | | float *b = random_matrix(k,n); |
| | | |
| | | int lda = (!TA)?k:m; |
| | | int ldb = (!TB)?n:k; |
| | | |
| | | float *c = random_matrix(m,n); |
| | | |
| | | cl_mem a_cl = cl_make_array(a, m*k); |
| | | cl_mem b_cl = cl_make_array(b, k*n); |
| | | cl_mem c_cl = cl_make_array(c, m*n); |
| | | |
| | | int i; |
| | | clock_t start = clock(), end; |
| | | for(i = 0; i<iter; ++i){ |
| | | gemm_ongpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n); |
| | | } |
| | | int flop = m*n*(2*k+3)*iter; |
| | | float gflop = flop/pow(10., 9); |
| | | end = clock(); |
| | | float seconds = sec(end-start); |
| | | printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds); |
| | | clReleaseMemObject(a_cl); |
| | | clReleaseMemObject(b_cl); |
| | | clReleaseMemObject(c_cl); |
| | | free(a); |
| | | free(b); |
| | | free(c); |
| | | } |
| | | |
| | | void test_gpu_accuracy(int TA, int TB, int m, int k, int n) |
| | | { |
| | | srand(0); |
| | |
| | | //printf("%f %f\n", c[i], c_gpu[i]); |
| | | sse += pow(c[i]-c_gpu[i], 2); |
| | | } |
| | | printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %g MSE\n",m,k,k,n, TA, TB, sse/(m*n)); |
| | | printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %g SSE\n",m,k,k,n, TA, TB, sse/(m*n)); |
| | | free(a); |
| | | free(b); |
| | | free(c); |
| | | free(c_gpu); |
| | | } |
| | | |
| | | void test_gpu_blas() |
| | | { |
| | | /* |
| | | test_gpu_accuracy(0,0,10,576,75); |
| | | |
| | | test_gpu_accuracy(0,0,17,10,10); |
| | |
| | | test_gpu_accuracy(1,0,1000,10,100); |
| | | test_gpu_accuracy(0,1,1000,10,100); |
| | | test_gpu_accuracy(1,1,1000,10,100); |
| | | */ |
| | | test_gpu_accuracy(0,0,131,4093,1199); |
| | | test_gpu_accuracy(0,1,131,4093,1199); |
| | | test_gpu_accuracy(1,0,131,4093,1199); |
| | | test_gpu_accuracy(1,1,131,4093,1199); |
| | | |
| | | time_ongpu(0,0,1024,1024,1024); |
| | | time_ongpu(0,1,1024,1024,1024); |
| | | time_ongpu(1,0,1024,1024,1024); |
| | | time_ongpu(1,1,1024,1024,1024); |
| | | |
| | | time_ongpu(0,0,128,4096,1200); |
| | | time_ongpu(0,1,128,4096,1200); |
| | | time_ongpu(1,0,128,4096,1200); |
| | | time_ongpu(1,1,128,4096,1200); |
| | | |
| | | /* |
| | | time_gpu_random_matrix(0,0,1000,1000,100); |
| | |
| | | |
| | | float val = 0; |
| | | |
| | | int row_block = get_group_id(0); |
| | | int col_block = get_group_id(1); |
| | | int row_block = get_group_id(1); |
| | | int col_block = get_group_id(0); |
| | | |
| | | int sub_row = get_local_id(0); |
| | | int sub_col = get_local_id(1); |
| | | int sub_row = get_local_id(1); |
| | | int sub_col = get_local_id(0); |
| | | |
| | | int row = row_block*BLOCK + sub_row; |
| | | int col = col_block*BLOCK + sub_col; |
| New file |
| | |
| | | __kernel void gemm_tn(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | __global float *A, int lda, |
| | | __global float *B, int ldb, |
| | | float BETA, |
| | | __global float *C, int ldc) |
| | | { |
| | | __local float Asub[BLOCK][BLOCK]; |
| | | __local float Bsub[BLOCK][BLOCK]; |
| | | |
| | | int col = get_global_id(0); |
| | | int row = get_global_id(1); |
| | | |
| | | int col_block = get_group_id(0); |
| | | int row_block = get_group_id(1); |
| | | |
| | | col = (col < N) ? col : N - 1; |
| | | row = (row < M) ? row : M - 1; |
| | | |
| | | int x = get_local_id(0); |
| | | int y = get_local_id(1); |
| | | |
| | | int i,j; |
| | | |
| | | float val = 0; |
| | | float orig = C[row*ldc + col]; |
| | | |
| | | for(i = 0; i < K; i += BLOCK){ |
| | | |
| | | int arow = y + i; |
| | | int acol = x + row_block*BLOCK; |
| | | |
| | | int brow = y + i; |
| | | int bcol = col; |
| | | |
| | | arow = (arow < K) ? arow : K-1; |
| | | acol = (acol < M) ? acol : M-1; |
| | | brow = (brow < K) ? brow : K-1; |
| | | |
| | | int aind = arow*lda + acol; |
| | | int bind = brow*ldb + bcol; |
| | | |
| | | Asub[x][y] = A[aind]; |
| | | Bsub[y][x] = B[bind]; |
| | | |
| | | barrier(CLK_LOCAL_MEM_FENCE); |
| | | |
| | | for(j = 0; j < BLOCK && i+j<K; ++j){ |
| | | val += Asub[y][j]*Bsub[j][x]; |
| | | } |
| | | barrier(CLK_LOCAL_MEM_FENCE); |
| | | } |
| | | |
| | | C[row*ldc+col] = ALPHA*val + BETA*orig; |
| | | } |
| | | |
| | | __kernel void gemm_nt(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | __global float *A, int lda, |
| | | __global float *B, int ldb, |
| | | float BETA, |
| | | __global float *C, int ldc) |
| | | { |
| | | __local float Asub[BLOCK][BLOCK]; |
| | | __local float Bsub[BLOCK][BLOCK]; |
| | | |
| | | |
| | | int col = get_global_id(0); |
| | | int row = get_global_id(1); |
| | | |
| | | int col_block = get_group_id(0); |
| | | int row_block = get_group_id(1); |
| | | |
| | | col = (col < N) ? col : N - 1; |
| | | row = (row < M) ? row : M - 1; |
| | | |
| | | int x = get_local_id(0); |
| | | int y = get_local_id(1); |
| | | |
| | | int i,j; |
| | | |
| | | float val = 0; |
| | | float orig = C[row*ldc + col]; |
| | | |
| | | for(i = 0; i < K; i += BLOCK){ |
| | | |
| | | int arow = row; |
| | | int acol = x + i; |
| | | |
| | | int brow = col_block*BLOCK + y; |
| | | int bcol = x + i; |
| | | |
| | | brow = (brow < N) ? brow : N-1; |
| | | acol = (acol < K) ? acol : K-1; |
| | | bcol = (bcol < K) ? bcol : K-1; |
| | | |
| | | int aind = arow*lda + acol; |
| | | int bind = brow*ldb + bcol; |
| | | |
| | | Asub[y][x] = A[aind]; |
| | | Bsub[x][y] = B[bind]; |
| | | |
| | | barrier(CLK_LOCAL_MEM_FENCE); |
| | | |
| | | for(j = 0; j < BLOCK && i+j<K; ++j){ |
| | | val += Asub[y][j]*Bsub[j][x]; |
| | | } |
| | | barrier(CLK_LOCAL_MEM_FENCE); |
| | | } |
| | | |
| | | C[row*ldc+col] = ALPHA*val + BETA*orig; |
| | | } |
| | | |
| | | __kernel void gemm_nn(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | __global float *A, int lda, |
| | | __global float *B, int ldb, |
| | | float BETA, |
| | | __global float *C, int ldc) |
| | | { |
| | | __local float Asub[BLOCK][BLOCK]; |
| | | __local float Bsub[BLOCK][BLOCK]; |
| | | |
| | | int col = get_global_id(0); |
| | | int row = get_global_id(1); |
| | | |
| | | col = (col < N) ? col : N - 1; |
| | | row = (row < M) ? row : M - 1; |
| | | |
| | | int x = get_local_id(0); |
| | | int y = get_local_id(1); |
| | | |
| | | int i,j; |
| | | |
| | | float orig = C[row*ldc+col]; |
| | | float val = 0; |
| | | |
| | | for(i = 0; i < K; i += BLOCK){ |
| | | |
| | | int arow = row; |
| | | int acol = x + i; |
| | | |
| | | int brow = y + i; |
| | | int bcol = col; |
| | | |
| | | acol = (acol < K) ? acol : K-1; |
| | | brow = (brow < K) ? brow : K-1; |
| | | |
| | | int aind = arow*lda + acol; |
| | | int bind = brow*ldb + bcol; |
| | | |
| | | Asub[y][x] = A[aind]; |
| | | Bsub[y][x] = B[bind]; |
| | | |
| | | barrier(CLK_LOCAL_MEM_FENCE); |
| | | |
| | | for(j = 0; j < BLOCK && i+j<K; ++j){ |
| | | val += Asub[y][j]*Bsub[j][x]; |
| | | } |
| | | barrier(CLK_LOCAL_MEM_FENCE); |
| | | } |
| | | |
| | | C[row*ldc+col] = ALPHA*val + BETA*orig; |
| | | } |
| | | |
| | |
| | | // Will do a scaled image resize with the correct aspect ratio. |
| | | outImg = resizeImage(croppedImg, newHeight, newWidth, 0); |
| | | cvReleaseImage( &croppedImg ); |
| | | |
| | | } |
| | | else { |
| | | |
| | |
| | | return out; |
| | | } |
| | | |
| | | image load_image_color(char *filename, int h, int w) |
| | | { |
| | | IplImage* src = 0; |
| | | if( (src = cvLoadImage(filename, 1)) == 0 ) |
| | | { |
| | | printf("Cannot load file image %s\n", filename); |
| | | exit(0); |
| | | } |
| | | if(h && w && (src->height != h || src->width != w)){ |
| | | printf("Resized!\n"); |
| | | IplImage *resized = resizeImage(src, h, w, 1); |
| | | cvReleaseImage(&src); |
| | | src = resized; |
| | | } |
| | | image out = ipl_to_image(src); |
| | | cvReleaseImage(&src); |
| | | return out; |
| | | } |
| | | |
| | | image load_image(char *filename, int h, int w) |
| | | { |
| | | IplImage* src = 0; |
| | |
| | | image float_to_image(int h, int w, int c, float *data); |
| | | image copy_image(image p); |
| | | image load_image(char *filename, int h, int w); |
| | | image load_image_color(char *filename, int h, int w); |
| | | image ipl_to_image(IplImage* src); |
| | | |
| | | float get_pixel(image m, int x, int y, int c); |
| | |
| | | float *B, int ldb, |
| | | float BETA, |
| | | float *C, int ldc); |
| | | inline void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY); |
| | | inline void copy_cpu(int N, float *X, int INCX, float *Y, int INCY); |
| | | inline void scal_cpu(int N, float ALPHA, float *X, int INCX); |
| | | inline float dot_cpu(int N, float *X, int INCX, float *Y, int INCY); |
| | | void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY); |
| | | void copy_cpu(int N, float *X, int INCX, float *Y, int INCY); |
| | | void scal_cpu(int N, float ALPHA, float *X, int INCX); |
| | | float dot_cpu(int N, float *X, int INCX, float *Y, int INCY); |
| | | void test_gpu_blas(); |
| | |
| | | image *prev = 0; |
| | | int i; |
| | | char buff[256]; |
| | | show_image(get_network_image_layer(net, 0), "Crop"); |
| | | //show_image(get_network_image_layer(net, 0), "Crop"); |
| | | for(i = 0; i < net.n; ++i){ |
| | | sprintf(buff, "Layer %d", i); |
| | | if(net.types[i] == CONVOLUTIONAL){ |
| | |
| | | } |
| | | } |
| | | |
| | | void top_predictions(network net, int n, int *index) |
| | | { |
| | | int i,j; |
| | | int k = get_network_output_size(net); |
| | | float *out = get_network_output(net); |
| | | float thresh = FLT_MAX; |
| | | for(i = 0; i < n; ++i){ |
| | | float max = -FLT_MAX; |
| | | int max_i = -1; |
| | | for(j = 0; j < k; ++j){ |
| | | float val = out[j]; |
| | | if(val > max && val < thresh){ |
| | | max = val; |
| | | max_i = j; |
| | | } |
| | | } |
| | | index[i] = max_i; |
| | | thresh = max; |
| | | } |
| | | } |
| | | |
| | | float *network_predict(network net, float *input) |
| | | { |
| | | forward_network(net, input, 0, 0); |
| | |
| | | float train_network_batch(network net, data d, int n); |
| | | void train_network(network net, data d); |
| | | matrix network_predict_data(network net, data test); |
| | | float *network_predict(network net, float *input); |
| | | float network_accuracy(network net, data d); |
| | | float network_accuracy_multi(network net, data d, int n); |
| | | void top_predictions(network net, int n, int *index); |
| | | float *get_network_output(network net); |
| | | float *get_network_output_layer(network net, int i); |
| | | float *get_network_delta_layer(network net, int i); |
| | |
| | | #include <string.h> |
| | | #include <time.h> |
| | | #include <unistd.h> |
| | | //#include <clBLAS.h> |
| | | #include <clBLAS.h> |
| | | |
| | | #include "opencl.h" |
| | | #include "utils.h" |
| | |
| | | |
| | | } |
| | | int index = getpid()%num_devices; |
| | | index = 0; |
| | | index = 1; |
| | | printf("%d rand, %d devices, %d index\n", getpid(), num_devices, index); |
| | | info.device = devices[index]; |
| | | fprintf(stderr, "Found %d device(s)\n", num_devices); |
| | |
| | | info.queues[i] = clCreateCommandQueue(info.context, info.device, 0, &info.error); |
| | | check_error(info); |
| | | } |
| | | //info.error = clblasSetup(); |
| | | info.error = clblasSetup(); |
| | | check_error(info); |
| | | info.initialized = 1; |
| | | return info; |
| | |
| | | |
| | | convolutional_layer *parse_convolutional(list *options, network *net, int count) |
| | | { |
| | | int i; |
| | | int h,w,c; |
| | | float learning_rate, momentum, decay; |
| | | int n = option_find_int(options, "filters",1); |
| | |
| | | if(h == 0) error("Layer before convolutional layer must output image."); |
| | | } |
| | | convolutional_layer *layer = make_convolutional_layer(net->batch,h,w,c,n,size,stride,pad,activation,learning_rate,momentum,decay); |
| | | char *data = option_find_str(options, "data", 0); |
| | | if(data){ |
| | | char *curr = data; |
| | | char *next = data; |
| | | for(i = 0; i < n; ++i){ |
| | | while(*++next !='\0' && *next != ','); |
| | | *next = '\0'; |
| | | sscanf(curr, "%g", &layer->biases[i]); |
| | | curr = next+1; |
| | | } |
| | | for(i = 0; i < c*n*size*size; ++i){ |
| | | while(*++next !='\0' && *next != ','); |
| | | *next = '\0'; |
| | | sscanf(curr, "%g", &layer->filters[i]); |
| | | curr = next+1; |
| | | } |
| | | } |
| | | char *weights = option_find_str(options, "weights", 0); |
| | | char *biases = option_find_str(options, "biases", 0); |
| | | parse_data(biases, layer->biases, n); |
| | | parse_data(weights, layer->filters, c*n*size*size); |
| | | parse_data(biases, layer->biases, n); |
| | | #ifdef GPU |
| | | push_convolutional_layer(*layer); |
| | | #endif |
| | | option_unused(options); |
| | | return layer; |
| | | } |
| | | |
| | | connected_layer *parse_connected(list *options, network *net, int count) |
| | | { |
| | | int i; |
| | | int input; |
| | | float learning_rate, momentum, decay; |
| | | int output = option_find_int(options, "output",1); |
| | |
| | | input = get_network_output_size_layer(*net, count-1); |
| | | } |
| | | connected_layer *layer = make_connected_layer(net->batch, input, output, activation,learning_rate,momentum,decay); |
| | | char *data = option_find_str(options, "data", 0); |
| | | if(data){ |
| | | char *curr = data; |
| | | char *next = data; |
| | | for(i = 0; i < output; ++i){ |
| | | while(*++next !='\0' && *next != ','); |
| | | *next = '\0'; |
| | | sscanf(curr, "%g", &layer->biases[i]); |
| | | curr = next+1; |
| | | } |
| | | for(i = 0; i < input*output; ++i){ |
| | | while(*++next !='\0' && *next != ','); |
| | | *next = '\0'; |
| | | sscanf(curr, "%g", &layer->weights[i]); |
| | | curr = next+1; |
| | | } |
| | | } |
| | | char *weights = option_find_str(options, "weights", 0); |
| | | char *biases = option_find_str(options, "biases", 0); |
| | | parse_data(biases, layer->biases, output); |
| | | parse_data(weights, layer->weights, input*output); |
| | | #ifdef GPU |
| | | push_connected_layer(*layer); |
| | | #endif |
| | | option_unused(options); |
| | | return layer; |
| | | } |