getting rid of sub_arrays, nvidia driver memory leak
12 files modified
1 files deleted
| | |
| | | CC=gcc |
| | | GPU=1 |
| | | COMMON=-Wall -Wfatal-errors `pkg-config --cflags opencv` -I/usr/local/cuda/include/ -I/usr/local/clblas/include/ |
| | | COMMON=-Wall -Wfatal-errors `pkg-config --cflags opencv` -I/usr/local/cuda/include/ |
| | | ifeq ($(GPU), 1) |
| | | COMMON+=-DGPU |
| | | else |
| | |
| | | else |
| | | OPTS+= -march=native |
| | | ifeq ($(GPU), 1) |
| | | LDFLAGS= -lOpenCL -lclBLAS |
| | | LDFLAGS= -lOpenCL |
| | | endif |
| | | endif |
| | | CFLAGS= $(COMMON) $(OPTS) |
| | |
| | | |
| | | void train_imagenet() |
| | | { |
| | | network net = parse_network_cfg("cfg/imagenet_backup_710.cfg"); |
| | | network net = parse_network_cfg("/home/pjreddie/imagenet_backup/imagenet_backup_slower_larger_870.cfg"); |
| | | printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); |
| | | int imgs = 1000/net.batch+1; |
| | | //imgs=1; |
| | | srand(888888); |
| | | srand(986987); |
| | | int i = 0; |
| | | char **labels = get_labels("/home/pjreddie/data/imagenet/cls.labels.list"); |
| | | list *plist = get_paths("/home/pjreddie/data/imagenet/cls.cropped.list"); |
| | | list *plist = get_paths("/data/imagenet/cls.train.list"); |
| | | char **paths = (char **)list_to_array(plist); |
| | | printf("%d\n", plist->size); |
| | | clock_t time; |
| | | while(1){ |
| | | i += 1; |
| | |
| | | printf("Loaded: %lf seconds\n", sec(clock()-time)); |
| | | time=clock(); |
| | | #ifdef GPU |
| | | float loss = train_network_sgd_gpu(net, train, imgs); |
| | | float loss = train_network_data_gpu(net, train, imgs); |
| | | printf("%d: %f, %lf seconds, %d images\n", i, loss, sec(clock()-time), i*imgs*net.batch); |
| | | #endif |
| | | free_data(train); |
| | | if(i%10==0){ |
| | | char buff[256]; |
| | | sprintf(buff, "/home/pjreddie/imagenet_backup/imagenet_backup_%d.cfg", i); |
| | | sprintf(buff, "/home/pjreddie/imagenet_backup/imagenet_backup_larger_%d.cfg", i); |
| | | save_network(net, buff); |
| | | } |
| | | } |
| | | } |
| | | |
| | | void train_imagenet_small() |
| | | { |
| | | network net = parse_network_cfg("cfg/imagenet_small.cfg"); |
| | | printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); |
| | | int imgs=1; |
| | | srand(111222); |
| | | int i = 0; |
| | | char **labels = get_labels("/home/pjreddie/data/imagenet/cls.labels.list"); |
| | | list *plist = get_paths("/data/imagenet/cls.train.list"); |
| | | char **paths = (char **)list_to_array(plist); |
| | | printf("%d\n", plist->size); |
| | | clock_t time; |
| | | |
| | | i += 1; |
| | | time=clock(); |
| | | data train = load_data_random(imgs*net.batch, paths, plist->size, labels, 1000, 256, 256); |
| | | normalize_data_rows(train); |
| | | printf("Loaded: %lf seconds\n", sec(clock()-time)); |
| | | time=clock(); |
| | | #ifdef GPU |
| | | float loss = train_network_data_gpu(net, train, imgs); |
| | | printf("%d: %f, %lf seconds, %d images\n", i, loss, sec(clock()-time), i*imgs*net.batch); |
| | | #endif |
| | | free_data(train); |
| | | char buff[256]; |
| | | sprintf(buff, "/home/pjreddie/imagenet_backup/imagenet_backup_slower_larger_%d.cfg", i); |
| | | save_network(net, buff); |
| | | } |
| | | |
| | | void test_imagenet() |
| | | { |
| | | network net = parse_network_cfg("cfg/imagenet_test.cfg"); |
| | | network net = parse_network_cfg("cfg/imagenet_test.cfg"); |
| | | //imgs=1; |
| | | srand(2222222); |
| | | int i = 0; |
| | | srand(2222222); |
| | | int i = 0; |
| | | char **names = get_labels("cfg/shortnames.txt"); |
| | | clock_t time; |
| | | char filename[256]; |
| | | int indexes[10]; |
| | | while(1){ |
| | | while(1){ |
| | | gets(filename); |
| | | image im = load_image_color(filename, 256, 256); |
| | | normalize_image(im); |
| | |
| | | time=clock(); |
| | | float *predictions = network_predict(net, X); |
| | | top_predictions(net, 10, indexes); |
| | | printf("%s: Predicted in %f seconds.\n", filename, sec(clock()-time)); |
| | | printf("%s: Predicted in %f seconds.\n", filename, sec(clock()-time)); |
| | | for(i = 0; i < 10; ++i){ |
| | | int index = indexes[i]; |
| | | printf("%s: %f\n", names[index], predictions[index]); |
| | | } |
| | | free_image(im); |
| | | } |
| | | free_image(im); |
| | | } |
| | | } |
| | | |
| | | void test_visualize() |
| | | { |
| | | network net = parse_network_cfg("cfg/assira_backup_740000.cfg"); |
| | | srand(2222222); |
| | | visualize_network(net); |
| | | cvWaitKey(0); |
| | | network net = parse_network_cfg("cfg/imagenet_test.cfg"); |
| | | visualize_network(net); |
| | | cvWaitKey(0); |
| | | } |
| | | void test_full() |
| | | { |
| | | network net = parse_network_cfg("cfg/backup_1300.cfg"); |
| | | srand(2222222); |
| | | int i,j; |
| | | int total = 100; |
| | | char *labels[] = {"cat","dog"}; |
| | | FILE *fp = fopen("preds.txt","w"); |
| | | for(i = 0; i < total; ++i){ |
| | | visualize_network(net); |
| | | cvWaitKey(100); |
| | | data test = load_data_image_pathfile_part("data/assira/test.list", i, total, labels, 2, 256, 256); |
| | | image im = float_to_image(256, 256, 3,test.X.vals[0]); |
| | | show_image(im, "input"); |
| | | cvWaitKey(100); |
| | | normalize_data_rows(test); |
| | | for(j = 0; j < test.X.rows; ++j){ |
| | | float *x = test.X.vals[j]; |
| | | forward_network(net, x, 0, 0); |
| | | int class = get_predicted_class_network(net); |
| | | fprintf(fp, "%d\n", class); |
| | | } |
| | | free_data(test); |
| | | } |
| | | fclose(fp); |
| | | network net = parse_network_cfg("cfg/backup_1300.cfg"); |
| | | srand(2222222); |
| | | int i,j; |
| | | int total = 100; |
| | | char *labels[] = {"cat","dog"}; |
| | | FILE *fp = fopen("preds.txt","w"); |
| | | for(i = 0; i < total; ++i){ |
| | | visualize_network(net); |
| | | cvWaitKey(100); |
| | | data test = load_data_image_pathfile_part("data/assira/test.list", i, total, labels, 2, 256, 256); |
| | | image im = float_to_image(256, 256, 3,test.X.vals[0]); |
| | | show_image(im, "input"); |
| | | cvWaitKey(100); |
| | | normalize_data_rows(test); |
| | | for(j = 0; j < test.X.rows; ++j){ |
| | | float *x = test.X.vals[j]; |
| | | forward_network(net, x, 0, 0); |
| | | int class = get_predicted_class_network(net); |
| | | fprintf(fp, "%d\n", class); |
| | | } |
| | | free_data(test); |
| | | } |
| | | fclose(fp); |
| | | } |
| | | |
| | | void test_cifar10() |
| | | { |
| | | network net = parse_network_cfg("cfg/cifar10_part5.cfg"); |
| | | data test = load_cifar10_data("data/cifar10/test_batch.bin"); |
| | | clock_t start = clock(), end; |
| | | clock_t start = clock(), end; |
| | | float test_acc = network_accuracy(net, test); |
| | | end = clock(); |
| | | end = clock(); |
| | | printf("%f in %f Sec\n", test_acc, (float)(end-start)/CLOCKS_PER_SEC); |
| | | visualize_network(net); |
| | | cvWaitKey(0); |
| | |
| | | int iters = 10000/net.batch; |
| | | while(++count <= 2000){ |
| | | clock_t start = clock(), end; |
| | | float loss = train_network_sgd_gpu(net, train, iters); |
| | | float loss = train_network_sgd(net, train, iters); |
| | | end = clock(); |
| | | float test_acc = network_accuracy(net, test); |
| | | //float test_acc = 0; |
| | |
| | | cvWaitKey(0); |
| | | } |
| | | |
| | | void test_gpu_net() |
| | | { |
| | | srand(222222); |
| | | network net = parse_network_cfg("cfg/nist.cfg"); |
| | | data train = load_categorical_data_csv("data/mnist/mnist_train.csv", 0, 10); |
| | | data test = load_categorical_data_csv("data/mnist/mnist_test.csv",0,10); |
| | | translate_data_rows(train, -144); |
| | | translate_data_rows(test, -144); |
| | | int count = 0; |
| | | int iters = 10000/net.batch; |
| | | while(++count <= 5){ |
| | | clock_t start = clock(), end; |
| | | float loss = train_network_sgd(net, train, iters); |
| | | end = clock(); |
| | | float test_acc = network_accuracy(net, test); |
| | | printf("%d: Loss: %f, Test Acc: %f, Time: %lf seconds, LR: %f, Momentum: %f, Decay: %f\n", count, loss, test_acc,(float)(end-start)/CLOCKS_PER_SEC, net.learning_rate, net.momentum, net.decay); |
| | | } |
| | | count = 0; |
| | | srand(222222); |
| | | net = parse_network_cfg("cfg/nist.cfg"); |
| | | while(++count <= 5){ |
| | | clock_t start = clock(), end; |
| | | float loss = train_network_sgd_gpu(net, train, iters); |
| | | end = clock(); |
| | | float test_acc = network_accuracy(net, test); |
| | | printf("%d: Loss: %f, Test Acc: %f, Time: %lf seconds, LR: %f, Momentum: %f, Decay: %f\n", count, loss, test_acc,(float)(end-start)/CLOCKS_PER_SEC, net.learning_rate, net.momentum, net.decay); |
| | | } |
| | | } |
| | | |
| | | |
| | | int main(int argc, char *argv[]) |
| | | { |
| | | test_gpu_blas(); |
| | | //train_imagenet(); |
| | | if(argc != 2){ |
| | | fprintf(stderr, "usage: %s <function>\n", argv[0]); |
| | | return 0; |
| | | } |
| | | if(0==strcmp(argv[1], "train")) train_imagenet(); |
| | | else if(0==strcmp(argv[1], "train_small")) train_imagenet_small(); |
| | | else if(0==strcmp(argv[1], "test_gpu")) test_gpu_blas(); |
| | | else if(0==strcmp(argv[1], "test")) test_gpu_net(); |
| | | //test_gpu_blas(); |
| | | //train_imagenet_small(); |
| | | //test_imagenet(); |
| | | //train_nist(); |
| | | //test_visualize(); |
| | | fprintf(stderr, "Success!\n"); |
| | | return 0; |
| | | } |
| | |
| | | |
| | | for(i = 0; i < layer.batch; ++i){ |
| | | cl_mem a = layer.filters_cl; |
| | | cl_mem b = cl_sub_array(layer.col_image_cl, i*k*n, k*n); |
| | | cl_mem c = cl_sub_array(layer.output_cl, i*m*n, m*n); |
| | | gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c,n); |
| | | clReleaseMemObject(b); |
| | | clReleaseMemObject(c); |
| | | cl_mem b = layer.col_image_cl; |
| | | cl_mem c = layer.output_cl; |
| | | gemm_ongpu_offset(0,0,m,n,k,1.,a,0,k,b,i*k*n,n,1.,c,i*m*n,n); |
| | | } |
| | | #ifdef TIMEIT |
| | | clFinish(cl.queue); |
| | |
| | | learn_bias_convolutional_layer_ongpu(layer); |
| | | |
| | | for(i = 0; i < layer.batch; ++i){ |
| | | cl_mem a = cl_sub_array(layer.delta_cl,i*m*k, m*k); |
| | | cl_mem b = cl_sub_array(layer.col_image_cl,i*k*n, k*n); |
| | | cl_mem a = layer.delta_cl; |
| | | cl_mem b = layer.col_image_cl; |
| | | cl_mem c = layer.filter_updates_cl; |
| | | |
| | | gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n); |
| | | |
| | | clReleaseMemObject(a); |
| | | clReleaseMemObject(b); |
| | | gemm_ongpu_offset(0,1,m,n,k,1,a,i*m*k,k,b,i*k*n,k,1,c,0,n); |
| | | } |
| | | //cl_read_array(layer.delta_cl, layer.delta, m*k*layer.batch); |
| | | |
| | |
| | | |
| | | for(i = 0; i < layer.batch; ++i){ |
| | | cl_mem a = layer.filters_cl; |
| | | cl_mem b = cl_sub_array(layer.delta_cl, i*k*n, k*n); |
| | | cl_mem c = cl_sub_array(layer.col_image_cl, i*m*n, m*n); |
| | | cl_mem b = layer.delta_cl; |
| | | cl_mem c = layer.col_image_cl; |
| | | |
| | | gemm_ongpu(1,0,m,n,k,1,a,m,b,n,0,c,n); |
| | | clReleaseMemObject(b); |
| | | clReleaseMemObject(c); |
| | | gemm_ongpu_offset(1,0,m,n,k,1,a,0,m,b,i*k*n,n,0,c,i*m*n,n); |
| | | } |
| | | |
| | | scal_ongpu(layer.batch*layer.h*layer.w*layer.c,0,delta_cl, 1); |
| | |
| | | return d; |
| | | } |
| | | |
| | | void get_batch(data d, int n, float *X, float *y) |
| | | void get_random_batch(data d, int n, float *X, float *y) |
| | | { |
| | | int j; |
| | | for(j = 0; j < n; ++j){ |
| | |
| | | } |
| | | } |
| | | |
| | | void get_next_batch(data d, int n, int offset, float *X, float *y) |
| | | { |
| | | int j; |
| | | for(j = 0; j < n; ++j){ |
| | | int index = offset + j; |
| | | memcpy(X+j*d.X.cols, d.X.vals[index], d.X.cols*sizeof(float)); |
| | | memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float)); |
| | | } |
| | | } |
| | | |
| | | |
| | | data load_all_cifar10() |
| | | { |
| | | data d; |
| | |
| | | data load_all_cifar10(); |
| | | list *get_paths(char *filename); |
| | | char **get_labels(char *filename); |
| | | void get_batch(data d, int n, float *X, float *y); |
| | | void get_random_batch(data d, int n, float *X, float *y); |
| | | void get_next_batch(data d, int n, int offset, float *X, float *y); |
| | | data load_categorical_data_csv(char *filename, int target, int k); |
| | | void normalize_data_rows(data d); |
| | | void scale_data_rows(data d, float s); |
| | |
| | | |
| | | #include "opencl.h" |
| | | #include <math.h> |
| | | #include <clBLAS.h> |
| | | //#include <clBLAS.h> |
| | | |
| | | #define STR_HELPER(x) #x |
| | | #define STR(x) STR_HELPER(x) |
| | |
| | | static int init = 0; |
| | | static cl_kernel gemm_kernel; |
| | | if(!init){ |
| | | gemm_kernel = get_kernel("src/gemm_new.cl", "gemm_nt", "-D BLOCK=" STR(BLOCK) ); |
| | | gemm_kernel = get_kernel("src/gemm.cl", "gemm_nt", "-D BLOCK=" STR(BLOCK) ); |
| | | init = 1; |
| | | } |
| | | return gemm_kernel; |
| | |
| | | static int init = 0; |
| | | static cl_kernel gemm_kernel; |
| | | if(!init){ |
| | | gemm_kernel = get_kernel("src/gemm_new.cl", "gemm_tn", "-D BLOCK=" STR(BLOCK) ); |
| | | gemm_kernel = get_kernel("src/gemm.cl", "gemm_tn", "-D BLOCK=" STR(BLOCK) ); |
| | | init = 1; |
| | | } |
| | | return gemm_kernel; |
| | |
| | | static int init = 0; |
| | | static cl_kernel gemm_kernel; |
| | | if(!init){ |
| | | gemm_kernel = get_kernel("src/gemm_new.cl", "gemm_nn", "-D BLOCK=" STR(BLOCK) ); |
| | | gemm_kernel = get_kernel("src/gemm.cl", "gemm_nn", "-D BLOCK=" STR(BLOCK) ); |
| | | init = 1; |
| | | } |
| | | return gemm_kernel; |
| | | } |
| | | |
| | | void gemm_ongpu_new(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | cl_mem A_gpu, int lda, |
| | | cl_mem B_gpu, int ldb, |
| | | float BETA, |
| | | cl_mem C_gpu, int ldc); |
| | | void gemm_ongpu_old(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | cl_mem A_gpu, int lda, |
| | | cl_mem B_gpu, int ldb, |
| | | float BETA, |
| | | cl_mem C_gpu, int ldc); |
| | | |
| | | void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | cl_mem A_gpu, int lda, |
| | | cl_mem B_gpu, int ldb, |
| | |
| | | cl_command_queue queue = cl.queue; |
| | | cl_event event; |
| | | cl.error = clblasSgemm(clblasRowMajor, TA?clblasTrans:clblasNoTrans, TB?clblasTrans:clblasNoTrans,M, N, K,ALPHA, A_gpu, 0, lda,B_gpu, 0, ldb,BETA, C_gpu, 0, ldc,1, &queue, 0, NULL, &event); |
| | | */ |
| | | |
| | | */ |
| | | gemm_ongpu_new(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc); |
| | | gemm_ongpu_offset(TA, TB, M, N, K, ALPHA, A_gpu, 0, lda, B_gpu, 0, ldb, BETA, C_gpu, 0, ldc); |
| | | } |
| | | |
| | | void gemm_ongpu_new(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | cl_mem A_gpu, int lda, |
| | | cl_mem B_gpu, int ldb, |
| | | void gemm_ongpu_offset(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | cl_mem A_gpu, int a_off, int lda, |
| | | cl_mem B_gpu, int b_off, int ldb, |
| | | float BETA, |
| | | cl_mem C_gpu, int ldc) |
| | | cl_mem C_gpu, int c_off, int ldc) |
| | | { |
| | | //printf("gpu: %d %d %d %d %d\n",TA, TB, M, N, K); |
| | | cl_setup(); |
| | |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(K), (void*) &K); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ALPHA), (void*) &ALPHA); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(A_gpu), (void*) &A_gpu); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(a_off), (void*) &a_off); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(lda), (void*) &lda); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(B_gpu), (void*) &B_gpu); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(b_off), (void*) &b_off); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldb), (void*) &ldb); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(BETA), (void*) &BETA); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(C_gpu), (void*) &C_gpu); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(c_off), (void*) &c_off); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc); |
| | | check_error(cl); |
| | | |
| | |
| | | check_error(cl); |
| | | } |
| | | |
| | | void gemm_ongpu_old(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | cl_mem A_gpu, int lda, |
| | | cl_mem B_gpu, int ldb, |
| | | float BETA, |
| | | cl_mem C_gpu, int ldc) |
| | | { |
| | | //printf("gpu: %d %d %d %d %d\n",TA, TB, M, N, K); |
| | | cl_setup(); |
| | | cl_kernel gemm_kernel = get_gemm_kernel(); |
| | | cl_command_queue queue = cl.queue; |
| | | |
| | | cl_uint i = 0; |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TA), (void*) &TA); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TB), (void*) &TB); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(M), (void*) &M); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(N), (void*) &N); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(K), (void*) &K); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ALPHA), (void*) &ALPHA); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(A_gpu), (void*) &A_gpu); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(lda), (void*) &lda); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(B_gpu), (void*) &B_gpu); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldb), (void*) &ldb); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(BETA), (void*) &BETA); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(C_gpu), (void*) &C_gpu); |
| | | cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc); |
| | | check_error(cl); |
| | | |
| | | const size_t global_size[] = {ceil((float)N/BLOCK)*BLOCK, ceil((float)M/BLOCK)*BLOCK}; |
| | | const size_t local_size[] = {BLOCK, BLOCK}; |
| | | |
| | | clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, local_size, 0, 0, 0); |
| | | check_error(cl); |
| | | } |
| | | |
| | | |
| | | void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | float *A, int lda, |
| | | float *B, int ldb, |
| | |
| | | __kernel void gemm_tn(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | __global float *A, int a_off, int lda, |
| | | __global float *B, int b_off, int ldb, |
| | | float BETA, |
| | | __global float *C, int c_off, int ldc) |
| | | { |
| | | A += a_off; |
| | | B += b_off; |
| | | C += c_off; |
| | | __local float Asub[BLOCK][BLOCK]; |
| | | __local float Bsub[BLOCK][BLOCK]; |
| | | |
| | | int col = get_global_id(0); |
| | | int row = get_global_id(1); |
| | | |
| | | int col_block = get_group_id(0); |
| | | int row_block = get_group_id(1); |
| | | |
| | | col = (col < N) ? col : N - 1; |
| | | row = (row < M) ? row : M - 1; |
| | | |
| | | int x = get_local_id(0); |
| | | int y = get_local_id(1); |
| | | |
| | | int i,j; |
| | | |
| | | float val = 0; |
| | | float orig = C[row*ldc + col]; |
| | | |
| | | for(i = 0; i < K; i += BLOCK){ |
| | | |
| | | int arow = y + i; |
| | | int acol = x + row_block*BLOCK; |
| | | |
| | | int brow = y + i; |
| | | int bcol = col; |
| | | |
| | | arow = (arow < K) ? arow : K-1; |
| | | acol = (acol < M) ? acol : M-1; |
| | | brow = (brow < K) ? brow : K-1; |
| | | |
| | | int aind = arow*lda + acol; |
| | | int bind = brow*ldb + bcol; |
| | | |
| | | Asub[x][y] = A[aind]; |
| | | Bsub[y][x] = B[bind]; |
| | | |
| | | barrier(CLK_LOCAL_MEM_FENCE); |
| | | |
| | | for(j = 0; j < BLOCK && i+j<K; ++j){ |
| | | val += Asub[y][j]*Bsub[j][x]; |
| | | } |
| | | barrier(CLK_LOCAL_MEM_FENCE); |
| | | } |
| | | |
| | | C[row*ldc+col] = ALPHA*val + BETA*orig; |
| | | } |
| | | |
| | | __kernel void gemm_nt(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | __global float *A, int a_off, int lda, |
| | | __global float *B, int b_off, int ldb, |
| | | float BETA, |
| | | __global float *C, int c_off, int ldc) |
| | | { |
| | | A += a_off; |
| | | B += b_off; |
| | | C += c_off; |
| | | __local float Asub[BLOCK][BLOCK]; |
| | | __local float Bsub[BLOCK][BLOCK]; |
| | | |
| | | |
| | | int col = get_global_id(0); |
| | | int row = get_global_id(1); |
| | | |
| | | int col_block = get_group_id(0); |
| | | int row_block = get_group_id(1); |
| | | |
| | | col = (col < N) ? col : N - 1; |
| | | row = (row < M) ? row : M - 1; |
| | | |
| | | int x = get_local_id(0); |
| | | int y = get_local_id(1); |
| | | |
| | | int i,j; |
| | | |
| | | float val = 0; |
| | | float orig = C[row*ldc + col]; |
| | | |
| | | for(i = 0; i < K; i += BLOCK){ |
| | | |
| | | int arow = row; |
| | | int acol = x + i; |
| | | |
| | | int brow = col_block*BLOCK + y; |
| | | int bcol = x + i; |
| | | |
| | | brow = (brow < N) ? brow : N-1; |
| | | acol = (acol < K) ? acol : K-1; |
| | | bcol = (bcol < K) ? bcol : K-1; |
| | | |
| | | int aind = arow*lda + acol; |
| | | int bind = brow*ldb + bcol; |
| | | |
| | | Asub[y][x] = A[aind]; |
| | | Bsub[x][y] = B[bind]; |
| | | |
| | | barrier(CLK_LOCAL_MEM_FENCE); |
| | | |
| | | for(j = 0; j < BLOCK && i+j<K; ++j){ |
| | | val += Asub[y][j]*Bsub[j][x]; |
| | | } |
| | | barrier(CLK_LOCAL_MEM_FENCE); |
| | | } |
| | | |
| | | C[row*ldc+col] = ALPHA*val + BETA*orig; |
| | | } |
| | | |
| | | __kernel void gemm_nn(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | __global float *A, int a_off, int lda, |
| | | __global float *B, int b_off, int ldb, |
| | | float BETA, |
| | | __global float *C, int c_off, int ldc) |
| | | { |
| | | A += a_off; |
| | | B += b_off; |
| | | C += c_off; |
| | | __local float Asub[BLOCK][BLOCK]; |
| | | __local float Bsub[BLOCK][BLOCK]; |
| | | |
| | | int col = get_global_id(0); |
| | | int row = get_global_id(1); |
| | | |
| | | col = (col < N) ? col : N - 1; |
| | | row = (row < M) ? row : M - 1; |
| | | |
| | | int x = get_local_id(0); |
| | | int y = get_local_id(1); |
| | | |
| | | int i,j; |
| | | |
| | | float orig = C[row*ldc+col]; |
| | | float val = 0; |
| | | |
| | | for(i = 0; i < K; i += BLOCK){ |
| | | |
| | | int arow = row; |
| | | int acol = x + i; |
| | | |
| | | int brow = y + i; |
| | | int bcol = col; |
| | | |
| | | acol = (acol < K) ? acol : K-1; |
| | | brow = (brow < K) ? brow : K-1; |
| | | |
| | | int aind = arow*lda + acol; |
| | | int bind = brow*ldb + bcol; |
| | | |
| | | Asub[y][x] = A[aind]; |
| | | Bsub[y][x] = B[bind]; |
| | | |
| | | barrier(CLK_LOCAL_MEM_FENCE); |
| | | |
| | | for(j = 0; j < BLOCK && i+j<K; ++j){ |
| | | val += Asub[y][j]*Bsub[j][x]; |
| | | } |
| | | barrier(CLK_LOCAL_MEM_FENCE); |
| | | } |
| | | |
| | | C[row*ldc+col] = ALPHA*val + BETA*orig; |
| | | } |
| | | |
| | | __kernel void gemm(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | __global float *A, int lda, |
| | | __global float *B, int ldb, |
| | | __global float *A, int a_off, int lda, |
| | | __global float *B, int b_off, int ldb, |
| | | float BETA, |
| | | __global float *C, int ldc) |
| | | __global float *C, int c_off, int ldc) |
| | | { |
| | | A += a_off; |
| | | B += b_off; |
| | | C += c_off; |
| | | __local float Asub[BLOCK][BLOCK]; |
| | | __local float Bsub[BLOCK][BLOCK]; |
| | | |
| | |
| | | int channels, int height, int width, |
| | | int ksize, int stride, int pad, float *data_col); |
| | | |
| | | void gemm_ongpu_offset(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | cl_mem A_gpu, int a_off, int lda, |
| | | cl_mem B_gpu, int b_off, int ldb, |
| | | float BETA, |
| | | cl_mem C_gpu, int c_off, int ldc); |
| | | |
| | | void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, |
| | | cl_mem A_gpu, int lda, |
| | | cl_mem B_gpu, int ldb, |
| | |
| | | int i; |
| | | float sum = 0; |
| | | for(i = 0; i < n; ++i){ |
| | | get_batch(d, batch, X, y); |
| | | get_random_batch(d, batch, X, y); |
| | | float err = train_network_datum_gpu(net, X, y); |
| | | sum += err; |
| | | } |
| | | free(X); |
| | | free(y); |
| | | return (float)sum/(n*batch); |
| | | } |
| | | |
| | | float train_network_data_gpu(network net, data d, int n) |
| | | { |
| | | int batch = net.batch; |
| | | float *X = calloc(batch*d.X.cols, sizeof(float)); |
| | | float *y = calloc(batch*d.y.cols, sizeof(float)); |
| | | |
| | | int i; |
| | | float sum = 0; |
| | | for(i = 0; i < n; ++i){ |
| | | get_next_batch(d, batch, i*batch, X, y); |
| | | float err = train_network_datum_gpu(net, X, y); |
| | | sum += err; |
| | | } |
| | |
| | | int i; |
| | | float sum = 0; |
| | | for(i = 0; i < n; ++i){ |
| | | get_batch(d, batch, X, y); |
| | | get_random_batch(d, batch, X, y); |
| | | float err = train_network_datum(net, X, y); |
| | | sum += err; |
| | | } |
| | |
| | | cl_mem get_network_output_cl_layer(network net, int i); |
| | | cl_mem get_network_delta_cl_layer(network net, int i); |
| | | float train_network_sgd_gpu(network net, data d, int n); |
| | | float train_network_data_gpu(network net, data d, int n); |
| | | #endif |
| | | |
| | | network make_network(int n, int batch); |
| | |
| | | #include <string.h> |
| | | #include <time.h> |
| | | #include <unistd.h> |
| | | #include <clBLAS.h> |
| | | //#include <clBLAS.h> |
| | | |
| | | #include "opencl.h" |
| | | #include "utils.h" |
| | |
| | | info.queues[i] = clCreateCommandQueue(info.context, info.device, 0, &info.error); |
| | | check_error(info); |
| | | } |
| | | info.error = clblasSetup(); |
| | | //info.error = clblasSetup(); |
| | | check_error(info); |
| | | info.initialized = 1; |
| | | return info; |
| | |
| | | void cl_setup() |
| | | { |
| | | if(!cl.initialized){ |
| | | printf("initializing\n"); |
| | | cl = cl_init(); |
| | | } |
| | | } |
| | |
| | | char *fgetl(FILE *fp) |
| | | { |
| | | if(feof(fp)) return 0; |
| | | int size = 512; |
| | | unsigned long size = 512; |
| | | char *line = malloc(size*sizeof(char)); |
| | | if(!fgets(line, size, fp)){ |
| | | free(line); |
| | |
| | | while(line[curr-1]!='\n'){ |
| | | size *= 2; |
| | | line = realloc(line, size*sizeof(char)); |
| | | if(!line) malloc_error(); |
| | | if(!line) { |
| | | printf("%ld\n", size); |
| | | malloc_error(); |
| | | } |
| | | fgets(&line[curr], size-curr, fp); |
| | | curr = strlen(line); |
| | | } |