| | |
| | | int iters = 10000/net.batch; |
| | | while(++count <= 2000){ |
| | | clock_t start = clock(), end; |
| | | float loss = train_network_sgd(net, train, iters); |
| | | float loss = train_network_sgd_gpu(net, train, iters); |
| | | end = clock(); |
| | | float test_acc = network_accuracy(net, test); |
| | | //float test_acc = 0; |
| | |
| | | |
| | | int main(int argc, char *argv[]) |
| | | { |
| | | //test_gpu_blas(); |
| | | train_imagenet(); |
| | | test_gpu_blas(); |
| | | //train_imagenet(); |
| | | //train_nist(); |
| | | fprintf(stderr, "Success!\n"); |
| | | return 0; |
| | | } |
| | |
| | | |
| | | int w_start = (w<ksize)?0:(w-ksize)/stride + 1; |
| | | int w_end = w/stride + 1; |
| | | if(width_col < w_end) w_end = width_col; |
| | | w_end = (width_col < w_end) ? width_col : w_end; |
| | | |
| | | int h_start = (h<ksize)?0:(h-ksize)/stride+1; |
| | | int h_end = h/stride + 1; |
| | | if(height_col < h_end) h_end = height_col; |
| | | h_end = (height_col < h_end) ? height_col : h_end; |
| | | |
| | | int rows = channels * ksize * ksize; |
| | | int cols = height_col*width_col; |
| | |
| | | check_error(cl); |
| | | } |
| | | |
| | | #define TIMEIT |
| | | //#define TIMEIT |
| | | |
| | | void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in) |
| | | { |
| | |
| | | float BETA, |
| | | cl_mem C_gpu, int ldc) |
| | | { |
| | | /* |
| | | cl_setup(); |
| | | cl_command_queue queue = cl.queue; |
| | | cl_event event; |
| | | cl.error = clblasSgemm(clblasRowMajor, TA?clblasTrans:clblasNoTrans, TB?clblasTrans:clblasNoTrans,M, N, K,ALPHA, A_gpu, 0, lda,B_gpu, 0, ldb,BETA, C_gpu, 0, ldc,1, &queue, 0, NULL, &event); |
| | | |
| | | //gemm_ongpu_new(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc); |
| | | */ |
| | | gemm_ongpu_new(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc); |
| | | } |
| | | |
| | | void gemm_ongpu_new(int TA, int TB, int M, int N, int K, float ALPHA, |
| | |
| | | width_col = 1 + (width-1) / stride; |
| | | } |
| | | |
| | | size_t global_size[2]; |
| | | global_size[0] = batch*channels_col; |
| | | global_size[1] = height_col*width_col; |
| | | size_t global_size = batch*channels_col*height_col*width_col; |
| | | |
| | | clEnqueueNDRangeKernel(queue, im2col_kernel, 2, 0, |
| | | global_size, 0, 0, 0, 0); |
| | | clEnqueueNDRangeKernel(queue, im2col_kernel, 1, 0, |
| | | &global_size, 0, 0, 0, 0); |
| | | check_error(cl); |
| | | } |
| | | |
| | |
| | | int c,h,w,b; |
| | | int height_col = (height - ksize) / stride + 1; |
| | | int width_col = (width - ksize) / stride + 1; |
| | | int channels_col = channels * ksize * ksize; |
| | | if (pad){ |
| | | height_col = 1 + (height-1) / stride; |
| | | width_col = 1 + (width-1) / stride; |
| | | pad = ksize/2; |
| | | } |
| | | int gid1 = get_global_id(0); |
| | | b = gid1%batch; |
| | | c = gid1/batch; |
| | | int id = get_global_id(0); |
| | | w = id % width_col; |
| | | id /= width_col; |
| | | h = id % height_col; |
| | | id /= height_col; |
| | | c = id % channels_col; |
| | | id /= channels_col; |
| | | b = id % batch; |
| | | id /= batch; |
| | | |
| | | int gid2 = get_global_id(1); |
| | | h = gid2%height_col; |
| | | w = gid2/height_col; |
| | | |
| | | |
| | | int channels_col = channels * ksize * ksize; |
| | | int col_size = height_col*width_col*channels_col; |
| | | int w_offset = c % ksize; |
| | | int h_offset = (c / ksize) % ksize; |
| | |
| | | //printf("start\n"); |
| | | int i; |
| | | for(i = 0; i < net.n; ++i){ |
| | | clock_t time = clock(); |
| | | //clock_t time = clock(); |
| | | if(net.types[i] == CONVOLUTIONAL){ |
| | | convolutional_layer layer = *(convolutional_layer *)net.layers[i]; |
| | | forward_convolutional_layer_gpu(layer, input); |
| | |
| | | forward_softmax_layer_gpu(layer, input); |
| | | input = layer.output_cl; |
| | | } |
| | | printf("%d %f\n", i, sec(clock()-time)); |
| | | //printf("%d %f\n", i, sec(clock()-time)); |
| | | /* |
| | | else if(net.types[i] == CROP){ |
| | | crop_layer layer = *(crop_layer *)net.layers[i]; |
| | |
| | | { |
| | | int x_size = get_network_input_size(net)*net.batch; |
| | | int y_size = get_network_output_size(net)*net.batch; |
| | | clock_t time = clock(); |
| | | if(!*net.input_cl){ |
| | | *net.input_cl = cl_make_array(x, x_size); |
| | | *net.truth_cl = cl_make_array(y, y_size); |
| | |
| | | cl_write_array(*net.input_cl, x, x_size); |
| | | cl_write_array(*net.truth_cl, y, y_size); |
| | | } |
| | | //printf("trans %f\n", sec(clock()-time)); |
| | | time = clock(); |
| | | forward_network_gpu(net, *net.input_cl, *net.truth_cl, 1); |
| | | //printf("forw %f\n", sec(clock()-time)); |
| | | time = clock(); |
| | | backward_network_gpu(net, *net.input_cl); |
| | | //printf("back %f\n", sec(clock()-time)); |
| | | time = clock(); |
| | | float error = get_network_cost(net); |
| | | update_network_gpu(net); |
| | | //printf("updt %f\n", sec(clock()-time)); |
| | | time = clock(); |
| | | return error; |
| | | } |
| | | |