Fixed bug in Tensor Cores V100 (1. Desc in Batch norm, 2. Manually selected algo).
Also fixed time measure on Linux for multi-threading.
| | |
| | | layer.x_norm_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs); |
| | | #ifdef CUDNN |
| | | cudnnCreateTensorDescriptor(&layer.normTensorDesc); |
| | | cudnnCreateTensorDescriptor(&layer.dstTensorDesc); |
| | | cudnnSetTensor4dDescriptor(layer.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, layer.batch, layer.out_c, layer.out_h, layer.out_w); |
| | | cudnnCreateTensorDescriptor(&layer.normDstTensorDesc); |
| | | cudnnSetTensor4dDescriptor(layer.normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, layer.batch, layer.out_c, layer.out_h, layer.out_w); |
| | | cudnnSetTensor4dDescriptor(layer.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, layer.out_c, 1, 1); |
| | | #endif |
| | | #endif |
| | |
| | | CUDNN_BATCHNORM_SPATIAL, |
| | | &one, |
| | | &zero, |
| | | l.dstTensorDesc, |
| | | l.normDstTensorDesc, |
| | | l.x_gpu, |
| | | l.dstTensorDesc, |
| | | l.normDstTensorDesc, |
| | | l.output_gpu, |
| | | l.normTensorDesc, |
| | | l.scales_gpu, |
| | |
| | | &zero, |
| | | &one, |
| | | &one, |
| | | l.dstTensorDesc, |
| | | l.normDstTensorDesc, |
| | | l.x_gpu, |
| | | l.dstTensorDesc, |
| | | l.normDstTensorDesc, |
| | | l.delta_gpu, |
| | | l.dstTensorDesc, |
| | | l.normDstTensorDesc, |
| | | l.x_norm_gpu, |
| | | l.normTensorDesc, |
| | | l.scales_gpu, |
| | |
| | | |
| | | // batch norm |
| | | cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); |
| | | cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); |
| | | #if(CUDNN_MAJOR >= 6) |
| | | cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); // cudnn >= 6.0 |
| | | #else |
| | |
| | | forward_algo = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE; |
| | | backward_algo = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE; |
| | | backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE; |
| | | printf(" CUDNN-slow "); |
| | | } |
| | | |
| | | cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), |
| | |
| | | backward_filter, |
| | | 0, |
| | | &l->bf_algo); |
| | | |
| | | if (data_type == CUDNN_DATA_HALF) |
| | | { |
| | | // HALF-16 if(data_type == CUDNN_DATA_HALF) |
| | | l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; |
| | | l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; |
| | | l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; |
| | | |
| | | // FLOAT-32 if(data_type == CUDNN_DATA_FLOAT) |
| | | //l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED; |
| | | //l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED; |
| | | //l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED; |
| | | |
| | | int fw = 0, bd = 0, bf = 0; |
| | | if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1; |
| | | //printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM \n"); |
| | | if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2; |
| | | //printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED \n"); |
| | | |
| | | if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1; |
| | | //printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 \n"); |
| | | if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2; |
| | | //printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED \n"); |
| | | |
| | | if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1; |
| | | //printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 \n"); |
| | | if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2; |
| | | //printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED \n"); |
| | | |
| | | if (fw == 2 && bd == 2 && bf == 2) printf("TF "); |
| | | else if (fw >= 1 && bd >= 1 && bf >= 1) printf("TH "); |
| | | } |
| | | } |
| | | #endif |
| | | #endif |
| | |
| | | l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); |
| | | } |
| | | #ifdef CUDNN |
| | | cudnnCreateTensorDescriptor(&l.normDstTensorDesc); |
| | | cudnnCreateTensorDescriptor(&l.normTensorDesc); |
| | | cudnnCreateTensorDescriptor(&l.srcTensorDesc); |
| | | cudnnCreateTensorDescriptor(&l.dstTensorDesc); |
| | |
| | | args.small_object = net.small_object; |
| | | args.d = &buffer; |
| | | args.type = DETECTION_DATA; |
| | | args.threads = 64; // 8 |
| | | args.threads = 16; // 64 |
| | | |
| | | args.angle = net.angle; |
| | | args.exposure = net.exposure; |
| | |
| | | args.hue = net.hue; |
| | | |
| | | #ifdef OPENCV |
| | | args.threads = 7; |
| | | IplImage* img = NULL; |
| | | float max_img_loss = 5; |
| | | int number_of_lines = 100; |
| | |
| | | #endif //OPENCV |
| | | |
| | | pthread_t load_thread = load_data(args); |
| | | clock_t time; |
| | | double time; |
| | | int count = 0; |
| | | //while(i*imgs < N*120){ |
| | | while(get_current_batch(net) < net.max_batches){ |
| | |
| | | } |
| | | net = nets[0]; |
| | | } |
| | | time=clock(); |
| | | time=what_time_is_it_now(); |
| | | pthread_join(load_thread, 0); |
| | | train = buffer; |
| | | load_thread = load_data(args); |
| | |
| | | save_image(im, "truth11"); |
| | | */ |
| | | |
| | | printf("Loaded: %lf seconds\n", sec(clock()-time)); |
| | | printf("Loaded: %lf seconds\n", (what_time_is_it_now()-time)); |
| | | |
| | | time=clock(); |
| | | time=what_time_is_it_now(); |
| | | float loss = 0; |
| | | #ifdef GPU |
| | | if(ngpus == 1){ |
| | |
| | | avg_loss = avg_loss*.9 + loss*.1; |
| | | |
| | | i = get_current_batch(net); |
| | | printf("\n %d: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs); |
| | | printf("\n %d: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), (what_time_is_it_now()-time), i*imgs); |
| | | |
| | | #ifdef OPENCV |
| | | if(!dont_show) |
| | |
| | | int *map = 0; |
| | | if (mapf) map = read_map(mapf); |
| | | |
| | | network net = parse_network_cfg_custom(cfgfile, 1); |
| | | network net = parse_network_cfg_custom(cfgfile, 1); // set batch=1 |
| | | if (weightfile) { |
| | | load_weights(&net, weightfile); |
| | | } |
| | | set_batch_network(&net, 1); |
| | | //set_batch_network(&net, 1); |
| | | fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); |
| | | srand(time(0)); |
| | | |
| | |
| | | |
| | | void validate_detector_recall(char *datacfg, char *cfgfile, char *weightfile) |
| | | { |
| | | network net = parse_network_cfg_custom(cfgfile, 1); |
| | | network net = parse_network_cfg_custom(cfgfile, 1); // set batch=1 |
| | | if (weightfile) { |
| | | load_weights(&net, weightfile); |
| | | } |
| | | set_batch_network(&net, 1); |
| | | //set_batch_network(&net, 1); |
| | | fuse_conv_batchnorm(net); |
| | | srand(time(0)); |
| | | |
| | |
| | | int *map = 0; |
| | | if (mapf) map = read_map(mapf); |
| | | |
| | | network net = parse_network_cfg_custom(cfgfile, 1); |
| | | network net = parse_network_cfg_custom(cfgfile, 1); // set batch=1 |
| | | if (weightfile) { |
| | | load_weights(&net, weightfile); |
| | | } |
| | | set_batch_network(&net, 1); |
| | | //set_batch_network(&net, 1); |
| | | fuse_conv_batchnorm(net); |
| | | srand(time(0)); |
| | | |
| | |
| | | char **names = get_labels(name_list); |
| | | |
| | | image **alphabet = load_alphabet(); |
| | | network net = parse_network_cfg_custom(cfgfile, 1); |
| | | network net = parse_network_cfg_custom(cfgfile, 1); // set batch=1 |
| | | if(weightfile){ |
| | | load_weights(&net, weightfile); |
| | | } |
| | | set_batch_network(&net, 1); |
| | | //set_batch_network(&net, 1); |
| | | fuse_conv_batchnorm(net); |
| | | srand(2222222); |
| | | clock_t time; |
| | | double time; |
| | | char buff[256]; |
| | | char *input = buff; |
| | | int j; |
| | |
| | | //for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float *)); |
| | | |
| | | float *X = sized.data; |
| | | time=clock(); |
| | | time= what_time_is_it_now(); |
| | | network_predict(net, X); |
| | | //network_predict_image(&net, im); |
| | | printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time)); |
| | | printf("%s: Predicted in %f seconds.\n", input, (what_time_is_it_now()-time)); |
| | | //get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, 0); |
| | | // if (nms) do_nms_sort_v2(boxes, probs, l.w*l.h*l.n, l.classes, nms); |
| | | //draw_detections(im, l.w*l.h*l.n, thresh, boxes, probs, names, alphabet, l.classes); |
| | |
| | | #ifdef CUDNN |
| | | cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc; |
| | | cudnnTensorDescriptor_t dsrcTensorDesc, ddstTensorDesc; |
| | | cudnnTensorDescriptor_t normTensorDesc; |
| | | cudnnTensorDescriptor_t normTensorDesc, normDstTensorDesc; |
| | | cudnnFilterDescriptor_t weightDesc; |
| | | cudnnFilterDescriptor_t dweightDesc; |
| | | cudnnConvolutionDescriptor_t convDesc; |
| | |
| | | #include <limits.h> |
| | | #ifdef WIN32 |
| | | #include "unistd.h" |
| | | #include "gettimeofday.h" |
| | | #else |
| | | #include <unistd.h> |
| | | #include <sys/time.h> |
| | | #endif |
| | | #include "utils.h" |
| | | |
| | | #pragma warning(disable: 4996) |
| | | |
| | | double what_time_is_it_now() |
| | | { |
| | | struct timeval time; |
| | | if (gettimeofday(&time, NULL)) { |
| | | return 0; |
| | | } |
| | | return (double)time.tv_sec + (double)time.tv_usec * .000001; |
| | | } |
| | | |
| | | int *read_map(char *filename) |
| | | { |
| | | int n = 0; |
| | |
| | | #endif |
| | | #endif |
| | | |
| | | double what_time_is_it_now(); |
| | | int *read_map(char *filename); |
| | | void shuffle(void *arr, size_t n, size_t size); |
| | | void sorta_shuffle(void *arr, size_t n, size_t size, size_t sections); |