AlexeyAB
2018-02-21 033e934ce82826c73d851098baf7ce4b1a27c89a
If there is excessive GPU-RAM consumption by CUDNN then then do not use Workspace
4 files modified
50 ■■■■ changed files
src/convolutional_layer.c 33 ●●●● patch | view | raw | blame | history
src/convolutional_layer.h 2 ●●● patch | view | raw | blame | history
src/cuda.h 1 ●●●● patch | view | raw | blame | history
src/network.c 14 ●●●● patch | view | raw | blame | history
src/convolutional_layer.c
@@ -137,7 +137,7 @@
#ifdef GPU
#ifdef CUDNN
void cudnn_convolutional_setup(layer *l)
void cudnn_convolutional_setup(layer *l, int cudnn_preference)
{
    cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
    cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
@@ -151,12 +151,21 @@
#else
    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);  // cudnn 5.1
#endif
    int forward_algo = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
    int backward_algo = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
    int backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
    if (cudnn_preference == cudnn_smallest) {
        forward_algo = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
        backward_algo = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
        backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
    }
    cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
            l->srcTensorDesc,
            l->weightDesc,
            l->convDesc,
            l->dstTensorDesc,
            CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
            forward_algo,
            0,
            &l->fw_algo);
    cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
@@ -164,7 +173,7 @@
            l->ddstTensorDesc,
            l->convDesc,
            l->dsrcTensorDesc,
            CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
            backward_algo,
            0,
            &l->bd_algo);
    cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
@@ -172,7 +181,7 @@
            l->ddstTensorDesc,
            l->convDesc,
            l->dweightDesc,
            CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
            backward_filter,
            0,
            &l->bf_algo);
}
@@ -306,7 +315,7 @@
        cudnnCreateTensorDescriptor(&l.ddstTensorDesc);
        cudnnCreateFilterDescriptor(&l.dweightDesc);
        cudnnCreateConvolutionDescriptor(&l.convDesc);
        cudnn_convolutional_setup(&l);
        cudnn_convolutional_setup(&l, cudnn_fastest);
#endif
    }
#endif
@@ -396,10 +405,22 @@
        }
    }
#ifdef CUDNN
    cudnn_convolutional_setup(l);
    cudnn_convolutional_setup(l, cudnn_fastest);
#endif
#endif
    l->workspace_size = get_workspace_size(*l);
#ifdef CUDNN
    // check for excessive memory consumption
    size_t free_byte;
    size_t total_byte;
    check_error(cudaMemGetInfo(&free_byte, &total_byte));
    if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) {
        printf(" used slow CUDNN algo without Workspace! \n");
        cudnn_convolutional_setup(l, cudnn_smallest);
        l->workspace_size = get_workspace_size(*l);
    }
#endif
}
void add_bias(float *output, float *biases, int batch, int n, int size)
src/convolutional_layer.h
@@ -20,7 +20,7 @@
void add_bias_gpu(float *output, float *biases, int batch, int n, int size);
void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size);
#ifdef CUDNN
void cudnn_convolutional_setup(layer *l);
void cudnn_convolutional_setup(layer *l, int cudnn_preference);
#endif
#endif
src/cuda.h
@@ -35,6 +35,7 @@
#ifdef CUDNN
cudnnHandle_t cudnn_handle();
enum {cudnn_fastest, cudnn_smallest};
#endif
#endif
src/network.c
@@ -316,7 +316,17 @@
        net->layers[i].batch = b;
#ifdef CUDNN
        if(net->layers[i].type == CONVOLUTIONAL){
            cudnn_convolutional_setup(net->layers + i);
            layer *l = net->layers + i;
            cudnn_convolutional_setup(l, cudnn_fastest);
            // check for excessive memory consumption
            size_t free_byte;
            size_t total_byte;
            check_error(cudaMemGetInfo(&free_byte, &total_byte));
            if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) {
                printf(" used slow CUDNN algo without Workspace! \n");
                cudnn_convolutional_setup(l, cudnn_smallest);
                l->workspace_size = get_workspace_size(*l);
            }
        }
#endif
    }
@@ -378,7 +388,7 @@
    }
#ifdef GPU
    if(gpu_index >= 0){
        printf(" try to allocate workspace, ");
        printf(" try to allocate workspace = %zu * sizeof(float), ", (workspace_size - 1) / sizeof(float) + 1);
        net->workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1);
        printf(" CUDA allocate done! \n");
    }else {