If there is excessive GPU-RAM consumption by CUDNN then then do not use Workspace
| | |
| | | |
| | | #ifdef GPU |
| | | #ifdef CUDNN |
| | | void cudnn_convolutional_setup(layer *l) |
| | | void cudnn_convolutional_setup(layer *l, int cudnn_preference) |
| | | { |
| | | cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); |
| | | cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); |
| | |
| | | #else |
| | | cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); // cudnn 5.1 |
| | | #endif |
| | | int forward_algo = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST; |
| | | int backward_algo = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST; |
| | | int backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST; |
| | | if (cudnn_preference == cudnn_smallest) { |
| | | forward_algo = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE; |
| | | backward_algo = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE; |
| | | backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE; |
| | | } |
| | | |
| | | cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), |
| | | l->srcTensorDesc, |
| | | l->weightDesc, |
| | | l->convDesc, |
| | | l->dstTensorDesc, |
| | | CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, |
| | | forward_algo, |
| | | 0, |
| | | &l->fw_algo); |
| | | cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), |
| | |
| | | l->ddstTensorDesc, |
| | | l->convDesc, |
| | | l->dsrcTensorDesc, |
| | | CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, |
| | | backward_algo, |
| | | 0, |
| | | &l->bd_algo); |
| | | cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), |
| | |
| | | l->ddstTensorDesc, |
| | | l->convDesc, |
| | | l->dweightDesc, |
| | | CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, |
| | | backward_filter, |
| | | 0, |
| | | &l->bf_algo); |
| | | } |
| | |
| | | cudnnCreateTensorDescriptor(&l.ddstTensorDesc); |
| | | cudnnCreateFilterDescriptor(&l.dweightDesc); |
| | | cudnnCreateConvolutionDescriptor(&l.convDesc); |
| | | cudnn_convolutional_setup(&l); |
| | | cudnn_convolutional_setup(&l, cudnn_fastest); |
| | | #endif |
| | | } |
| | | #endif |
| | |
| | | } |
| | | } |
| | | #ifdef CUDNN |
| | | cudnn_convolutional_setup(l); |
| | | cudnn_convolutional_setup(l, cudnn_fastest); |
| | | #endif |
| | | #endif |
| | | l->workspace_size = get_workspace_size(*l); |
| | | |
| | | #ifdef CUDNN |
| | | // check for excessive memory consumption |
| | | size_t free_byte; |
| | | size_t total_byte; |
| | | check_error(cudaMemGetInfo(&free_byte, &total_byte)); |
| | | if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) { |
| | | printf(" used slow CUDNN algo without Workspace! \n"); |
| | | cudnn_convolutional_setup(l, cudnn_smallest); |
| | | l->workspace_size = get_workspace_size(*l); |
| | | } |
| | | #endif |
| | | } |
| | | |
| | | void add_bias(float *output, float *biases, int batch, int n, int size) |
| | |
| | | void add_bias_gpu(float *output, float *biases, int batch, int n, int size); |
| | | void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size); |
| | | #ifdef CUDNN |
| | | void cudnn_convolutional_setup(layer *l); |
| | | void cudnn_convolutional_setup(layer *l, int cudnn_preference); |
| | | #endif |
| | | #endif |
| | | |
| | |
| | | |
| | | #ifdef CUDNN |
| | | cudnnHandle_t cudnn_handle(); |
| | | enum {cudnn_fastest, cudnn_smallest}; |
| | | #endif |
| | | |
| | | #endif |
| | |
| | | net->layers[i].batch = b; |
| | | #ifdef CUDNN |
| | | if(net->layers[i].type == CONVOLUTIONAL){ |
| | | cudnn_convolutional_setup(net->layers + i); |
| | | layer *l = net->layers + i; |
| | | cudnn_convolutional_setup(l, cudnn_fastest); |
| | | // check for excessive memory consumption |
| | | size_t free_byte; |
| | | size_t total_byte; |
| | | check_error(cudaMemGetInfo(&free_byte, &total_byte)); |
| | | if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) { |
| | | printf(" used slow CUDNN algo without Workspace! \n"); |
| | | cudnn_convolutional_setup(l, cudnn_smallest); |
| | | l->workspace_size = get_workspace_size(*l); |
| | | } |
| | | } |
| | | #endif |
| | | } |
| | |
| | | } |
| | | #ifdef GPU |
| | | if(gpu_index >= 0){ |
| | | printf(" try to allocate workspace, "); |
| | | printf(" try to allocate workspace = %zu * sizeof(float), ", (workspace_size - 1) / sizeof(float) + 1); |
| | | net->workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1); |
| | | printf(" CUDA allocate done! \n"); |
| | | }else { |