From 028696bf15efeca3acb3db8c42a96f7b9e0f55ff Mon Sep 17 00:00:00 2001
From: iovodov <b@ovdv.ru>
Date: Thu, 03 May 2018 13:33:46 +0000
Subject: [PATCH] Output improvements for detector results: When printing detector results, output was done in random order, obfuscating results for interpreting. Now: 1. Text output includes coordinates of rects in (left,right,top,bottom in pixels) along with label and score 2. Text output is sorted by rect lefts to simplify finding appropriate rects on image 3. If several class probs are > thresh for some detection, the most probable is written first and coordinates for others are not repeated 4. Rects are imprinted in image in order by their best class prob, so most probable rects are always on top and not overlayed by less probable ones 5. Most probable label for rect is always written first Also: 6. Message about low GPU memory include required amount

---
 src/convolutional_layer.c |   95 ++++++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 85 insertions(+), 10 deletions(-)

diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 801270a..b8065fd 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -139,25 +139,61 @@
 #ifdef CUDNN
 void cudnn_convolutional_setup(layer *l, int cudnn_preference)
 {
-    cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
-    cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
-    cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); 
 
-    cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
-    cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
-    cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); 
+#ifdef CUDNN_HALF
+	// TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0): 
+	//   Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
+	// PSEUDO_HALF_CONFIG is required for Tensor Cores - our case!
+	const cudnnDataType_t data_type = CUDNN_DATA_HALF;
+#else
+	cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#endif
+
+#if(CUDNN_MAJOR >= 7)
+	// Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH
+	// For *_ALGO_WINOGRAD_NONFUSED can be used CUDNN_DATA_FLOAT
+	// otherwise Input, Filter and Output descriptors (xDesc, yDesc, wDesc, dxDesc, dyDesc and dwDesc as applicable) have dataType = CUDNN_DATA_HALF
+	// Three techniques for training using Mixed-precision: https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
+	// 1. Accumulation into FP32
+	// 2. Loss Scaling - required only for: activation gradients. We do not use.
+	// 3. FP32 Master Copy of Weights
+	// More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
+	cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH);
+#endif
+
+	// INT8_CONFIG, INT8_EXT_CONFIG, INT8x4_CONFIG and INT8x4_EXT_CONFIG are only supported 
+	//   on architectures with DP4A support (compute capability 6.1 and later).
+	//cudnnDataType_t data_type = CUDNN_DATA_INT8;
+
+	// backward delta
+    cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w);
+    cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
+    cudnnSetFilter4dDescriptor(l->dweightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
+
+	// forward
+    cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w);
+    cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
+    cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
+
+	// batch norm
+	cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1);
+	cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
+
+	cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
 #if(CUDNN_MAJOR >= 6)
-	cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);	// cudnn 6.0
+	cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);	// cudnn >= 6.0
 #else
 	cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);	// cudnn 5.1
 #endif
 	int forward_algo = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
 	int backward_algo = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
 	int backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
-	if (cudnn_preference == cudnn_smallest) {
+	if (cudnn_preference == cudnn_smallest) 
+	{
 		forward_algo = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
 		backward_algo = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
 		backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+		printf(" CUDNN-slow ");
 	}
 
 	cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
@@ -184,6 +220,38 @@
 			backward_filter,
             0,
             &l->bf_algo);
+
+	if (data_type == CUDNN_DATA_HALF) 
+	{
+		// HALF-16 if(data_type == CUDNN_DATA_HALF)
+		l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+		l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+		l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+
+		// FLOAT-32 if(data_type == CUDNN_DATA_FLOAT)
+		//l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
+		//l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
+		//l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;
+
+		int fw = 0, bd = 0, bf = 0;
+		if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1;
+			//printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM \n");
+		if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2;
+			//printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED \n");
+
+		if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1;
+			//printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1  \n");
+		if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2;
+			//printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED \n");
+
+		if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1;
+			//printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1   \n");
+		if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2;
+			//printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED \n");
+
+		if (fw == 2 && bd == 2 && bf == 2) printf("TF ");
+		else if (fw == 1 && bd == 1 && bf == 1) printf("TH ");
+	}
 }
 #endif
 #endif
@@ -275,6 +343,10 @@
         }
 
         l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
+#ifdef CUDNN_HALF
+		l.weights_gpu16 = cuda_make_array(NULL, c*n*size*size / 2); //cuda_make_array(l.weights, c*n*size*size / 2);
+		l.weight_updates_gpu16 = cuda_make_array(NULL, c*n*size*size / 2); //cuda_make_array(l.weight_updates, c*n*size*size / 2);
+#endif
         l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
 
         l.biases_gpu = cuda_make_array(l.biases, n);
@@ -307,7 +379,10 @@
             l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
             l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
         }
-#ifdef CUDNN
+#ifdef CUDNN		
+		cudnnCreateTensorDescriptor(&l.normDstTensorDesc);
+		cudnnCreateTensorDescriptor(&l.normDstTensorDescF16);
+		cudnnCreateTensorDescriptor(&l.normTensorDesc);
         cudnnCreateTensorDescriptor(&l.srcTensorDesc);
         cudnnCreateTensorDescriptor(&l.dstTensorDesc);
         cudnnCreateFilterDescriptor(&l.weightDesc);
@@ -416,7 +491,7 @@
 	size_t total_byte;
 	check_error(cudaMemGetInfo(&free_byte, &total_byte));
 	if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) {
-		printf(" used slow CUDNN algo without Workspace! \n");
+		printf(" used slow CUDNN algo without Workspace! Need memory: %d, available: %d\n", l->workspace_size, (free_byte < total_byte/2) ? free_byte : total_byte/2);
 		cudnn_convolutional_setup(l, cudnn_smallest);
 		l->workspace_size = get_workspace_size(*l);
 	}

--
Gitblit v1.10.0