From cd2bdec09030edf7da79ecdeb38d908c106850b3 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Fri, 23 Feb 2018 12:05:31 +0000
Subject: [PATCH] Updated to CUDA 9.1. And fixed no_gpu dependecies.

---
 build/darknet/yolo_cpp_dll.vcxproj   |    4 +-
 Makefile                             |   15 +++++--
 src/convolutional_layer.c            |   18 ++++++--
 src/network_kernels.cu               |    7 +++
 build/darknet/darknet_no_gpu.vcxproj |    2 +
 build/darknet/darknet.vcxproj        |    4 +-
 src/convolutional_kernels.cu         |    5 +-
 README.md                            |    8 ++--
 8 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/Makefile b/Makefile
index f8bd4a5..8898ba8 100644
--- a/Makefile
+++ b/Makefile
@@ -9,18 +9,23 @@
       -gencode arch=compute_35,code=sm_35 \
       -gencode arch=compute_50,code=[sm_50,compute_50] \
       -gencode arch=compute_52,code=[sm_52,compute_52] \
-      -gencode arch=compute_61,code=[sm_61,compute_61]
+	  -gencode arch=compute_61,code=[sm_61,compute_61]
 
+# Tesla V100
+# ARCH= -gencode arch=compute_70,code=[sm_70,compute_70]
+
+# GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4
+# ARCH= -gencode arch=compute_61,code=sm_61 -gencode arch=compute_61,code=compute_61
+
+# GP100/Tesla P100 � DGX-1
+# ARCH= -gencode arch=compute_60,code=sm_60
 
 # For Jetson Tx1 uncomment:
 # ARCH= -gencode arch=compute_51,code=[sm_51,compute_51]
 
-# For Jetson Tx2 uncomment:
+# For Jetson Tx2 or Drive-PX2 uncomment:
 # ARCH= -gencode arch=compute_62,code=[sm_62,compute_62]
 
-# This is what I use, uncomment if you know your arch and want to specify
-# ARCH=  -gencode arch=compute_52,code=compute_52
-
 
 VPATH=./src/
 EXEC=darknet
diff --git a/README.md b/README.md
index ec8c19a..b7a4bd5 100644
--- a/README.md
+++ b/README.md
@@ -32,13 +32,13 @@
 
 * both Windows and Linux
 * both OpenCV 3.x and OpenCV 2.4.13
-* both cuDNN 5 and cuDNN 6
+* both cuDNN v5-v7
 * CUDA >= 7.5
 * also create SO-library on Linux and DLL-library on Windows
 
 ##### Requires: 
 * **Linux GCC>=4.9 or Windows MS Visual Studio 2015 (v140)**: https://go.microsoft.com/fwlink/?LinkId=532606&clcid=0x409  (or offline [ISO image](https://go.microsoft.com/fwlink/?LinkId=615448&clcid=0x409))
-* **CUDA 8.0**: https://developer.nvidia.com/cuda-downloads
+* **CUDA 9.1**: https://developer.nvidia.com/cuda-downloads
 * **OpenCV 3.x**: https://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.2.0/opencv-3.2.0-vc14.exe/download
 * **or OpenCV 2.4.13**: https://sourceforge.net/projects/opencvlibrary/files/opencv-win/2.4.13/opencv-2.4.13.2-vc14.exe/download
   - OpenCV allows to show image or video detection in the window and store result to file that specified in command line `-out_filename res.avi`
@@ -117,7 +117,7 @@
 Just do `make` in the darknet directory.
 Before make, you can set such options in the `Makefile`: [link](https://github.com/AlexeyAB/darknet/blob/9c1b9a2cf6363546c152251be578a21f3c3caec6/Makefile#L1)
 * `GPU=1` to build with CUDA to accelerate by using GPU (CUDA should be in `/usr/local/cuda`)
-* `CUDNN=1` to build with cuDNN v5/v6 to accelerate training by using GPU (cuDNN should be in `/usr/local/cudnn`)
+* `CUDNN=1` to build with cuDNN v5-v7 to accelerate training by using GPU (cuDNN should be in `/usr/local/cudnn`)
 * `OPENCV=1` to build with OpenCV 3.x/2.4.x - allows to detect on video files and video streams from network cameras or web-cams
 * `DEBUG=1` to bould debug version of Yolo
 * `OPENMP=1` to build with OpenMP support to accelerate Yolo by using multi-core CPU
@@ -142,7 +142,7 @@
   
 5. If you want to build with CUDNN to speed up then:
       
-    * download and install **cuDNN 6.0 for CUDA 8.0**: https://developer.nvidia.com/cudnn
+    * download and install **cuDNN 7.0 for CUDA 9.1**: https://developer.nvidia.com/cudnn
       
     * add Windows system variable `cudnn` with path to CUDNN: https://hsto.org/files/a49/3dc/fc4/a493dcfc4bd34a1295fd15e0e2e01f26.jpg
       
diff --git a/build/darknet/darknet.vcxproj b/build/darknet/darknet.vcxproj
index 0eba8f8..0ff8799 100644
--- a/build/darknet/darknet.vcxproj
+++ b/build/darknet/darknet.vcxproj
@@ -52,7 +52,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 9.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
@@ -281,6 +281,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 9.1.targets" />
   </ImportGroup>
 </Project>
\ No newline at end of file
diff --git a/build/darknet/darknet_no_gpu.vcxproj b/build/darknet/darknet_no_gpu.vcxproj
index c7d51e3..578a5af 100644
--- a/build/darknet/darknet_no_gpu.vcxproj
+++ b/build/darknet/darknet_no_gpu.vcxproj
@@ -198,6 +198,7 @@
     <ClCompile Include="..\..\src\gettimeofday.c" />
     <ClCompile Include="..\..\src\go.c" />
     <ClCompile Include="..\..\src\gru_layer.c" />
+    <ClCompile Include="..\..\src\http_stream.cpp" />
     <ClCompile Include="..\..\src\im2col.c" />
     <ClCompile Include="..\..\src\image.c" />
     <ClCompile Include="..\..\src\layer.c" />
@@ -251,6 +252,7 @@
     <ClInclude Include="..\..\src\getopt.h" />
     <ClInclude Include="..\..\src\gettimeofday.h" />
     <ClInclude Include="..\..\src\gru_layer.h" />
+    <ClInclude Include="..\..\src\http_stream.h" />
     <ClInclude Include="..\..\src\im2col.h" />
     <ClInclude Include="..\..\src\image.h" />
     <ClInclude Include="..\..\src\layer.h" />
diff --git a/build/darknet/yolo_cpp_dll.vcxproj b/build/darknet/yolo_cpp_dll.vcxproj
index b4a97a3..31699fb 100644
--- a/build/darknet/yolo_cpp_dll.vcxproj
+++ b/build/darknet/yolo_cpp_dll.vcxproj
@@ -52,7 +52,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 9.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
@@ -285,6 +285,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 9.1.targets" />
   </ImportGroup>
 </Project>
\ No newline at end of file
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index ee9b534..3b2a349 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -78,7 +78,7 @@
 {
 	int idx = blockIdx.x * blockDim.x + threadIdx.x;
 	if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
-	//if (idx < size) *((unsigned int *)output_f16 + idx) = __float2half(input_f32[idx]);
+	//if (idx < size) *((unsigned short *)output_f16 + idx) = __float2half(input_f32[idx]);
 }
 
 void cuda_convert_f32_to_f16(float* input_f32, size_t size, half *output_f16) {
@@ -89,7 +89,7 @@
 {
 	int idx = blockIdx.x * blockDim.x + threadIdx.x;
 	if (idx < size) output_f32[idx] = __half2float(input_f16[idx]);
-	//if (idx < size) output_f32[idx] = __half2float(*((unsigned int *)input_f16 + idx));
+	//if (idx < size) output_f32[idx] = __half2float(*((unsigned short *)input_f16 + idx));
 }
 
 void cuda_convert_f16_to_f32(half* input_f16, size_t size, float *output_f32) {
@@ -247,6 +247,7 @@
 
     if(state.delta){
         if(l.binary || l.xnor) swap_binary(&l);
+		// http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
         cudnnConvolutionBackwardData(cudnn_handle(),
                 &one,
                 l.weightDesc,
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index aeb621a..d35246e 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -141,19 +141,27 @@
 {
 
 #ifdef CUDNN_HALF
-	// TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0):
-	// Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
+	// TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0): Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
+	// PSEUDO_HALF_CONFIG is required for Tensor Cores - our case!
 	const cudnnDataType_t data_type = CUDNN_DATA_HALF;
 #else
 	cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #endif
-	// Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH
+
 #if(CUDNN_MAJOR >= 7)
+	// Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH
+	// For *_ALGO_WINOGRAD_NONFUSED can be used CUDNN_DATA_FLOAT
+	// otherwise Input, Filter and Output descriptors (xDesc, yDesc, wDesc, dxDesc, dyDesc and dwDesc as applicable) have dataType = CUDNN_DATA_HALF
+	// Three techniques for training using Mixed-precision: https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
+	// 1. Accumulation into FP32
+	// 2. Loss Scaling - required only for: activation gradients. We do not use.
+	// 3. FP32 Master Copy of Weights
+	// More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
 	cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH);
 #endif
 
 	// INT8_CONFIG, INT8_EXT_CONFIG, INT8x4_CONFIG and INT8x4_EXT_CONFIG are only supported 
-	// on architectures with DP4A support (compute capability 6.1 and later).
+	//   on architectures with DP4A support (compute capability 6.1 and later).
 	//cudnnDataType_t data_type = CUDNN_DATA_INT8;
 
     cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w);
@@ -164,7 +172,7 @@
     cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
     cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
 #if(CUDNN_MAJOR >= 6)
-	cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, data_type);	// cudnn >= 6.0
+	cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);	// cudnn >= 6.0
 #else
 	cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);	// cudnn 5.1
 #endif
diff --git a/src/network_kernels.cu b/src/network_kernels.cu
index 64f4f9b..6090bb0 100644
--- a/src/network_kernels.cu
+++ b/src/network_kernels.cu
@@ -113,6 +113,13 @@
     state.delta = 0;
     state.truth = *net.truth_gpu;
     state.train = 1;
+#ifdef CUDNN_HALF
+	int i;
+	for (i = 0; i < net.n; ++i) {
+		layer l = net.layers[i];
+		cuda_convert_f32_to_f16(l.weights_gpu, l.c*l.n*l.size*l.size, (half *)l.weights_gpu16);
+	}
+#endif
     forward_network_gpu(net, state);
 	cudaStreamSynchronize(get_cuda_stream());
     backward_network_gpu(net, state);

--
Gitblit v1.10.0