Updated to CUDA 9.1. And fixed no_gpu dependecies.
| | |
| | | -gencode arch=compute_35,code=sm_35 \ |
| | | -gencode arch=compute_50,code=[sm_50,compute_50] \ |
| | | -gencode arch=compute_52,code=[sm_52,compute_52] \ |
| | | -gencode arch=compute_61,code=[sm_61,compute_61] |
| | | -gencode arch=compute_61,code=[sm_61,compute_61] |
| | | |
| | | # Tesla V100 |
| | | # ARCH= -gencode arch=compute_70,code=[sm_70,compute_70] |
| | | |
| | | # GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4 |
| | | # ARCH= -gencode arch=compute_61,code=sm_61 -gencode arch=compute_61,code=compute_61 |
| | | |
| | | # GP100/Tesla P100 DGX-1 |
| | | # ARCH= -gencode arch=compute_60,code=sm_60 |
| | | |
| | | # For Jetson Tx1 uncomment: |
| | | # ARCH= -gencode arch=compute_51,code=[sm_51,compute_51] |
| | | |
| | | # For Jetson Tx2 uncomment: |
| | | # For Jetson Tx2 or Drive-PX2 uncomment: |
| | | # ARCH= -gencode arch=compute_62,code=[sm_62,compute_62] |
| | | |
| | | # This is what I use, uncomment if you know your arch and want to specify |
| | | # ARCH= -gencode arch=compute_52,code=compute_52 |
| | | |
| | | |
| | | VPATH=./src/ |
| | | EXEC=darknet |
| | |
| | | |
| | | * both Windows and Linux |
| | | * both OpenCV 3.x and OpenCV 2.4.13 |
| | | * both cuDNN 5 and cuDNN 6 |
| | | * both cuDNN v5-v7 |
| | | * CUDA >= 7.5 |
| | | * also create SO-library on Linux and DLL-library on Windows |
| | | |
| | | ##### Requires: |
| | | * **Linux GCC>=4.9 or Windows MS Visual Studio 2015 (v140)**: https://go.microsoft.com/fwlink/?LinkId=532606&clcid=0x409 (or offline [ISO image](https://go.microsoft.com/fwlink/?LinkId=615448&clcid=0x409)) |
| | | * **CUDA 8.0**: https://developer.nvidia.com/cuda-downloads |
| | | * **CUDA 9.1**: https://developer.nvidia.com/cuda-downloads |
| | | * **OpenCV 3.x**: https://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.2.0/opencv-3.2.0-vc14.exe/download |
| | | * **or OpenCV 2.4.13**: https://sourceforge.net/projects/opencvlibrary/files/opencv-win/2.4.13/opencv-2.4.13.2-vc14.exe/download |
| | | - OpenCV allows to show image or video detection in the window and store result to file that specified in command line `-out_filename res.avi` |
| | |
| | | Just do `make` in the darknet directory. |
| | | Before make, you can set such options in the `Makefile`: [link](https://github.com/AlexeyAB/darknet/blob/9c1b9a2cf6363546c152251be578a21f3c3caec6/Makefile#L1) |
| | | * `GPU=1` to build with CUDA to accelerate by using GPU (CUDA should be in `/usr/local/cuda`) |
| | | * `CUDNN=1` to build with cuDNN v5/v6 to accelerate training by using GPU (cuDNN should be in `/usr/local/cudnn`) |
| | | * `CUDNN=1` to build with cuDNN v5-v7 to accelerate training by using GPU (cuDNN should be in `/usr/local/cudnn`) |
| | | * `OPENCV=1` to build with OpenCV 3.x/2.4.x - allows to detect on video files and video streams from network cameras or web-cams |
| | | * `DEBUG=1` to bould debug version of Yolo |
| | | * `OPENMP=1` to build with OpenMP support to accelerate Yolo by using multi-core CPU |
| | |
| | | |
| | | 5. If you want to build with CUDNN to speed up then: |
| | | |
| | | * download and install **cuDNN 6.0 for CUDA 8.0**: https://developer.nvidia.com/cudnn |
| | | * download and install **cuDNN 7.0 for CUDA 9.1**: https://developer.nvidia.com/cudnn |
| | | |
| | | * add Windows system variable `cudnn` with path to CUDNN: https://hsto.org/files/a49/3dc/fc4/a493dcfc4bd34a1295fd15e0e2e01f26.jpg |
| | | |
| | |
| | | </PropertyGroup> |
| | | <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> |
| | | <ImportGroup Label="ExtensionSettings"> |
| | | <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" /> |
| | | <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 9.1.props" /> |
| | | </ImportGroup> |
| | | <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> |
| | | <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
| | |
| | | </ItemGroup> |
| | | <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> |
| | | <ImportGroup Label="ExtensionTargets"> |
| | | <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" /> |
| | | <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 9.1.targets" /> |
| | | </ImportGroup> |
| | | </Project> |
| | |
| | | <ClCompile Include="..\..\src\gettimeofday.c" /> |
| | | <ClCompile Include="..\..\src\go.c" /> |
| | | <ClCompile Include="..\..\src\gru_layer.c" /> |
| | | <ClCompile Include="..\..\src\http_stream.cpp" /> |
| | | <ClCompile Include="..\..\src\im2col.c" /> |
| | | <ClCompile Include="..\..\src\image.c" /> |
| | | <ClCompile Include="..\..\src\layer.c" /> |
| | |
| | | <ClInclude Include="..\..\src\getopt.h" /> |
| | | <ClInclude Include="..\..\src\gettimeofday.h" /> |
| | | <ClInclude Include="..\..\src\gru_layer.h" /> |
| | | <ClInclude Include="..\..\src\http_stream.h" /> |
| | | <ClInclude Include="..\..\src\im2col.h" /> |
| | | <ClInclude Include="..\..\src\image.h" /> |
| | | <ClInclude Include="..\..\src\layer.h" /> |
| | |
| | | </PropertyGroup> |
| | | <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> |
| | | <ImportGroup Label="ExtensionSettings"> |
| | | <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" /> |
| | | <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 9.1.props" /> |
| | | </ImportGroup> |
| | | <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> |
| | | <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> |
| | |
| | | </ItemGroup> |
| | | <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> |
| | | <ImportGroup Label="ExtensionTargets"> |
| | | <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" /> |
| | | <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 9.1.targets" /> |
| | | </ImportGroup> |
| | | </Project> |
| | |
| | | { |
| | | int idx = blockIdx.x * blockDim.x + threadIdx.x; |
| | | if (idx < size) output_f16[idx] = __float2half(input_f32[idx]); |
| | | //if (idx < size) *((unsigned int *)output_f16 + idx) = __float2half(input_f32[idx]); |
| | | //if (idx < size) *((unsigned short *)output_f16 + idx) = __float2half(input_f32[idx]); |
| | | } |
| | | |
| | | void cuda_convert_f32_to_f16(float* input_f32, size_t size, half *output_f16) { |
| | |
| | | { |
| | | int idx = blockIdx.x * blockDim.x + threadIdx.x; |
| | | if (idx < size) output_f32[idx] = __half2float(input_f16[idx]); |
| | | //if (idx < size) output_f32[idx] = __half2float(*((unsigned int *)input_f16 + idx)); |
| | | //if (idx < size) output_f32[idx] = __half2float(*((unsigned short *)input_f16 + idx)); |
| | | } |
| | | |
| | | void cuda_convert_f16_to_f32(half* input_f16, size_t size, float *output_f32) { |
| | |
| | | |
| | | if(state.delta){ |
| | | if(l.binary || l.xnor) swap_binary(&l); |
| | | // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData |
| | | cudnnConvolutionBackwardData(cudnn_handle(), |
| | | &one, |
| | | l.weightDesc, |
| | |
| | | { |
| | | |
| | | #ifdef CUDNN_HALF |
| | | // TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0): |
| | | // Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100 |
| | | // TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0): Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100 |
| | | // PSEUDO_HALF_CONFIG is required for Tensor Cores - our case! |
| | | const cudnnDataType_t data_type = CUDNN_DATA_HALF; |
| | | #else |
| | | cudnnDataType_t data_type = CUDNN_DATA_FLOAT; |
| | | #endif |
| | | // Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH |
| | | |
| | | #if(CUDNN_MAJOR >= 7) |
| | | // Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH |
| | | // For *_ALGO_WINOGRAD_NONFUSED can be used CUDNN_DATA_FLOAT |
| | | // otherwise Input, Filter and Output descriptors (xDesc, yDesc, wDesc, dxDesc, dyDesc and dwDesc as applicable) have dataType = CUDNN_DATA_HALF |
| | | // Three techniques for training using Mixed-precision: https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/ |
| | | // 1. Accumulation into FP32 |
| | | // 2. Loss Scaling - required only for: activation gradients. We do not use. |
| | | // 3. FP32 Master Copy of Weights |
| | | // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops |
| | | cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH); |
| | | #endif |
| | | |
| | | // INT8_CONFIG, INT8_EXT_CONFIG, INT8x4_CONFIG and INT8x4_EXT_CONFIG are only supported |
| | | // on architectures with DP4A support (compute capability 6.1 and later). |
| | | // on architectures with DP4A support (compute capability 6.1 and later). |
| | | //cudnnDataType_t data_type = CUDNN_DATA_INT8; |
| | | |
| | | cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w); |
| | |
| | | cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w); |
| | | cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); |
| | | #if(CUDNN_MAJOR >= 6) |
| | | cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, data_type); // cudnn >= 6.0 |
| | | cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); // cudnn >= 6.0 |
| | | #else |
| | | cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); // cudnn 5.1 |
| | | #endif |
| | |
| | | state.delta = 0; |
| | | state.truth = *net.truth_gpu; |
| | | state.train = 1; |
| | | #ifdef CUDNN_HALF |
| | | int i; |
| | | for (i = 0; i < net.n; ++i) { |
| | | layer l = net.layers[i]; |
| | | cuda_convert_f32_to_f16(l.weights_gpu, l.c*l.n*l.size*l.size, (half *)l.weights_gpu16); |
| | | } |
| | | #endif |
| | | forward_network_gpu(net, state); |
| | | cudaStreamSynchronize(get_cuda_stream()); |
| | | backward_network_gpu(net, state); |