~speedprog/mtg/mtg_card_detector.git

parent: 6332ea99 | patch | commit | ignore whitespace

Updated to CUDA 9.1. And fixed no_gpu dependecies.

AlexeyAB

2018-02-23 cd2bdec09030edf7da79ecdeb38d908c106850b3

Updated to CUDA 9.1. And fixed no_gpu dependecies.

8 files modified

	Makefile	15 ●●●●● patch \| view \| raw \| blame \| history
	README.md	8 ●●●●● patch \| view \| raw \| blame \| history
	build/darknet/darknet.vcxproj	4 ●●●●● patch \| view \| raw \| blame \| history
	build/darknet/darknet_no_gpu.vcxproj	2 ●●●●● patch \| view \| raw \| blame \| history
	build/darknet/yolo_cpp_dll.vcxproj	4 ●●●●● patch \| view \| raw \| blame \| history
	src/convolutional_kernels.cu	5 ●●●●● patch \| view \| raw \| blame \| history
	src/convolutional_layer.c	18 ●●●●● patch \| view \| raw \| blame \| history
	src/network_kernels.cu	7 ●●●●● patch \| view \| raw \| blame \| history

 Makefile

@@ -9,18 +9,23 @@
      -gencode arch=compute_35,code=sm_35 \
      -gencode arch=compute_50,code=[sm_50,compute_50] \
      -gencode arch=compute_52,code=[sm_52,compute_52] \
      -gencode arch=compute_61,code=[sm_61,compute_61]
      -gencode arch=compute_61,code=[sm_61,compute_61]

# Tesla V100
# ARCH= -gencode arch=compute_70,code=[sm_70,compute_70]

# GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4
# ARCH= -gencode arch=compute_61,code=sm_61 -gencode arch=compute_61,code=compute_61

# GP100/Tesla P100  DGX-1
# ARCH= -gencode arch=compute_60,code=sm_60

# For Jetson Tx1 uncomment:
# ARCH= -gencode arch=compute_51,code=[sm_51,compute_51]

# For Jetson Tx2 uncomment:
# For Jetson Tx2 or Drive-PX2 uncomment:
# ARCH= -gencode arch=compute_62,code=[sm_62,compute_62]

# This is what I use, uncomment if you know your arch and want to specify
# ARCH=  -gencode arch=compute_52,code=compute_52


VPATH=./src/
EXEC=darknet

 README.md

@@ -32,13 +32,13 @@

* both Windows and Linux
* both OpenCV 3.x and OpenCV 2.4.13
* both cuDNN 5 and cuDNN 6
* both cuDNN v5-v7
* CUDA >= 7.5
* also create SO-library on Linux and DLL-library on Windows

##### Requires: 
* **Linux GCC>=4.9 or Windows MS Visual Studio 2015 (v140)**: https://go.microsoft.com/fwlink/?LinkId=532606&clcid=0x409  (or offline [ISO image](https://go.microsoft.com/fwlink/?LinkId=615448&clcid=0x409))
* **CUDA 8.0**: https://developer.nvidia.com/cuda-downloads
* **CUDA 9.1**: https://developer.nvidia.com/cuda-downloads
* **OpenCV 3.x**: https://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.2.0/opencv-3.2.0-vc14.exe/download
* **or OpenCV 2.4.13**: https://sourceforge.net/projects/opencvlibrary/files/opencv-win/2.4.13/opencv-2.4.13.2-vc14.exe/download
  - OpenCV allows to show image or video detection in the window and store result to file that specified in command line `-out_filename res.avi`
@@ -117,7 +117,7 @@
Just do `make` in the darknet directory.
Before make, you can set such options in the `Makefile`: [link](https://github.com/AlexeyAB/darknet/blob/9c1b9a2cf6363546c152251be578a21f3c3caec6/Makefile#L1)
* `GPU=1` to build with CUDA to accelerate by using GPU (CUDA should be in `/usr/local/cuda`)
* `CUDNN=1` to build with cuDNN v5/v6 to accelerate training by using GPU (cuDNN should be in `/usr/local/cudnn`)
* `CUDNN=1` to build with cuDNN v5-v7 to accelerate training by using GPU (cuDNN should be in `/usr/local/cudnn`)
* `OPENCV=1` to build with OpenCV 3.x/2.4.x - allows to detect on video files and video streams from network cameras or web-cams
* `DEBUG=1` to bould debug version of Yolo
* `OPENMP=1` to build with OpenMP support to accelerate Yolo by using multi-core CPU
@@ -142,7 +142,7 @@
  
5. If you want to build with CUDNN to speed up then:
      
    * download and install **cuDNN 6.0 for CUDA 8.0**: https://developer.nvidia.com/cudnn
    * download and install **cuDNN 7.0 for CUDA 9.1**: https://developer.nvidia.com/cudnn
      
    * add Windows system variable `cudnn` with path to CUDNN: https://hsto.org/files/a49/3dc/fc4/a493dcfc4bd34a1295fd15e0e2e01f26.jpg
      

 build/darknet/darknet.vcxproj

@@ -52,7 +52,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" />
    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 9.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
@@ -281,6 +281,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" />
    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 9.1.targets" />
  </ImportGroup>
</Project>

 build/darknet/darknet_no_gpu.vcxproj

@@ -198,6 +198,7 @@
    <ClCompile Include="..\..\src\gettimeofday.c" />
    <ClCompile Include="..\..\src\go.c" />
    <ClCompile Include="..\..\src\gru_layer.c" />
    <ClCompile Include="..\..\src\http_stream.cpp" />
    <ClCompile Include="..\..\src\im2col.c" />
    <ClCompile Include="..\..\src\image.c" />
    <ClCompile Include="..\..\src\layer.c" />
@@ -251,6 +252,7 @@
    <ClInclude Include="..\..\src\getopt.h" />
    <ClInclude Include="..\..\src\gettimeofday.h" />
    <ClInclude Include="..\..\src\gru_layer.h" />
    <ClInclude Include="..\..\src\http_stream.h" />
    <ClInclude Include="..\..\src\im2col.h" />
    <ClInclude Include="..\..\src\image.h" />
    <ClInclude Include="..\..\src\layer.h" />

 build/darknet/yolo_cpp_dll.vcxproj

@@ -52,7 +52,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" />
    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 9.1.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
@@ -285,6 +285,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" />
    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 9.1.targets" />
  </ImportGroup>
</Project>

 src/convolutional_kernels.cu

@@ -78,7 +78,7 @@
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
    //if (idx < size) *((unsigned int *)output_f16 + idx) = __float2half(input_f32[idx]);
    //if (idx < size) *((unsigned short *)output_f16 + idx) = __float2half(input_f32[idx]);
}

void cuda_convert_f32_to_f16(float* input_f32, size_t size, half *output_f16) {
@@ -89,7 +89,7 @@
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) output_f32[idx] = __half2float(input_f16[idx]);
    //if (idx < size) output_f32[idx] = __half2float(*((unsigned int *)input_f16 + idx));
    //if (idx < size) output_f32[idx] = __half2float(*((unsigned short *)input_f16 + idx));
}

void cuda_convert_f16_to_f32(half* input_f16, size_t size, float *output_f32) {
@@ -247,6 +247,7 @@

    if(state.delta){
        if(l.binary || l.xnor) swap_binary(&l);
        // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
        cudnnConvolutionBackwardData(cudnn_handle(),
                &one,
                l.weightDesc,

 src/convolutional_layer.c

@@ -141,19 +141,27 @@
{

#ifdef CUDNN_HALF
    // TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0):
    // Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
    // TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0): Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
    // PSEUDO_HALF_CONFIG is required for Tensor Cores - our case!
    const cudnnDataType_t data_type = CUDNN_DATA_HALF;
#else
    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#endif
    // Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH

#if(CUDNN_MAJOR >= 7)
    // Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH
    // For *_ALGO_WINOGRAD_NONFUSED can be used CUDNN_DATA_FLOAT
    // otherwise Input, Filter and Output descriptors (xDesc, yDesc, wDesc, dxDesc, dyDesc and dwDesc as applicable) have dataType = CUDNN_DATA_HALF
    // Three techniques for training using Mixed-precision: https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
    // 1. Accumulation into FP32
    // 2. Loss Scaling - required only for: activation gradients. We do not use.
    // 3. FP32 Master Copy of Weights
    // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
    cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH);
#endif

    // INT8_CONFIG, INT8_EXT_CONFIG, INT8x4_CONFIG and INT8x4_EXT_CONFIG are only supported 
    // on architectures with DP4A support (compute capability 6.1 and later).
    //   on architectures with DP4A support (compute capability 6.1 and later).
    //cudnnDataType_t data_type = CUDNN_DATA_INT8;

    cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w);
@@ -164,7 +172,7 @@
    cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
    cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
#if(CUDNN_MAJOR >= 6)
    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, data_type);   // cudnn >= 6.0
    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);    // cudnn >= 6.0
#else
    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);  // cudnn 5.1
#endif

 src/network_kernels.cu

@@ -113,6 +113,13 @@
    state.delta = 0;
    state.truth = *net.truth_gpu;
    state.train = 1;
#ifdef CUDNN_HALF
    int i;
    for (i = 0; i < net.n; ++i) {
        layer l = net.layers[i];
        cuda_convert_f32_to_f16(l.weights_gpu, l.c*l.n*l.size*l.size, (half *)l.weights_gpu16);
    }
#endif
    forward_network_gpu(net, state);
    cudaStreamSynchronize(get_cuda_stream());
    backward_network_gpu(net, state);

			@@ -9,18 +9,23 @@
			-gencode arch=compute_35,code=sm_35 \
			-gencode arch=compute_50,code=[sm_50,compute_50] \
			-gencode arch=compute_52,code=[sm_52,compute_52] \
			-gencode arch=compute_61,code=[sm_61,compute_61]
			-gencode arch=compute_61,code=[sm_61,compute_61]

			# Tesla V100
			# ARCH= -gencode arch=compute_70,code=[sm_70,compute_70]

			# GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4
			# ARCH= -gencode arch=compute_61,code=sm_61 -gencode arch=compute_61,code=compute_61

			# GP100/Tesla P100 DGX-1
			# ARCH= -gencode arch=compute_60,code=sm_60

			# For Jetson Tx1 uncomment:
			# ARCH= -gencode arch=compute_51,code=[sm_51,compute_51]

			# For Jetson Tx2 uncomment:
			# For Jetson Tx2 or Drive-PX2 uncomment:
			# ARCH= -gencode arch=compute_62,code=[sm_62,compute_62]

			# This is what I use, uncomment if you know your arch and want to specify
			# ARCH= -gencode arch=compute_52,code=compute_52


			VPATH=./src/
			EXEC=darknet

			@@ -32,13 +32,13 @@

			* both Windows and Linux
			* both OpenCV 3.x and OpenCV 2.4.13
			* both cuDNN 5 and cuDNN 6
			* both cuDNN v5-v7
			* CUDA >= 7.5
			* also create SO-library on Linux and DLL-library on Windows

			##### Requires:
			* Linux GCC>=4.9 or Windows MS Visual Studio 2015 (v140): https://go.microsoft.com/fwlink/?LinkId=532606&clcid=0x409 (or offline [ISO image](https://go.microsoft.com/fwlink/?LinkId=615448&clcid=0x409))
			* CUDA 8.0: https://developer.nvidia.com/cuda-downloads
			* CUDA 9.1: https://developer.nvidia.com/cuda-downloads
			* OpenCV 3.x: https://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.2.0/opencv-3.2.0-vc14.exe/download
			* or OpenCV 2.4.13: https://sourceforge.net/projects/opencvlibrary/files/opencv-win/2.4.13/opencv-2.4.13.2-vc14.exe/download
			- OpenCV allows to show image or video detection in the window and store result to file that specified in command line `-out_filename res.avi`
			@@ -117,7 +117,7 @@
			Just do `make` in the darknet directory.
			Before make, you can set such options in the `Makefile`: [link](https://github.com/AlexeyAB/darknet/blob/9c1b9a2cf6363546c152251be578a21f3c3caec6/Makefile#L1)
			* `GPU=1` to build with CUDA to accelerate by using GPU (CUDA should be in `/usr/local/cuda`)
			* `CUDNN=1` to build with cuDNN v5/v6 to accelerate training by using GPU (cuDNN should be in `/usr/local/cudnn`)
			* `CUDNN=1` to build with cuDNN v5-v7 to accelerate training by using GPU (cuDNN should be in `/usr/local/cudnn`)
			* `OPENCV=1` to build with OpenCV 3.x/2.4.x - allows to detect on video files and video streams from network cameras or web-cams
			* `DEBUG=1` to bould debug version of Yolo
			* `OPENMP=1` to build with OpenMP support to accelerate Yolo by using multi-core CPU
			@@ -142,7 +142,7 @@

			5. If you want to build with CUDNN to speed up then:

			* download and install cuDNN 6.0 for CUDA 8.0: https://developer.nvidia.com/cudnn
			* download and install cuDNN 7.0 for CUDA 9.1: https://developer.nvidia.com/cudnn

			* add Windows system variable `cudnn` with path to CUDNN: https://hsto.org/files/a49/3dc/fc4/a493dcfc4bd34a1295fd15e0e2e01f26.jpg

			@@ -52,7 +52,7 @@
			</PropertyGroup>
			<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
			<ImportGroup Label="ExtensionSettings">
			<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" />
			<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 9.1.props" />
			</ImportGroup>
			<ImportGroup Label="PropertySheets" Condition="'$(Configuration)\|$(Platform)'=='Debug\|Win32'">
			<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
			@@ -281,6 +281,6 @@
			</ItemGroup>
			<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
			<ImportGroup Label="ExtensionTargets">
			<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" />
			<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 9.1.targets" />
			</ImportGroup>
			</Project>

			@@ -198,6 +198,7 @@
			<ClCompile Include="..\..\src\gettimeofday.c" />
			<ClCompile Include="..\..\src\go.c" />
			<ClCompile Include="..\..\src\gru_layer.c" />
			<ClCompile Include="..\..\src\http_stream.cpp" />
			<ClCompile Include="..\..\src\im2col.c" />
			<ClCompile Include="..\..\src\image.c" />
			<ClCompile Include="..\..\src\layer.c" />
			@@ -251,6 +252,7 @@
			<ClInclude Include="..\..\src\getopt.h" />
			<ClInclude Include="..\..\src\gettimeofday.h" />
			<ClInclude Include="..\..\src\gru_layer.h" />
			<ClInclude Include="..\..\src\http_stream.h" />
			<ClInclude Include="..\..\src\im2col.h" />
			<ClInclude Include="..\..\src\image.h" />
			<ClInclude Include="..\..\src\layer.h" />

			@@ -78,7 +78,7 @@
			{
			int idx = blockIdx.x * blockDim.x + threadIdx.x;
			if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
			//if (idx < size) ((unsigned int )output_f16 + idx) = __float2half(input_f32[idx]);
			//if (idx < size) ((unsigned short )output_f16 + idx) = __float2half(input_f32[idx]);
			}

			void cuda_convert_f32_to_f16(float* input_f32, size_t size, half *output_f16) {
			@@ -89,7 +89,7 @@
			{
			int idx = blockIdx.x * blockDim.x + threadIdx.x;
			if (idx < size) output_f32[idx] = __half2float(input_f16[idx]);
			//if (idx < size) output_f32[idx] = __half2float(((unsigned int )input_f16 + idx));
			//if (idx < size) output_f32[idx] = __half2float(((unsigned short )input_f16 + idx));
			}

			void cuda_convert_f16_to_f32(half* input_f16, size_t size, float *output_f32) {
			@@ -247,6 +247,7 @@

			if(state.delta){
			if(l.binary \|\| l.xnor) swap_binary(&l);
			// http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
			cudnnConvolutionBackwardData(cudnn_handle(),
			&one,
			l.weightDesc,

			@@ -141,19 +141,27 @@
			{

			#ifdef CUDNN_HALF
			// TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0):
			// Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
			// TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0): Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
			// PSEUDO_HALF_CONFIG is required for Tensor Cores - our case!
			const cudnnDataType_t data_type = CUDNN_DATA_HALF;
			#else
			cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
			#endif
			// Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH

			#if(CUDNN_MAJOR >= 7)
			// Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH
			// For *_ALGO_WINOGRAD_NONFUSED can be used CUDNN_DATA_FLOAT
			// otherwise Input, Filter and Output descriptors (xDesc, yDesc, wDesc, dxDesc, dyDesc and dwDesc as applicable) have dataType = CUDNN_DATA_HALF
			// Three techniques for training using Mixed-precision: https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
			// 1. Accumulation into FP32
			// 2. Loss Scaling - required only for: activation gradients. We do not use.
			// 3. FP32 Master Copy of Weights
			// More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
			cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH);
			#endif

			// INT8_CONFIG, INT8_EXT_CONFIG, INT8x4_CONFIG and INT8x4_EXT_CONFIG are only supported
			// on architectures with DP4A support (compute capability 6.1 and later).
			// on architectures with DP4A support (compute capability 6.1 and later).
			//cudnnDataType_t data_type = CUDNN_DATA_INT8;

			cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w);
			@@ -164,7 +172,7 @@
			cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
			cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
			#if(CUDNN_MAJOR >= 6)
			cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, data_type); // cudnn >= 6.0
			cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); // cudnn >= 6.0
			#else
			cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); // cudnn 5.1
			#endif

			@@ -113,6 +113,13 @@
			state.delta = 0;
			state.truth = *net.truth_gpu;
			state.train = 1;
			#ifdef CUDNN_HALF
			int i;
			for (i = 0; i < net.n; ++i) {
			layer l = net.layers[i];
			cuda_convert_f32_to_f16(l.weights_gpu, l.cl.nl.sizel.size, (half )l.weights_gpu16);
			}
			#endif
			forward_network_gpu(net, state);
			cudaStreamSynchronize(get_cuda_stream());
			backward_network_gpu(net, state);