AlexeyAB
2017-07-24 51477ab274bbb40be259844bdaab9685f693b028
Increased 2.3 times performance of Yolo on CPU by using OpenMP in both darknet_no_gpu and yolo_cpp_dll_no_gpu
3 files modified
24 ■■■■■ changed files
build/darknet/darknet_no_gpu.vcxproj 2 ●●●●● patch | view | raw | blame | history
build/darknet/yolo_cpp_dll_no_gpu.vcxproj 1 ●●●● patch | view | raw | blame | history
src/gemm.c 21 ●●●●● patch | view | raw | blame | history
build/darknet/darknet_no_gpu.vcxproj
@@ -89,6 +89,7 @@
      <AdditionalIncludeDirectories>C:\opencv_2.4.9\opencv\build\include;..\..\3rdparty\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <PreprocessorDefinitions>_MBCS;OPENCV;_TIMESPEC_DEFINED;_CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <UndefinePreprocessorDefinitions>CUDNN</UndefinePreprocessorDefinitions>
      <OpenMPSupport>true</OpenMPSupport>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
@@ -133,6 +134,7 @@
      <PrecompiledHeaderCompileAs>CompileAsCpp</PrecompiledHeaderCompileAs>
      <CompileAs>Default</CompileAs>
      <UndefinePreprocessorDefinitions>CUDNN</UndefinePreprocessorDefinitions>
      <OpenMPSupport>true</OpenMPSupport>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
build/darknet/yolo_cpp_dll_no_gpu.vcxproj
@@ -139,6 +139,7 @@
      <UndefinePreprocessorDefinitions>
      </UndefinePreprocessorDefinitions>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
      <OpenMPSupport>true</OpenMPSupport>
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
src/gemm.c
@@ -151,14 +151,19 @@
            C[i*ldc + j] *= BETA;
        }
    }
    if(!TA && !TB)
        gemm_nn(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
    else if(TA && !TB)
        gemm_tn(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
    else if(!TA && TB)
        gemm_nt(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
    else
        gemm_tt(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
    int t;
    #pragma omp parallel for
    for (t = 0; t < M; ++t) {
        if (!TA && !TB)
            gemm_nn(1, N, K, ALPHA, A + t*lda, lda, B, ldb, C + t*ldc, ldc);
        else if (TA && !TB)
            gemm_tn(1, N, K, ALPHA, A + t, lda, B, ldb, C + t*ldc, ldc);
        else if (!TA && TB)
            gemm_nt(1, N, K, ALPHA, A + t*lda, lda, B, ldb, C + t*ldc, ldc);
        else
            gemm_tt(1, N, K, ALPHA, A + t, lda, B, ldb, C + t*ldc, ldc);
    }
}
#ifdef GPU