~speedprog/mtg/mtg_card_detector.git

parent: 81d4688b | patch | commit | ignore whitespace

Just used spaces for indents instead of Tabs

AlexeyAB

2018-07-10 cfc5fedbb6df2471493b1ec162d0024485618211

Just used spaces for indents instead of Tabs

30 files modified

1 files added

	src/.editorconfig	8 ●●●●● patch \| view \| raw \| blame \| history
	src/batchnorm_layer.c	152 ●●●●● patch \| view \| raw \| blame \| history
	src/blas.c	32 ●●●●● patch \| view \| raw \| blame \| history
	src/blas_kernels.cu	62 ●●●●● patch \| view \| raw \| blame \| history
	src/box.c	142 ●●●●● patch \| view \| raw \| blame \| history
	src/convolutional_kernels.cu	432 ●●●●● patch \| view \| raw \| blame \| history
	src/convolutional_layer.c	206 ●●●●● patch \| view \| raw \| blame \| history
	src/cuda.c	46 ●●●●● patch \| view \| raw \| blame \| history
	src/data.c	314 ●●●●● patch \| view \| raw \| blame \| history
	src/demo.c	260 ●●●●● patch \| view \| raw \| blame \| history
	src/detector.c	1758 ●●●●● patch \| view \| raw \| blame \| history
	src/gemm.c	208 ●●●●● patch \| view \| raw \| blame \| history
	src/gettimeofday.c	2 ●●●●● patch \| view \| raw \| blame \| history
	src/http_stream.cpp	442 ●●●●● patch \| view \| raw \| blame \| history
	src/http_stream.h	4 ●●●●● patch \| view \| raw \| blame \| history
	src/image.c	1102 ●●●●● patch \| view \| raw \| blame \| history
	src/list.c	74 ●●●●● patch \| view \| raw \| blame \| history
	src/network.c	364 ●●●●● patch \| view \| raw \| blame \| history
	src/network_kernels.cu	46 ●●●●● patch \| view \| raw \| blame \| history
	src/option_list.c	28 ●●●●● patch \| view \| raw \| blame \| history
	src/parser.c	220 ●●●●● patch \| view \| raw \| blame \| history
	src/region_layer.c	284 ●●●●● patch \| view \| raw \| blame \| history
	src/reorg_layer.c	48 ●●●●● patch \| view \| raw \| blame \| history
	src/reorg_old_layer.c	48 ●●●●● patch \| view \| raw \| blame \| history
	src/shortcut_layer.c	24 ●●●●● patch \| view \| raw \| blame \| history
	src/tree.c	54 ●●●●● patch \| view \| raw \| blame \| history
	src/utils.c	144 ●●●●● patch \| view \| raw \| blame \| history
	src/yolo_console_dll.cpp	746 ●●●●● patch \| view \| raw \| blame \| history
	src/yolo_layer.c	140 ●●●●● patch \| view \| raw \| blame \| history
	src/yolo_v2_class.cpp	398 ●●●●● patch \| view \| raw \| blame \| history
	src/yolo_v2_class.hpp	892 ●●●●● patch \| view \| raw \| blame \| history

 src/.editorconfig

New file
@@ -0,0 +1,8 @@
root=true

[*]
trim_trailing_whitespace = true
indent_style = space
indent_size = 4



 src/batchnorm_layer.c

@@ -53,10 +53,10 @@
    layer.x_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs);
    layer.x_norm_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs);
#ifdef CUDNN
    cudnnCreateTensorDescriptor(&layer.normTensorDesc);
    cudnnCreateTensorDescriptor(&layer.normDstTensorDesc);
    cudnnSetTensor4dDescriptor(layer.normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, layer.batch, layer.out_c, layer.out_h, layer.out_w);
    cudnnSetTensor4dDescriptor(layer.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, layer.out_c, 1, 1);
    cudnnCreateTensorDescriptor(&layer.normTensorDesc);
    cudnnCreateTensorDescriptor(&layer.normDstTensorDesc);
    cudnnSetTensor4dDescriptor(layer.normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, layer.batch, layer.out_c, layer.out_h, layer.out_w);
    cudnnSetTensor4dDescriptor(layer.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, layer.out_c, 1, 1);
#endif
#endif
    return layer;
@@ -179,93 +179,93 @@

void forward_batchnorm_layer_gpu(layer l, network_state state)
{
    if (l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
    copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
    if (state.train) {
    if (l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
    copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
    if (state.train) {
#ifdef CUDNN
        float one = 1;
        float zero = 0;
        cudnnBatchNormalizationForwardTraining(cudnn_handle(),
            CUDNN_BATCHNORM_SPATIAL,
            &one,
            &zero,
            l.normDstTensorDesc,
            l.x_gpu,                // input
            l.normDstTensorDesc,
            l.output_gpu,           // output
            l.normTensorDesc,
            l.scales_gpu,
            l.biases_gpu,
            .01,
            l.rolling_mean_gpu,     // output (should be FP32)
            l.rolling_variance_gpu, // output (should be FP32)
            .00001,
            l.mean_gpu,         // output (should be FP32)
            l.variance_gpu);    // output (should be FP32)
        float one = 1;
        float zero = 0;
        cudnnBatchNormalizationForwardTraining(cudnn_handle(),
            CUDNN_BATCHNORM_SPATIAL,
            &one,
            &zero,
            l.normDstTensorDesc,
            l.x_gpu,                // input
            l.normDstTensorDesc,
            l.output_gpu,            // output
            l.normTensorDesc,
            l.scales_gpu,
            l.biases_gpu,
            .01,
            l.rolling_mean_gpu,        // output (should be FP32)
            l.rolling_variance_gpu,    // output (should be FP32)
            .00001,
            l.mean_gpu,            // output (should be FP32)
            l.variance_gpu);    // output (should be FP32)
#else
        fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
        fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu);
        fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
        fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu);

        scal_ongpu(l.out_c, .99, l.rolling_mean_gpu, 1);
        axpy_ongpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
        scal_ongpu(l.out_c, .99, l.rolling_variance_gpu, 1);
        axpy_ongpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);
        scal_ongpu(l.out_c, .99, l.rolling_mean_gpu, 1);
        axpy_ongpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
        scal_ongpu(l.out_c, .99, l.rolling_variance_gpu, 1);
        axpy_ongpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);

        copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
        normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
        copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);
        copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
        normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
        copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);

        scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
        scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
#endif
    }
    else {
        normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
        scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
    }
    }
    else {
        normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
        scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
    }

}

void backward_batchnorm_layer_gpu(layer l, network_state state)
{
    if (!state.train) {
        l.mean_gpu = l.rolling_mean_gpu;
        l.variance_gpu = l.rolling_variance_gpu;
    }
    if (!state.train) {
        l.mean_gpu = l.rolling_mean_gpu;
        l.variance_gpu = l.rolling_variance_gpu;
    }
#ifdef CUDNN
    float one = 1;
    float zero = 0;
    cudnnBatchNormalizationBackward(cudnn_handle(),
        CUDNN_BATCHNORM_SPATIAL,
        &one,
        &zero,
        &one,
        &one,
        l.normDstTensorDesc,
        l.x_gpu,                // input
        l.normDstTensorDesc,
        l.delta_gpu,            // input
        l.normDstTensorDesc,
        l.x_norm_gpu,           // output
        l.normTensorDesc,
        l.scales_gpu,           // output (should be FP32)
        l.scale_updates_gpu,    // output (should be FP32)
        l.bias_updates_gpu,     // output (should be FP32)
        .00001,
        l.mean_gpu,             // input (should be FP32)
        l.variance_gpu);        // input (should be FP32)
    copy_ongpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
    float one = 1;
    float zero = 0;
    cudnnBatchNormalizationBackward(cudnn_handle(),
        CUDNN_BATCHNORM_SPATIAL,
        &one,
        &zero,
        &one,
        &one,
        l.normDstTensorDesc,
        l.x_gpu,                // input
        l.normDstTensorDesc,
        l.delta_gpu,            // input
        l.normDstTensorDesc,
        l.x_norm_gpu,            // output
        l.normTensorDesc,
        l.scales_gpu,            // output (should be FP32)
        l.scale_updates_gpu,    // output (should be FP32)
        l.bias_updates_gpu,        // output (should be FP32)
        .00001,
        l.mean_gpu,                // input (should be FP32)
        l.variance_gpu);        // input (should be FP32)
    copy_ongpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
#else
    backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h);
    backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu);
    backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h);
    backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu);

    scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
    scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);

    fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu);
    fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu);
    normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
    fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu);
    fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu);
    normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
#endif
    if (l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
    if (l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
}
#endif

 src/blas.c

@@ -11,8 +11,8 @@
    int b,i,j,k;
    int in_c = out_c/(stride*stride);

    //printf("\n out_c = %d, out_w = %d, out_h = %d, stride = %d, forward = %d \n", out_c, out_w, out_h, stride, forward);
    //printf("  in_c = %d,  in_w = %d,  in_h = %d \n", in_c, out_w*stride, out_h*stride);
    //printf("\n out_c = %d, out_w = %d, out_h = %d, stride = %d, forward = %d \n", out_c, out_w, out_h, stride, forward);
    //printf("  in_c = %d,  in_w = %d,  in_h = %d \n", in_c, out_w*stride, out_h*stride);

    for(b = 0; b < batch; ++b){
        for(k = 0; k < out_c; ++k){
@@ -24,7 +24,7 @@
                    int w2 = i*stride + offset % stride;
                    int h2 = j*stride + offset / stride;
                    int out_index = w2 + out_w*stride*(h2 + out_h*stride*(c2 + in_c*b));
                    if(forward) out[out_index] = x[in_index];   // used by default for forward (i.e. forward = 0)
                    if(forward) out[out_index] = x[in_index];    // used by default for forward (i.e. forward = 0)
                    else out[in_index] = x[out_index];
                }
            }
@@ -293,17 +293,17 @@

void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
{
    int i, j, k, b;
    for (b = 0; b < batch; ++b) {
        for (k = 0; k < c; ++k) {
            for (j = 0; j < h*stride; ++j) {
                for (i = 0; i < w*stride; ++i) {
                    int in_index = b*w*h*c + k*w*h + (j / stride)*w + i / stride;
                    int out_index = b*w*h*c*stride*stride + k*w*h*stride*stride + j*w*stride + i;
                    if (forward) out[out_index] = scale*in[in_index];
                    else in[in_index] += scale*out[out_index];
                }
            }
        }
    }
    int i, j, k, b;
    for (b = 0; b < batch; ++b) {
        for (k = 0; k < c; ++k) {
            for (j = 0; j < h*stride; ++j) {
                for (i = 0; i < w*stride; ++i) {
                    int in_index = b*w*h*c + k*w*h + (j / stride)*w + i / stride;
                    int out_index = b*w*h*c*stride*stride + k*w*h*stride*stride + j*w*stride + i;
                    if (forward) out[out_index] = scale*in[in_index];
                    else in[in_index] += scale*out[out_index];
                }
            }
        }
    }
}

 src/blas_kernels.cu

@@ -157,16 +157,16 @@

extern "C" void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t)
{
    scal_ongpu(n, B1, m, 1);
    scal_ongpu(n, B2, v, 1);
    axpy_ongpu(n, -decay*batch, w, 1, d, 1);
    scal_ongpu(n, B1, m, 1);
    scal_ongpu(n, B2, v, 1);
    axpy_ongpu(n, -decay*batch, w, 1, d, 1);

    axpy_ongpu(n, (1 - B1), d, 1, m, 1);
    mul_ongpu(n, d, 1, d, 1);
    axpy_ongpu(n, (1 - B2), d, 1, v, 1);
    axpy_ongpu(n, (1 - B1), d, 1, m, 1);
    mul_ongpu(n, d, 1, d, 1);
    axpy_ongpu(n, (1 - B2), d, 1, v, 1);

    adam_gpu(n, w, m, v, B1, B2, rate, eps, t);
    fill_ongpu(n, 0, d, 1);
    adam_gpu(n, w, m, v, B1, B2, rate, eps, t);
    fill_ongpu(n, 0, d, 1);
}

__global__ void normalize_kernel(int N, float *x, float *mean, float *variance, int batch, int filters, int spatial)
@@ -237,7 +237,7 @@
            local[id] += (i+id < spatial) ? delta[index] : 0;
        }
    }
    __syncthreads();
    __syncthreads();

    if(id == 0){
        mean_delta[filter] = 0;
@@ -266,7 +266,7 @@
            local[id] += (i+id < spatial) ? delta[index]*(x[index] - mean[filter]) : 0;
        }
    }
    __syncthreads();
    __syncthreads();

    if(id == 0){
        variance_delta[filter] = 0;
@@ -462,7 +462,7 @@
            local[id] += (i+id < spatial) ? x[index] : 0;
        }
    }
    __syncthreads();
    __syncthreads();

    if(id == 0){
        mean[filter] = 0;
@@ -491,7 +491,7 @@
            local[id] += (i+id < spatial) ? powf((x[index] - mean[filter]), 2) : 0;
        }
    }
    __syncthreads();
    __syncthreads();

    if(id == 0){
        variance[filter] = 0;
@@ -787,31 +787,31 @@

__global__ void upsample_kernel(size_t N, float *x, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
{
    size_t i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
    if (i >= N) return;
    int out_index = i;
    int out_w = i % (w*stride);
    i = i / (w*stride);
    int out_h = i % (h*stride);
    i = i / (h*stride);
    int out_c = i%c;
    i = i / c;
    int b = i%batch;
    size_t i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
    if (i >= N) return;
    int out_index = i;
    int out_w = i % (w*stride);
    i = i / (w*stride);
    int out_h = i % (h*stride);
    i = i / (h*stride);
    int out_c = i%c;
    i = i / c;
    int b = i%batch;

    int in_w = out_w / stride;
    int in_h = out_h / stride;
    int in_c = out_c;
    int in_w = out_w / stride;
    int in_h = out_h / stride;
    int in_c = out_c;

    int in_index = b*w*h*c + in_c*w*h + in_h*w + in_w;
    int in_index = b*w*h*c + in_c*w*h + in_h*w + in_w;


    if (forward) out[out_index] += scale * x[in_index];
    else atomicAdd(x + in_index, scale * out[out_index]);
    if (forward) out[out_index] += scale * x[in_index];
    else atomicAdd(x + in_index, scale * out[out_index]);
}

extern "C" void upsample_gpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
{
    size_t size = w*h*c*batch*stride*stride;
    upsample_kernel << <cuda_gridsize(size), BLOCK >> >(size, in, w, h, c, batch, stride, forward, scale, out);
    check_error(cudaPeekAtLastError());
    size_t size = w*h*c*batch*stride*stride;
    upsample_kernel << <cuda_gridsize(size), BLOCK >> >(size, in, w, h, c, batch, stride, forward, scale, out);
    check_error(cudaPeekAtLastError());
}

 src/box.c

@@ -278,88 +278,88 @@

int nms_comparator_v3(const void *pa, const void *pb)
{
    detection a = *(detection *)pa;
    detection b = *(detection *)pb;
    float diff = 0;
    if (b.sort_class >= 0) {
        diff = a.prob[b.sort_class] - b.prob[b.sort_class];
    }
    else {
        diff = a.objectness - b.objectness;
    }
    if (diff < 0) return 1;
    else if (diff > 0) return -1;
    return 0;
    detection a = *(detection *)pa;
    detection b = *(detection *)pb;
    float diff = 0;
    if (b.sort_class >= 0) {
        diff = a.prob[b.sort_class] - b.prob[b.sort_class];
    }
    else {
        diff = a.objectness - b.objectness;
    }
    if (diff < 0) return 1;
    else if (diff > 0) return -1;
    return 0;
}

void do_nms_obj(detection *dets, int total, int classes, float thresh)
{
    int i, j, k;
    k = total - 1;
    for (i = 0; i <= k; ++i) {
        if (dets[i].objectness == 0) {
            detection swap = dets[i];
            dets[i] = dets[k];
            dets[k] = swap;
            --k;
            --i;
        }
    }
    total = k + 1;
    int i, j, k;
    k = total - 1;
    for (i = 0; i <= k; ++i) {
        if (dets[i].objectness == 0) {
            detection swap = dets[i];
            dets[i] = dets[k];
            dets[k] = swap;
            --k;
            --i;
        }
    }
    total = k + 1;

    for (i = 0; i < total; ++i) {
        dets[i].sort_class = -1;
    }
    for (i = 0; i < total; ++i) {
        dets[i].sort_class = -1;
    }

    qsort(dets, total, sizeof(detection), nms_comparator_v3);
    for (i = 0; i < total; ++i) {
        if (dets[i].objectness == 0) continue;
        box a = dets[i].bbox;
        for (j = i + 1; j < total; ++j) {
            if (dets[j].objectness == 0) continue;
            box b = dets[j].bbox;
            if (box_iou(a, b) > thresh) {
                dets[j].objectness = 0;
                for (k = 0; k < classes; ++k) {
                    dets[j].prob[k] = 0;
                }
            }
        }
    }
    qsort(dets, total, sizeof(detection), nms_comparator_v3);
    for (i = 0; i < total; ++i) {
        if (dets[i].objectness == 0) continue;
        box a = dets[i].bbox;
        for (j = i + 1; j < total; ++j) {
            if (dets[j].objectness == 0) continue;
            box b = dets[j].bbox;
            if (box_iou(a, b) > thresh) {
                dets[j].objectness = 0;
                for (k = 0; k < classes; ++k) {
                    dets[j].prob[k] = 0;
                }
            }
        }
    }
}

void do_nms_sort(detection *dets, int total, int classes, float thresh)
{
    int i, j, k;
    k = total - 1;
    for (i = 0; i <= k; ++i) {
        if (dets[i].objectness == 0) {
            detection swap = dets[i];
            dets[i] = dets[k];
            dets[k] = swap;
            --k;
            --i;
        }
    }
    total = k + 1;
    int i, j, k;
    k = total - 1;
    for (i = 0; i <= k; ++i) {
        if (dets[i].objectness == 0) {
            detection swap = dets[i];
            dets[i] = dets[k];
            dets[k] = swap;
            --k;
            --i;
        }
    }
    total = k + 1;

    for (k = 0; k < classes; ++k) {
        for (i = 0; i < total; ++i) {
            dets[i].sort_class = k;
        }
        qsort(dets, total, sizeof(detection), nms_comparator_v3);
        for (i = 0; i < total; ++i) {
            //printf("  k = %d, \t i = %d \n", k, i);
            if (dets[i].prob[k] == 0) continue;
            box a = dets[i].bbox;
            for (j = i + 1; j < total; ++j) {
                box b = dets[j].bbox;
                if (box_iou(a, b) > thresh) {
                    dets[j].prob[k] = 0;
                }
            }
        }
    }
    for (k = 0; k < classes; ++k) {
        for (i = 0; i < total; ++i) {
            dets[i].sort_class = k;
        }
        qsort(dets, total, sizeof(detection), nms_comparator_v3);
        for (i = 0; i < total; ++i) {
            //printf("  k = %d, \t i = %d \n", k, i);
            if (dets[i].prob[k] == 0) continue;
            box a = dets[i].bbox;
            for (j = i + 1; j < total; ++j) {
                box b = dets[j].bbox;
                if (box_iou(a, b) > thresh) {
                    dets[j].prob[k] = 0;
                }
            }
        }
    }
}

void do_nms(box *boxes, float **probs, int total, int classes, float thresh)

 src/convolutional_kernels.cu

@@ -76,36 +76,36 @@

__global__ void cuda_f32_to_f16(float* input_f32, size_t size, half *output_f16)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
    //if (idx < size) *((unsigned short *)output_f16 + idx) = __float2half(input_f32[idx]);
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
    //if (idx < size) *((unsigned short *)output_f16 + idx) = __float2half(input_f32[idx]);
}

void cuda_convert_f32_to_f16(float* input_f32, size_t size, float *output_f16) {
    cuda_f32_to_f16 <<< size / BLOCK + 1, BLOCK, 0, get_cuda_stream() >>> (input_f32, size, (half *)output_f16);
    cuda_f32_to_f16 <<< size / BLOCK + 1, BLOCK, 0, get_cuda_stream() >>> (input_f32, size, (half *)output_f16);
}

__global__ void cuda_f16_to_f32(half* input_f16, size_t size, float *output_f32)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) output_f32[idx] = __half2float(input_f16[idx]);
    //if (idx < size) output_f32[idx] = __half2float(*((unsigned short *)input_f16 + idx));
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) output_f32[idx] = __half2float(input_f16[idx]);
    //if (idx < size) output_f32[idx] = __half2float(*((unsigned short *)input_f16 + idx));
}

void cuda_convert_f16_to_f32(float* input_f16, size_t size, float *output_f32) {
    cuda_f16_to_f32 <<< size / BLOCK + 1, BLOCK, 0, get_cuda_stream() >>> ((half *)input_f16, size, output_f32);
    cuda_f16_to_f32 <<< size / BLOCK + 1, BLOCK, 0, get_cuda_stream() >>> ((half *)input_f16, size, output_f32);
}

half *cuda_make_f16_from_f32_array(float *src, size_t n)
{
    half *dst16;
    size_t size = sizeof(half)*n;
    check_error(cudaMalloc((void **)&dst16, size));
    if (src) {
        cuda_convert_f32_to_f16(src, n, (float *)dst16);
    }
    if (!dst16) error("Cuda malloc failed\n");
    return dst16;
    half *dst16;
    size_t size = sizeof(half)*n;
    check_error(cudaMalloc((void **)&dst16, size));
    if (src) {
        cuda_convert_f32_to_f16(src, n, (float *)dst16);
    }
    if (!dst16) error("Cuda malloc failed\n");
    return dst16;
}

void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
@@ -124,96 +124,96 @@
    }

#ifdef CUDNN
    float one = 1;  // alpha[0], beta[0] is float for HALF and FLOAT
    float alpha = 1, beta = 0; 
    float one = 1;    // alpha[0], beta[0] is float for HALF and FLOAT
    float alpha = 1, beta = 0; 

#ifdef CUDNN_HALF
    // Note: For improved performance it is advised to use beta[0] = 0.0. 
    // For Tensor Core: cudnnSetConvolutionMathType() where cudnnMathType_t mathType = CUDNN_TENSOR_OP_MATH;
    // 1. or CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM and use CUDNN_DATA_HALF
    // 2. or CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED
    // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
    // Note: For improved performance it is advised to use beta[0] = 0.0. 
    // For Tensor Core: cudnnSetConvolutionMathType() where cudnnMathType_t mathType = CUDNN_TENSOR_OP_MATH;
    // 1. or CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM and use CUDNN_DATA_HALF
    // 2. or CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED
    // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops

    const size_t input16_size = l.batch*l.c*l.w*l.h;
    const size_t output16_size = l.batch*l.out_c*l.out_h*l.out_w;
    const size_t input16_size = l.batch*l.c*l.w*l.h;
    const size_t output16_size = l.batch*l.out_c*l.out_h*l.out_w;

    if (*state.net.max_input16_size < input16_size) {
        //printf("\n input16_size: cur = %zu \t max = %zu \n", input16_size, *state.net.max_input16_size);
        *state.net.max_input16_size = input16_size;
        if (*state.net.input16_gpu) cuda_free(*state.net.input16_gpu);
        *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size);
    }
    float *input16 = *state.net.input16_gpu;
    if (*state.net.max_input16_size < input16_size) {
        //printf("\n input16_size: cur = %zu \t max = %zu \n", input16_size, *state.net.max_input16_size);
        *state.net.max_input16_size = input16_size;
        if (*state.net.input16_gpu) cuda_free(*state.net.input16_gpu);
        *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size);
    }
    float *input16 = *state.net.input16_gpu;

    if (*state.net.max_output16_size < output16_size) {
        *state.net.max_output16_size = output16_size;
        if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu);
        *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size);
    }
    float *output16 = *state.net.output16_gpu;
    if (*state.net.max_output16_size < output16_size) {
        *state.net.max_output16_size = output16_size;
        if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu);
        *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size);
    }
    float *output16 = *state.net.output16_gpu;

    cuda_convert_f32_to_f16(state.input, input16_size, input16);
    cuda_convert_f32_to_f16(state.input, input16_size, input16);

    //fill_ongpu(output16_size / 2, 0, (float *)output16, 1);
    cudnnConvolutionForward(cudnn_handle(),
        &alpha,
        l.srcTensorDesc,
        input16,
        l.weightDesc,
        l.weights_gpu16,
        l.convDesc,
        l.fw_algo,
        state.workspace,
        l.workspace_size,
        &beta,
        l.dstTensorDesc,
        output16);
	
    //fill_ongpu(output16_size / 2, 0, (float *)output16, 1);
    cudnnConvolutionForward(cudnn_handle(),
        &alpha,
        l.srcTensorDesc,
        input16,
        l.weightDesc,
        l.weights_gpu16,
        l.convDesc,
        l.fw_algo,
        state.workspace,
        l.workspace_size,
        &beta,
        l.dstTensorDesc,
        output16);
    

    if (l.batch_normalize) 
    {		
        if (state.train) // Training
        {
            copy_ongpu(l.outputs*l.batch / 2, output16, 1, l.x_gpu, 1);
            //cudaMemcpyAsync(l.x_gpu, output16, l.outputs*l.batch*sizeof(half), cudaMemcpyDefault, get_cuda_stream());
            float one = 1;
            float zero = 0;
            // Batch-normalization can still take FP16 inputs and outputs, saving half the bandwidth
            // compared to FP32, its just that the statistics and value adjustment should be done in FP32.
            cudnnBatchNormalizationForwardTraining(cudnn_handle(),
                CUDNN_BATCHNORM_SPATIAL,
                &one,
                &zero,
                l.normDstTensorDescF16,
                l.x_gpu,            // input
                l.normDstTensorDescF16,
                output16,           // output
                l.normTensorDesc,
                l.scales_gpu,
                l.biases_gpu,
                .01,
                l.rolling_mean_gpu,     // output (should be FP32)
                l.rolling_variance_gpu, // output (should be FP32)
                .00001,
                l.mean_gpu,         // output (should be FP32)
                l.variance_gpu);    // output (should be FP32)
    if (l.batch_normalize) 
    {        
        if (state.train) // Training
        {
            copy_ongpu(l.outputs*l.batch / 2, output16, 1, l.x_gpu, 1);
            //cudaMemcpyAsync(l.x_gpu, output16, l.outputs*l.batch*sizeof(half), cudaMemcpyDefault, get_cuda_stream());
            float one = 1;
            float zero = 0;
            // Batch-normalization can still take FP16 inputs and outputs, saving half the bandwidth
            // compared to FP32, its just that the statistics and value adjustment should be done in FP32.
            cudnnBatchNormalizationForwardTraining(cudnn_handle(),
                CUDNN_BATCHNORM_SPATIAL,
                &one,
                &zero,
                l.normDstTensorDescF16,
                l.x_gpu,            // input
                l.normDstTensorDescF16,
                output16,            // output
                l.normTensorDesc,
                l.scales_gpu,
                l.biases_gpu,
                .01,
                l.rolling_mean_gpu,        // output (should be FP32)
                l.rolling_variance_gpu,    // output (should be FP32)
                .00001,
                l.mean_gpu,            // output (should be FP32)
                l.variance_gpu);    // output (should be FP32)

            cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
            //forward_batchnorm_layer_gpu(l, state);
        }
        else // Detection
        {
            cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
            normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
            scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
            add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
        }
    }
    else // BIAS only
    {
        cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
    }	
            cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
            //forward_batchnorm_layer_gpu(l, state);
        }
        else // Detection
        {
            cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
            normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
            scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
            add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
        }
    }
    else // BIAS only
    {
        cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
    }    

#else

@@ -230,7 +230,7 @@
                &one,
                l.dstTensorDesc,
                l.output_gpu);
#endif  // CUDNN_HALF
#endif    // CUDNN_HALF


#else
@@ -250,16 +250,16 @@
#ifndef CUDNN_HALF
    if (l.batch_normalize) {
        forward_batchnorm_layer_gpu(l, state);
    }
    else {
        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
    }
    }
    else {
        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
    }
#endif // no CUDNN_HALF

    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
    //if(l.dot > 0) dot_error_gpu(l);
    if(l.binary || l.xnor) swap_binary(&l);
    //cudaDeviceSynchronize();  // for correct profiling of performance
    //cudaDeviceSynchronize();    // for correct profiling of performance
}

void backward_convolutional_layer_gpu(convolutional_layer l, network_state state)
@@ -272,126 +272,126 @@
    if(l.batch_normalize){
        backward_batchnorm_layer_gpu(l, state);
    } else {
        //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
        //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
    }
#endif // no CUDNN_HALF
    float *original_input = state.input;

    if(l.xnor) state.input = l.binary_input_gpu;
#ifdef CUDNN
    float one = 1;
    float alpha = 1, beta = 0;
    float one = 1;
    float alpha = 1, beta = 0;

#ifdef CUDNN_HALF
		
    const size_t input16_size = l.batch*l.c*l.w*l.h;
    const size_t delta16_size = l.batch*l.n*l.out_w*l.out_h;
	
    if (*state.net.max_input16_size < input16_size) {		
        *state.net.max_input16_size = input16_size;
        if(*state.net.input16_gpu) cuda_free(*state.net.input16_gpu);
        *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size);
    }
    float *input16 = *state.net.input16_gpu;
        
    const size_t input16_size = l.batch*l.c*l.w*l.h;
    const size_t delta16_size = l.batch*l.n*l.out_w*l.out_h;
    
    if (*state.net.max_input16_size < input16_size) {        
        *state.net.max_input16_size = input16_size;
        if(*state.net.input16_gpu) cuda_free(*state.net.input16_gpu);
        *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size);
    }
    float *input16 = *state.net.input16_gpu;

    if (*state.net.max_output16_size < delta16_size) {
        *state.net.max_output16_size = delta16_size;
        if(*state.net.output16_gpu) cuda_free(*state.net.output16_gpu);
        *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size);
    }
    float *delta16 = *state.net.output16_gpu;
    if (*state.net.max_output16_size < delta16_size) {
        *state.net.max_output16_size = delta16_size;
        if(*state.net.output16_gpu) cuda_free(*state.net.output16_gpu);
        *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size);
    }
    float *delta16 = *state.net.output16_gpu;

    cuda_convert_f32_to_f16(state.input, input16_size, input16);
    cuda_convert_f32_to_f16(l.delta_gpu, delta16_size, delta16);
    cuda_convert_f32_to_f16(state.input, input16_size, input16);
    cuda_convert_f32_to_f16(l.delta_gpu, delta16_size, delta16);

    if (l.batch_normalize) {
        //if (!state.train) {
        //  l.mean_gpu = l.rolling_mean_gpu;
        //  l.variance_gpu = l.rolling_variance_gpu;
        //}
        float one = 1;
        float zero = 0;
        cudnnBatchNormalizationBackward(cudnn_handle(),
            CUDNN_BATCHNORM_SPATIAL,
            &one,
            &zero,
            &one,
            &one,
            l.normDstTensorDescF16,
            l.x_gpu,                // input
            l.normDstTensorDescF16,
            delta16,                // input
            l.normDstTensorDescF16,
            l.x_norm_gpu,           // output
            l.normTensorDesc,
            l.scales_gpu,           // output (should be FP32)
            l.scale_updates_gpu,    // output (should be FP32)
            l.bias_updates_gpu,     // output (should be FP32)
            .00001,
            l.mean_gpu,             // input (should be FP32)
            l.variance_gpu);        // input (should be FP32)
        copy_ongpu(l.outputs*l.batch / 2, l.x_norm_gpu, 1, delta16, 1);
        //cudaMemcpyAsync(delta16, l.x_norm_gpu, l.outputs*l.batch * sizeof(half), cudaMemcpyDefault, get_cuda_stream());
    }
    else
    {
        //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
    }
    if (l.batch_normalize) {
        //if (!state.train) {
        //    l.mean_gpu = l.rolling_mean_gpu;
        //    l.variance_gpu = l.rolling_variance_gpu;
        //}
        float one = 1;
        float zero = 0;
        cudnnBatchNormalizationBackward(cudnn_handle(),
            CUDNN_BATCHNORM_SPATIAL,
            &one,
            &zero,
            &one,
            &one,
            l.normDstTensorDescF16,
            l.x_gpu,                // input
            l.normDstTensorDescF16,
            delta16,                // input
            l.normDstTensorDescF16,
            l.x_norm_gpu,            // output
            l.normTensorDesc,
            l.scales_gpu,            // output (should be FP32)
            l.scale_updates_gpu,    // output (should be FP32)
            l.bias_updates_gpu,        // output (should be FP32)
            .00001,
            l.mean_gpu,                // input (should be FP32)
            l.variance_gpu);        // input (should be FP32)
        copy_ongpu(l.outputs*l.batch / 2, l.x_norm_gpu, 1, delta16, 1);
        //cudaMemcpyAsync(delta16, l.x_norm_gpu, l.outputs*l.batch * sizeof(half), cudaMemcpyDefault, get_cuda_stream());
    }
    else
    {
        //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
    }

    // convert input: state.input (x), l.delta_gpu (y) from fp32 to fp16
    // get output: l.weight_updates_gpu (dw) and convert it to fp32 (ONLY if it is fp16)
    // convert input: state.input (x), l.delta_gpu (y) from fp32 to fp16
    // get output: l.weight_updates_gpu (dw) and convert it to fp32 (ONLY if it is fp16)

    // calculate conv weight updates
    // Already: l.weight_updates_gpu = (l.weight_updates_gpu - l.weight*decay*batch*subdivision)*momentum
    //   so we should copy f32 to f16, or compute: f16=(w_up - w*d*b*s)*m
    cuda_convert_f32_to_f16(l.weight_updates_gpu, l.c*l.n*l.size*l.size, l.weight_updates_gpu16);
    // calculate conv weight updates
    // Already: l.weight_updates_gpu = (l.weight_updates_gpu - l.weight*decay*batch*subdivision)*momentum
    //   so we should copy f32 to f16, or compute: f16=(w_up - w*d*b*s)*m
    cuda_convert_f32_to_f16(l.weight_updates_gpu, l.c*l.n*l.size*l.size, l.weight_updates_gpu16);

    cudnnConvolutionBackwardFilter(cudnn_handle(),
        &one,
        l.srcTensorDesc,
        input16, //state.input,
        l.ddstTensorDesc,
        delta16, //l.delta_gpu,
        l.convDesc,
        l.bf_algo,
        state.workspace,
        l.workspace_size,
        &one,
        l.dweightDesc,
        l.weight_updates_gpu16);    // l.weight_updates_gpu);
    cudnnConvolutionBackwardFilter(cudnn_handle(),
        &one,
        l.srcTensorDesc,
        input16, //state.input,
        l.ddstTensorDesc,
        delta16, //l.delta_gpu,
        l.convDesc,
        l.bf_algo,
        state.workspace,
        l.workspace_size,
        &one,
        l.dweightDesc,
        l.weight_updates_gpu16);    // l.weight_updates_gpu);

    cuda_convert_f16_to_f32(l.weight_updates_gpu16, l.c*l.n*l.size*l.size, l.weight_updates_gpu);
    cuda_convert_f16_to_f32(l.weight_updates_gpu16, l.c*l.n*l.size*l.size, l.weight_updates_gpu);

    if (state.delta) {
        if (l.binary || l.xnor) swap_binary(&l);
    if (state.delta) {
        if (l.binary || l.xnor) swap_binary(&l);

        // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
        // calculate delta for the next layer
        // convert input: l.weights_gpu (w), l.delta_gpu (dy) from fp32 to fp16
        // get output: state.delta (dx) and convert it to fp32 (ONLY if it is fp16)	
        cudnnConvolutionBackwardData(cudnn_handle(),
            &alpha,
            l.weightDesc,
            l.weights_gpu16, //l.weights_gpu,
            l.ddstTensorDesc,
            delta16, //l.delta_gpu,
            l.convDesc,
            l.bd_algo,
            state.workspace,
            l.workspace_size,
            &beta,
            l.dsrcTensorDesc,
            input16);   // state.delta);
        // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
        // calculate delta for the next layer
        // convert input: l.weights_gpu (w), l.delta_gpu (dy) from fp32 to fp16
        // get output: state.delta (dx) and convert it to fp32 (ONLY if it is fp16)    
        cudnnConvolutionBackwardData(cudnn_handle(),
            &alpha,
            l.weightDesc,
            l.weights_gpu16, //l.weights_gpu,
            l.ddstTensorDesc,
            delta16, //l.delta_gpu,
            l.convDesc,
            l.bd_algo,
            state.workspace,
            l.workspace_size,
            &beta,
            l.dsrcTensorDesc,
            input16);    // state.delta);

        cuda_convert_f16_to_f32(input16, input16_size, state.delta);
        cuda_convert_f16_to_f32(input16, input16_size, state.delta);

        if (l.binary || l.xnor) swap_binary(&l);
        if (l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta);
    }
#else   // CUDNN_HALF
        if (l.binary || l.xnor) swap_binary(&l);
        if (l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta);
    }
#else    // CUDNN_HALF

    // calculate conv weight updates
    // if used: beta=1 then loss decreases faster
    // calculate conv weight updates
    // if used: beta=1 then loss decreases faster
    cudnnConvolutionBackwardFilter(cudnn_handle(),
            &one,
            l.srcTensorDesc,
@@ -408,8 +408,8 @@

    if(state.delta){
        if(l.binary || l.xnor) swap_binary(&l);
        // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
        // calculate delta for the next layer
        // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
        // calculate delta for the next layer
        cudnnConvolutionBackwardData(cudnn_handle(),
                &one,
                l.weightDesc,
@@ -427,9 +427,9 @@
        if(l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta);
    }

#endif  // CUDNN_HALF
#endif    // CUDNN_HALF

#else   // CUDNN
#else    // CUDNN
    int m = l.n;
    int n = l.size*l.size*l.c;
    int k = l.out_w*l.out_h;
@@ -482,7 +482,7 @@
{
    cuda_push_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
#ifdef CUDNN_HALF
    cuda_convert_f32_to_f16(layer.weights_gpu, layer.c*layer.n*layer.size*layer.size, layer.weights_gpu16);
    cuda_convert_f32_to_f16(layer.weights_gpu, layer.c*layer.n*layer.size*layer.size, layer.weights_gpu16);
#endif
    cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
    cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
@@ -522,14 +522,14 @@
        adam_gpu(size, layer.weights_gpu, layer.m_gpu, layer.v_gpu, layer.B1, layer.B2, learning_rate/batch, layer.eps, layer.t+1);
        fill_ongpu(size, 0, layer.weight_updates_gpu, 1);
    }else{
        // update weights:
        // weights_gpu = weights_gpu*(1 - decay*lr) + weight_updates_gpu*lr / (batch*subdivision) =
        //  weights_gpu*(1 - 0.0005*0.001) + weight_updates_gpu*0.001/(64*8) = 
        //  weights_gpu * 0.999 999 5 + weight_updates_gpu * 0.000 001 953125
        // 
        // weight_updates_gpu = (weight_updates_gpu - weights_gpu*decay*batch*subdivision)*momentum = 
        //  (weight_updates_gpu - weights_gpu * 0.0005 * 64 * 8) * 0.9 = 
        //  weight_updates_gpu*0.9 - weights_gpu*0.2304
        // update weights:
        // weights_gpu = weights_gpu*(1 - decay*lr) + weight_updates_gpu*lr / (batch*subdivision) =
        //  weights_gpu*(1 - 0.0005*0.001) + weight_updates_gpu*0.001/(64*8) = 
        //  weights_gpu * 0.999 999 5 + weight_updates_gpu * 0.000 001 953125
        // 
        // weight_updates_gpu = (weight_updates_gpu - weights_gpu*decay*batch*subdivision)*momentum = 
        //  (weight_updates_gpu - weights_gpu * 0.0005 * 64 * 8) * 0.9 = 
        //  weight_updates_gpu*0.9 - weights_gpu*0.2304
        axpy_ongpu(size, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1);
        axpy_ongpu(size, learning_rate/batch, layer.weight_updates_gpu, 1, layer.weights_gpu, 1);
        scal_ongpu(size, momentum, layer.weight_updates_gpu, 1);

 src/convolutional_layer.c

@@ -141,67 +141,67 @@
{

#ifdef CUDNN_HALF
    // TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0): 
    //   Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
    // PSEUDO_HALF_CONFIG is required for Tensor Cores - our case!
    const cudnnDataType_t data_type = CUDNN_DATA_HALF;
    // TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0): 
    //   Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
    // PSEUDO_HALF_CONFIG is required for Tensor Cores - our case!
    const cudnnDataType_t data_type = CUDNN_DATA_HALF;
#else
    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
#endif

#if(CUDNN_MAJOR >= 7)
    // Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH
    // For *_ALGO_WINOGRAD_NONFUSED can be used CUDNN_DATA_FLOAT
    // otherwise Input, Filter and Output descriptors (xDesc, yDesc, wDesc, dxDesc, dyDesc and dwDesc as applicable) have dataType = CUDNN_DATA_HALF
    // Three techniques for training using Mixed-precision: https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
    // 1. Accumulation into FP32
    // 2. Loss Scaling - required only for: activation gradients. We do not use.
    // 3. FP32 Master Copy of Weights
    // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
    cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH);
    // Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH
    // For *_ALGO_WINOGRAD_NONFUSED can be used CUDNN_DATA_FLOAT
    // otherwise Input, Filter and Output descriptors (xDesc, yDesc, wDesc, dxDesc, dyDesc and dwDesc as applicable) have dataType = CUDNN_DATA_HALF
    // Three techniques for training using Mixed-precision: https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
    // 1. Accumulation into FP32
    // 2. Loss Scaling - required only for: activation gradients. We do not use.
    // 3. FP32 Master Copy of Weights
    // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
    cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH);
#endif

    // INT8_CONFIG, INT8_EXT_CONFIG, INT8x4_CONFIG and INT8x4_EXT_CONFIG are only supported 
    //   on architectures with DP4A support (compute capability 6.1 and later).
    //cudnnDataType_t data_type = CUDNN_DATA_INT8;
    // INT8_CONFIG, INT8_EXT_CONFIG, INT8x4_CONFIG and INT8x4_EXT_CONFIG are only supported 
    //   on architectures with DP4A support (compute capability 6.1 and later).
    //cudnnDataType_t data_type = CUDNN_DATA_INT8;

    // backward delta
    // backward delta
    cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w);
    cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
    cudnnSetFilter4dDescriptor(l->dweightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);

    // forward
    // forward
    cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w);
    cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
    cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);

    // batch norm
    cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1);
    cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
    // batch norm
    cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1);
    cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);

    cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
    cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
#if(CUDNN_MAJOR >= 6)
    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);    // cudnn >= 6.0
    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);    // cudnn >= 6.0
#else
    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);  // cudnn 5.1
    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);    // cudnn 5.1
#endif
    int forward_algo = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
    int backward_algo = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
    int backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
    if (cudnn_preference == cudnn_smallest) 
    {
        forward_algo = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
        backward_algo = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
        backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
        printf(" CUDNN-slow ");
    }
    int forward_algo = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
    int backward_algo = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
    int backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
    if (cudnn_preference == cudnn_smallest) 
    {
        forward_algo = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
        backward_algo = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
        backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
        printf(" CUDNN-slow ");
    }

    cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
    cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
            l->srcTensorDesc,
            l->weightDesc,
            l->convDesc,
            l->dstTensorDesc,
            forward_algo,
            forward_algo,
            0,
            &l->fw_algo);
    cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
@@ -209,7 +209,7 @@
            l->ddstTensorDesc,
            l->convDesc,
            l->dsrcTensorDesc,
            backward_algo,
            backward_algo,
            0,
            &l->bd_algo);
    cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
@@ -217,41 +217,41 @@
            l->ddstTensorDesc,
            l->convDesc,
            l->dweightDesc,
            backward_filter,
            backward_filter,
            0,
            &l->bf_algo);

    if (data_type == CUDNN_DATA_HALF) 
    {
        // HALF-16 if(data_type == CUDNN_DATA_HALF)
        l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
        l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
        l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
    if (data_type == CUDNN_DATA_HALF) 
    {
        // HALF-16 if(data_type == CUDNN_DATA_HALF)
        l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
        l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
        l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;

        // FLOAT-32 if(data_type == CUDNN_DATA_FLOAT)
        //l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
        //l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
        //l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;
        // FLOAT-32 if(data_type == CUDNN_DATA_FLOAT)
        //l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
        //l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
        //l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;

        int fw = 0, bd = 0, bf = 0;
        if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1;
            //printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM \n");
        if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2;
            //printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED \n");
        int fw = 0, bd = 0, bf = 0;
        if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1;
            //printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM \n");
        if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2;
            //printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED \n");

        if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1;
            //printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1  \n");
        if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2;
            //printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED \n");
        if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1;
            //printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1  \n");
        if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2;
            //printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED \n");

        if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1;
            //printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1   \n");
        if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2;
            //printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED \n");
        if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1;
            //printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1   \n");
        if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2;
            //printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED \n");

        if (fw == 2 && bd == 2 && bf == 2) printf("TF ");
        else if (fw == 1 && bd == 1 && bf == 1) printf("TH ");
    }
        if (fw == 2 && bd == 2 && bf == 2) printf("TF ");
        else if (fw == 1 && bd == 1 && bf == 1) printf("TH ");
    }
}
#endif
#endif
@@ -344,8 +344,8 @@

        l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
#ifdef CUDNN_HALF
        l.weights_gpu16 = cuda_make_array(NULL, c*n*size*size / 2); //cuda_make_array(l.weights, c*n*size*size / 2);
        l.weight_updates_gpu16 = cuda_make_array(NULL, c*n*size*size / 2); //cuda_make_array(l.weight_updates, c*n*size*size / 2);
        l.weights_gpu16 = cuda_make_array(NULL, c*n*size*size / 2); //cuda_make_array(l.weights, c*n*size*size / 2);
        l.weight_updates_gpu16 = cuda_make_array(NULL, c*n*size*size / 2); //cuda_make_array(l.weight_updates, c*n*size*size / 2);
#endif
        l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);

@@ -379,10 +379,10 @@
            l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
            l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
        }
#ifdef CUDNN		
        cudnnCreateTensorDescriptor(&l.normDstTensorDesc);
        cudnnCreateTensorDescriptor(&l.normDstTensorDescF16);
        cudnnCreateTensorDescriptor(&l.normTensorDesc);
#ifdef CUDNN        
        cudnnCreateTensorDescriptor(&l.normDstTensorDesc);
        cudnnCreateTensorDescriptor(&l.normDstTensorDescF16);
        cudnnCreateTensorDescriptor(&l.normTensorDesc);
        cudnnCreateTensorDescriptor(&l.srcTensorDesc);
        cudnnCreateTensorDescriptor(&l.dstTensorDesc);
        cudnnCreateFilterDescriptor(&l.weightDesc);
@@ -398,8 +398,8 @@
    l.activation = activation;

    //fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
    l.bflops = (2.0 * l.n * l.size*l.size*l.c * l.out_h*l.out_w) / 1000000000.;
    fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d %5.3f BF\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
    l.bflops = (2.0 * l.n * l.size*l.size*l.c * l.out_h*l.out_w) / 1000000000.;
    fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d %5.3f BF\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);

    return l;
}
@@ -445,8 +445,8 @@

void resize_convolutional_layer(convolutional_layer *l, int w, int h)
{
    int old_w = l->w;
    int old_h = l->h;
    int old_w = l->w;
    int old_h = l->h;
    l->w = w;
    l->h = h;
    int out_w = convolutional_out_width(*l);
@@ -465,31 +465,31 @@
        l->x_norm  = realloc(l->x_norm, l->batch*l->outputs*sizeof(float));
    }

    if (l->xnor) {
        //l->binary_input = realloc(l->inputs*l->batch, sizeof(float));
    }
    if (l->xnor) {
        //l->binary_input = realloc(l->inputs*l->batch, sizeof(float));
    }

#ifdef GPU
    if (old_w < w || old_h < h) {
        cuda_free(l->delta_gpu);
        cuda_free(l->output_gpu);
    if (old_w < w || old_h < h) {
        cuda_free(l->delta_gpu);
        cuda_free(l->output_gpu);

        l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs);
        l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
        l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs);
        l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);

        if (l->batch_normalize) {
            cuda_free(l->x_gpu);
            cuda_free(l->x_norm_gpu);
        if (l->batch_normalize) {
            cuda_free(l->x_gpu);
            cuda_free(l->x_norm_gpu);

            l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs);
            l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs);
        }
            l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs);
            l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs);
        }

        if (l->xnor) {
            cuda_free(l->binary_input_gpu);
            l->binary_input_gpu = cuda_make_array(0, l->inputs*l->batch);
        }
    }
        if (l->xnor) {
            cuda_free(l->binary_input_gpu);
            l->binary_input_gpu = cuda_make_array(0, l->inputs*l->batch);
        }
    }
#ifdef CUDNN
    cudnn_convolutional_setup(l, cudnn_fastest);
#endif
@@ -497,15 +497,15 @@
    l->workspace_size = get_workspace_size(*l);

#ifdef CUDNN
    // check for excessive memory consumption 
    size_t free_byte;
    size_t total_byte;
    check_error(cudaMemGetInfo(&free_byte, &total_byte));
    if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) {
        printf(" used slow CUDNN algo without Workspace! Need memory: %zu, available: %zu\n", l->workspace_size, (free_byte < total_byte/2) ? free_byte : total_byte/2);
        cudnn_convolutional_setup(l, cudnn_smallest);
        l->workspace_size = get_workspace_size(*l);
    }
    // check for excessive memory consumption 
    size_t free_byte;
    size_t total_byte;
    check_error(cudaMemGetInfo(&free_byte, &total_byte));
    if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) {
        printf(" used slow CUDNN algo without Workspace! Need memory: %zu, available: %zu\n", l->workspace_size, (free_byte < total_byte/2) ? free_byte : total_byte/2);
        cudnn_convolutional_setup(l, cudnn_smallest);
        l->workspace_size = get_workspace_size(*l);
    }
#endif
}


 src/cuda.c

@@ -61,25 +61,25 @@
    return d;
}

static cudaStream_t streamsArray[16];   // cudaStreamSynchronize( get_cuda_stream() );
static cudaStream_t streamsArray[16];    // cudaStreamSynchronize( get_cuda_stream() );
static int streamInit[16] = { 0 };

cudaStream_t get_cuda_stream() {
    int i = cuda_get_device();
    if (!streamInit[i]) {
        cudaError_t status = cudaStreamCreate(&streamsArray[i]);
        //cudaError_t status = cudaStreamCreateWithFlags(&streamsArray[i], cudaStreamNonBlocking);
        if (status != cudaSuccess) {
            printf(" cudaStreamCreate error: %d \n", status);
            const char *s = cudaGetErrorString(status);
            char buffer[256];
            printf("CUDA Error: %s\n", s);
            status = cudaStreamCreateWithFlags(&streamsArray[i], cudaStreamDefault);
            check_error(status);
        }
        streamInit[i] = 1;
    }
    return streamsArray[i];
    int i = cuda_get_device();
    if (!streamInit[i]) {
        cudaError_t status = cudaStreamCreate(&streamsArray[i]);
        //cudaError_t status = cudaStreamCreateWithFlags(&streamsArray[i], cudaStreamNonBlocking);
        if (status != cudaSuccess) {
            printf(" cudaStreamCreate error: %d \n", status);
            const char *s = cudaGetErrorString(status);
            char buffer[256];
            printf("CUDA Error: %s\n", s);
            status = cudaStreamCreateWithFlags(&streamsArray[i], cudaStreamDefault);
            check_error(status);
        }
        streamInit[i] = 1;
    }
    return streamsArray[i];
}


@@ -92,7 +92,7 @@
    if(!init[i]) {
        cudnnCreate(&handle[i]);
        init[i] = 1;
        cudnnStatus_t status = cudnnSetStream(handle[i], get_cuda_stream());
        cudnnStatus_t status = cudnnSetStream(handle[i], get_cuda_stream());
    }
    return handle[i];
}
@@ -105,7 +105,7 @@
    int i = cuda_get_device();
    if(!init[i]) {
        cublasCreate(&handle[i]);
        cublasStatus_t status = cublasSetStream(handle[i], get_cuda_stream());
        cublasStatus_t status = cublasSetStream(handle[i], get_cuda_stream());
        init[i] = 1;
    }
    return handle[i];
@@ -119,7 +119,7 @@
    check_error(status);
    if(x){
        //status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
        status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyHostToDevice, get_cuda_stream());
        status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyHostToDevice, get_cuda_stream());
        check_error(status);
    }
    if(!x_gpu) error("Cuda malloc failed\n");
@@ -164,7 +164,7 @@

void cuda_free(float *x_gpu)
{
    //cudaStreamSynchronize(get_cuda_stream());
    //cudaStreamSynchronize(get_cuda_stream());
    cudaError_t status = cudaFree(x_gpu);
    check_error(status);
}
@@ -173,7 +173,7 @@
{
    size_t size = sizeof(float)*n;
    //cudaError_t status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
    cudaError_t status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyHostToDevice, get_cuda_stream());
    cudaError_t status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyHostToDevice, get_cuda_stream());
    check_error(status);
}

@@ -181,9 +181,9 @@
{
    size_t size = sizeof(float)*n;
    //cudaError_t status = cudaMemcpy(x, x_gpu, size, cudaMemcpyDeviceToHost);
    cudaError_t status = cudaMemcpyAsync(x, x_gpu, size, cudaMemcpyDeviceToHost, get_cuda_stream());
    cudaError_t status = cudaMemcpyAsync(x, x_gpu, size, cudaMemcpyDeviceToHost, get_cuda_stream());
    check_error(status);
    cudaStreamSynchronize(get_cuda_stream());
    cudaStreamSynchronize(get_cuda_stream());
}

#else // GPU

 src/data.c

@@ -44,15 +44,15 @@
    char **random_paths = calloc(n, sizeof(char*));
    int i;
    pthread_mutex_lock(&mutex);
    //printf("n = %d \n", n);
    //printf("n = %d \n", n);
    for(i = 0; i < n; ++i){
        do {
            int index = random_gen() % m;
            random_paths[i] = paths[index];
            //if(i == 0) printf("%s\n", paths[index]);
            //printf("grp: %s\n", paths[index]);
            if (strlen(random_paths[i]) <= 4) printf(" Very small path to the image: %s \n", random_paths[i]);
        } while (strlen(random_paths[i]) == 0);
        do {
            int index = random_gen() % m;
            random_paths[i] = paths[index];
            //if(i == 0) printf("%s\n", paths[index]);
            //printf("grp: %s\n", paths[index]);
            if (strlen(random_paths[i]) <= 4) printf(" Very small path to the image: %s \n", random_paths[i]);
        } while (strlen(random_paths[i]) == 0);
    }
    pthread_mutex_unlock(&mutex);
    return random_paths;
@@ -140,18 +140,18 @@
{
    box_label *boxes = calloc(1, sizeof(box_label));
    FILE *file = fopen(filename, "r");
    if (!file) {
        printf("Can't open label file. (This can be normal only if you use MSCOCO) \n");
        //file_error(filename);
        FILE* fw = fopen("bad.list", "a");
        fwrite(filename, sizeof(char), strlen(filename), fw);
        char *new_line = "\n";
        fwrite(new_line, sizeof(char), strlen(new_line), fw);
        fclose(fw);
    if (!file) {
        printf("Can't open label file. (This can be normal only if you use MSCOCO) \n");
        //file_error(filename);
        FILE* fw = fopen("bad.list", "a");
        fwrite(filename, sizeof(char), strlen(filename), fw);
        char *new_line = "\n";
        fwrite(new_line, sizeof(char), strlen(new_line), fw);
        fclose(fw);

        *n = 0;
        return boxes;
    }
        *n = 0;
        return boxes;
    }
    float x, y, h, w;
    int id;
    int count = 0;
@@ -224,7 +224,7 @@
void fill_truth_swag(char *path, float *truth, int classes, int flip, float dx, float dy, float sx, float sy)
{
    char labelpath[4096];
    replace_image_to_label(path, labelpath);
    replace_image_to_label(path, labelpath);

    int count = 0;
    box_label *boxes = read_boxes(labelpath, &count);
@@ -258,9 +258,9 @@
void fill_truth_region(char *path, float *truth, int classes, int num_boxes, int flip, float dx, float dy, float sx, float sy)
{
    char labelpath[4096];
    replace_image_to_label(path, labelpath);
    replace_image_to_label(path, labelpath);

    int count = 0;
    int count = 0;
    box_label *boxes = read_boxes(labelpath, &count);
    randomize_boxes(boxes, count);
    correct_boxes(boxes, count, dx, dy, sx, sy, flip);
@@ -299,77 +299,77 @@
}

void fill_truth_detection(char *path, int num_boxes, float *truth, int classes, int flip, float dx, float dy, float sx, float sy,
    int small_object, int net_w, int net_h)
    int small_object, int net_w, int net_h)
{
    char labelpath[4096];
    replace_image_to_label(path, labelpath);
    char labelpath[4096];
    replace_image_to_label(path, labelpath);

    int count = 0;
    int i;
    box_label *boxes = read_boxes(labelpath, &count);
    float lowest_w = 1.F / net_w;
    float lowest_h = 1.F / net_h;
    if (small_object == 1) {
        for (i = 0; i < count; ++i) {
            if (boxes[i].w < lowest_w) boxes[i].w = lowest_w;
            if (boxes[i].h < lowest_h) boxes[i].h = lowest_h;
        }
    }
    randomize_boxes(boxes, count);
    correct_boxes(boxes, count, dx, dy, sx, sy, flip);
    if (count > num_boxes) count = num_boxes;
    float x, y, w, h;
    int id;
    int count = 0;
    int i;
    box_label *boxes = read_boxes(labelpath, &count);
    float lowest_w = 1.F / net_w;
    float lowest_h = 1.F / net_h;
    if (small_object == 1) {
        for (i = 0; i < count; ++i) {
            if (boxes[i].w < lowest_w) boxes[i].w = lowest_w;
            if (boxes[i].h < lowest_h) boxes[i].h = lowest_h;
        }
    }
    randomize_boxes(boxes, count);
    correct_boxes(boxes, count, dx, dy, sx, sy, flip);
    if (count > num_boxes) count = num_boxes;
    float x, y, w, h;
    int id;

    for (i = 0; i < count; ++i) {
        x = boxes[i].x;
        y = boxes[i].y;
        w = boxes[i].w;
        h = boxes[i].h;
        id = boxes[i].id;
    for (i = 0; i < count; ++i) {
        x = boxes[i].x;
        y = boxes[i].y;
        w = boxes[i].w;
        h = boxes[i].h;
        id = boxes[i].id;

        // not detect small objects
        //if ((w < 0.001F || h < 0.001F)) continue;
        // if truth (box for object) is smaller than 1x1 pix
        char buff[256];
        if (id >= classes) {
            printf("\n Wrong annotation: class_id = %d. But class_id should be [from 0 to %d] \n", id, classes);
            sprintf(buff, "echo %s \"Wrong annotation: class_id = %d. But class_id should be [from 0 to %d]\" >> bad_label.list", labelpath, id, classes);
            system(buff);
            getchar();
            continue;
        }
        if ((w < lowest_w || h < lowest_h)) {
            //sprintf(buff, "echo %s \"Very small object: w < lowest_w OR h < lowest_h\" >> bad_label.list", labelpath);
            //system(buff);
            continue;
        }
        if (x == 999999 || y == 999999) {
            printf("\n Wrong annotation: x = 0, y = 0 \n");
            sprintf(buff, "echo %s \"Wrong annotation: x = 0 or y = 0\" >> bad_label.list", labelpath);
            system(buff);
            continue;
        }
        if (x <= 0 || x > 1 || y <= 0 || y > 1) {
            printf("\n Wrong annotation: x = %f, y = %f \n", x, y);
            sprintf(buff, "echo %s \"Wrong annotation: x = %f, y = %f\" >> bad_label.list", labelpath, x, y);
            system(buff);
            continue;
        }
        if (w > 1) {
            printf("\n Wrong annotation: w = %f \n", w);
            sprintf(buff, "echo %s \"Wrong annotation: w = %f\" >> bad_label.list", labelpath, w);
            system(buff);
            w = 1;
        }
        if (h > 1) {
            printf("\n Wrong annotation: h = %f \n", h);
            sprintf(buff, "echo %s \"Wrong annotation: h = %f\" >> bad_label.list", labelpath, h);
            system(buff);
            h = 1;
        }
        if (x == 0) x += lowest_w;
        if (y == 0) y += lowest_h;
        // not detect small objects
        //if ((w < 0.001F || h < 0.001F)) continue;
        // if truth (box for object) is smaller than 1x1 pix
        char buff[256];
        if (id >= classes) {
            printf("\n Wrong annotation: class_id = %d. But class_id should be [from 0 to %d] \n", id, classes);
            sprintf(buff, "echo %s \"Wrong annotation: class_id = %d. But class_id should be [from 0 to %d]\" >> bad_label.list", labelpath, id, classes);
            system(buff);
            getchar();
            continue;
        }
        if ((w < lowest_w || h < lowest_h)) {
            //sprintf(buff, "echo %s \"Very small object: w < lowest_w OR h < lowest_h\" >> bad_label.list", labelpath);
            //system(buff);
            continue;
        }
        if (x == 999999 || y == 999999) {
            printf("\n Wrong annotation: x = 0, y = 0 \n");
            sprintf(buff, "echo %s \"Wrong annotation: x = 0 or y = 0\" >> bad_label.list", labelpath);
            system(buff);
            continue;
        }
        if (x <= 0 || x > 1 || y <= 0 || y > 1) {
            printf("\n Wrong annotation: x = %f, y = %f \n", x, y);
            sprintf(buff, "echo %s \"Wrong annotation: x = %f, y = %f\" >> bad_label.list", labelpath, x, y);
            system(buff);
            continue;
        }
        if (w > 1) {
            printf("\n Wrong annotation: w = %f \n", w);
            sprintf(buff, "echo %s \"Wrong annotation: w = %f\" >> bad_label.list", labelpath, w);
            system(buff);
            w = 1;
        }
        if (h > 1) {
            printf("\n Wrong annotation: h = %f \n", h);
            sprintf(buff, "echo %s \"Wrong annotation: h = %f\" >> bad_label.list", labelpath, h);
            system(buff);
            h = 1;
        }
        if (x == 0) x += lowest_w;
        if (y == 0) y += lowest_h;

        truth[i*5+0] = x;
        truth[i*5+1] = y;
@@ -524,7 +524,7 @@
char **get_labels_custom(char *filename, int *size)
{
    list *plist = get_paths(filename);
    if(size) *size = plist->size;
    if(size) *size = plist->size;
    char **labels = (char **)list_to_array(plist);
    free_list(plist);
    return labels;
@@ -532,7 +532,7 @@

char **get_labels(char *filename)
{
    return get_labels_custom(filename, NULL);
    return get_labels_custom(filename, NULL);
}

void free_data(data d)
@@ -742,22 +742,22 @@

    d.y = make_matrix(n, 5*boxes);
    for(i = 0; i < n; ++i){
        const char *filename = random_paths[i];
        const char *filename = random_paths[i];

        int flag = (c >= 3);
        IplImage *src;
        if ((src = cvLoadImage(filename, flag)) == 0)
        {
            fprintf(stderr, "Cannot load image \"%s\"\n", filename);
            char buff[256];
            sprintf(buff, "echo %s >> bad.list", filename);
            system(buff);
            continue;
            //exit(0);
        }
        int flag = (c >= 3);
        IplImage *src;
        if ((src = cvLoadImage(filename, flag)) == 0)
        {
            fprintf(stderr, "Cannot load image \"%s\"\n", filename);
            char buff[256];
            sprintf(buff, "echo %s >> bad.list", filename);
            system(buff);
            continue;
            //exit(0);
        }

        int oh = src->height;
        int ow = src->width;
        int oh = src->height;
        int ow = src->width;

        int dw = (ow*jitter);
        int dh = (oh*jitter);
@@ -778,81 +778,81 @@
        float dx = ((float)pleft/ow)/sx;
        float dy = ((float)ptop /oh)/sy;

        float dhue = rand_uniform_strong(-hue, hue);
        float dsat = rand_scale(saturation);
        float dexp = rand_scale(exposure);
        float dhue = rand_uniform_strong(-hue, hue);
        float dsat = rand_scale(saturation);
        float dexp = rand_scale(exposure);

        image ai = image_data_augmentation(src, w, h, pleft, ptop, swidth, sheight, flip, jitter, dhue, dsat, dexp);
        d.X.vals[i] = ai.data;
		
        //show_image(ai, "aug");
        //cvWaitKey(0);
        image ai = image_data_augmentation(src, w, h, pleft, ptop, swidth, sheight, flip, jitter, dhue, dsat, dexp);
        d.X.vals[i] = ai.data;
        
        //show_image(ai, "aug");
        //cvWaitKey(0);

        fill_truth_detection(filename, boxes, d.y.vals[i], classes, flip, dx, dy, 1./sx, 1./sy, small_object, w, h);

        cvReleaseImage(&src);
        cvReleaseImage(&src);
    }
    free(random_paths);
    return d;
}
#else   // OPENCV
#else    // OPENCV
data load_data_detection(int n, char **paths, int m, int w, int h, int c, int boxes, int classes, int use_flip, float jitter, float hue, float saturation, float exposure, int small_object)
{
    c = c ? c : 3;
    char **random_paths = get_random_paths(paths, n, m);
    int i;
    data d = { 0 };
    d.shallow = 0;
    char **random_paths = get_random_paths(paths, n, m);
    int i;
    data d = { 0 };
    d.shallow = 0;

    d.X.rows = n;
    d.X.vals = calloc(d.X.rows, sizeof(float*));
    d.X.cols = h*w*c;
    d.X.rows = n;
    d.X.vals = calloc(d.X.rows, sizeof(float*));
    d.X.cols = h*w*c;

    d.y = make_matrix(n, 5 * boxes);
    for (i = 0; i < n; ++i) {
        image orig = load_image(random_paths[i], 0, 0, c);
    d.y = make_matrix(n, 5 * boxes);
    for (i = 0; i < n; ++i) {
        image orig = load_image(random_paths[i], 0, 0, c);

        int oh = orig.h;
        int ow = orig.w;
        int oh = orig.h;
        int ow = orig.w;

        int dw = (ow*jitter);
        int dh = (oh*jitter);
        int dw = (ow*jitter);
        int dh = (oh*jitter);

        int pleft = rand_uniform_strong(-dw, dw);
        int pright = rand_uniform_strong(-dw, dw);
        int ptop = rand_uniform_strong(-dh, dh);
        int pbot = rand_uniform_strong(-dh, dh);
        int pleft = rand_uniform_strong(-dw, dw);
        int pright = rand_uniform_strong(-dw, dw);
        int ptop = rand_uniform_strong(-dh, dh);
        int pbot = rand_uniform_strong(-dh, dh);

        int swidth = ow - pleft - pright;
        int sheight = oh - ptop - pbot;
        int swidth = ow - pleft - pright;
        int sheight = oh - ptop - pbot;

        float sx = (float)swidth / ow;
        float sy = (float)sheight / oh;
        float sx = (float)swidth / ow;
        float sy = (float)sheight / oh;

        int flip = use_flip ? random_gen() % 2 : 0;
        image cropped = crop_image(orig, pleft, ptop, swidth, sheight);
        int flip = use_flip ? random_gen() % 2 : 0;
        image cropped = crop_image(orig, pleft, ptop, swidth, sheight);

        float dx = ((float)pleft / ow) / sx;
        float dy = ((float)ptop / oh) / sy;
        float dx = ((float)pleft / ow) / sx;
        float dy = ((float)ptop / oh) / sy;

        image sized = resize_image(cropped, w, h);
        if (flip) flip_image(sized);
        random_distort_image(sized, hue, saturation, exposure);
        d.X.vals[i] = sized.data;
        image sized = resize_image(cropped, w, h);
        if (flip) flip_image(sized);
        random_distort_image(sized, hue, saturation, exposure);
        d.X.vals[i] = sized.data;

        fill_truth_detection(random_paths[i], boxes, d.y.vals[i], classes, flip, dx, dy, 1. / sx, 1. / sy, small_object, w, h);
        fill_truth_detection(random_paths[i], boxes, d.y.vals[i], classes, flip, dx, dy, 1. / sx, 1. / sy, small_object, w, h);

        free_image(orig);
        free_image(cropped);
    }
    free(random_paths);
    return d;
        free_image(orig);
        free_image(cropped);
    }
    free(random_paths);
    return d;
}
#endif  // OPENCV
#endif    // OPENCV

void *load_thread(void *ptr)
{
    //srand(time(0));
    //srand(time(0));
    //printf("Loading data: %d\n", random_gen());
    load_args a = *(struct load_args*)ptr;
    if(a.exposure == 0) a.exposure = 1;
@@ -878,9 +878,9 @@
    } else if (a.type == IMAGE_DATA){
        *(a.im) = load_image(a.path, 0, 0, a.c);
        *(a.resized) = resize_image(*(a.im), a.w, a.h);
    }else if (a.type == LETTERBOX_DATA) {
        *(a.im) = load_image(a.path, 0, 0, a.c);
        *(a.resized) = letterbox_image(*(a.im), a.w, a.h);
    }else if (a.type == LETTERBOX_DATA) {
        *(a.im) = load_image(a.path, 0, 0, a.c);
        *(a.resized) = letterbox_image(*(a.im), a.w, a.h);
    } else if (a.type == TAG_DATA){
        *a.d = load_data_tag(a.paths, a.n, a.m, a.classes, a.flip, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
    }
@@ -899,7 +899,7 @@

void *load_threads(void *ptr)
{
    //srand(time(0));
    //srand(time(0));
    int i;
    load_args args = *(load_args *)ptr;
    if (args.threads == 0) args.threads = 1;

 src/demo.c

@@ -64,25 +64,25 @@
void *fetch_in_thread(void *ptr)
{
    //in = get_image_from_stream(cap);
    int dont_close_stream = 0;  // set 1 if your IP-camera periodically turns off and turns on video-stream
    if(letter_box) 
        in_s = get_image_from_stream_letterbox(cap, net.w, net.h, net.c, &in_img, cpp_video_capture, dont_close_stream);
    else
        in_s = get_image_from_stream_resize(cap, net.w, net.h, net.c, &in_img, cpp_video_capture, dont_close_stream);
    int dont_close_stream = 0;    // set 1 if your IP-camera periodically turns off and turns on video-stream
    if(letter_box)
        in_s = get_image_from_stream_letterbox(cap, net.w, net.h, net.c, &in_img, cpp_video_capture, dont_close_stream);
    else
        in_s = get_image_from_stream_resize(cap, net.w, net.h, net.c, &in_img, cpp_video_capture, dont_close_stream);
    if(!in_s.data){
        //error("Stream closed.");
        printf("Stream closed.\n");
        flag_exit = 1;
        return EXIT_FAILURE;
        printf("Stream closed.\n");
        flag_exit = 1;
        return EXIT_FAILURE;
    }
    //in_s = resize_image(in, net.w, net.h);
	

    return 0;
}

void *detect_in_thread(void *ptr)
{
    float nms = .45;    // 0.4F
    float nms = .45;    // 0.4F

    layer l = net.layers[net.n-1];
    float *X = det_s.data;
@@ -94,29 +94,29 @@

    free_image(det_s);

    int nboxes = 0;
    detection *dets = NULL;
    if (letter_box)
        dets = get_network_boxes(&net, in_img->width, in_img->height, demo_thresh, demo_thresh, 0, 1, &nboxes, 1); // letter box
    else
        dets = get_network_boxes(&net, det_s.w, det_s.h, demo_thresh, demo_thresh, 0, 1, &nboxes, 0); // resized
    //if (nms) do_nms_obj(dets, nboxes, l.classes, nms);    // bad results
    if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
	
    int nboxes = 0;
    detection *dets = NULL;
    if (letter_box)
        dets = get_network_boxes(&net, in_img->width, in_img->height, demo_thresh, demo_thresh, 0, 1, &nboxes, 1); // letter box
    else
        dets = get_network_boxes(&net, det_s.w, det_s.h, demo_thresh, demo_thresh, 0, 1, &nboxes, 0); // resized
    //if (nms) do_nms_obj(dets, nboxes, l.classes, nms);    // bad results
    if (nms) do_nms_sort(dets, nboxes, l.classes, nms);


    printf("\033[2J");
    printf("\033[1;1H");
    printf("\nFPS:%.1f\n",fps);
    printf("Objects:\n\n");

    ipl_images[demo_index] = det_img;
    det_img = ipl_images[(demo_index + FRAMES / 2 + 1) % FRAMES];
    ipl_images[demo_index] = det_img;
    det_img = ipl_images[(demo_index + FRAMES / 2 + 1) % FRAMES];
    demo_index = (demo_index + 1)%FRAMES;
	    
    draw_detections_cv_v3(det_img, dets, nboxes, demo_thresh, demo_names, demo_alphabet, demo_classes, demo_ext_output);
    free_detections(dets, nboxes);

    return 0;
    draw_detections_cv_v3(det_img, dets, nboxes, demo_thresh, demo_names, demo_alphabet, demo_classes, demo_ext_output);
    free_detections(dets, nboxes);

    return 0;
}

double get_wall_time()
@@ -129,7 +129,7 @@
}

void demo(char *cfgfile, char *weightfile, float thresh, float hier_thresh, int cam_index, const char *filename, char **names, int classes,
    int frame_skip, char *prefix, char *out_filename, int http_stream_port, int dont_show, int ext_output)
    int frame_skip, char *prefix, char *out_filename, int http_stream_port, int dont_show, int ext_output)
{
    //skip = frame_skip;
    image **alphabet = load_alphabet();
@@ -138,40 +138,40 @@
    demo_alphabet = alphabet;
    demo_classes = classes;
    demo_thresh = thresh;
    demo_ext_output = ext_output;
    demo_ext_output = ext_output;
    printf("Demo\n");
    net = parse_network_cfg_custom(cfgfile, 1); // set batch=1
    net = parse_network_cfg_custom(cfgfile, 1);    // set batch=1
    if(weightfile){
        load_weights(&net, weightfile);
    }
    //set_batch_network(&net, 1);
    fuse_conv_batchnorm(net);
    fuse_conv_batchnorm(net);
    srand(2222222);

    if(filename){
        printf("video file: %s\n", filename);
//#ifdef CV_VERSION_EPOCH   // OpenCV 2.x
//      cap = cvCaptureFromFile(filename);
//#else                 // OpenCV 3.x
        cpp_video_capture = 1;
        cap = get_capture_video_stream(filename);
//#ifdef CV_VERSION_EPOCH    // OpenCV 2.x
//        cap = cvCaptureFromFile(filename);
//#else                    // OpenCV 3.x
        cpp_video_capture = 1;
        cap = get_capture_video_stream(filename);
//#endif
    }else{
        printf("Webcam index: %d\n", cam_index);
//#ifdef CV_VERSION_EPOCH   // OpenCV 2.x
        printf("Webcam index: %d\n", cam_index);
//#ifdef CV_VERSION_EPOCH    // OpenCV 2.x
//        cap = cvCaptureFromCAM(cam_index);
//#else                 // OpenCV 3.x
        cpp_video_capture = 1;
        cap = get_capture_webcam(cam_index);
//#else                    // OpenCV 3.x
        cpp_video_capture = 1;
        cap = get_capture_webcam(cam_index);
//#endif
    }

    if (!cap) {
    if (!cap) {
#ifdef WIN32
        printf("Check that you have copied file opencv_ffmpeg340_64.dll to the same directory where is darknet.exe \n");
        printf("Check that you have copied file opencv_ffmpeg340_64.dll to the same directory where is darknet.exe \n");
#endif
        error("Couldn't connect to webcam.\n");
    }
        error("Couldn't connect to webcam.\n");
    }

    layer l = net.layers[net.n-1];
    int j;
@@ -184,51 +184,51 @@
    probs = (float **)calloc(l.w*l.h*l.n, sizeof(float *));
    for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = (float *)calloc(l.classes, sizeof(float *));

    flag_exit = 0;
    flag_exit = 0;

    pthread_t fetch_thread;
    pthread_t detect_thread;

    fetch_in_thread(0);
    det_img = in_img;
    det_img = in_img;
    det_s = in_s;

    fetch_in_thread(0);
    detect_in_thread(0);
    det_img = in_img;
    det_img = in_img;
    det_s = in_s;

    for(j = 0; j < FRAMES/2; ++j){
        fetch_in_thread(0);
        detect_in_thread(0);
        det_img = in_img;
        det_img = in_img;
        det_s = in_s;
    }

    int count = 0;
    if(!prefix && !dont_show){
        cvNamedWindow("Demo", CV_WINDOW_NORMAL); 
        cvNamedWindow("Demo", CV_WINDOW_NORMAL);
        cvMoveWindow("Demo", 0, 0);
        cvResizeWindow("Demo", 1352, 1013);
    }

    CvVideoWriter* output_video_writer = NULL;    // cv::VideoWriter output_video;
    if (out_filename && !flag_exit)
    {
        CvSize size;
        size.width = det_img->width, size.height = det_img->height;
        int src_fps = 25;
        src_fps = get_stream_fps(cap, cpp_video_capture);
    CvVideoWriter* output_video_writer = NULL;    // cv::VideoWriter output_video;
    if (out_filename && !flag_exit)
    {
        CvSize size;
        size.width = det_img->width, size.height = det_img->height;
        int src_fps = 25;
        src_fps = get_stream_fps(cap, cpp_video_capture);

        //const char* output_name = "test_dnn_out.avi";
        //output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('H', '2', '6', '4'), src_fps, size, 1);
        output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('D', 'I', 'V', 'X'), src_fps, size, 1);
        //output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('M', 'J', 'P', 'G'), src_fps, size, 1);
        //output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('M', 'P', '4', 'V'), src_fps, size, 1);
        //output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('M', 'P', '4', '2'), src_fps, size, 1);
        //output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('X', 'V', 'I', 'D'), src_fps, size, 1);
        //output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('W', 'M', 'V', '2'), src_fps, size, 1);
    }
        //const char* output_name = "test_dnn_out.avi";
        //output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('H', '2', '6', '4'), src_fps, size, 1);
        output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('D', 'I', 'V', 'X'), src_fps, size, 1);
        //output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('M', 'J', 'P', 'G'), src_fps, size, 1);
        //output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('M', 'P', '4', 'V'), src_fps, size, 1);
        //output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('M', 'P', '4', '2'), src_fps, size, 1);
        //output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('X', 'V', 'I', 'D'), src_fps, size, 1);
        //output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('W', 'M', 'V', '2'), src_fps, size, 1);
    }

    double before = get_wall_time();

@@ -239,66 +239,66 @@
            if(pthread_create(&detect_thread, 0, detect_in_thread, 0)) error("Thread creation failed");

            if(!prefix){
                if (!dont_show) {
                    show_image_cv_ipl(show_img, "Demo");
                    int c = cvWaitKey(1);
                    if (c == 10) {
                        if (frame_skip == 0) frame_skip = 60;
                        else if (frame_skip == 4) frame_skip = 0;
                        else if (frame_skip == 60) frame_skip = 4;
                        else frame_skip = 0;
                    }
                    else if (c == 27 || c == 1048603) // ESC - exit (OpenCV 2.x / 3.x)
                    {
                        flag_exit = 1;
                    }
                }
                if (!dont_show) {
                    show_image_cv_ipl(show_img, "Demo");
                    int c = cvWaitKey(1);
                    if (c == 10) {
                        if (frame_skip == 0) frame_skip = 60;
                        else if (frame_skip == 4) frame_skip = 0;
                        else if (frame_skip == 60) frame_skip = 4;
                        else frame_skip = 0;
                    }
                    else if (c == 27 || c == 1048603) // ESC - exit (OpenCV 2.x / 3.x)
                    {
                        flag_exit = 1;
                    }
                }
            }else{
                char buff[256];
                sprintf(buff, "%s_%08d.jpg", prefix, count);
                cvSaveImage(buff, show_img, 0);
                cvSaveImage(buff, show_img, 0);
                //save_image(disp, buff);
            }

            // if you run it with param -http_port 8090  then open URL in your web-browser: http://localhost:8090
            if (http_stream_port > 0 && show_img) {
                //int port = 8090;
                int port = http_stream_port;
                int timeout = 200;
                int jpeg_quality = 30;  // 1 - 100
                send_mjpeg(show_img, port, timeout, jpeg_quality);
            }
            // if you run it with param -http_port 8090  then open URL in your web-browser: http://localhost:8090
            if (http_stream_port > 0 && show_img) {
                //int port = 8090;
                int port = http_stream_port;
                int timeout = 200;
                int jpeg_quality = 30;    // 1 - 100
                send_mjpeg(show_img, port, timeout, jpeg_quality);
            }

            // save video file
            if (output_video_writer && show_img) {
                cvWriteFrame(output_video_writer, show_img);
                printf("\n cvWriteFrame \n");
            }
            // save video file
            if (output_video_writer && show_img) {
                cvWriteFrame(output_video_writer, show_img);
                printf("\n cvWriteFrame \n");
            }

            cvReleaseImage(&show_img);
            cvReleaseImage(&show_img);

            pthread_join(fetch_thread, 0);
            pthread_join(detect_thread, 0);

            if (flag_exit == 1) break;
            if (flag_exit == 1) break;

            if(delay == 0){
                show_img = det_img;
                show_img = det_img;
            }
            det_img = in_img;
            det_img = in_img;
            det_s = in_s;
        }else {
            fetch_in_thread(0);
            det_img = in_img;
            det_img = in_img;
            det_s = in_s;
            detect_in_thread(0);

            show_img = det_img;
            if (!dont_show) {
                show_image_cv_ipl(show_img, "Demo");
                cvWaitKey(1);
            }
            cvReleaseImage(&show_img);
            show_img = det_img;
            if (!dont_show) {
                show_image_cv_ipl(show_img, "Demo");
                cvWaitKey(1);
            }
            cvReleaseImage(&show_img);
        }
        --delay;
        if(delay < 0){
@@ -310,42 +310,42 @@
            before = after;
        }
    }
    printf("input video stream closed. \n");
    if (output_video_writer) {
        cvReleaseVideoWriter(&output_video_writer);
        printf("output_video_writer closed. \n");
    }
    printf("input video stream closed. \n");
    if (output_video_writer) {
        cvReleaseVideoWriter(&output_video_writer);
        printf("output_video_writer closed. \n");
    }

    // free memory
    cvReleaseImage(&show_img);
    cvReleaseImage(&in_img);
    free_image(in_s);
    // free memory
    cvReleaseImage(&show_img);
    cvReleaseImage(&in_img);
    free_image(in_s);

    free(avg);
    for (j = 0; j < FRAMES; ++j) free(predictions[j]);
    for (j = 0; j < FRAMES; ++j) free_image(images[j]);
    free(avg);
    for (j = 0; j < FRAMES; ++j) free(predictions[j]);
    for (j = 0; j < FRAMES; ++j) free_image(images[j]);

    for (j = 0; j < l.w*l.h*l.n; ++j) free(probs[j]);
    free(boxes);
    free(probs);
    for (j = 0; j < l.w*l.h*l.n; ++j) free(probs[j]);
    free(boxes);
    free(probs);

    free_ptrs(names, net.layers[net.n - 1].classes);
    free_ptrs(names, net.layers[net.n - 1].classes);

    int i;
    const int nsize = 8;
    for (j = 0; j < nsize; ++j) {
        for (i = 32; i < 127; ++i) {
            free_image(alphabet[j][i]);
        }
        free(alphabet[j]);
    }
    free(alphabet);
    int i;
    const int nsize = 8;
    for (j = 0; j < nsize; ++j) {
        for (i = 32; i < 127; ++i) {
            free_image(alphabet[j][i]);
        }
        free(alphabet[j]);
    }
    free(alphabet);

    free_network(net);
    free_network(net);
}
#else
void demo(char *cfgfile, char *weightfile, float thresh, float hier_thresh, int cam_index, const char *filename, char **names, int classes,
    int frame_skip, char *prefix, char *out_filename, int http_stream_port, int dont_show, int ext_output)
    int frame_skip, char *prefix, char *out_filename, int http_stream_port, int dont_show, int ext_output)
{
    fprintf(stderr, "Demo needs OpenCV for webcam images.\n");
}

 src/detector.c

@@ -27,7 +27,7 @@

IplImage* draw_train_chart(float max_img_loss, int max_batches, int number_of_lines, int img_size);
void draw_train_loss(IplImage* img, int img_size, float avg_loss, float max_img_loss, int current_batch, int max_batches);
#endif  // OPENCV
#endif    // OPENCV

static int coco_ids[] = {1,2,3,4,5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,27,28,31,32,33,34,35,36,37,38,39,40,41,42,43,44,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,67,70,72,73,74,75,76,77,78,79,80,81,82,84,85,86,87,88,89,90};

@@ -61,14 +61,14 @@
    srand(time(0));
    network net = nets[0];

    const int actual_batch_size = net.batch * net.subdivisions;
    if (actual_batch_size == 1) {
        printf("\n Error: You set incorrect value batch=1 for Training! You should set batch=64 subdivision=64 \n");
        getchar();
    }
    else if (actual_batch_size < 64) {
            printf("\n Warning: You set batch=%d lower than 64! It is recommended to set batch=64 subdivision=64 \n", actual_batch_size);
    }
    const int actual_batch_size = net.batch * net.subdivisions;
    if (actual_batch_size == 1) {
        printf("\n Error: You set incorrect value batch=1 for Training! You should set batch=64 subdivision=64 \n");
        getchar();
    }
    else if (actual_batch_size < 64) {
            printf("\n Warning: You set batch=%d lower than 64! It is recommended to set batch=64 subdivision=64 \n", actual_batch_size);
    }

    int imgs = net.batch * net.subdivisions * ngpus;
    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
@@ -83,26 +83,26 @@
    //int N = plist->size;
    char **paths = (char **)list_to_array(plist);

    int init_w = net.w;
    int init_h = net.h;
    int iter_save;
    iter_save = get_current_batch(net);
    int init_w = net.w;
    int init_h = net.h;
    int iter_save;
    iter_save = get_current_batch(net);

    load_args args = {0};
    args.w = net.w;
    args.h = net.h;
    args.c = net.c;
    args.paths = paths;
    args.c = net.c;
    args.paths = paths;
    args.n = imgs;
    args.m = plist->size;
    args.classes = classes;
    args.flip = net.flip;
    args.jitter = jitter;
    args.num_boxes = l.max_boxes;
    args.small_object = net.small_object;
    args.small_object = net.small_object;
    args.d = &buffer;
    args.type = DETECTION_DATA;
    args.threads = 16;  // 64
    args.threads = 16;    // 64

    args.angle = net.angle;
    args.exposure = net.exposure;
@@ -110,40 +110,40 @@
    args.hue = net.hue;

#ifdef OPENCV
    args.threads = 3 * ngpus;
    IplImage* img = NULL;
    float max_img_loss = 5;
    int number_of_lines = 100;
    int img_size = 1000;
    if (!dont_show)
        img = draw_train_chart(max_img_loss, net.max_batches, number_of_lines, img_size);
#endif  //OPENCV
    args.threads = 3 * ngpus;
    IplImage* img = NULL;
    float max_img_loss = 5;
    int number_of_lines = 100;
    int img_size = 1000;
    if (!dont_show)
        img = draw_train_chart(max_img_loss, net.max_batches, number_of_lines, img_size);
#endif    //OPENCV

    pthread_t load_thread = load_data(args);
    double time;
    int count = 0;
    //while(i*imgs < N*120){
    while(get_current_batch(net) < net.max_batches){
        if(l.random && count++%10 == 0){
        if(l.random && count++%10 == 0){
            printf("Resizing\n");
            //int dim = (rand() % 12 + (init_w/32 - 5)) * 32;   // +-160
            //int dim = (rand() % 12 + (init_w/32 - 5)) * 32;    // +-160
            //int dim = (rand() % 4 + 16) * 32;
            //if (get_current_batch(net)+100 > net.max_batches) dim = 544;
			
            //int random_val = rand() % 12;
            //int dim_w = (random_val + (init_w / 32 - 5)) * 32;    // +-160
            //int dim_h = (random_val + (init_h / 32 - 5)) * 32;    // +-160
            //if (get_current_batch(net)+100 > net.max_batches) dim = 544;

            float random_val = rand_scale(1.4); // *x or /x
            int dim_w = roundl(random_val*init_w / 32) * 32;
            int dim_h = roundl(random_val*init_h / 32) * 32;
            //int random_val = rand() % 12;
            //int dim_w = (random_val + (init_w / 32 - 5)) * 32;    // +-160
            //int dim_h = (random_val + (init_h / 32 - 5)) * 32;    // +-160

            if (dim_w < 32) dim_w = 32;
            if (dim_h < 32) dim_h = 32;
            float random_val = rand_scale(1.4);    // *x or /x
            int dim_w = roundl(random_val*init_w / 32) * 32;
            int dim_h = roundl(random_val*init_h / 32) * 32;

            printf("%d x %d \n", dim_w, dim_h);
            args.w = dim_w;
            args.h = dim_h;
            if (dim_w < 32) dim_w = 32;
            if (dim_h < 32) dim_h = 32;

            printf("%d x %d \n", dim_w, dim_h);
            args.w = dim_w;
            args.h = dim_h;

            pthread_join(load_thread, 0);
            train = buffer;
@@ -190,28 +190,28 @@
#else
        loss = train_network(net, train);
#endif
        if (avg_loss < 0 || avg_loss != avg_loss) avg_loss = loss;  // if(-inf or nan)
        if (avg_loss < 0 || avg_loss != avg_loss) avg_loss = loss;    // if(-inf or nan)
        avg_loss = avg_loss*.9 + loss*.1;

        i = get_current_batch(net);
        printf("\n %d: %f, %f avg loss, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), (what_time_is_it_now()-time), i*imgs);

#ifdef OPENCV
        if(!dont_show)
            draw_train_loss(img, img_size, avg_loss, max_img_loss, i, net.max_batches);
#endif  // OPENCV
        if(!dont_show)
            draw_train_loss(img, img_size, avg_loss, max_img_loss, i, net.max_batches);
#endif    // OPENCV

        //if (i % 1000 == 0 || (i < 1000 && i % 100 == 0)) {
        //if (i % 100 == 0) {
        if(i >= (iter_save + 100)) {
            iter_save = i;
        //if (i % 1000 == 0 || (i < 1000 && i % 100 == 0)) {
        //if (i % 100 == 0) {
        if(i >= (iter_save + 100)) {
            iter_save = i;
#ifdef GPU
            if (ngpus != 1) sync_nets(nets, ngpus, 0);
            if (ngpus != 1) sync_nets(nets, ngpus, 0);
#endif
            char buff[256];
            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
            save_weights(net, buff);
        }
            char buff[256];
            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
            save_weights(net, buff);
        }
        free_data(train);
    }
#ifdef GPU
@@ -222,870 +222,870 @@
    save_weights(net, buff);

#ifdef OPENCV
    cvReleaseImage(&img);
    cvDestroyAllWindows();
    cvReleaseImage(&img);
    cvDestroyAllWindows();
#endif

    // free memory
    pthread_join(load_thread, 0);
    free_data(buffer);
    // free memory
    pthread_join(load_thread, 0);
    free_data(buffer);

    free(base);
    free(paths);
    free_list_contents(plist);
    free_list(plist);
    free(base);
    free(paths);
    free_list_contents(plist);
    free_list(plist);

    free_list_contents_kvp(options);
    free_list(options);
    free_list_contents_kvp(options);
    free_list(options);

    free(nets);
    free_network(net);
    free(nets);
    free_network(net);
}


static int get_coco_image_id(char *filename)
{
    char *p = strrchr(filename, '/');
    char *c = strrchr(filename, '_');
    if (c) p = c;
    return atoi(p + 1);
    char *p = strrchr(filename, '/');
    char *c = strrchr(filename, '_');
    if (c) p = c;
    return atoi(p + 1);
}

static void print_cocos(FILE *fp, char *image_path, detection *dets, int num_boxes, int classes, int w, int h)
{
    int i, j;
    int image_id = get_coco_image_id(image_path);
    for (i = 0; i < num_boxes; ++i) {
        float xmin = dets[i].bbox.x - dets[i].bbox.w / 2.;
        float xmax = dets[i].bbox.x + dets[i].bbox.w / 2.;
        float ymin = dets[i].bbox.y - dets[i].bbox.h / 2.;
        float ymax = dets[i].bbox.y + dets[i].bbox.h / 2.;
    int i, j;
    int image_id = get_coco_image_id(image_path);
    for (i = 0; i < num_boxes; ++i) {
        float xmin = dets[i].bbox.x - dets[i].bbox.w / 2.;
        float xmax = dets[i].bbox.x + dets[i].bbox.w / 2.;
        float ymin = dets[i].bbox.y - dets[i].bbox.h / 2.;
        float ymax = dets[i].bbox.y + dets[i].bbox.h / 2.;

        if (xmin < 0) xmin = 0;
        if (ymin < 0) ymin = 0;
        if (xmax > w) xmax = w;
        if (ymax > h) ymax = h;
        if (xmin < 0) xmin = 0;
        if (ymin < 0) ymin = 0;
        if (xmax > w) xmax = w;
        if (ymax > h) ymax = h;

        float bx = xmin;
        float by = ymin;
        float bw = xmax - xmin;
        float bh = ymax - ymin;
        float bx = xmin;
        float by = ymin;
        float bw = xmax - xmin;
        float bh = ymax - ymin;

        for (j = 0; j < classes; ++j) {
            if (dets[i].prob[j]) fprintf(fp, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f},\n", image_id, coco_ids[j], bx, by, bw, bh, dets[i].prob[j]);
        }
    }
        for (j = 0; j < classes; ++j) {
            if (dets[i].prob[j]) fprintf(fp, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f},\n", image_id, coco_ids[j], bx, by, bw, bh, dets[i].prob[j]);
        }
    }
}

void print_detector_detections(FILE **fps, char *id, detection *dets, int total, int classes, int w, int h)
{
    int i, j;
    for (i = 0; i < total; ++i) {
        float xmin = dets[i].bbox.x - dets[i].bbox.w / 2. + 1;
        float xmax = dets[i].bbox.x + dets[i].bbox.w / 2. + 1;
        float ymin = dets[i].bbox.y - dets[i].bbox.h / 2. + 1;
        float ymax = dets[i].bbox.y + dets[i].bbox.h / 2. + 1;
    int i, j;
    for (i = 0; i < total; ++i) {
        float xmin = dets[i].bbox.x - dets[i].bbox.w / 2. + 1;
        float xmax = dets[i].bbox.x + dets[i].bbox.w / 2. + 1;
        float ymin = dets[i].bbox.y - dets[i].bbox.h / 2. + 1;
        float ymax = dets[i].bbox.y + dets[i].bbox.h / 2. + 1;

        if (xmin < 1) xmin = 1;
        if (ymin < 1) ymin = 1;
        if (xmax > w) xmax = w;
        if (ymax > h) ymax = h;
        if (xmin < 1) xmin = 1;
        if (ymin < 1) ymin = 1;
        if (xmax > w) xmax = w;
        if (ymax > h) ymax = h;

        for (j = 0; j < classes; ++j) {
            if (dets[i].prob[j]) fprintf(fps[j], "%s %f %f %f %f %f\n", id, dets[i].prob[j],
                xmin, ymin, xmax, ymax);
        }
    }
        for (j = 0; j < classes; ++j) {
            if (dets[i].prob[j]) fprintf(fps[j], "%s %f %f %f %f %f\n", id, dets[i].prob[j],
                xmin, ymin, xmax, ymax);
        }
    }
}

void print_imagenet_detections(FILE *fp, int id, detection *dets, int total, int classes, int w, int h)
{
    int i, j;
    for (i = 0; i < total; ++i) {
        float xmin = dets[i].bbox.x - dets[i].bbox.w / 2.;
        float xmax = dets[i].bbox.x + dets[i].bbox.w / 2.;
        float ymin = dets[i].bbox.y - dets[i].bbox.h / 2.;
        float ymax = dets[i].bbox.y + dets[i].bbox.h / 2.;
    int i, j;
    for (i = 0; i < total; ++i) {
        float xmin = dets[i].bbox.x - dets[i].bbox.w / 2.;
        float xmax = dets[i].bbox.x + dets[i].bbox.w / 2.;
        float ymin = dets[i].bbox.y - dets[i].bbox.h / 2.;
        float ymax = dets[i].bbox.y + dets[i].bbox.h / 2.;

        if (xmin < 0) xmin = 0;
        if (ymin < 0) ymin = 0;
        if (xmax > w) xmax = w;
        if (ymax > h) ymax = h;
        if (xmin < 0) xmin = 0;
        if (ymin < 0) ymin = 0;
        if (xmax > w) xmax = w;
        if (ymax > h) ymax = h;

        for (j = 0; j < classes; ++j) {
            int class = j;
            if (dets[i].prob[class]) fprintf(fp, "%d %d %f %f %f %f %f\n", id, j + 1, dets[i].prob[class],
                xmin, ymin, xmax, ymax);
        }
    }
        for (j = 0; j < classes; ++j) {
            int class = j;
            if (dets[i].prob[class]) fprintf(fp, "%d %d %f %f %f %f %f\n", id, j + 1, dets[i].prob[class],
                xmin, ymin, xmax, ymax);
        }
    }
}

void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *outfile)
{
    int j;
    list *options = read_data_cfg(datacfg);
    char *valid_images = option_find_str(options, "valid", "data/train.list");
    char *name_list = option_find_str(options, "names", "data/names.list");
    char *prefix = option_find_str(options, "results", "results");
    char **names = get_labels(name_list);
    char *mapf = option_find_str(options, "map", 0);
    int *map = 0;
    if (mapf) map = read_map(mapf);
    int j;
    list *options = read_data_cfg(datacfg);
    char *valid_images = option_find_str(options, "valid", "data/train.list");
    char *name_list = option_find_str(options, "names", "data/names.list");
    char *prefix = option_find_str(options, "results", "results");
    char **names = get_labels(name_list);
    char *mapf = option_find_str(options, "map", 0);
    int *map = 0;
    if (mapf) map = read_map(mapf);

    network net = parse_network_cfg_custom(cfgfile, 1); // set batch=1
    if (weightfile) {
        load_weights(&net, weightfile);
    }
    //set_batch_network(&net, 1);
    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
    srand(time(0));
    network net = parse_network_cfg_custom(cfgfile, 1);    // set batch=1
    if (weightfile) {
        load_weights(&net, weightfile);
    }
    //set_batch_network(&net, 1);
    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
    srand(time(0));

    list *plist = get_paths(valid_images);
    char **paths = (char **)list_to_array(plist);
    list *plist = get_paths(valid_images);
    char **paths = (char **)list_to_array(plist);

    layer l = net.layers[net.n - 1];
    int classes = l.classes;
    layer l = net.layers[net.n - 1];
    int classes = l.classes;

    char buff[1024];
    char *type = option_find_str(options, "eval", "voc");
    FILE *fp = 0;
    FILE **fps = 0;
    int coco = 0;
    int imagenet = 0;
    if (0 == strcmp(type, "coco")) {
        if (!outfile) outfile = "coco_results";
        snprintf(buff, 1024, "%s/%s.json", prefix, outfile);
        fp = fopen(buff, "w");
        fprintf(fp, "[\n");
        coco = 1;
    }
    else if (0 == strcmp(type, "imagenet")) {
        if (!outfile) outfile = "imagenet-detection";
        snprintf(buff, 1024, "%s/%s.txt", prefix, outfile);
        fp = fopen(buff, "w");
        imagenet = 1;
        classes = 200;
    }
    else {
        if (!outfile) outfile = "comp4_det_test_";
        fps = calloc(classes, sizeof(FILE *));
        for (j = 0; j < classes; ++j) {
            snprintf(buff, 1024, "%s/%s%s.txt", prefix, outfile, names[j]);
            fps[j] = fopen(buff, "w");
        }
    }
    char buff[1024];
    char *type = option_find_str(options, "eval", "voc");
    FILE *fp = 0;
    FILE **fps = 0;
    int coco = 0;
    int imagenet = 0;
    if (0 == strcmp(type, "coco")) {
        if (!outfile) outfile = "coco_results";
        snprintf(buff, 1024, "%s/%s.json", prefix, outfile);
        fp = fopen(buff, "w");
        fprintf(fp, "[\n");
        coco = 1;
    }
    else if (0 == strcmp(type, "imagenet")) {
        if (!outfile) outfile = "imagenet-detection";
        snprintf(buff, 1024, "%s/%s.txt", prefix, outfile);
        fp = fopen(buff, "w");
        imagenet = 1;
        classes = 200;
    }
    else {
        if (!outfile) outfile = "comp4_det_test_";
        fps = calloc(classes, sizeof(FILE *));
        for (j = 0; j < classes; ++j) {
            snprintf(buff, 1024, "%s/%s%s.txt", prefix, outfile, names[j]);
            fps[j] = fopen(buff, "w");
        }
    }


    int m = plist->size;
    int i = 0;
    int t;
    int m = plist->size;
    int i = 0;
    int t;

    float thresh = .005;
    float nms = .45;
    float thresh = .005;
    float nms = .45;

    int nthreads = 4;
    image *val = calloc(nthreads, sizeof(image));
    image *val_resized = calloc(nthreads, sizeof(image));
    image *buf = calloc(nthreads, sizeof(image));
    image *buf_resized = calloc(nthreads, sizeof(image));
    pthread_t *thr = calloc(nthreads, sizeof(pthread_t));
    int nthreads = 4;
    image *val = calloc(nthreads, sizeof(image));
    image *val_resized = calloc(nthreads, sizeof(image));
    image *buf = calloc(nthreads, sizeof(image));
    image *buf_resized = calloc(nthreads, sizeof(image));
    pthread_t *thr = calloc(nthreads, sizeof(pthread_t));

    load_args args = { 0 };
    args.w = net.w;
    args.h = net.h;
    args.c = net.c;
    args.type = IMAGE_DATA;
    //args.type = LETTERBOX_DATA;
    load_args args = { 0 };
    args.w = net.w;
    args.h = net.h;
    args.c = net.c;
    args.type = IMAGE_DATA;
    //args.type = LETTERBOX_DATA;

    for (t = 0; t < nthreads; ++t) {
        args.path = paths[i + t];
        args.im = &buf[t];
        args.resized = &buf_resized[t];
        thr[t] = load_data_in_thread(args);
    }
    time_t start = time(0);
    for (i = nthreads; i < m + nthreads; i += nthreads) {
        fprintf(stderr, "%d\n", i);
        for (t = 0; t < nthreads && i + t - nthreads < m; ++t) {
            pthread_join(thr[t], 0);
            val[t] = buf[t];
            val_resized[t] = buf_resized[t];
        }
        for (t = 0; t < nthreads && i + t < m; ++t) {
            args.path = paths[i + t];
            args.im = &buf[t];
            args.resized = &buf_resized[t];
            thr[t] = load_data_in_thread(args);
        }
        for (t = 0; t < nthreads && i + t - nthreads < m; ++t) {
            char *path = paths[i + t - nthreads];
            char *id = basecfg(path);
            float *X = val_resized[t].data;
            network_predict(net, X);
            int w = val[t].w;
            int h = val[t].h;
            int nboxes = 0;
            int letterbox = (args.type == LETTERBOX_DATA);
            detection *dets = get_network_boxes(&net, w, h, thresh, .5, map, 0, &nboxes, letterbox);
            if (nms) do_nms_sort(dets, nboxes, classes, nms);
            if (coco) {
                print_cocos(fp, path, dets, nboxes, classes, w, h);
            }
            else if (imagenet) {
                print_imagenet_detections(fp, i + t - nthreads + 1, dets, nboxes, classes, w, h);
            }
            else {
                print_detector_detections(fps, id, dets, nboxes, classes, w, h);
            }
            free_detections(dets, nboxes);
            free(id);
            free_image(val[t]);
            free_image(val_resized[t]);
        }
    }
    for (j = 0; j < classes; ++j) {
        if (fps) fclose(fps[j]);
    }
    if (coco) {
        fseek(fp, -2, SEEK_CUR);
        fprintf(fp, "\n]\n");
        fclose(fp);
    }
    fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)time(0) - start);
    for (t = 0; t < nthreads; ++t) {
        args.path = paths[i + t];
        args.im = &buf[t];
        args.resized = &buf_resized[t];
        thr[t] = load_data_in_thread(args);
    }
    time_t start = time(0);
    for (i = nthreads; i < m + nthreads; i += nthreads) {
        fprintf(stderr, "%d\n", i);
        for (t = 0; t < nthreads && i + t - nthreads < m; ++t) {
            pthread_join(thr[t], 0);
            val[t] = buf[t];
            val_resized[t] = buf_resized[t];
        }
        for (t = 0; t < nthreads && i + t < m; ++t) {
            args.path = paths[i + t];
            args.im = &buf[t];
            args.resized = &buf_resized[t];
            thr[t] = load_data_in_thread(args);
        }
        for (t = 0; t < nthreads && i + t - nthreads < m; ++t) {
            char *path = paths[i + t - nthreads];
            char *id = basecfg(path);
            float *X = val_resized[t].data;
            network_predict(net, X);
            int w = val[t].w;
            int h = val[t].h;
            int nboxes = 0;
            int letterbox = (args.type == LETTERBOX_DATA);
            detection *dets = get_network_boxes(&net, w, h, thresh, .5, map, 0, &nboxes, letterbox);
            if (nms) do_nms_sort(dets, nboxes, classes, nms);
            if (coco) {
                print_cocos(fp, path, dets, nboxes, classes, w, h);
            }
            else if (imagenet) {
                print_imagenet_detections(fp, i + t - nthreads + 1, dets, nboxes, classes, w, h);
            }
            else {
                print_detector_detections(fps, id, dets, nboxes, classes, w, h);
            }
            free_detections(dets, nboxes);
            free(id);
            free_image(val[t]);
            free_image(val_resized[t]);
        }
    }
    for (j = 0; j < classes; ++j) {
        if (fps) fclose(fps[j]);
    }
    if (coco) {
        fseek(fp, -2, SEEK_CUR);
        fprintf(fp, "\n]\n");
        fclose(fp);
    }
    fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)time(0) - start);
}

void validate_detector_recall(char *datacfg, char *cfgfile, char *weightfile)
{
    network net = parse_network_cfg_custom(cfgfile, 1); // set batch=1
    if (weightfile) {
        load_weights(&net, weightfile);
    }
    //set_batch_network(&net, 1);
    fuse_conv_batchnorm(net);
    srand(time(0));
    network net = parse_network_cfg_custom(cfgfile, 1);    // set batch=1
    if (weightfile) {
        load_weights(&net, weightfile);
    }
    //set_batch_network(&net, 1);
    fuse_conv_batchnorm(net);
    srand(time(0));

    //list *plist = get_paths("data/coco_val_5k.list");
    list *options = read_data_cfg(datacfg);
    char *valid_images = option_find_str(options, "valid", "data/train.txt");
    list *plist = get_paths(valid_images);
    char **paths = (char **)list_to_array(plist);
    //list *plist = get_paths("data/coco_val_5k.list");
    list *options = read_data_cfg(datacfg);
    char *valid_images = option_find_str(options, "valid", "data/train.txt");
    list *plist = get_paths(valid_images);
    char **paths = (char **)list_to_array(plist);

    layer l = net.layers[net.n - 1];
    layer l = net.layers[net.n - 1];

    int j, k;
    int j, k;

    int m = plist->size;
    int i = 0;
    int m = plist->size;
    int i = 0;

    float thresh = .001;
    float iou_thresh = .5;
    float nms = .4;
    float thresh = .001;
    float iou_thresh = .5;
    float nms = .4;

    int total = 0;
    int correct = 0;
    int proposals = 0;
    float avg_iou = 0;
    int total = 0;
    int correct = 0;
    int proposals = 0;
    float avg_iou = 0;

    for (i = 0; i < m; ++i) {
        char *path = paths[i];
        image orig = load_image(path, 0, 0, net.c);
        image sized = resize_image(orig, net.w, net.h);
        char *id = basecfg(path);
        network_predict(net, sized.data);
        int nboxes = 0;
        int letterbox = 0;
        detection *dets = get_network_boxes(&net, sized.w, sized.h, thresh, .5, 0, 1, &nboxes, letterbox);
        if (nms) do_nms_obj(dets, nboxes, 1, nms);
    for (i = 0; i < m; ++i) {
        char *path = paths[i];
        image orig = load_image(path, 0, 0, net.c);
        image sized = resize_image(orig, net.w, net.h);
        char *id = basecfg(path);
        network_predict(net, sized.data);
        int nboxes = 0;
        int letterbox = 0;
        detection *dets = get_network_boxes(&net, sized.w, sized.h, thresh, .5, 0, 1, &nboxes, letterbox);
        if (nms) do_nms_obj(dets, nboxes, 1, nms);

        char labelpath[4096];
        replace_image_to_label(path, labelpath);
        char labelpath[4096];
        replace_image_to_label(path, labelpath);

        int num_labels = 0;
        box_label *truth = read_boxes(labelpath, &num_labels);
        for (k = 0; k < nboxes; ++k) {
            if (dets[k].objectness > thresh) {
                ++proposals;
            }
        }
        for (j = 0; j < num_labels; ++j) {
            ++total;
            box t = { truth[j].x, truth[j].y, truth[j].w, truth[j].h };
            float best_iou = 0;
            for (k = 0; k < nboxes; ++k) {
                float iou = box_iou(dets[k].bbox, t);
                if (dets[k].objectness > thresh && iou > best_iou) {
                    best_iou = iou;
                }
            }
            avg_iou += best_iou;
            if (best_iou > iou_thresh) {
                ++correct;
            }
        }
        //fprintf(stderr, " %s - %s - ", paths[i], labelpath);
        fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals / (i + 1), avg_iou * 100 / total, 100.*correct / total);
        free(id);
        free_image(orig);
        free_image(sized);
    }
        int num_labels = 0;
        box_label *truth = read_boxes(labelpath, &num_labels);
        for (k = 0; k < nboxes; ++k) {
            if (dets[k].objectness > thresh) {
                ++proposals;
            }
        }
        for (j = 0; j < num_labels; ++j) {
            ++total;
            box t = { truth[j].x, truth[j].y, truth[j].w, truth[j].h };
            float best_iou = 0;
            for (k = 0; k < nboxes; ++k) {
                float iou = box_iou(dets[k].bbox, t);
                if (dets[k].objectness > thresh && iou > best_iou) {
                    best_iou = iou;
                }
            }
            avg_iou += best_iou;
            if (best_iou > iou_thresh) {
                ++correct;
            }
        }
        //fprintf(stderr, " %s - %s - ", paths[i], labelpath);
        fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals / (i + 1), avg_iou * 100 / total, 100.*correct / total);
        free(id);
        free_image(orig);
        free_image(sized);
    }
}

typedef struct {
    box b;
    float p;
    int class_id;
    int image_index;
    int truth_flag;
    int unique_truth_index;
    box b;
    float p;
    int class_id;
    int image_index;
    int truth_flag;
    int unique_truth_index;
} box_prob;

int detections_comparator(const void *pa, const void *pb)
{
    box_prob a = *(box_prob *)pa;
    box_prob b = *(box_prob *)pb;
    float diff = a.p - b.p;
    if (diff < 0) return 1;
    else if (diff > 0) return -1;
    return 0;
    box_prob a = *(box_prob *)pa;
    box_prob b = *(box_prob *)pb;
    float diff = a.p - b.p;
    if (diff < 0) return 1;
    else if (diff > 0) return -1;
    return 0;
}

void validate_detector_map(char *datacfg, char *cfgfile, char *weightfile, float thresh_calc_avg_iou)
{
    int j;
    list *options = read_data_cfg(datacfg);
    char *valid_images = option_find_str(options, "valid", "data/train.txt");
    char *difficult_valid_images = option_find_str(options, "difficult", NULL);
    char *name_list = option_find_str(options, "names", "data/names.list");
    char **names = get_labels(name_list);
    char *mapf = option_find_str(options, "map", 0);
    int *map = 0;
    if (mapf) map = read_map(mapf);
    FILE* reinforcement_fd = NULL;
    int j;
    list *options = read_data_cfg(datacfg);
    char *valid_images = option_find_str(options, "valid", "data/train.txt");
    char *difficult_valid_images = option_find_str(options, "difficult", NULL);
    char *name_list = option_find_str(options, "names", "data/names.list");
    char **names = get_labels(name_list);
    char *mapf = option_find_str(options, "map", 0);
    int *map = 0;
    if (mapf) map = read_map(mapf);
    FILE* reinforcement_fd = NULL;

    network net = parse_network_cfg_custom(cfgfile, 1); // set batch=1
    if (weightfile) {
        load_weights(&net, weightfile);
    }
    //set_batch_network(&net, 1);
    fuse_conv_batchnorm(net);
    srand(time(0));
    network net = parse_network_cfg_custom(cfgfile, 1);    // set batch=1
    if (weightfile) {
        load_weights(&net, weightfile);
    }
    //set_batch_network(&net, 1);
    fuse_conv_batchnorm(net);
    srand(time(0));

    list *plist = get_paths(valid_images);
    char **paths = (char **)list_to_array(plist);
    list *plist = get_paths(valid_images);
    char **paths = (char **)list_to_array(plist);

    char **paths_dif = NULL;
    if (difficult_valid_images) {
        list *plist_dif = get_paths(difficult_valid_images);
        paths_dif = (char **)list_to_array(plist_dif);
    }
	

    layer l = net.layers[net.n - 1];
    int classes = l.classes;

    int m = plist->size;
    int i = 0;
    int t;

    const float thresh = .005;
    const float nms = .45;
    const float iou_thresh = 0.5;

    int nthreads = 4;
    image *val = calloc(nthreads, sizeof(image));
    image *val_resized = calloc(nthreads, sizeof(image));
    image *buf = calloc(nthreads, sizeof(image));
    image *buf_resized = calloc(nthreads, sizeof(image));
    pthread_t *thr = calloc(nthreads, sizeof(pthread_t));

    load_args args = { 0 };
    args.w = net.w;
    args.h = net.h;
    args.c = net.c;
    args.type = IMAGE_DATA;
    //args.type = LETTERBOX_DATA;

    //const float thresh_calc_avg_iou = 0.24;
    float avg_iou = 0;
    int tp_for_thresh = 0;
    int fp_for_thresh = 0;

    box_prob *detections = calloc(1, sizeof(box_prob));
    int detections_count = 0;
    int unique_truth_count = 0;

    int *truth_classes_count = calloc(classes, sizeof(int));

    for (t = 0; t < nthreads; ++t) {
        args.path = paths[i + t];
        args.im = &buf[t];
        args.resized = &buf_resized[t];
        thr[t] = load_data_in_thread(args);
    }
    time_t start = time(0);
    for (i = nthreads; i < m + nthreads; i += nthreads) {
        fprintf(stderr, "%d\n", i);
        for (t = 0; t < nthreads && i + t - nthreads < m; ++t) {
            pthread_join(thr[t], 0);
            val[t] = buf[t];
            val_resized[t] = buf_resized[t];
        }
        for (t = 0; t < nthreads && i + t < m; ++t) {
            args.path = paths[i + t];
            args.im = &buf[t];
            args.resized = &buf_resized[t];
            thr[t] = load_data_in_thread(args);
        }
        for (t = 0; t < nthreads && i + t - nthreads < m; ++t) {
            const int image_index = i + t - nthreads;
            char *path = paths[image_index];
            char *id = basecfg(path);
            float *X = val_resized[t].data;
            network_predict(net, X);

            int nboxes = 0;
            int letterbox = (args.type == LETTERBOX_DATA);
            float hier_thresh = 0;
            detection *dets = get_network_boxes(&net, 1, 1, thresh, hier_thresh, 0, 0, &nboxes, letterbox);
            //detection *dets = get_network_boxes(&net, val[t].w, val[t].h, thresh, hier_thresh, 0, 1, &nboxes, letterbox); // for letterbox=1
            if (nms) do_nms_sort(dets, nboxes, l.classes, nms);

            char labelpath[4096];
            replace_image_to_label(path, labelpath);
            int num_labels = 0;
            box_label *truth = read_boxes(labelpath, &num_labels);
            int i, j;
            for (j = 0; j < num_labels; ++j) {
                truth_classes_count[truth[j].id]++;
            }

            // difficult
            box_label *truth_dif = NULL;
            int num_labels_dif = 0;
            if (paths_dif)
            {
                char *path_dif = paths_dif[image_index];

                char labelpath_dif[4096];
                replace_image_to_label(path_dif, labelpath_dif);
			
                truth_dif = read_boxes(labelpath_dif, &num_labels_dif);
            }

            const int checkpoint_detections_count = detections_count;

            for (i = 0; i < nboxes; ++i) {

                int class_id;
                for (class_id = 0; class_id < classes; ++class_id) {
                    float prob = dets[i].prob[class_id];
                    if (prob > 0) {
                        detections_count++;
                        detections = realloc(detections, detections_count * sizeof(box_prob));
                        detections[detections_count - 1].b = dets[i].bbox;
                        detections[detections_count - 1].p = prob;
                        detections[detections_count - 1].image_index = image_index;
                        detections[detections_count - 1].class_id = class_id;
                        detections[detections_count - 1].truth_flag = 0;
                        detections[detections_count - 1].unique_truth_index = -1;

                        int truth_index = -1;
                        float max_iou = 0;
                        for (j = 0; j < num_labels; ++j)
                        {
                            box t = { truth[j].x, truth[j].y, truth[j].w, truth[j].h };
                            //printf(" IoU = %f, prob = %f, class_id = %d, truth[j].id = %d \n", 
                            //  box_iou(dets[i].bbox, t), prob, class_id, truth[j].id);
                            float current_iou = box_iou(dets[i].bbox, t);
                            if (current_iou > iou_thresh && class_id == truth[j].id) {
                                if (current_iou > max_iou) {
                                    max_iou = current_iou;
                                    truth_index = unique_truth_count + j;
                                }
                            }
                        }

                        // best IoU
                        if (truth_index > -1) {
                            detections[detections_count - 1].truth_flag = 1;
                            detections[detections_count - 1].unique_truth_index = truth_index;
                        }
                        else {
                            // if object is difficult then remove detection
                            for (j = 0; j < num_labels_dif; ++j) {
                                box t = { truth_dif[j].x, truth_dif[j].y, truth_dif[j].w, truth_dif[j].h };
                                float current_iou = box_iou(dets[i].bbox, t);
                                if (current_iou > iou_thresh && class_id == truth_dif[j].id) {
                                    --detections_count;
                                    break;
                                }
                            }
                        }

                        // calc avg IoU, true-positives, false-positives for required Threshold
                        if (prob > thresh_calc_avg_iou) {
                            int z, found = 0;
                            for (z = checkpoint_detections_count; z < detections_count-1; ++z)
                                if (detections[z].unique_truth_index == truth_index) {
                                    found = 1; break;
                                }

                            if(truth_index > -1 && found == 0) {
                                avg_iou += max_iou;
                                ++tp_for_thresh;
                            }
                            else
                                fp_for_thresh++;
                        }
                    }
                }
            }
				
            unique_truth_count += num_labels;

            //static int previous_errors = 0;
            //int total_errors = fp_for_thresh + (unique_truth_count - tp_for_thresh);
            //int errors_in_this_image = total_errors - previous_errors;
            //previous_errors = total_errors;
            //if(reinforcement_fd == NULL) reinforcement_fd = fopen("reinforcement.txt", "wb");
            //char buff[1000];
            //sprintf(buff, "%s\n", path);
            //if(errors_in_this_image > 0) fwrite(buff, sizeof(char), strlen(buff), reinforcement_fd);

            free_detections(dets, nboxes);
            free(id);
            free_image(val[t]);
            free_image(val_resized[t]);
        }
    }

    if((tp_for_thresh + fp_for_thresh) > 0)
        avg_iou = avg_iou / (tp_for_thresh + fp_for_thresh);

	
    // SORT(detections)
    qsort(detections, detections_count, sizeof(box_prob), detections_comparator);
	
    typedef struct {
        double precision;
        double recall;
        int tp, fp, fn;
    } pr_t;

    // for PR-curve
    pr_t **pr = calloc(classes, sizeof(pr_t*));
    for (i = 0; i < classes; ++i) {
        pr[i] = calloc(detections_count, sizeof(pr_t));
    }
    printf("detections_count = %d, unique_truth_count = %d  \n", detections_count, unique_truth_count);
    char **paths_dif = NULL;
    if (difficult_valid_images) {
        list *plist_dif = get_paths(difficult_valid_images);
        paths_dif = (char **)list_to_array(plist_dif);
    }


    int *truth_flags = calloc(unique_truth_count, sizeof(int));
    layer l = net.layers[net.n - 1];
    int classes = l.classes;

    int rank;
    for (rank = 0; rank < detections_count; ++rank) {
        if(rank % 100 == 0)
            printf(" rank = %d of ranks = %d \r", rank, detections_count);
    int m = plist->size;
    int i = 0;
    int t;

        if (rank > 0) {
            int class_id;
            for (class_id = 0; class_id < classes; ++class_id) {
                pr[class_id][rank].tp = pr[class_id][rank - 1].tp;
                pr[class_id][rank].fp = pr[class_id][rank - 1].fp;
            }
        }
    const float thresh = .005;
    const float nms = .45;
    const float iou_thresh = 0.5;

        box_prob d = detections[rank];
        // if (detected && isn't detected before)
        if (d.truth_flag == 1) {
            if (truth_flags[d.unique_truth_index] == 0) 
            {
                truth_flags[d.unique_truth_index] = 1;
                pr[d.class_id][rank].tp++;  // true-positive
            }
        }
        else {
            pr[d.class_id][rank].fp++;  // false-positive
        }
    int nthreads = 4;
    image *val = calloc(nthreads, sizeof(image));
    image *val_resized = calloc(nthreads, sizeof(image));
    image *buf = calloc(nthreads, sizeof(image));
    image *buf_resized = calloc(nthreads, sizeof(image));
    pthread_t *thr = calloc(nthreads, sizeof(pthread_t));

        for (i = 0; i < classes; ++i) 
        {
            const int tp = pr[i][rank].tp;
            const int fp = pr[i][rank].fp;
            const int fn = truth_classes_count[i] - tp; // false-negative = objects - true-positive
            pr[i][rank].fn = fn;
    load_args args = { 0 };
    args.w = net.w;
    args.h = net.h;
    args.c = net.c;
    args.type = IMAGE_DATA;
    //args.type = LETTERBOX_DATA;

            if ((tp + fp) > 0) pr[i][rank].precision = (double)tp / (double)(tp + fp);
            else pr[i][rank].precision = 0;
    //const float thresh_calc_avg_iou = 0.24;
    float avg_iou = 0;
    int tp_for_thresh = 0;
    int fp_for_thresh = 0;

            if ((tp + fn) > 0) pr[i][rank].recall = (double)tp / (double)(tp + fn);
            else pr[i][rank].recall = 0;
        }
    }
    box_prob *detections = calloc(1, sizeof(box_prob));
    int detections_count = 0;
    int unique_truth_count = 0;

    free(truth_flags);
	
	
    double mean_average_precision = 0;
    int *truth_classes_count = calloc(classes, sizeof(int));

    for (i = 0; i < classes; ++i) {
        double avg_precision = 0;
        int point;
        for (point = 0; point < 11; ++point) {
            double cur_recall = point * 0.1;
            double cur_precision = 0;
            for (rank = 0; rank < detections_count; ++rank)
            {
                if (pr[i][rank].recall >= cur_recall) { // > or >=
                    if (pr[i][rank].precision > cur_precision) {
                        cur_precision = pr[i][rank].precision;
                    }
                }
            }
            //printf("class_id = %d, point = %d, cur_recall = %.4f, cur_precision = %.4f \n", i, point, cur_recall, cur_precision);
    for (t = 0; t < nthreads; ++t) {
        args.path = paths[i + t];
        args.im = &buf[t];
        args.resized = &buf_resized[t];
        thr[t] = load_data_in_thread(args);
    }
    time_t start = time(0);
    for (i = nthreads; i < m + nthreads; i += nthreads) {
        fprintf(stderr, "%d\n", i);
        for (t = 0; t < nthreads && i + t - nthreads < m; ++t) {
            pthread_join(thr[t], 0);
            val[t] = buf[t];
            val_resized[t] = buf_resized[t];
        }
        for (t = 0; t < nthreads && i + t < m; ++t) {
            args.path = paths[i + t];
            args.im = &buf[t];
            args.resized = &buf_resized[t];
            thr[t] = load_data_in_thread(args);
        }
        for (t = 0; t < nthreads && i + t - nthreads < m; ++t) {
            const int image_index = i + t - nthreads;
            char *path = paths[image_index];
            char *id = basecfg(path);
            float *X = val_resized[t].data;
            network_predict(net, X);

            avg_precision += cur_precision;
        }
        avg_precision = avg_precision / 11;
        printf("class_id = %d, name = %s, \t ap = %2.2f %% \n", i, names[i], avg_precision*100);
        mean_average_precision += avg_precision;
    }
	
    const float cur_precision = (float)tp_for_thresh / ((float)tp_for_thresh + (float)fp_for_thresh);
    const float cur_recall = (float)tp_for_thresh / ((float)tp_for_thresh + (float)(unique_truth_count - tp_for_thresh));
    const float f1_score = 2.F * cur_precision * cur_recall / (cur_precision + cur_recall);
    printf(" for thresh = %1.2f, precision = %1.2f, recall = %1.2f, F1-score = %1.2f \n",
        thresh_calc_avg_iou, cur_precision, cur_recall, f1_score);
            int nboxes = 0;
            int letterbox = (args.type == LETTERBOX_DATA);
            float hier_thresh = 0;
            detection *dets = get_network_boxes(&net, 1, 1, thresh, hier_thresh, 0, 0, &nboxes, letterbox);
            //detection *dets = get_network_boxes(&net, val[t].w, val[t].h, thresh, hier_thresh, 0, 1, &nboxes, letterbox); // for letterbox=1
            if (nms) do_nms_sort(dets, nboxes, l.classes, nms);

    printf(" for thresh = %0.2f, TP = %d, FP = %d, FN = %d, average IoU = %2.2f %% \n", 
        thresh_calc_avg_iou, tp_for_thresh, fp_for_thresh, unique_truth_count - tp_for_thresh, avg_iou * 100);
            char labelpath[4096];
            replace_image_to_label(path, labelpath);
            int num_labels = 0;
            box_label *truth = read_boxes(labelpath, &num_labels);
            int i, j;
            for (j = 0; j < num_labels; ++j) {
                truth_classes_count[truth[j].id]++;
            }

    mean_average_precision = mean_average_precision / classes;
    printf("\n mean average precision (mAP) = %f, or %2.2f %% \n", mean_average_precision, mean_average_precision*100);
            // difficult
            box_label *truth_dif = NULL;
            int num_labels_dif = 0;
            if (paths_dif)
            {
                char *path_dif = paths_dif[image_index];

                char labelpath_dif[4096];
                replace_image_to_label(path_dif, labelpath_dif);

                truth_dif = read_boxes(labelpath_dif, &num_labels_dif);
            }

            const int checkpoint_detections_count = detections_count;

            for (i = 0; i < nboxes; ++i) {

                int class_id;
                for (class_id = 0; class_id < classes; ++class_id) {
                    float prob = dets[i].prob[class_id];
                    if (prob > 0) {
                        detections_count++;
                        detections = realloc(detections, detections_count * sizeof(box_prob));
                        detections[detections_count - 1].b = dets[i].bbox;
                        detections[detections_count - 1].p = prob;
                        detections[detections_count - 1].image_index = image_index;
                        detections[detections_count - 1].class_id = class_id;
                        detections[detections_count - 1].truth_flag = 0;
                        detections[detections_count - 1].unique_truth_index = -1;

                        int truth_index = -1;
                        float max_iou = 0;
                        for (j = 0; j < num_labels; ++j)
                        {
                            box t = { truth[j].x, truth[j].y, truth[j].w, truth[j].h };
                            //printf(" IoU = %f, prob = %f, class_id = %d, truth[j].id = %d \n",
                            //    box_iou(dets[i].bbox, t), prob, class_id, truth[j].id);
                            float current_iou = box_iou(dets[i].bbox, t);
                            if (current_iou > iou_thresh && class_id == truth[j].id) {
                                if (current_iou > max_iou) {
                                    max_iou = current_iou;
                                    truth_index = unique_truth_count + j;
                                }
                            }
                        }

                        // best IoU
                        if (truth_index > -1) {
                            detections[detections_count - 1].truth_flag = 1;
                            detections[detections_count - 1].unique_truth_index = truth_index;
                        }
                        else {
                            // if object is difficult then remove detection
                            for (j = 0; j < num_labels_dif; ++j) {
                                box t = { truth_dif[j].x, truth_dif[j].y, truth_dif[j].w, truth_dif[j].h };
                                float current_iou = box_iou(dets[i].bbox, t);
                                if (current_iou > iou_thresh && class_id == truth_dif[j].id) {
                                    --detections_count;
                                    break;
                                }
                            }
                        }

                        // calc avg IoU, true-positives, false-positives for required Threshold
                        if (prob > thresh_calc_avg_iou) {
                            int z, found = 0;
                            for (z = checkpoint_detections_count; z < detections_count-1; ++z)
                                if (detections[z].unique_truth_index == truth_index) {
                                    found = 1; break;
                                }

                            if(truth_index > -1 && found == 0) {
                                avg_iou += max_iou;
                                ++tp_for_thresh;
                            }
                            else
                                fp_for_thresh++;
                        }
                    }
                }
            }

            unique_truth_count += num_labels;

            //static int previous_errors = 0;
            //int total_errors = fp_for_thresh + (unique_truth_count - tp_for_thresh);
            //int errors_in_this_image = total_errors - previous_errors;
            //previous_errors = total_errors;
            //if(reinforcement_fd == NULL) reinforcement_fd = fopen("reinforcement.txt", "wb");
            //char buff[1000];
            //sprintf(buff, "%s\n", path);
            //if(errors_in_this_image > 0) fwrite(buff, sizeof(char), strlen(buff), reinforcement_fd);

            free_detections(dets, nboxes);
            free(id);
            free_image(val[t]);
            free_image(val_resized[t]);
        }
    }

    if((tp_for_thresh + fp_for_thresh) > 0)
        avg_iou = avg_iou / (tp_for_thresh + fp_for_thresh);


    for (i = 0; i < classes; ++i) {
        free(pr[i]);
    }
    free(pr);
    free(detections);
    free(truth_classes_count);
    // SORT(detections)
    qsort(detections, detections_count, sizeof(box_prob), detections_comparator);

    fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start));
    if (reinforcement_fd != NULL) fclose(reinforcement_fd);
    typedef struct {
        double precision;
        double recall;
        int tp, fp, fn;
    } pr_t;

    // for PR-curve
    pr_t **pr = calloc(classes, sizeof(pr_t*));
    for (i = 0; i < classes; ++i) {
        pr[i] = calloc(detections_count, sizeof(pr_t));
    }
    printf("detections_count = %d, unique_truth_count = %d  \n", detections_count, unique_truth_count);


    int *truth_flags = calloc(unique_truth_count, sizeof(int));

    int rank;
    for (rank = 0; rank < detections_count; ++rank) {
        if(rank % 100 == 0)
            printf(" rank = %d of ranks = %d \r", rank, detections_count);

        if (rank > 0) {
            int class_id;
            for (class_id = 0; class_id < classes; ++class_id) {
                pr[class_id][rank].tp = pr[class_id][rank - 1].tp;
                pr[class_id][rank].fp = pr[class_id][rank - 1].fp;
            }
        }

        box_prob d = detections[rank];
        // if (detected && isn't detected before)
        if (d.truth_flag == 1) {
            if (truth_flags[d.unique_truth_index] == 0)
            {
                truth_flags[d.unique_truth_index] = 1;
                pr[d.class_id][rank].tp++;    // true-positive
            }
        }
        else {
            pr[d.class_id][rank].fp++;    // false-positive
        }

        for (i = 0; i < classes; ++i)
        {
            const int tp = pr[i][rank].tp;
            const int fp = pr[i][rank].fp;
            const int fn = truth_classes_count[i] - tp;    // false-negative = objects - true-positive
            pr[i][rank].fn = fn;

            if ((tp + fp) > 0) pr[i][rank].precision = (double)tp / (double)(tp + fp);
            else pr[i][rank].precision = 0;

            if ((tp + fn) > 0) pr[i][rank].recall = (double)tp / (double)(tp + fn);
            else pr[i][rank].recall = 0;
        }
    }

    free(truth_flags);


    double mean_average_precision = 0;

    for (i = 0; i < classes; ++i) {
        double avg_precision = 0;
        int point;
        for (point = 0; point < 11; ++point) {
            double cur_recall = point * 0.1;
            double cur_precision = 0;
            for (rank = 0; rank < detections_count; ++rank)
            {
                if (pr[i][rank].recall >= cur_recall) {    // > or >=
                    if (pr[i][rank].precision > cur_precision) {
                        cur_precision = pr[i][rank].precision;
                    }
                }
            }
            //printf("class_id = %d, point = %d, cur_recall = %.4f, cur_precision = %.4f \n", i, point, cur_recall, cur_precision);

            avg_precision += cur_precision;
        }
        avg_precision = avg_precision / 11;
        printf("class_id = %d, name = %s, \t ap = %2.2f %% \n", i, names[i], avg_precision*100);
        mean_average_precision += avg_precision;
    }

    const float cur_precision = (float)tp_for_thresh / ((float)tp_for_thresh + (float)fp_for_thresh);
    const float cur_recall = (float)tp_for_thresh / ((float)tp_for_thresh + (float)(unique_truth_count - tp_for_thresh));
    const float f1_score = 2.F * cur_precision * cur_recall / (cur_precision + cur_recall);
    printf(" for thresh = %1.2f, precision = %1.2f, recall = %1.2f, F1-score = %1.2f \n",
        thresh_calc_avg_iou, cur_precision, cur_recall, f1_score);

    printf(" for thresh = %0.2f, TP = %d, FP = %d, FN = %d, average IoU = %2.2f %% \n",
        thresh_calc_avg_iou, tp_for_thresh, fp_for_thresh, unique_truth_count - tp_for_thresh, avg_iou * 100);

    mean_average_precision = mean_average_precision / classes;
    printf("\n mean average precision (mAP) = %f, or %2.2f %% \n", mean_average_precision, mean_average_precision*100);


    for (i = 0; i < classes; ++i) {
        free(pr[i]);
    }
    free(pr);
    free(detections);
    free(truth_classes_count);

    fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start));
    if (reinforcement_fd != NULL) fclose(reinforcement_fd);
}

#ifdef OPENCV
typedef struct {
    float w, h;
    float w, h;
} anchors_t;

int anchors_comparator(const void *pa, const void *pb)
{
    anchors_t a = *(anchors_t *)pa;
    anchors_t b = *(anchors_t *)pb;
    float diff = b.w*b.h - a.w*a.h;
    if (diff < 0) return 1;
    else if (diff > 0) return -1;
    return 0;
    anchors_t a = *(anchors_t *)pa;
    anchors_t b = *(anchors_t *)pb;
    float diff = b.w*b.h - a.w*a.h;
    if (diff < 0) return 1;
    else if (diff > 0) return -1;
    return 0;
}

void calc_anchors(char *datacfg, int num_of_clusters, int width, int height, int show)
{
    printf("\n num_of_clusters = %d, width = %d, height = %d \n", num_of_clusters, width, height);
    if (width < 0 || height < 0) {
        printf("Usage: darknet detector calc_anchors data/voc.data -num_of_clusters 9 -width 416 -height 416 \n");
        printf("Error: set width and height \n");
        return;
    }
    printf("\n num_of_clusters = %d, width = %d, height = %d \n", num_of_clusters, width, height);
    if (width < 0 || height < 0) {
        printf("Usage: darknet detector calc_anchors data/voc.data -num_of_clusters 9 -width 416 -height 416 \n");
        printf("Error: set width and height \n");
        return;
    }

    //float pointsdata[] = { 1,1, 2,2, 6,6, 5,5, 10,10 };
    float *rel_width_height_array = calloc(1000, sizeof(float));
    //float pointsdata[] = { 1,1, 2,2, 6,6, 5,5, 10,10 };
    float *rel_width_height_array = calloc(1000, sizeof(float));

    list *options = read_data_cfg(datacfg);
    char *train_images = option_find_str(options, "train", "data/train.list");
    list *plist = get_paths(train_images);
    int number_of_images = plist->size;
    char **paths = (char **)list_to_array(plist);
    list *options = read_data_cfg(datacfg);
    char *train_images = option_find_str(options, "train", "data/train.list");
    list *plist = get_paths(train_images);
    int number_of_images = plist->size;
    char **paths = (char **)list_to_array(plist);

    int number_of_boxes = 0;
    printf(" read labels from %d images \n", number_of_images);
    int number_of_boxes = 0;
    printf(" read labels from %d images \n", number_of_images);

    int i, j;
    for (i = 0; i < number_of_images; ++i) {
        char *path = paths[i];
        char labelpath[4096];
        replace_image_to_label(path, labelpath);
    int i, j;
    for (i = 0; i < number_of_images; ++i) {
        char *path = paths[i];
        char labelpath[4096];
        replace_image_to_label(path, labelpath);

        int num_labels = 0;
        box_label *truth = read_boxes(labelpath, &num_labels);
        //printf(" new path: %s \n", labelpath);
        char buff[1024];
        for (j = 0; j < num_labels; ++j)
        {
            if (truth[j].x > 1 || truth[j].x <= 0 || truth[j].y > 1 || truth[j].y <= 0 ||
                truth[j].w > 1 || truth[j].w <= 0 || truth[j].h > 1 || truth[j].h <= 0) 
            {				
                printf("\n\nWrong label: %s - j = %d, x = %f, y = %f, width = %f, height = %f \n",
                    labelpath, j, truth[j].x, truth[j].y, truth[j].w, truth[j].h);
                sprintf(buff, "echo \"Wrong label: %s - j = %d, x = %f, y = %f, width = %f, height = %f\" >> bad_label.list", 
                    labelpath, j, truth[j].x, truth[j].y, truth[j].w, truth[j].h);
                system(buff);				
            }
            number_of_boxes++;
            rel_width_height_array = realloc(rel_width_height_array, 2 * number_of_boxes * sizeof(float));
            rel_width_height_array[number_of_boxes * 2 - 2] = truth[j].w * width;
            rel_width_height_array[number_of_boxes * 2 - 1] = truth[j].h * height;
            printf("\r loaded \t image: %d \t box: %d", i+1, number_of_boxes);
        }
    }
    printf("\n all loaded. \n");
        int num_labels = 0;
        box_label *truth = read_boxes(labelpath, &num_labels);
        //printf(" new path: %s \n", labelpath);
        char buff[1024];
        for (j = 0; j < num_labels; ++j)
        {
            if (truth[j].x > 1 || truth[j].x <= 0 || truth[j].y > 1 || truth[j].y <= 0 ||
                truth[j].w > 1 || truth[j].w <= 0 || truth[j].h > 1 || truth[j].h <= 0)
            {
                printf("\n\nWrong label: %s - j = %d, x = %f, y = %f, width = %f, height = %f \n",
                    labelpath, j, truth[j].x, truth[j].y, truth[j].w, truth[j].h);
                sprintf(buff, "echo \"Wrong label: %s - j = %d, x = %f, y = %f, width = %f, height = %f\" >> bad_label.list",
                    labelpath, j, truth[j].x, truth[j].y, truth[j].w, truth[j].h);
                system(buff);
            }
            number_of_boxes++;
            rel_width_height_array = realloc(rel_width_height_array, 2 * number_of_boxes * sizeof(float));
            rel_width_height_array[number_of_boxes * 2 - 2] = truth[j].w * width;
            rel_width_height_array[number_of_boxes * 2 - 1] = truth[j].h * height;
            printf("\r loaded \t image: %d \t box: %d", i+1, number_of_boxes);
        }
    }
    printf("\n all loaded. \n");

    CvMat* points = cvCreateMat(number_of_boxes, 2, CV_32FC1);
    CvMat* centers = cvCreateMat(num_of_clusters, 2, CV_32FC1);
    CvMat* labels = cvCreateMat(number_of_boxes, 1, CV_32SC1);
    CvMat* points = cvCreateMat(number_of_boxes, 2, CV_32FC1);
    CvMat* centers = cvCreateMat(num_of_clusters, 2, CV_32FC1);
    CvMat* labels = cvCreateMat(number_of_boxes, 1, CV_32SC1);

    for (i = 0; i < number_of_boxes; ++i) {
        points->data.fl[i * 2] = rel_width_height_array[i * 2];
        points->data.fl[i * 2 + 1] = rel_width_height_array[i * 2 + 1];
        //cvSet1D(points, i * 2, cvScalar(rel_width_height_array[i * 2], 0, 0, 0));
        //cvSet1D(points, i * 2 + 1, cvScalar(rel_width_height_array[i * 2 + 1], 0, 0, 0));
    }
    for (i = 0; i < number_of_boxes; ++i) {
        points->data.fl[i * 2] = rel_width_height_array[i * 2];
        points->data.fl[i * 2 + 1] = rel_width_height_array[i * 2 + 1];
        //cvSet1D(points, i * 2, cvScalar(rel_width_height_array[i * 2], 0, 0, 0));
        //cvSet1D(points, i * 2 + 1, cvScalar(rel_width_height_array[i * 2 + 1], 0, 0, 0));
    }


    const int attemps = 10;
    double compactness;
    const int attemps = 10;
    double compactness;

    enum {
        KMEANS_RANDOM_CENTERS = 0,
        KMEANS_USE_INITIAL_LABELS = 1,
        KMEANS_PP_CENTERS = 2
    };
	
    printf("\n calculating k-means++ ...");
    // Should be used: distance(box, centroid) = 1 - IoU(box, centroid)
    cvKMeans2(points, num_of_clusters, labels, 
        cvTermCriteria(CV_TERMCRIT_EPS+CV_TERMCRIT_ITER, 10000, 0), attemps, 
        0, KMEANS_PP_CENTERS,
        centers, &compactness);
    enum {
        KMEANS_RANDOM_CENTERS = 0,
        KMEANS_USE_INITIAL_LABELS = 1,
        KMEANS_PP_CENTERS = 2
    };

    // sort anchors
    qsort(centers->data.fl, num_of_clusters, 2*sizeof(float), anchors_comparator);
    printf("\n calculating k-means++ ...");
    // Should be used: distance(box, centroid) = 1 - IoU(box, centroid)
    cvKMeans2(points, num_of_clusters, labels,
        cvTermCriteria(CV_TERMCRIT_EPS+CV_TERMCRIT_ITER, 10000, 0), attemps,
        0, KMEANS_PP_CENTERS,
        centers, &compactness);

    //orig 2.0 anchors = 1.08,1.19,  3.42,4.41,  6.63,11.38,  9.42,5.11,  16.62,10.52
    //float orig_anch[] = { 1.08,1.19,  3.42,4.41,  6.63,11.38,  9.42,5.11,  16.62,10.52 };
    // worse than ours (even for 19x19 final size - for input size 608x608)
    // sort anchors
    qsort(centers->data.fl, num_of_clusters, 2*sizeof(float), anchors_comparator);

    //orig anchors = 1.3221,1.73145, 3.19275,4.00944, 5.05587,8.09892, 9.47112,4.84053, 11.2364,10.0071
    //float orig_anch[] = { 1.3221,1.73145, 3.19275,4.00944, 5.05587,8.09892, 9.47112,4.84053, 11.2364,10.0071 };
    // orig (IoU=59.90%) better than ours (59.75%)
    //orig 2.0 anchors = 1.08,1.19,  3.42,4.41,  6.63,11.38,  9.42,5.11,  16.62,10.52
    //float orig_anch[] = { 1.08,1.19,  3.42,4.41,  6.63,11.38,  9.42,5.11,  16.62,10.52 };
    // worse than ours (even for 19x19 final size - for input size 608x608)

    //gen_anchors.py = 1.19, 1.99, 2.79, 4.60, 4.53, 8.92, 8.06, 5.29, 10.32, 10.66
    //float orig_anch[] = { 1.19, 1.99, 2.79, 4.60, 4.53, 8.92, 8.06, 5.29, 10.32, 10.66 };
    //orig anchors = 1.3221,1.73145, 3.19275,4.00944, 5.05587,8.09892, 9.47112,4.84053, 11.2364,10.0071
    //float orig_anch[] = { 1.3221,1.73145, 3.19275,4.00944, 5.05587,8.09892, 9.47112,4.84053, 11.2364,10.0071 };
    // orig (IoU=59.90%) better than ours (59.75%)

    // ours: anchors = 9.3813,6.0095, 3.3999,5.3505, 10.9476,11.1992, 5.0161,9.8314, 1.5003,2.1595
    //float orig_anch[] = { 9.3813,6.0095, 3.3999,5.3505, 10.9476,11.1992, 5.0161,9.8314, 1.5003,2.1595 };
    //for (i = 0; i < num_of_clusters * 2; ++i) centers->data.fl[i] = orig_anch[i];
	
    //for (i = 0; i < number_of_boxes; ++i)
    //  printf("%2.2f,%2.2f, ", points->data.fl[i * 2], points->data.fl[i * 2 + 1]);
    //gen_anchors.py = 1.19, 1.99, 2.79, 4.60, 4.53, 8.92, 8.06, 5.29, 10.32, 10.66
    //float orig_anch[] = { 1.19, 1.99, 2.79, 4.60, 4.53, 8.92, 8.06, 5.29, 10.32, 10.66 };

    printf("\n");
    float avg_iou = 0;
    for (i = 0; i < number_of_boxes; ++i) {
        float box_w = points->data.fl[i * 2];
        float box_h = points->data.fl[i * 2 + 1];
        //int cluster_idx = labels->data.i[i];		
        int cluster_idx = 0;
        float min_dist = FLT_MAX;
        for (j = 0; j < num_of_clusters; ++j) {
            float anchor_w = centers->data.fl[j * 2];
            float anchor_h = centers->data.fl[j * 2 + 1];
            float w_diff = anchor_w - box_w;
            float h_diff = anchor_h - box_h;
            float distance = sqrt(w_diff*w_diff + h_diff*h_diff);
            if (distance < min_dist) min_dist = distance, cluster_idx = j;
        }
		
        float anchor_w = centers->data.fl[cluster_idx * 2];
        float anchor_h = centers->data.fl[cluster_idx * 2 + 1];
        float min_w = (box_w < anchor_w) ? box_w : anchor_w;
        float min_h = (box_h < anchor_h) ? box_h : anchor_h;
        float box_intersect = min_w*min_h;
        float box_union = box_w*box_h + anchor_w*anchor_h - box_intersect;
        float iou = box_intersect / box_union;
        if (iou > 1 || iou < 0) { // || box_w > width || box_h > height) {
            printf(" Wrong label: i = %d, box_w = %d, box_h = %d, anchor_w = %d, anchor_h = %d, iou = %f \n",
                i, box_w, box_h, anchor_w, anchor_h, iou);
        }
        else avg_iou += iou;
    }
    avg_iou = 100 * avg_iou / number_of_boxes;
    printf("\n avg IoU = %2.2f %% \n", avg_iou);
    // ours: anchors = 9.3813,6.0095, 3.3999,5.3505, 10.9476,11.1992, 5.0161,9.8314, 1.5003,2.1595
    //float orig_anch[] = { 9.3813,6.0095, 3.3999,5.3505, 10.9476,11.1992, 5.0161,9.8314, 1.5003,2.1595 };
    //for (i = 0; i < num_of_clusters * 2; ++i) centers->data.fl[i] = orig_anch[i];

    char buff[1024];
    FILE* fw = fopen("anchors.txt", "wb");
    printf("\nSaving anchors to the file: anchors.txt \n");
    printf("anchors = ");
    for (i = 0; i < num_of_clusters; ++i) {
        sprintf(buff, "%2.4f,%2.4f", centers->data.fl[i * 2], centers->data.fl[i * 2 + 1]);
        printf("%s", buff);
        fwrite(buff, sizeof(char), strlen(buff), fw);
        if (i + 1 < num_of_clusters) {
            fwrite(", ", sizeof(char), 2, fw);
            printf(", ");
        }
    }
    printf("\n");
    fclose(fw);
    //for (i = 0; i < number_of_boxes; ++i)
    //    printf("%2.2f,%2.2f, ", points->data.fl[i * 2], points->data.fl[i * 2 + 1]);

    if (show) {
        size_t img_size = 700;
        IplImage* img = cvCreateImage(cvSize(img_size, img_size), 8, 3);
        cvZero(img);
        for (j = 0; j < num_of_clusters; ++j) {
            CvPoint pt1, pt2;
            pt1.x = pt1.y = 0;
            pt2.x = centers->data.fl[j * 2] * img_size / width;
            pt2.y = centers->data.fl[j * 2 + 1] * img_size / height;
            cvRectangle(img, pt1, pt2, CV_RGB(255, 255, 255), 1, 8, 0);
        }
    printf("\n");
    float avg_iou = 0;
    for (i = 0; i < number_of_boxes; ++i) {
        float box_w = points->data.fl[i * 2];
        float box_h = points->data.fl[i * 2 + 1];
        //int cluster_idx = labels->data.i[i];
        int cluster_idx = 0;
        float min_dist = FLT_MAX;
        for (j = 0; j < num_of_clusters; ++j) {
            float anchor_w = centers->data.fl[j * 2];
            float anchor_h = centers->data.fl[j * 2 + 1];
            float w_diff = anchor_w - box_w;
            float h_diff = anchor_h - box_h;
            float distance = sqrt(w_diff*w_diff + h_diff*h_diff);
            if (distance < min_dist) min_dist = distance, cluster_idx = j;
        }

        for (i = 0; i < number_of_boxes; ++i) {
            CvPoint pt;
            pt.x = points->data.fl[i * 2] * img_size / width;
            pt.y = points->data.fl[i * 2 + 1] * img_size / height;
            int cluster_idx = labels->data.i[i];
            int red_id = (cluster_idx * (uint64_t)123 + 55) % 255;
            int green_id = (cluster_idx * (uint64_t)321 + 33) % 255;
            int blue_id = (cluster_idx * (uint64_t)11 + 99) % 255;
            cvCircle(img, pt, 1, CV_RGB(red_id, green_id, blue_id), CV_FILLED, 8, 0);
            //if(pt.x > img_size || pt.y > img_size) printf("\n pt.x = %d, pt.y = %d \n", pt.x, pt.y);
        }
        cvShowImage("clusters", img);
        cvWaitKey(0);
        cvReleaseImage(&img);
        cvDestroyAllWindows();
    }
        float anchor_w = centers->data.fl[cluster_idx * 2];
        float anchor_h = centers->data.fl[cluster_idx * 2 + 1];
        float min_w = (box_w < anchor_w) ? box_w : anchor_w;
        float min_h = (box_h < anchor_h) ? box_h : anchor_h;
        float box_intersect = min_w*min_h;
        float box_union = box_w*box_h + anchor_w*anchor_h - box_intersect;
        float iou = box_intersect / box_union;
        if (iou > 1 || iou < 0) { // || box_w > width || box_h > height) {
            printf(" Wrong label: i = %d, box_w = %d, box_h = %d, anchor_w = %d, anchor_h = %d, iou = %f \n",
                i, box_w, box_h, anchor_w, anchor_h, iou);
        }
        else avg_iou += iou;
    }
    avg_iou = 100 * avg_iou / number_of_boxes;
    printf("\n avg IoU = %2.2f %% \n", avg_iou);

    free(rel_width_height_array);
    cvReleaseMat(&points);
    cvReleaseMat(&centers);
    cvReleaseMat(&labels);
    char buff[1024];
    FILE* fw = fopen("anchors.txt", "wb");
    printf("\nSaving anchors to the file: anchors.txt \n");
    printf("anchors = ");
    for (i = 0; i < num_of_clusters; ++i) {
        sprintf(buff, "%2.4f,%2.4f", centers->data.fl[i * 2], centers->data.fl[i * 2 + 1]);
        printf("%s", buff);
        fwrite(buff, sizeof(char), strlen(buff), fw);
        if (i + 1 < num_of_clusters) {
            fwrite(", ", sizeof(char), 2, fw);
            printf(", ");
        }
    }
    printf("\n");
    fclose(fw);

    if (show) {
        size_t img_size = 700;
        IplImage* img = cvCreateImage(cvSize(img_size, img_size), 8, 3);
        cvZero(img);
        for (j = 0; j < num_of_clusters; ++j) {
            CvPoint pt1, pt2;
            pt1.x = pt1.y = 0;
            pt2.x = centers->data.fl[j * 2] * img_size / width;
            pt2.y = centers->data.fl[j * 2 + 1] * img_size / height;
            cvRectangle(img, pt1, pt2, CV_RGB(255, 255, 255), 1, 8, 0);
        }

        for (i = 0; i < number_of_boxes; ++i) {
            CvPoint pt;
            pt.x = points->data.fl[i * 2] * img_size / width;
            pt.y = points->data.fl[i * 2 + 1] * img_size / height;
            int cluster_idx = labels->data.i[i];
            int red_id = (cluster_idx * (uint64_t)123 + 55) % 255;
            int green_id = (cluster_idx * (uint64_t)321 + 33) % 255;
            int blue_id = (cluster_idx * (uint64_t)11 + 99) % 255;
            cvCircle(img, pt, 1, CV_RGB(red_id, green_id, blue_id), CV_FILLED, 8, 0);
            //if(pt.x > img_size || pt.y > img_size) printf("\n pt.x = %d, pt.y = %d \n", pt.x, pt.y);
        }
        cvShowImage("clusters", img);
        cvWaitKey(0);
        cvReleaseImage(&img);
        cvDestroyAllWindows();
    }

    free(rel_width_height_array);
    cvReleaseMat(&points);
    cvReleaseMat(&centers);
    cvReleaseMat(&labels);
}
#else
void calc_anchors(char *datacfg, int num_of_clusters, int width, int height, int show) {
    printf(" k-means++ can't be used without OpenCV, because there is used cvKMeans2 implementation \n");
    printf(" k-means++ can't be used without OpenCV, because there is used cvKMeans2 implementation \n");
}
#endif // OPENCV

void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh,
                   float hier_thresh, int dont_show, int ext_output, int save_labels)
                   float hier_thresh, int dont_show, int ext_output, int save_labels)
{
    list *options = read_data_cfg(datacfg);
    char *name_list = option_find_str(options, "names", "data/names.list");
    int names_size = 0;
    char **names = get_labels_custom(name_list, &names_size); //get_labels(name_list);
    int names_size = 0;
    char **names = get_labels_custom(name_list, &names_size); //get_labels(name_list);

    image **alphabet = load_alphabet();
    network net = parse_network_cfg_custom(cfgfile, 1); // set batch=1
@@ -1093,23 +1093,23 @@
        load_weights(&net, weightfile);
    }
    //set_batch_network(&net, 1);
    fuse_conv_batchnorm(net);
    if (net.layers[net.n - 1].classes != names_size) {
        printf(" Error: in the file %s number of names %d that isn't equal to classes=%d in the file %s \n", 
            name_list, names_size, net.layers[net.n - 1].classes, cfgfile);
        if(net.layers[net.n - 1].classes > names_size) getchar();
    }
    fuse_conv_batchnorm(net);
    if (net.layers[net.n - 1].classes != names_size) {
        printf(" Error: in the file %s number of names %d that isn't equal to classes=%d in the file %s \n",
            name_list, names_size, net.layers[net.n - 1].classes, cfgfile);
        if(net.layers[net.n - 1].classes > names_size) getchar();
    }
    srand(2222222);
    double time;
    char buff[256];
    char *input = buff;
    int j;
    float nms=.45;  // 0.4F
    float nms=.45;    // 0.4F
    while(1){
        if(filename){
            strncpy(input, filename, 256);
            if(strlen(input) > 0)
                if (input[strlen(input) - 1] == 0x0d) input[strlen(input) - 1] = 0;
            if(strlen(input) > 0)
                if (input[strlen(input) - 1] == 0x0d) input[strlen(input) - 1] = 0;
        } else {
            printf("Enter Image Path: ");
            fflush(stdout);
@@ -1118,9 +1118,9 @@
            strtok(input, "\n");
        }
        image im = load_image(input,0,0,net.c);
        int letterbox = 0;
        int letterbox = 0;
        image sized = resize_image(im, net.w, net.h);
        //image sized = letterbox_image(im, net.w, net.h); letterbox = 1;
        //image sized = letterbox_image(im, net.w, net.h); letterbox = 1;
        layer l = net.layers[net.n-1];

        //box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
@@ -1130,97 +1130,97 @@
        float *X = sized.data;
        time= what_time_is_it_now();
        network_predict(net, X);
        //network_predict_image(&net, im); letterbox = 1;
        //network_predict_image(&net, im); letterbox = 1;
        printf("%s: Predicted in %f seconds.\n", input, (what_time_is_it_now()-time));
        //get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, 0);
        // if (nms) do_nms_sort_v2(boxes, probs, l.w*l.h*l.n, l.classes, nms);
        //draw_detections(im, l.w*l.h*l.n, thresh, boxes, probs, names, alphabet, l.classes);
        int nboxes = 0;
        detection *dets = get_network_boxes(&net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes, letterbox);
        if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
        draw_detections_v3(im, dets, nboxes, thresh, names, alphabet, l.classes, ext_output);
        // if (nms) do_nms_sort_v2(boxes, probs, l.w*l.h*l.n, l.classes, nms);
        //draw_detections(im, l.w*l.h*l.n, thresh, boxes, probs, names, alphabet, l.classes);
        int nboxes = 0;
        detection *dets = get_network_boxes(&net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes, letterbox);
        if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
        draw_detections_v3(im, dets, nboxes, thresh, names, alphabet, l.classes, ext_output);
        save_image(im, "predictions");
        if (!dont_show) {
            show_image(im, "predictions");
        }
        if (!dont_show) {
            show_image(im, "predictions");
        }

        // pseudo labeling concept - fast.ai
        if(save_labels)
        {
            char labelpath[4096];
            replace_image_to_label(input, labelpath);
        // pseudo labeling concept - fast.ai
        if(save_labels)
        {
            char labelpath[4096];
            replace_image_to_label(input, labelpath);

            FILE* fw = fopen(labelpath, "wb");
            int i;
            for (i = 0; i < nboxes; ++i) {
                char buff[1024];
                int class_id = -1;
                float prob = 0;
                for (j = 0; j < l.classes; ++j) {
                    if (dets[i].prob[j] > thresh && dets[i].prob[j] > prob) {
                        prob = dets[i].prob[j];
                        class_id = j;
                    }
                }
                if (class_id >= 0) {
                    sprintf(buff, "%d %2.4f %2.4f %2.4f %2.4f\n", class_id, dets[i].bbox.x, dets[i].bbox.y, dets[i].bbox.w, dets[i].bbox.h);
                    fwrite(buff, sizeof(char), strlen(buff), fw);
                }
            }
            fclose(fw);
        }
            FILE* fw = fopen(labelpath, "wb");
            int i;
            for (i = 0; i < nboxes; ++i) {
                char buff[1024];
                int class_id = -1;
                float prob = 0;
                for (j = 0; j < l.classes; ++j) {
                    if (dets[i].prob[j] > thresh && dets[i].prob[j] > prob) {
                        prob = dets[i].prob[j];
                        class_id = j;
                    }
                }
                if (class_id >= 0) {
                    sprintf(buff, "%d %2.4f %2.4f %2.4f %2.4f\n", class_id, dets[i].bbox.x, dets[i].bbox.y, dets[i].bbox.w, dets[i].bbox.h);
                    fwrite(buff, sizeof(char), strlen(buff), fw);
                }
            }
            fclose(fw);
        }

        free_detections(dets, nboxes);
        free_detections(dets, nboxes);
        free_image(im);
        free_image(sized);
        //free(boxes);
        //free_ptrs((void **)probs, l.w*l.h*l.n);
#ifdef OPENCV
        if (!dont_show) {
            cvWaitKey(0);
            cvDestroyAllWindows();
        }
        if (!dont_show) {
            cvWaitKey(0);
            cvDestroyAllWindows();
        }
#endif
        if (filename) break;
    }

    // free memory
    free_ptrs(names, net.layers[net.n - 1].classes);
    free_list_contents_kvp(options);
    free_list(options);
    // free memory
    free_ptrs(names, net.layers[net.n - 1].classes);
    free_list_contents_kvp(options);
    free_list(options);

    int i;
    const int nsize = 8;
    for (j = 0; j < nsize; ++j) {
        for (i = 32; i < 127; ++i) {
            free_image(alphabet[j][i]);
        }
        free(alphabet[j]);
    }
    free(alphabet);
    int i;
    const int nsize = 8;
    for (j = 0; j < nsize; ++j) {
        for (i = 32; i < 127; ++i) {
            free_image(alphabet[j][i]);
        }
        free(alphabet[j]);
    }
    free(alphabet);

    free_network(net);
    free_network(net);
}

void run_detector(int argc, char **argv)
{
    int dont_show = find_arg(argc, argv, "-dont_show");
    int show = find_arg(argc, argv, "-show");
    int http_stream_port = find_int_arg(argc, argv, "-http_port", -1);
    char *out_filename = find_char_arg(argc, argv, "-out_filename", 0);
    char *outfile = find_char_arg(argc, argv, "-out", 0);
    int dont_show = find_arg(argc, argv, "-dont_show");
    int show = find_arg(argc, argv, "-show");
    int http_stream_port = find_int_arg(argc, argv, "-http_port", -1);
    char *out_filename = find_char_arg(argc, argv, "-out_filename", 0);
    char *outfile = find_char_arg(argc, argv, "-out", 0);
    char *prefix = find_char_arg(argc, argv, "-prefix", 0);
    float thresh = find_float_arg(argc, argv, "-thresh", .25);  // 0.24
    float hier_thresh = find_float_arg(argc, argv, "-hier", .5);
    float thresh = find_float_arg(argc, argv, "-thresh", .25);    // 0.24
    float hier_thresh = find_float_arg(argc, argv, "-hier", .5);
    int cam_index = find_int_arg(argc, argv, "-c", 0);
    int frame_skip = find_int_arg(argc, argv, "-s", 0);
    int num_of_clusters = find_int_arg(argc, argv, "-num_of_clusters", 5);
    int width = find_int_arg(argc, argv, "-width", -1);
    int height = find_int_arg(argc, argv, "-height", -1);
    // extended output in test mode (output of rect bound coords)
    // and for recall mode (extended output table-like format with results for best_class fit)
    int ext_output = find_arg(argc, argv, "-ext_output");
    int save_labels = find_arg(argc, argv, "-save_labels");
    int num_of_clusters = find_int_arg(argc, argv, "-num_of_clusters", 5);
    int width = find_int_arg(argc, argv, "-width", -1);
    int height = find_int_arg(argc, argv, "-height", -1);
    // extended output in test mode (output of rect bound coords)
    // and for recall mode (extended output table-like format with results for best_class fit)
    int ext_output = find_arg(argc, argv, "-ext_output");
    int save_labels = find_arg(argc, argv, "-save_labels");
    if(argc < 4){
        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
        return;
@@ -1253,29 +1253,29 @@
    char *datacfg = argv[3];
    char *cfg = argv[4];
    char *weights = (argc > 5) ? argv[5] : 0;
    if(weights)
        if(strlen(weights) > 0)
            if (weights[strlen(weights) - 1] == 0x0d) weights[strlen(weights) - 1] = 0;
    if(weights)
        if(strlen(weights) > 0)
            if (weights[strlen(weights) - 1] == 0x0d) weights[strlen(weights) - 1] = 0;
    char *filename = (argc > 6) ? argv[6]: 0;
    if(0==strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh, hier_thresh, dont_show, ext_output, save_labels);
    else if(0==strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear, dont_show);
    else if(0==strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights, outfile);
    else if(0==strcmp(argv[2], "recall")) validate_detector_recall(datacfg, cfg, weights);
    else if(0==strcmp(argv[2], "map")) validate_detector_map(datacfg, cfg, weights, thresh);
    else if(0==strcmp(argv[2], "calc_anchors")) calc_anchors(datacfg, num_of_clusters, width, height, show);
    else if(0==strcmp(argv[2], "map")) validate_detector_map(datacfg, cfg, weights, thresh);
    else if(0==strcmp(argv[2], "calc_anchors")) calc_anchors(datacfg, num_of_clusters, width, height, show);
    else if(0==strcmp(argv[2], "demo")) {
        list *options = read_data_cfg(datacfg);
        int classes = option_find_int(options, "classes", 20);
        char *name_list = option_find_str(options, "names", "data/names.list");
        char **names = get_labels(name_list);
        if(filename)
            if(strlen(filename) > 0)
                if (filename[strlen(filename) - 1] == 0x0d) filename[strlen(filename) - 1] = 0;
        if(filename)
            if(strlen(filename) > 0)
                if (filename[strlen(filename) - 1] == 0x0d) filename[strlen(filename) - 1] = 0;
        demo(cfg, weights, thresh, hier_thresh, cam_index, filename, names, classes, frame_skip, prefix, out_filename,
            http_stream_port, dont_show, ext_output);
            http_stream_port, dont_show, ext_output);

        free_list_contents_kvp(options);
        free_list(options);
        free_list_contents_kvp(options);
        free_list(options);
    }
    else printf(" There isn't such command: %s", argv[2]);
    else printf(" There isn't such command: %s", argv[2]);
}

 src/gemm.c

@@ -87,7 +87,7 @@
#include <immintrin.h>
#include <smmintrin.h>

#else   // Linux GCC/Clang
#else    // Linux GCC/Clang
#include <x86intrin.h>
#include <ammintrin.h>
#include <immintrin.h>
@@ -96,124 +96,124 @@

void asm_cpuid(uint32_t* abcd, uint32_t eax)
{
    uint32_t ebx = 0, edx = 0, ecx = 0;
    uint32_t ebx = 0, edx = 0, ecx = 0;

    // EBX is saved to EDI and later restored
    __asm__("movl %%ebx, %%edi;"
        "cpuid;"
        "xchgl %%ebx, %%edi;"
        : "=D"(ebx),
        "+a"(eax), "+c"(ecx), "=d"(edx));
    // EBX is saved to EDI and later restored
    __asm__("movl %%ebx, %%edi;"
        "cpuid;"
        "xchgl %%ebx, %%edi;"
        : "=D"(ebx),
        "+a"(eax), "+c"(ecx), "=d"(edx));

    abcd[0] = eax;
    abcd[1] = ebx;
    abcd[2] = ecx;
    abcd[3] = edx;
    abcd[0] = eax;
    abcd[1] = ebx;
    abcd[2] = ecx;
    abcd[3] = edx;
}

#endif

int simd_detect_x86(unsigned int idFeature)
{
    uint32_t regs[4];   // EAX, EBX, ECX, EDX;
    uint32_t regs[4];    // EAX, EBX, ECX, EDX;
#ifdef _WIN32
    __cpuid(regs, 0);
    if (regs[0] > 1U) __cpuid(regs, 1);
    __cpuid(regs, 0);
    if (regs[0] > 1U) __cpuid(regs, 1);
#else
    __get_cpuid(0, &regs[0], &regs[1], &regs[2], &regs[3]);
    if(regs[0] > 1U) __get_cpuid(1, &regs[0], &regs[1], &regs[2], &regs[3]);
    __get_cpuid(0, &regs[0], &regs[1], &regs[2], &regs[3]);
    if(regs[0] > 1U) __get_cpuid(1, &regs[0], &regs[1], &regs[2], &regs[3]);
#endif

    if ((regs[2] & idFeature) != idFeature)
        return 0;
    return 1;
    if ((regs[2] & idFeature) != idFeature)
        return 0;
    return 1;
}

int is_fma_avx() {
    static int result = -1;
    if (result == -1) {
        result = simd_detect_x86(AVXFlag);
        if (result == 1) printf(" Used AVX \n");
        else printf(" Not used AVX \n");
    }
    return result;
    static int result = -1;
    if (result == -1) {
        result = simd_detect_x86(AVXFlag);
        if (result == 1) printf(" Used AVX \n");
        else printf(" Not used AVX \n");
    }
    return result;
}

// https://software.intel.com/sites/landingpage/IntrinsicsGuide
void gemm_nn(int M, int N, int K, float ALPHA,
    float *A, int lda,
    float *B, int ldb,
    float *C, int ldc)
    float *A, int lda,
    float *B, int ldb,
    float *C, int ldc)
{
    int i, j, k;
    if (is_fma_avx() == 1) {    // AVX
        for (i = 0; i < M; ++i) {
            for (k = 0; k < K; ++k) {
                float A_PART = ALPHA*A[i*lda + k];
                __m256 a256, b256, c256, result256; // AVX
                a256 = _mm256_set1_ps(A_PART);
                for (j = 0; j < N - 8; j += 8) {
                    b256 = _mm256_loadu_ps(&B[k*ldb + j]);
                    c256 = _mm256_loadu_ps(&C[i*ldc + j]);
                    // FMA - Intel Haswell (2013), AMD Piledriver (2012)
                    //result256 = _mm256_fmadd_ps(a256, b256, c256);
                    result256 = _mm256_mul_ps(a256, b256);
                    result256 = _mm256_add_ps(result256, c256);
                    _mm256_storeu_ps(&C[i*ldc + j], result256);
                }
    int i, j, k;
    if (is_fma_avx() == 1) {    // AVX
        for (i = 0; i < M; ++i) {
            for (k = 0; k < K; ++k) {
                float A_PART = ALPHA*A[i*lda + k];
                __m256 a256, b256, c256, result256;    // AVX
                a256 = _mm256_set1_ps(A_PART);
                for (j = 0; j < N - 8; j += 8) {
                    b256 = _mm256_loadu_ps(&B[k*ldb + j]);
                    c256 = _mm256_loadu_ps(&C[i*ldc + j]);
                    // FMA - Intel Haswell (2013), AMD Piledriver (2012)
                    //result256 = _mm256_fmadd_ps(a256, b256, c256);
                    result256 = _mm256_mul_ps(a256, b256);
                    result256 = _mm256_add_ps(result256, c256);
                    _mm256_storeu_ps(&C[i*ldc + j], result256);
                }

                int prev_end = (N % 8 == 0) ? (N - 8) : (N / 8) * 8;
                for (j = prev_end; j < N; ++j)
                    C[i*ldc + j] += A_PART*B[k*ldb + j];
            }
        }
    }
    else {
        for (i = 0; i < M; ++i) {
            for (k = 0; k < K; ++k) {
                register float A_PART = ALPHA*A[i*lda + k];
                for (j = 0; j < N; ++j) {
                    C[i*ldc + j] += A_PART*B[k*ldb + j];
                }
                /* // SSE
                __m128 a128, b128, c128, result128; // SSE
                a128 = _mm_set1_ps(A_PART);
                for (j = 0; j < N - 4; j += 4) {
                b128 = _mm_loadu_ps(&B[k*ldb + j]);
                c128 = _mm_loadu_ps(&C[i*ldc + j]);
                //result128 = _mm_fmadd_ps(a128, b128, c128);
                result128 = _mm_mul_ps(a128, b128);
                result128 = _mm_add_ps(result128, c128);
                _mm_storeu_ps(&C[i*ldc + j], result128);
                }
                int prev_end = (N % 8 == 0) ? (N - 8) : (N / 8) * 8;
                for (j = prev_end; j < N; ++j)
                    C[i*ldc + j] += A_PART*B[k*ldb + j];
            }
        }
    }
    else {
        for (i = 0; i < M; ++i) {
            for (k = 0; k < K; ++k) {
                register float A_PART = ALPHA*A[i*lda + k];
                for (j = 0; j < N; ++j) {
                    C[i*ldc + j] += A_PART*B[k*ldb + j];
                }
                /* // SSE
                __m128 a128, b128, c128, result128;    // SSE
                a128 = _mm_set1_ps(A_PART);
                for (j = 0; j < N - 4; j += 4) {
                b128 = _mm_loadu_ps(&B[k*ldb + j]);
                c128 = _mm_loadu_ps(&C[i*ldc + j]);
                //result128 = _mm_fmadd_ps(a128, b128, c128);
                result128 = _mm_mul_ps(a128, b128);
                result128 = _mm_add_ps(result128, c128);
                _mm_storeu_ps(&C[i*ldc + j], result128);
                }

                int prev_end = (N % 4 == 0) ? (N - 4) : (N / 4) * 4;
                for (j = prev_end; j < N; ++j){
                C[i*ldc + j] += A_PART*B[k*ldb + j];
                }
                */
            }
        }
    }
                int prev_end = (N % 4 == 0) ? (N - 4) : (N / 4) * 4;
                for (j = prev_end; j < N; ++j){
                C[i*ldc + j] += A_PART*B[k*ldb + j];
                }
                */
            }
        }
    }
}
#else

void gemm_nn(int M, int N, int K, float ALPHA,
    float *A, int lda,
    float *B, int ldb,
    float *C, int ldc)
    float *A, int lda,
    float *B, int ldb,
    float *C, int ldc)
{
    int i, j, k;
    for (i = 0; i < M; ++i) {
        for (k = 0; k < K; ++k) {
            register float A_PART = ALPHA*A[i*lda + k];
            for (j = 0; j < N; ++j) {
                C[i*ldc + j] += A_PART*B[k*ldb + j];
            }
        }
    }
    int i, j, k;
    for (i = 0; i < M; ++i) {
        for (k = 0; k < K; ++k) {
            register float A_PART = ALPHA*A[i*lda + k];
            for (j = 0; j < N; ++j) {
                C[i*ldc + j] += A_PART*B[k*ldb + j];
            }
        }
    }
}
#endif  // __x86_64
#endif    // __x86_64

void gemm_nt(int M, int N, int K, float ALPHA, 
        float *A, int lda, 
@@ -282,18 +282,18 @@
        }
    }

    int t;
    #pragma omp parallel for
    for (t = 0; t < M; ++t) {
        if (!TA && !TB)
            gemm_nn(1, N, K, ALPHA, A + t*lda, lda, B, ldb, C + t*ldc, ldc);
        else if (TA && !TB)
            gemm_tn(1, N, K, ALPHA, A + t, lda, B, ldb, C + t*ldc, ldc);
        else if (!TA && TB)
            gemm_nt(1, N, K, ALPHA, A + t*lda, lda, B, ldb, C + t*ldc, ldc);
        else
            gemm_tt(1, N, K, ALPHA, A + t, lda, B, ldb, C + t*ldc, ldc);
    }
    int t;
    #pragma omp parallel for
    for (t = 0; t < M; ++t) {
        if (!TA && !TB)
            gemm_nn(1, N, K, ALPHA, A + t*lda, lda, B, ldb, C + t*ldc, ldc);
        else if (TA && !TB)
            gemm_tn(1, N, K, ALPHA, A + t, lda, B, ldb, C + t*ldc, ldc);
        else if (!TA && TB)
            gemm_nt(1, N, K, ALPHA, A + t*lda, lda, B, ldb, C + t*ldc, ldc);
        else
            gemm_tt(1, N, K, ALPHA, A + t, lda, B, ldb, C + t*ldc, ldc);
    }
}

#ifdef GPU
@@ -307,7 +307,7 @@
        float *C_gpu, int ldc)
{
    cublasHandle_t handle = blas_handle();
    cudaError_t stream_status = cublasSetStream(handle, get_cuda_stream());
    cudaError_t stream_status = cublasSetStream(handle, get_cuda_stream());
    cudaError_t status = cublasSgemm(handle, (TB ? CUBLAS_OP_T : CUBLAS_OP_N), 
            (TA ? CUBLAS_OP_T : CUBLAS_OP_N), N, M, K, &ALPHA, B_gpu, ldb, A_gpu, lda, &BETA, C_gpu, ldc);
    check_error(status);

 src/gettimeofday.c

@@ -1,5 +1,5 @@
#include "gettimeofday.h"
 

int gettimeofday(struct timeval *tv, struct timezone *tz)
{
  FILETIME ft;

 src/http_stream.cpp

@@ -7,7 +7,7 @@
//
// socket related abstractions:
//
#ifdef _WIN32  
#ifdef _WIN32
#pragma comment(lib, "ws2_32.lib")
#include <winsock.h>
#include <windows.h>
@@ -16,8 +16,8 @@
#define ADDRPOINTER   int*
struct _INIT_W32DATA
{
    WSADATA w;
    _INIT_W32DATA() { WSAStartup(MAKEWORD(2, 1), &w); }
    WSADATA w;
    _INIT_W32DATA() { WSAStartup(MAKEWORD(2, 1), &w); }
} _init_once;
#else       /* ! win32 */
#include <unistd.h>
@@ -58,274 +58,274 @@

class MJPGWriter
{
    SOCKET sock;
    SOCKET maxfd;
    fd_set master;
    int timeout; // master sock timeout, shutdown after timeout millis.
    int quality; // jpeg compression [1..100]
    SOCKET sock;
    SOCKET maxfd;
    fd_set master;
    int timeout; // master sock timeout, shutdown after timeout millis.
    int quality; // jpeg compression [1..100]

    int _write(int sock, char const*const s, int len)
    {
        if (len < 1) { len = strlen(s); }
        return ::send(sock, s, len, 0);
    }
    int _write(int sock, char const*const s, int len)
    {
        if (len < 1) { len = strlen(s); }
        return ::send(sock, s, len, 0);
    }

public:

    MJPGWriter(int port = 0, int _timeout = 200000, int _quality = 30)
        : sock(INVALID_SOCKET)
        , timeout(_timeout)
        , quality(_quality)
    {
        FD_ZERO(&master);
        if (port)
            open(port);
    }
    MJPGWriter(int port = 0, int _timeout = 200000, int _quality = 30)
        : sock(INVALID_SOCKET)
        , timeout(_timeout)
        , quality(_quality)
    {
        FD_ZERO(&master);
        if (port)
            open(port);
    }

    ~MJPGWriter()
    {
        release();
    }
    ~MJPGWriter()
    {
        release();
    }

    bool release()
    {
        if (sock != INVALID_SOCKET)
            ::shutdown(sock, 2);
        sock = (INVALID_SOCKET);
        return false;
    }
    bool release()
    {
        if (sock != INVALID_SOCKET)
            ::shutdown(sock, 2);
        sock = (INVALID_SOCKET);
        return false;
    }

    bool open(int port)
    {
        sock = ::socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
    bool open(int port)
    {
        sock = ::socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);

        SOCKADDR_IN address;
        address.sin_addr.s_addr = INADDR_ANY;
        address.sin_family = AF_INET;
        address.sin_port = htons(port); // ::htons(port);
        if (::bind(sock, (SOCKADDR*)&address, sizeof(SOCKADDR_IN)) == SOCKET_ERROR)
        {
            cerr << "error : couldn't bind sock " << sock << " to port " << port << "!" << endl;
            return release();
        }
        if (::listen(sock, 10) == SOCKET_ERROR)
        {
            cerr << "error : couldn't listen on sock " << sock << " on port " << port << " !" << endl;
            return release();
        }
        FD_ZERO(&master);
        FD_SET(sock, &master);
        maxfd = sock;
        return true;
    }
        SOCKADDR_IN address;
        address.sin_addr.s_addr = INADDR_ANY;
        address.sin_family = AF_INET;
        address.sin_port = htons(port);    // ::htons(port);
        if (::bind(sock, (SOCKADDR*)&address, sizeof(SOCKADDR_IN)) == SOCKET_ERROR)
        {
            cerr << "error : couldn't bind sock " << sock << " to port " << port << "!" << endl;
            return release();
        }
        if (::listen(sock, 10) == SOCKET_ERROR)
        {
            cerr << "error : couldn't listen on sock " << sock << " on port " << port << " !" << endl;
            return release();
        }
        FD_ZERO(&master);
        FD_SET(sock, &master);
        maxfd = sock;
        return true;
    }

    bool isOpened()
    {
        return sock != INVALID_SOCKET;
    }
    bool isOpened()
    {
        return sock != INVALID_SOCKET;
    }

    bool write(const Mat & frame)
    {
        fd_set rread = master;
        struct timeval to = { 0,timeout };
        if (::select(maxfd+1, &rread, NULL, NULL, &to) <= 0)
            return true; // nothing broken, there's just noone listening
    bool write(const Mat & frame)
    {
        fd_set rread = master;
        struct timeval to = { 0,timeout };
        if (::select(maxfd+1, &rread, NULL, NULL, &to) <= 0)
            return true; // nothing broken, there's just noone listening

        std::vector<uchar> outbuf;
        std::vector<int> params;
        params.push_back(IMWRITE_JPEG_QUALITY);
        params.push_back(quality);
        cv::imencode(".jpg", frame, outbuf, params);
        size_t outlen = outbuf.size();
        std::vector<uchar> outbuf;
        std::vector<int> params;
        params.push_back(IMWRITE_JPEG_QUALITY);
        params.push_back(quality);
        cv::imencode(".jpg", frame, outbuf, params);
        size_t outlen = outbuf.size();

#ifdef _WIN32 
        for (unsigned i = 0; i<rread.fd_count; i++)
        {
            int addrlen = sizeof(SOCKADDR);
            SOCKET s = rread.fd_array[i];    // fd_set on win is an array, while ...
#else         
        for (int s = 0; s<=maxfd; s++)
        {
            socklen_t addrlen = sizeof(SOCKADDR);
            if (!FD_ISSET(s, &rread))      // ... on linux it's a bitmask ;)
                continue;
#endif                   
            if (s == sock) // request on master socket, accept and send main header.
            {
                SOCKADDR_IN address = { 0 };
                SOCKET      client = ::accept(sock, (SOCKADDR*)&address, &addrlen);
                if (client == SOCKET_ERROR)
                {
                    cerr << "error : couldn't accept connection on sock " << sock << " !" << endl;
                    return false;
                }
                maxfd = (maxfd>client ? maxfd : client);
                FD_SET(client, &master);
                _write(client, "HTTP/1.0 200 OK\r\n", 0);
                _write(client,
                    "Server: Mozarella/2.2\r\n"
                    "Accept-Range: bytes\r\n"
                    "Connection: close\r\n"
                    "Max-Age: 0\r\n"
                    "Expires: 0\r\n"
                    "Cache-Control: no-cache, private\r\n"
                    "Pragma: no-cache\r\n"
                    "Content-Type: multipart/x-mixed-replace; boundary=mjpegstream\r\n"
                    "\r\n", 0);
                cerr << "new client " << client << endl;
            }
            else // existing client, just stream pix
            {
                char head[400];
                sprintf(head, "--mjpegstream\r\nContent-Type: image/jpeg\r\nContent-Length: %zu\r\n\r\n", outlen);
                _write(s, head, 0);
                int n = _write(s, (char*)(&outbuf[0]), outlen);
                //cerr << "known client " << s << " " << n << endl;
                if (n < outlen)
                {
                    cerr << "kill client " << s << endl;
                    ::shutdown(s, 2);
                    FD_CLR(s, &master);
                }
            }
        }
        return true;
    }
#ifdef _WIN32
        for (unsigned i = 0; i<rread.fd_count; i++)
        {
            int addrlen = sizeof(SOCKADDR);
            SOCKET s = rread.fd_array[i];    // fd_set on win is an array, while ...
#else
        for (int s = 0; s<=maxfd; s++)
        {
            socklen_t addrlen = sizeof(SOCKADDR);
            if (!FD_ISSET(s, &rread))      // ... on linux it's a bitmask ;)
                continue;
#endif
            if (s == sock) // request on master socket, accept and send main header.
            {
                SOCKADDR_IN address = { 0 };
                SOCKET      client = ::accept(sock, (SOCKADDR*)&address, &addrlen);
                if (client == SOCKET_ERROR)
                {
                    cerr << "error : couldn't accept connection on sock " << sock << " !" << endl;
                    return false;
                }
                maxfd = (maxfd>client ? maxfd : client);
                FD_SET(client, &master);
                _write(client, "HTTP/1.0 200 OK\r\n", 0);
                _write(client,
                    "Server: Mozarella/2.2\r\n"
                    "Accept-Range: bytes\r\n"
                    "Connection: close\r\n"
                    "Max-Age: 0\r\n"
                    "Expires: 0\r\n"
                    "Cache-Control: no-cache, private\r\n"
                    "Pragma: no-cache\r\n"
                    "Content-Type: multipart/x-mixed-replace; boundary=mjpegstream\r\n"
                    "\r\n", 0);
                cerr << "new client " << client << endl;
            }
            else // existing client, just stream pix
            {
                char head[400];
                sprintf(head, "--mjpegstream\r\nContent-Type: image/jpeg\r\nContent-Length: %zu\r\n\r\n", outlen);
                _write(s, head, 0);
                int n = _write(s, (char*)(&outbuf[0]), outlen);
                //cerr << "known client " << s << " " << n << endl;
                if (n < outlen)
                {
                    cerr << "kill client " << s << endl;
                    ::shutdown(s, 2);
                    FD_CLR(s, &master);
                }
            }
        }
        return true;
    }
};
// ----------------------------------------

void send_mjpeg(IplImage* ipl, int port, int timeout, int quality) {
    static MJPGWriter wri(port, timeout, quality);
    cv::Mat mat = cv::cvarrToMat(ipl);
    wri.write(mat);
    std::cout << " MJPEG-stream sent. \n";
    static MJPGWriter wri(port, timeout, quality);
    cv::Mat mat = cv::cvarrToMat(ipl);
    wri.write(mat);
    std::cout << " MJPEG-stream sent. \n";
}
// ----------------------------------------

CvCapture* get_capture_video_stream(char *path) {
    CvCapture* cap = NULL;
    try {
        cap = (CvCapture*)new cv::VideoCapture(path);
    }
    catch (...) {
        std::cout << " Error: video-stream " << path << " can't be opened! \n";
    }
    return cap;
    CvCapture* cap = NULL;
    try {
        cap = (CvCapture*)new cv::VideoCapture(path);
    }
    catch (...) {
        std::cout << " Error: video-stream " << path << " can't be opened! \n";
    }
    return cap;
}
// ----------------------------------------

CvCapture* get_capture_webcam(int index) {
    CvCapture* cap = NULL;
    try {
        cap = (CvCapture*)new cv::VideoCapture(index);
        //((cv::VideoCapture*)cap)->set(CV_CAP_PROP_FRAME_WIDTH, 1280);
        //((cv::VideoCapture*)cap)->set(CV_CAP_PROP_FRAME_HEIGHT, 960);
    }
    catch (...) {
        std::cout << " Error: Web-camera " << index << " can't be opened! \n";
    }
    return cap;
    CvCapture* cap = NULL;
    try {
        cap = (CvCapture*)new cv::VideoCapture(index);
        //((cv::VideoCapture*)cap)->set(CV_CAP_PROP_FRAME_WIDTH, 1280);
        //((cv::VideoCapture*)cap)->set(CV_CAP_PROP_FRAME_HEIGHT, 960);
    }
    catch (...) {
        std::cout << " Error: Web-camera " << index << " can't be opened! \n";
    }
    return cap;
}
// ----------------------------------------

IplImage* get_webcam_frame(CvCapture *cap) {
    IplImage* src = NULL;
    try {
        cv::VideoCapture &cpp_cap = *(cv::VideoCapture *)cap;
        cv::Mat frame;
        if (cpp_cap.isOpened()) 
        {
            cpp_cap >> frame;
            IplImage tmp = frame;
            src = cvCloneImage(&tmp);
        }
        else {
            std::cout << " Video-stream stoped! \n";
        }
    }
    catch (...) {
        std::cout << " Video-stream stoped! \n";
    }
    return src;
    IplImage* src = NULL;
    try {
        cv::VideoCapture &cpp_cap = *(cv::VideoCapture *)cap;
        cv::Mat frame;
        if (cpp_cap.isOpened())
        {
            cpp_cap >> frame;
            IplImage tmp = frame;
            src = cvCloneImage(&tmp);
        }
        else {
            std::cout << " Video-stream stoped! \n";
        }
    }
    catch (...) {
        std::cout << " Video-stream stoped! \n";
    }
    return src;
}

int get_stream_fps_cpp(CvCapture *cap) {
    int fps = 25;
    try {
        cv::VideoCapture &cpp_cap = *(cv::VideoCapture *)cap;
#ifndef CV_VERSION_EPOCH    // OpenCV 3.x
        fps = cpp_cap.get(CAP_PROP_FPS);
#else                       // OpenCV 2.x
        fps = cpp_cap.get(CV_CAP_PROP_FPS);
#endif		
    }
    catch (...) {
        std::cout << " Can't get FPS of source videofile. For output video FPS = 25 by default. \n";
    }
    return fps;
    int fps = 25;
    try {
        cv::VideoCapture &cpp_cap = *(cv::VideoCapture *)cap;
#ifndef CV_VERSION_EPOCH    // OpenCV 3.x
        fps = cpp_cap.get(CAP_PROP_FPS);
#else                        // OpenCV 2.x
        fps = cpp_cap.get(CV_CAP_PROP_FPS);
#endif
    }
    catch (...) {
        std::cout << " Can't get FPS of source videofile. For output video FPS = 25 by default. \n";
    }
    return fps;
}
// ----------------------------------------
extern "C" {
    image ipl_to_image(IplImage* src);  // image.c
    image ipl_to_image(IplImage* src);    // image.c
}

image image_data_augmentation(IplImage* ipl, int w, int h,
    int pleft, int ptop, int swidth, int sheight, int flip,
    float jitter, float dhue, float dsat, float dexp)
    int pleft, int ptop, int swidth, int sheight, int flip,
    float jitter, float dhue, float dsat, float dexp)
{
    cv::Mat img = cv::cvarrToMat(ipl);
    cv::Mat img = cv::cvarrToMat(ipl);

    // crop
    cv::Rect src_rect(pleft, ptop, swidth, sheight);
    cv::Rect img_rect(cv::Point2i(0, 0), img.size());
    cv::Rect new_src_rect = src_rect & img_rect;
    // crop
    cv::Rect src_rect(pleft, ptop, swidth, sheight);
    cv::Rect img_rect(cv::Point2i(0, 0), img.size());
    cv::Rect new_src_rect = src_rect & img_rect;

    cv::Rect dst_rect(cv::Point2i(std::max(0, -pleft), std::max(0, -ptop)), new_src_rect.size());
    cv::Rect dst_rect(cv::Point2i(std::max(0, -pleft), std::max(0, -ptop)), new_src_rect.size());

    cv::Mat cropped(cv::Size(src_rect.width, src_rect.height), img.type());
    cropped.setTo(cv::Scalar::all(0));
    cv::Mat cropped(cv::Size(src_rect.width, src_rect.height), img.type());
    cropped.setTo(cv::Scalar::all(0));

    img(new_src_rect).copyTo(cropped(dst_rect));
    img(new_src_rect).copyTo(cropped(dst_rect));

    // resize
    cv::Mat sized;
    cv::resize(cropped, sized, cv::Size(w, h), 0, 0, INTER_LINEAR);
    // resize
    cv::Mat sized;
    cv::resize(cropped, sized, cv::Size(w, h), 0, 0, INTER_LINEAR);

    // flip
    if (flip) {
        cv::flip(sized, cropped, 1);    // 0 - x-axis, 1 - y-axis, -1 - both axes (x & y)
        sized = cropped.clone();
    }
    // flip
    if (flip) {
        cv::flip(sized, cropped, 1);    // 0 - x-axis, 1 - y-axis, -1 - both axes (x & y)
        sized = cropped.clone();
    }

    // HSV augmentation
    // CV_BGR2HSV, CV_RGB2HSV, CV_HSV2BGR, CV_HSV2RGB
    if (ipl->nChannels >= 3)
    {
        cv::Mat hsv_src;
        cvtColor(sized, hsv_src, CV_BGR2HSV);   // also BGR -> RGB
	
        std::vector<cv::Mat> hsv;
        cv::split(hsv_src, hsv);
    // HSV augmentation
    // CV_BGR2HSV, CV_RGB2HSV, CV_HSV2BGR, CV_HSV2RGB
    if (ipl->nChannels >= 3)
    {
        cv::Mat hsv_src;
        cvtColor(sized, hsv_src, CV_BGR2HSV);    // also BGR -> RGB

        hsv[1] *= dsat;
        hsv[2] *= dexp;
        hsv[0] += 179 * dhue;
        std::vector<cv::Mat> hsv;
        cv::split(hsv_src, hsv);

        cv::merge(hsv, hsv_src);
        hsv[1] *= dsat;
        hsv[2] *= dexp;
        hsv[0] += 179 * dhue;

        cvtColor(hsv_src, sized, CV_HSV2RGB);   // now RGB instead of BGR
    }
    else
    {
        sized *= dexp;
    }
        cv::merge(hsv, hsv_src);

    // Mat -> IplImage -> image
    IplImage src = sized;
    image out = ipl_to_image(&src);
        cvtColor(hsv_src, sized, CV_HSV2RGB);    // now RGB instead of BGR
    }
    else
    {
        sized *= dexp;
    }

    return out;
    // Mat -> IplImage -> image
    IplImage src = sized;
    image out = ipl_to_image(&src);

    return out;
}


#endif  // OPENCV
#endif    // OPENCV

 src/http_stream.h

@@ -14,8 +14,8 @@
int get_stream_fps_cpp(CvCapture *cap);

image image_data_augmentation(IplImage* ipl, int w, int h,
    int pleft, int ptop, int swidth, int sheight, int flip,
    float jitter, float dhue, float dsat, float dexp);
    int pleft, int ptop, int swidth, int sheight, int flip,
    float jitter, float dhue, float dsat, float dexp);

#ifdef __cplusplus
}

 src/image.c

@@ -40,31 +40,31 @@

static float get_pixel(image m, int x, int y, int c)
{
    assert(x < m.w && y < m.h && c < m.c);
    return m.data[c*m.h*m.w + y*m.w + x];
    assert(x < m.w && y < m.h && c < m.c);
    return m.data[c*m.h*m.w + y*m.w + x];
}
static float get_pixel_extend(image m, int x, int y, int c)
{
    if (x < 0 || x >= m.w || y < 0 || y >= m.h) return 0;
    /*
    if(x < 0) x = 0;
    if(x >= m.w) x = m.w-1;
    if(y < 0) y = 0;
    if(y >= m.h) y = m.h-1;
    */
    if (c < 0 || c >= m.c) return 0;
    return get_pixel(m, x, y, c);
    if (x < 0 || x >= m.w || y < 0 || y >= m.h) return 0;
    /*
    if(x < 0) x = 0;
    if(x >= m.w) x = m.w-1;
    if(y < 0) y = 0;
    if(y >= m.h) y = m.h-1;
    */
    if (c < 0 || c >= m.c) return 0;
    return get_pixel(m, x, y, c);
}
static void set_pixel(image m, int x, int y, int c, float val)
{
    if (x < 0 || y < 0 || c < 0 || x >= m.w || y >= m.h || c >= m.c) return;
    assert(x < m.w && y < m.h && c < m.c);
    m.data[c*m.h*m.w + y*m.w + x] = val;
    if (x < 0 || y < 0 || c < 0 || x >= m.w || y >= m.h || c >= m.c) return;
    assert(x < m.w && y < m.h && c < m.c);
    m.data[c*m.h*m.w + y*m.w + x] = val;
}
static void add_pixel(image m, int x, int y, int c, float val)
{
    assert(x < m.w && y < m.h && c < m.c);
    m.data[c*m.h*m.w + y*m.w + x] += val;
    assert(x < m.w && y < m.h && c < m.c);
    m.data[c*m.h*m.w + y*m.w + x] += val;
}

void composite_image(image source, image dest, int dx, int dy)
@@ -125,19 +125,19 @@

image get_label_v3(image **characters, char *string, int size)
{
    size = size / 10;
    if (size > 7) size = 7;
    image label = make_empty_image(0, 0, 0);
    while (*string) {
        image l = characters[size][(int)*string];
        image n = tile_images(label, l, -size - 1 + (size + 1) / 2);
        free_image(label);
        label = n;
        ++string;
    }
    image b = border_image(label, label.h*.25);
    free_image(label);
    return b;
    size = size / 10;
    if (size > 7) size = 7;
    image label = make_empty_image(0, 0, 0);
    while (*string) {
        image l = characters[size][(int)*string];
        image n = tile_images(label, l, -size - 1 + (size + 1) / 2);
        free_image(label);
        label = n;
        ++string;
    }
    image b = border_image(label, label.h*.25);
    free_image(label);
    return b;
}

void draw_label(image a, int r, int c, image label, const float *rgb)
@@ -235,143 +235,143 @@
// Creates array of detections with prob > thresh and fills best_class for them
detection_with_class* get_actual_detections(detection *dets, int dets_num, float thresh, int* selected_detections_num)
{
    int selected_num = 0;
    detection_with_class* result_arr = calloc(dets_num, sizeof(detection_with_class));
    int i;
    for (i = 0; i < dets_num; ++i) {
        int best_class = -1;
        float best_class_prob = thresh;
        int j;
        for (j = 0; j < dets[i].classes; ++j) {
            if (dets[i].prob[j] > best_class_prob ) {
                best_class = j;
                best_class_prob = dets[i].prob[j];
            }
        }
        if (best_class >= 0) {
            result_arr[selected_num].det = dets[i];
            result_arr[selected_num].best_class = best_class;
            ++selected_num;
        }
    }
    if (selected_detections_num)
        *selected_detections_num = selected_num;
    return result_arr;
    int selected_num = 0;
    detection_with_class* result_arr = calloc(dets_num, sizeof(detection_with_class));
    int i;
    for (i = 0; i < dets_num; ++i) {
        int best_class = -1;
        float best_class_prob = thresh;
        int j;
        for (j = 0; j < dets[i].classes; ++j) {
            if (dets[i].prob[j] > best_class_prob ) {
                best_class = j;
                best_class_prob = dets[i].prob[j];
            }
        }
        if (best_class >= 0) {
            result_arr[selected_num].det = dets[i];
            result_arr[selected_num].best_class = best_class;
            ++selected_num;
        }
    }
    if (selected_detections_num)
        *selected_detections_num = selected_num;
    return result_arr;
}

// compare to sort detection** by bbox.x
int compare_by_lefts(const void *a_ptr, const void *b_ptr) {
    const detection_with_class* a = (detection_with_class*)a_ptr;
    const detection_with_class* b = (detection_with_class*)b_ptr;
    const float delta = (a->det.bbox.x - a->det.bbox.w/2) - (b->det.bbox.x - b->det.bbox.w/2);
    return delta < 0 ? -1 : delta > 0 ? 1 : 0;
    const detection_with_class* a = (detection_with_class*)a_ptr;
    const detection_with_class* b = (detection_with_class*)b_ptr;
    const float delta = (a->det.bbox.x - a->det.bbox.w/2) - (b->det.bbox.x - b->det.bbox.w/2);
    return delta < 0 ? -1 : delta > 0 ? 1 : 0;
}

// compare to sort detection** by best_class probability 
int compare_by_probs(const void *a_ptr, const void *b_ptr) {
    const detection_with_class* a = (detection_with_class*)a_ptr;
    const detection_with_class* b = (detection_with_class*)b_ptr;
    float delta = a->det.prob[a->best_class] - b->det.prob[b->best_class];
    return delta < 0 ? -1 : delta > 0 ? 1 : 0;
    const detection_with_class* a = (detection_with_class*)a_ptr;
    const detection_with_class* b = (detection_with_class*)b_ptr;
    float delta = a->det.prob[a->best_class] - b->det.prob[b->best_class];
    return delta < 0 ? -1 : delta > 0 ? 1 : 0;
}

void draw_detections_v3(image im, detection *dets, int num, float thresh, char **names, image **alphabet, int classes, int ext_output)
{
    int selected_detections_num;
    detection_with_class* selected_detections = get_actual_detections(dets, num, thresh, &selected_detections_num);
    int selected_detections_num;
    detection_with_class* selected_detections = get_actual_detections(dets, num, thresh, &selected_detections_num);

    // text output
    qsort(selected_detections, selected_detections_num, sizeof(*selected_detections), compare_by_lefts);
    int i;
    for (i = 0; i < selected_detections_num; ++i) {
        const int best_class = selected_detections[i].best_class;
        printf("%s: %.0f%%", names[best_class], selected_detections[i].det.prob[best_class] * 100);
        if (ext_output)
            printf("\t(left_x: %4.0f   top_y: %4.0f   width: %4.0f   height: %4.0f)\n",
                (selected_detections[i].det.bbox.x - selected_detections[i].det.bbox.w / 2)*im.w,
                (selected_detections[i].det.bbox.y - selected_detections[i].det.bbox.h / 2)*im.h,
                selected_detections[i].det.bbox.w*im.w, selected_detections[i].det.bbox.h*im.h);
        else
            printf("\n");
        int j;
        for (j = 0; j < classes; ++j) {
            if (selected_detections[i].det.prob[j] > thresh && j != best_class) {
                printf("%s: %.0f%%\n", names[j], selected_detections[i].det.prob[j] * 100);
            }
        }
    }
    // text output
    qsort(selected_detections, selected_detections_num, sizeof(*selected_detections), compare_by_lefts);
    int i;
    for (i = 0; i < selected_detections_num; ++i) {
        const int best_class = selected_detections[i].best_class;
        printf("%s: %.0f%%", names[best_class],    selected_detections[i].det.prob[best_class] * 100);
        if (ext_output)
            printf("\t(left_x: %4.0f   top_y: %4.0f   width: %4.0f   height: %4.0f)\n",
                (selected_detections[i].det.bbox.x - selected_detections[i].det.bbox.w / 2)*im.w,
                (selected_detections[i].det.bbox.y - selected_detections[i].det.bbox.h / 2)*im.h,
                selected_detections[i].det.bbox.w*im.w, selected_detections[i].det.bbox.h*im.h);
        else
            printf("\n");
        int j;
        for (j = 0; j < classes; ++j) {
            if (selected_detections[i].det.prob[j] > thresh && j != best_class) {
                printf("%s: %.0f%%\n", names[j], selected_detections[i].det.prob[j] * 100);
            }
        }
    }

    // image output
    qsort(selected_detections, selected_detections_num, sizeof(*selected_detections), compare_by_probs);
    for (i = 0; i < selected_detections_num; ++i) {
            int width = im.h * .006;
            if (width < 1)
                width = 1;
    // image output
    qsort(selected_detections, selected_detections_num, sizeof(*selected_detections), compare_by_probs);
    for (i = 0; i < selected_detections_num; ++i) {
            int width = im.h * .006;
            if (width < 1)
                width = 1;

            /*
            if(0){
            width = pow(prob, 1./2.)*10+1;
            alphabet = 0;
            }
            */
            /*
            if(0){
            width = pow(prob, 1./2.)*10+1;
            alphabet = 0;
            }
            */

            //printf("%d %s: %.0f%%\n", i, names[selected_detections[i].best_class], prob*100);
            int offset = selected_detections[i].best_class * 123457 % classes;
            float red = get_color(2, offset, classes);
            float green = get_color(1, offset, classes);
            float blue = get_color(0, offset, classes);
            float rgb[3];
            //printf("%d %s: %.0f%%\n", i, names[selected_detections[i].best_class], prob*100);
            int offset = selected_detections[i].best_class * 123457 % classes;
            float red = get_color(2, offset, classes);
            float green = get_color(1, offset, classes);
            float blue = get_color(0, offset, classes);
            float rgb[3];

            //width = prob*20+2;
            //width = prob*20+2;

            rgb[0] = red;
            rgb[1] = green;
            rgb[2] = blue;
            box b = selected_detections[i].det.bbox;
            //printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
            rgb[0] = red;
            rgb[1] = green;
            rgb[2] = blue;
            box b = selected_detections[i].det.bbox;
            //printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);

            int left = (b.x - b.w / 2.)*im.w;
            int right = (b.x + b.w / 2.)*im.w;
            int top = (b.y - b.h / 2.)*im.h;
            int bot = (b.y + b.h / 2.)*im.h;
            int left = (b.x - b.w / 2.)*im.w;
            int right = (b.x + b.w / 2.)*im.w;
            int top = (b.y - b.h / 2.)*im.h;
            int bot = (b.y + b.h / 2.)*im.h;

            if (left < 0) left = 0;
            if (right > im.w - 1) right = im.w - 1;
            if (top < 0) top = 0;
            if (bot > im.h - 1) bot = im.h - 1;
            if (left < 0) left = 0;
            if (right > im.w - 1) right = im.w - 1;
            if (top < 0) top = 0;
            if (bot > im.h - 1) bot = im.h - 1;

            //int b_x_center = (left + right) / 2;
            //int b_y_center = (top + bot) / 2;
            //int b_width = right - left;
            //int b_height = bot - top;
            //sprintf(labelstr, "%d x %d - w: %d, h: %d", b_x_center, b_y_center, b_width, b_height);
            //int b_x_center = (left + right) / 2;
            //int b_y_center = (top + bot) / 2;
            //int b_width = right - left;
            //int b_height = bot - top;
            //sprintf(labelstr, "%d x %d - w: %d, h: %d", b_x_center, b_y_center, b_width, b_height);

            draw_box_width(im, left, top, right, bot, width, red, green, blue);
            if (alphabet) {
                char labelstr[4096] = { 0 };
                strcat(labelstr, names[selected_detections[i].best_class]);
                int j;
                for (j = 0; j < classes; ++j) {
                    if (selected_detections[i].det.prob[j] > thresh && j != selected_detections[i].best_class) {
                        strcat(labelstr, ", ");
                        strcat(labelstr, names[j]);
                    }
                }
                image label = get_label_v3(alphabet, labelstr, (im.h*.03));
                draw_label(im, top + width, left, label, rgb);
                free_image(label);
            }
            if (selected_detections[i].det.mask) {
                image mask = float_to_image(14, 14, 1, selected_detections[i].det.mask);
                image resized_mask = resize_image(mask, b.w*im.w, b.h*im.h);
                image tmask = threshold_image(resized_mask, .5);
                embed_image(tmask, im, left, top);
                free_image(mask);
                free_image(resized_mask);
                free_image(tmask);
            }
    }
    free(selected_detections);
            draw_box_width(im, left, top, right, bot, width, red, green, blue);
            if (alphabet) {
                char labelstr[4096] = { 0 };
                strcat(labelstr, names[selected_detections[i].best_class]);
                int j;
                for (j = 0; j < classes; ++j) {
                    if (selected_detections[i].det.prob[j] > thresh && j != selected_detections[i].best_class) {
                        strcat(labelstr, ", ");
                        strcat(labelstr, names[j]);
                    }
                }
                image label = get_label_v3(alphabet, labelstr, (im.h*.03));
                draw_label(im, top + width, left, label, rgb);
                free_image(label);
            }
            if (selected_detections[i].det.mask) {
                image mask = float_to_image(14, 14, 1, selected_detections[i].det.mask);
                image resized_mask = resize_image(mask, b.w*im.w, b.h*im.h);
                image tmask = threshold_image(resized_mask, .5);
                embed_image(tmask, im, left, top);
                free_image(mask);
                free_image(resized_mask);
                free_image(tmask);
            }
    }
    free(selected_detections);
}

void draw_detections(image im, int num, float thresh, box *boxes, float **probs, char **names, image **alphabet, int classes)
@@ -383,13 +383,13 @@
        float prob = probs[i][class_id];
        if(prob > thresh){

            //// for comparison with OpenCV version of DNN Darknet Yolo v2
            //printf("\n %f, %f, %f, %f, ", boxes[i].x, boxes[i].y, boxes[i].w, boxes[i].h);
            // int k;
            //for (k = 0; k < classes; ++k) {
            //  printf("%f, ", probs[i][k]);
            //}
            //printf("\n");
            //// for comparison with OpenCV version of DNN Darknet Yolo v2
            //printf("\n %f, %f, %f, %f, ", boxes[i].x, boxes[i].y, boxes[i].w, boxes[i].h);
            // int k;
            //for (k = 0; k < classes; ++k) {
            //    printf("%f, ", probs[i][k]);
            //}
            //printf("\n");

            int width = im.h * .012;

@@ -420,12 +420,12 @@
            if(right > im.w-1) right = im.w-1;
            if(top < 0) top = 0;
            if(bot > im.h-1) bot = im.h-1;
            printf("%s: %.0f%%", names[class_id], prob * 100);
			
            //printf(" - id: %d, x_center: %d, y_center: %d, width: %d, height: %d",
            //  class_id, (right + left) / 2, (bot - top) / 2, right - left, bot - top);
            printf("%s: %.0f%%", names[class_id], prob * 100);
            
            //printf(" - id: %d, x_center: %d, y_center: %d, width: %d, height: %d",
            //    class_id, (right + left) / 2, (bot - top) / 2, right - left, bot - top);

            printf("\n");
            printf("\n");
            draw_box_width(im, left, top, right, bot, width, red, green, blue);
            if (alphabet) {
                image label = get_label(alphabet, names[class_id], (im.h*.03)/10);
@@ -439,257 +439,257 @@

void draw_detections_cv_v3(IplImage* show_img, detection *dets, int num, float thresh, char **names, image **alphabet, int classes, int ext_output)
{
    int i, j;
    if (!show_img) return;
    static int frame_id = 0;
    frame_id++;
    int i, j;
    if (!show_img) return;
    static int frame_id = 0;
    frame_id++;

    for (i = 0; i < num; ++i) {
        char labelstr[4096] = { 0 };
        int class_id = -1;
        for (j = 0; j < classes; ++j) {
            if (dets[i].prob[j] > thresh) {
                if (class_id < 0) {
                    strcat(labelstr, names[j]);
                    class_id = j;
                }
                else {
                    strcat(labelstr, ", ");
                    strcat(labelstr, names[j]);
                }
                printf("%s: %.0f%% ", names[j], dets[i].prob[j] * 100);
            }
        }
        if (class_id >= 0) {
            int width = show_img->height * .006;
    for (i = 0; i < num; ++i) {
        char labelstr[4096] = { 0 };
        int class_id = -1;
        for (j = 0; j < classes; ++j) {
            if (dets[i].prob[j] > thresh) {
                if (class_id < 0) {
                    strcat(labelstr, names[j]);
                    class_id = j;
                }
                else {
                    strcat(labelstr, ", ");
                    strcat(labelstr, names[j]);
                }
                printf("%s: %.0f%% ", names[j], dets[i].prob[j] * 100);
            }
        }
        if (class_id >= 0) {
            int width = show_img->height * .006;

            //if(0){
            //width = pow(prob, 1./2.)*10+1;
            //alphabet = 0;
            //}
            //if(0){
            //width = pow(prob, 1./2.)*10+1;
            //alphabet = 0;
            //}

            //printf("%d %s: %.0f%%\n", i, names[class_id], prob*100);
            int offset = class_id * 123457 % classes;
            float red = get_color(2, offset, classes);
            float green = get_color(1, offset, classes);
            float blue = get_color(0, offset, classes);
            float rgb[3];
            //printf("%d %s: %.0f%%\n", i, names[class_id], prob*100);
            int offset = class_id * 123457 % classes;
            float red = get_color(2, offset, classes);
            float green = get_color(1, offset, classes);
            float blue = get_color(0, offset, classes);
            float rgb[3];

            //width = prob*20+2;
            //width = prob*20+2;

            rgb[0] = red;
            rgb[1] = green;
            rgb[2] = blue;
            box b = dets[i].bbox;
            //printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
            rgb[0] = red;
            rgb[1] = green;
            rgb[2] = blue;
            box b = dets[i].bbox;
            //printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);

            int left = (b.x - b.w / 2.)*show_img->width;
            int right = (b.x + b.w / 2.)*show_img->width;
            int top = (b.y - b.h / 2.)*show_img->height;
            int bot = (b.y + b.h / 2.)*show_img->height;
            int left = (b.x - b.w / 2.)*show_img->width;
            int right = (b.x + b.w / 2.)*show_img->width;
            int top = (b.y - b.h / 2.)*show_img->height;
            int bot = (b.y + b.h / 2.)*show_img->height;

            if (left < 0) left = 0;
            if (right > show_img->width - 1) right = show_img->width - 1;
            if (top < 0) top = 0;
            if (bot > show_img->height - 1) bot = show_img->height - 1;
            if (left < 0) left = 0;
            if (right > show_img->width - 1) right = show_img->width - 1;
            if (top < 0) top = 0;
            if (bot > show_img->height - 1) bot = show_img->height - 1;

            //int b_x_center = (left + right) / 2;
            //int b_y_center = (top + bot) / 2;
            //int b_width = right - left;
            //int b_height = bot - top;
            //sprintf(labelstr, "%d x %d - w: %d, h: %d", b_x_center, b_y_center, b_width, b_height);
            //int b_x_center = (left + right) / 2;
            //int b_y_center = (top + bot) / 2;
            //int b_width = right - left;
            //int b_height = bot - top;
            //sprintf(labelstr, "%d x %d - w: %d, h: %d", b_x_center, b_y_center, b_width, b_height);

            float const font_size = show_img->height / 1000.F;
            CvPoint pt1, pt2, pt_text, pt_text_bg1, pt_text_bg2;
            pt1.x = left;
            pt1.y = top;
            pt2.x = right;
            pt2.y = bot;
            pt_text.x = left;
            pt_text.y = top - 12;
            pt_text_bg1.x = left;
            pt_text_bg1.y = top - (10 + 25 * font_size);
            pt_text_bg2.x = right;
            pt_text_bg2.y = top;
            CvScalar color;
            color.val[0] = red * 256;
            color.val[1] = green * 256;
            color.val[2] = blue * 256;
            float const font_size = show_img->height / 1000.F;
            CvPoint pt1, pt2, pt_text, pt_text_bg1, pt_text_bg2;
            pt1.x = left;
            pt1.y = top;
            pt2.x = right;
            pt2.y = bot;
            pt_text.x = left;
            pt_text.y = top - 12;
            pt_text_bg1.x = left;
            pt_text_bg1.y = top - (10 + 25 * font_size);
            pt_text_bg2.x = right;
            pt_text_bg2.y = top;
            CvScalar color;
            color.val[0] = red * 256;
            color.val[1] = green * 256;
            color.val[2] = blue * 256;

            // you should create directory: result_img
            //static int copied_frame_id = -1;
            //static IplImage* copy_img = NULL;
            //if (copied_frame_id != frame_id) {
            //  copied_frame_id = frame_id;
            //  if(copy_img == NULL) copy_img = cvCreateImage(cvSize(show_img->width, show_img->height), show_img->depth, show_img->nChannels);
            //  cvCopy(show_img, copy_img, 0);
            //}
            //static int img_id = 0;
            //img_id++;
            //char image_name[1024];
            //sprintf(image_name, "result_img/img_%d_%d_%d.jpg", frame_id, img_id, class_id);
            //CvRect rect = cvRect(pt1.x, pt1.y, pt2.x - pt1.x, pt2.y - pt1.y);
            //cvSetImageROI(copy_img, rect);
            //cvSaveImage(image_name, copy_img, 0);
            //cvResetImageROI(copy_img);
            // you should create directory: result_img
            //static int copied_frame_id = -1;
            //static IplImage* copy_img = NULL;
            //if (copied_frame_id != frame_id) {
            //    copied_frame_id = frame_id;
            //    if(copy_img == NULL) copy_img = cvCreateImage(cvSize(show_img->width, show_img->height), show_img->depth, show_img->nChannels);
            //    cvCopy(show_img, copy_img, 0);
            //}
            //static int img_id = 0;
            //img_id++;
            //char image_name[1024];
            //sprintf(image_name, "result_img/img_%d_%d_%d.jpg", frame_id, img_id, class_id);
            //CvRect rect = cvRect(pt1.x, pt1.y, pt2.x - pt1.x, pt2.y - pt1.y);
            //cvSetImageROI(copy_img, rect);
            //cvSaveImage(image_name, copy_img, 0);
            //cvResetImageROI(copy_img);

            cvRectangle(show_img, pt1, pt2, color, width, 8, 0);
            if (ext_output)
                printf("\t(left_x: %4.0f   top_y: %4.0f   width: %4.0f   height: %4.0f)\n", 
                    (float)left, (float)top, b.w*show_img->width, b.h*show_img->height);
            else
                printf("\n");
            cvRectangle(show_img, pt_text_bg1, pt_text_bg2, color, width, 8, 0);
            cvRectangle(show_img, pt_text_bg1, pt_text_bg2, color, CV_FILLED, 8, 0);    // filled
            CvScalar black_color;
            black_color.val[0] = 0;
            CvFont font;
            cvInitFont(&font, CV_FONT_HERSHEY_SIMPLEX, font_size, font_size, 0, font_size * 3, 8);
            cvPutText(show_img, labelstr, pt_text, &font, black_color);
        }
    }
            cvRectangle(show_img, pt1, pt2, color, width, 8, 0);
            if (ext_output)
                printf("\t(left_x: %4.0f   top_y: %4.0f   width: %4.0f   height: %4.0f)\n", 
                    (float)left, (float)top, b.w*show_img->width, b.h*show_img->height);
            else
                printf("\n");
            cvRectangle(show_img, pt_text_bg1, pt_text_bg2, color, width, 8, 0);
            cvRectangle(show_img, pt_text_bg1, pt_text_bg2, color, CV_FILLED, 8, 0);    // filled
            CvScalar black_color;
            black_color.val[0] = 0;
            CvFont font;
            cvInitFont(&font, CV_FONT_HERSHEY_SIMPLEX, font_size, font_size, 0, font_size * 3, 8);
            cvPutText(show_img, labelstr, pt_text, &font, black_color);
        }
    }
}

void draw_detections_cv(IplImage* show_img, int num, float thresh, box *boxes, float **probs, char **names, image **alphabet, int classes)
{
    int i;
    int i;

    for (i = 0; i < num; ++i) {
        int class_id = max_index(probs[i], classes);
        float prob = probs[i][class_id];
        if (prob > thresh) {
    for (i = 0; i < num; ++i) {
        int class_id = max_index(probs[i], classes);
        float prob = probs[i][class_id];
        if (prob > thresh) {

            int width = show_img->height * .012;
            int width = show_img->height * .012;

            if (0) {
                width = pow(prob, 1. / 2.) * 10 + 1;
                alphabet = 0;
            }
            if (0) {
                width = pow(prob, 1. / 2.) * 10 + 1;
                alphabet = 0;
            }

            printf("%s: %.0f%%\n", names[class_id], prob * 100);
            int offset = class_id * 123457 % classes;
            float red = get_color(2, offset, classes);
            float green = get_color(1, offset, classes);
            float blue = get_color(0, offset, classes);
            float rgb[3];
            printf("%s: %.0f%%\n", names[class_id], prob * 100);
            int offset = class_id * 123457 % classes;
            float red = get_color(2, offset, classes);
            float green = get_color(1, offset, classes);
            float blue = get_color(0, offset, classes);
            float rgb[3];

            //width = prob*20+2;
            //width = prob*20+2;

            rgb[0] = red;
            rgb[1] = green;
            rgb[2] = blue;
            box b = boxes[i];
            rgb[0] = red;
            rgb[1] = green;
            rgb[2] = blue;
            box b = boxes[i];

            int left = (b.x - b.w / 2.)*show_img->width;
            int right = (b.x + b.w / 2.)*show_img->width;
            int top = (b.y - b.h / 2.)*show_img->height;
            int bot = (b.y + b.h / 2.)*show_img->height;
            int left = (b.x - b.w / 2.)*show_img->width;
            int right = (b.x + b.w / 2.)*show_img->width;
            int top = (b.y - b.h / 2.)*show_img->height;
            int bot = (b.y + b.h / 2.)*show_img->height;

            if (left < 0) left = 0;
            if (right > show_img->width - 1) right = show_img->width - 1;
            if (top < 0) top = 0;
            if (bot > show_img->height - 1) bot = show_img->height - 1;
            if (left < 0) left = 0;
            if (right > show_img->width - 1) right = show_img->width - 1;
            if (top < 0) top = 0;
            if (bot > show_img->height - 1) bot = show_img->height - 1;

            float const font_size = show_img->height / 1000.F;
            CvPoint pt1, pt2, pt_text, pt_text_bg1, pt_text_bg2;
            pt1.x = left;
            pt1.y = top;
            pt2.x = right;
            pt2.y = bot;
            pt_text.x = left;
            pt_text.y = top - 12;
            pt_text_bg1.x = left;
            pt_text_bg1.y = top - (10+25*font_size);
            pt_text_bg2.x = right;
            pt_text_bg2.y = top;
            CvScalar color;
            color.val[0] = red * 256;
            color.val[1] = green * 256;
            color.val[2] = blue * 256;
            float const font_size = show_img->height / 1000.F;
            CvPoint pt1, pt2, pt_text, pt_text_bg1, pt_text_bg2;
            pt1.x = left;
            pt1.y = top;
            pt2.x = right;
            pt2.y = bot;
            pt_text.x = left;
            pt_text.y = top - 12;
            pt_text_bg1.x = left;
            pt_text_bg1.y = top - (10+25*font_size);
            pt_text_bg2.x = right;
            pt_text_bg2.y = top;
            CvScalar color;
            color.val[0] = red * 256;
            color.val[1] = green * 256;
            color.val[2] = blue * 256;

            cvRectangle(show_img, pt1, pt2, color, width, 8, 0);
            //printf("left=%d, right=%d, top=%d, bottom=%d, obj_id=%d, obj=%s \n", left, right, top, bot, class_id, names[class_id]);
            cvRectangle(show_img, pt_text_bg1, pt_text_bg2, color, width, 8, 0);
            cvRectangle(show_img, pt_text_bg1, pt_text_bg2, color, CV_FILLED, 8, 0);    // filled
            CvScalar black_color;
            black_color.val[0] = 0;
            CvFont font;
            cvInitFont(&font, CV_FONT_HERSHEY_SIMPLEX, font_size, font_size, 0, font_size * 3, 8);	
            cvPutText(show_img, names[class_id], pt_text, &font, black_color);
        }
    }
            cvRectangle(show_img, pt1, pt2, color, width, 8, 0);
            //printf("left=%d, right=%d, top=%d, bottom=%d, obj_id=%d, obj=%s \n", left, right, top, bot, class_id, names[class_id]);
            cvRectangle(show_img, pt_text_bg1, pt_text_bg2, color, width, 8, 0);
            cvRectangle(show_img, pt_text_bg1, pt_text_bg2, color, CV_FILLED, 8, 0);    // filled
            CvScalar black_color;
            black_color.val[0] = 0;
            CvFont font;
            cvInitFont(&font, CV_FONT_HERSHEY_SIMPLEX, font_size, font_size, 0, font_size * 3, 8);    
            cvPutText(show_img, names[class_id], pt_text, &font, black_color);
        }
    }
}

IplImage* draw_train_chart(float max_img_loss, int max_batches, int number_of_lines, int img_size)
{
    int img_offset = 50;
    int draw_size = img_size - img_offset;
    IplImage* img = cvCreateImage(cvSize(img_size, img_size), 8, 3);
    cvSet(img, CV_RGB(255, 255, 255), 0);
    CvPoint pt1, pt2, pt_text;
    CvFont font;
    cvInitFont(&font, CV_FONT_HERSHEY_COMPLEX_SMALL, 0.7, 0.7, 0, 1, CV_AA);
    char char_buff[100];
    int i;
    // vertical lines
    pt1.x = img_offset; pt2.x = img_size, pt_text.x = 10;
    for (i = 1; i <= number_of_lines; ++i) {
        pt1.y = pt2.y = (float)i * draw_size / number_of_lines;
        cvLine(img, pt1, pt2, CV_RGB(224, 224, 224), 1, 8, 0);
        if (i % 10 == 0) {
            sprintf(char_buff, "%2.1f", max_img_loss*(number_of_lines - i) / number_of_lines);
            pt_text.y = pt1.y + 5;
            cvPutText(img, char_buff, pt_text, &font, CV_RGB(0, 0, 0));
            cvLine(img, pt1, pt2, CV_RGB(128, 128, 128), 1, 8, 0);
        }
    }
    // horizontal lines
    pt1.y = draw_size; pt2.y = 0, pt_text.y = draw_size + 15;
    for (i = 0; i <= number_of_lines; ++i) {
        pt1.x = pt2.x = img_offset + (float)i * draw_size / number_of_lines;
        cvLine(img, pt1, pt2, CV_RGB(224, 224, 224), 1, 8, 0);
        if (i % 10 == 0) {
            sprintf(char_buff, "%d", max_batches * i / number_of_lines);
            pt_text.x = pt1.x - 20;
            cvPutText(img, char_buff, pt_text, &font, CV_RGB(0, 0, 0));
            cvLine(img, pt1, pt2, CV_RGB(128, 128, 128), 1, 8, 0);
        }
    }
    cvPutText(img, "Iteration number", cvPoint(draw_size / 2, img_size - 10), &font, CV_RGB(0, 0, 0));
    cvPutText(img, "Press 's' to save: chart.jpg", cvPoint(5, img_size - 10), &font, CV_RGB(0, 0, 0));
    printf(" If error occurs - run training with flag: -dont_show \n");
    cvNamedWindow("average loss", CV_WINDOW_NORMAL);
    cvMoveWindow("average loss", 0, 0);
    cvResizeWindow("average loss", img_size, img_size);
    cvShowImage("average loss", img);
    cvWaitKey(20);
    return img;
    int img_offset = 50;
    int draw_size = img_size - img_offset;
    IplImage* img = cvCreateImage(cvSize(img_size, img_size), 8, 3);
    cvSet(img, CV_RGB(255, 255, 255), 0);
    CvPoint pt1, pt2, pt_text;
    CvFont font;
    cvInitFont(&font, CV_FONT_HERSHEY_COMPLEX_SMALL, 0.7, 0.7, 0, 1, CV_AA);
    char char_buff[100];
    int i;
    // vertical lines
    pt1.x = img_offset; pt2.x = img_size, pt_text.x = 10;
    for (i = 1; i <= number_of_lines; ++i) {
        pt1.y = pt2.y = (float)i * draw_size / number_of_lines;
        cvLine(img, pt1, pt2, CV_RGB(224, 224, 224), 1, 8, 0);
        if (i % 10 == 0) {
            sprintf(char_buff, "%2.1f", max_img_loss*(number_of_lines - i) / number_of_lines);
            pt_text.y = pt1.y + 5;
            cvPutText(img, char_buff, pt_text, &font, CV_RGB(0, 0, 0));
            cvLine(img, pt1, pt2, CV_RGB(128, 128, 128), 1, 8, 0);
        }
    }
    // horizontal lines
    pt1.y = draw_size; pt2.y = 0, pt_text.y = draw_size + 15;
    for (i = 0; i <= number_of_lines; ++i) {
        pt1.x = pt2.x = img_offset + (float)i * draw_size / number_of_lines;
        cvLine(img, pt1, pt2, CV_RGB(224, 224, 224), 1, 8, 0);
        if (i % 10 == 0) {
            sprintf(char_buff, "%d", max_batches * i / number_of_lines);
            pt_text.x = pt1.x - 20;
            cvPutText(img, char_buff, pt_text, &font, CV_RGB(0, 0, 0));
            cvLine(img, pt1, pt2, CV_RGB(128, 128, 128), 1, 8, 0);
        }
    }
    cvPutText(img, "Iteration number", cvPoint(draw_size / 2, img_size - 10), &font, CV_RGB(0, 0, 0));
    cvPutText(img, "Press 's' to save: chart.jpg", cvPoint(5, img_size - 10), &font, CV_RGB(0, 0, 0));
    printf(" If error occurs - run training with flag: -dont_show \n");
    cvNamedWindow("average loss", CV_WINDOW_NORMAL);
    cvMoveWindow("average loss", 0, 0);
    cvResizeWindow("average loss", img_size, img_size);
    cvShowImage("average loss", img);
    cvWaitKey(20);
    return img;
}

void draw_train_loss(IplImage* img, int img_size, float avg_loss, float max_img_loss, int current_batch, int max_batches)
{
    int img_offset = 50;
    int draw_size = img_size - img_offset;
    CvFont font;
    cvInitFont(&font, CV_FONT_HERSHEY_COMPLEX_SMALL, 0.7, 0.7, 0, 1, CV_AA);
    char char_buff[100];
    CvPoint pt1, pt2;
    pt1.x = img_offset + draw_size * (float)current_batch / max_batches;
    pt1.y = draw_size * (1 - avg_loss / max_img_loss);
    if (pt1.y < 0) pt1.y = 1;
    cvCircle(img, pt1, 1, CV_RGB(0, 0, 255), CV_FILLED, 8, 0);
    int img_offset = 50;
    int draw_size = img_size - img_offset;
    CvFont font;
    cvInitFont(&font, CV_FONT_HERSHEY_COMPLEX_SMALL, 0.7, 0.7, 0, 1, CV_AA);
    char char_buff[100];
    CvPoint pt1, pt2;
    pt1.x = img_offset + draw_size * (float)current_batch / max_batches;
    pt1.y = draw_size * (1 - avg_loss / max_img_loss);
    if (pt1.y < 0) pt1.y = 1;
    cvCircle(img, pt1, 1, CV_RGB(0, 0, 255), CV_FILLED, 8, 0);

    sprintf(char_buff, "current avg loss = %2.4f", avg_loss);
    pt1.x = img_size / 2, pt1.y = 30;
    pt2.x = pt1.x + 250, pt2.y = pt1.y + 20;
    cvRectangle(img, pt1, pt2, CV_RGB(255, 255, 255), CV_FILLED, 8, 0);
    pt1.y += 15;
    cvPutText(img, char_buff, pt1, &font, CV_RGB(0, 0, 0));
    cvShowImage("average loss", img);
    int k = cvWaitKey(20);
    if (k == 's' || current_batch == (max_batches-1)) cvSaveImage("chart.jpg", img, 0);
    sprintf(char_buff, "current avg loss = %2.4f", avg_loss);
    pt1.x = img_size / 2, pt1.y = 30;
    pt2.x = pt1.x + 250, pt2.y = pt1.y + 20;
    cvRectangle(img, pt1, pt2, CV_RGB(255, 255, 255), CV_FILLED, 8, 0);
    pt1.y += 15;
    cvPutText(img, char_buff, pt1, &font, CV_RGB(0, 0, 0));
    cvShowImage("average loss", img);
    int k = cvWaitKey(20);
    if (k == 's' || current_batch == (max_batches-1)) cvSaveImage("chart.jpg", img, 0);
}
#endif  // OPENCV
#endif    // OPENCV

void transpose_image(image im)
{
@@ -909,15 +909,15 @@

void show_image_cv_ipl(IplImage *disp, const char *name)
{
    if (disp == NULL) return;
    char buff[256];
    //sprintf(buff, "%s (%d)", name, windows);
    sprintf(buff, "%s", name);
    cvNamedWindow(buff, CV_WINDOW_NORMAL);
    //cvMoveWindow(buff, 100*(windows%10) + 200*(windows/10), 100*(windows%10));
    ++windows;
    cvShowImage(buff, disp);
    //cvReleaseImage(&disp);
    if (disp == NULL) return;
    char buff[256];
    //sprintf(buff, "%s (%d)", name, windows);
    sprintf(buff, "%s", name);
    cvNamedWindow(buff, CV_WINDOW_NORMAL);
    //cvMoveWindow(buff, 100*(windows%10) + 200*(windows/10), 100*(windows%10));
    ++windows;
    cvShowImage(buff, disp);
    //cvReleaseImage(&disp);
}
#endif

@@ -966,22 +966,22 @@

    if( (src = cvLoadImage(filename, flag)) == 0 )
    {
        char shrinked_filename[1024];
        if (strlen(filename) >= 1024) sprintf(shrinked_filename, "name is too long");
        else sprintf(shrinked_filename, "%s", filename);
        fprintf(stderr, "Cannot load image \"%s\"\n", shrinked_filename);
        FILE* fw = fopen("bad.list", "a");
        fwrite(shrinked_filename, sizeof(char), strlen(shrinked_filename), fw);
        char *new_line = "\n";
        fwrite(new_line, sizeof(char), strlen(new_line), fw);
        fclose(fw);
        char shrinked_filename[1024];
        if (strlen(filename) >= 1024) sprintf(shrinked_filename, "name is too long");
        else sprintf(shrinked_filename, "%s", filename);
        fprintf(stderr, "Cannot load image \"%s\"\n", shrinked_filename);
        FILE* fw = fopen("bad.list", "a");
        fwrite(shrinked_filename, sizeof(char), strlen(shrinked_filename), fw);
        char *new_line = "\n";
        fwrite(new_line, sizeof(char), strlen(new_line), fw);
        fclose(fw);
        return make_image(10,10,3);
        //exit(EXIT_FAILURE);
    }
    image out = ipl_to_image(src);
    cvReleaseImage(&src);
    if (out.c > 1)
        rgbgr_image(out);
    if (out.c > 1)
        rgbgr_image(out);
    return out;
}

@@ -996,119 +996,119 @@

image get_image_from_stream_cpp(CvCapture *cap)
{
    //IplImage* src = cvQueryFrame(cap);
    IplImage* src;
    static int once = 1;
    if (once) {
        once = 0;
        do {
            src = get_webcam_frame(cap);
            if (!src) return make_empty_image(0, 0, 0);
        } while (src->width < 1 || src->height < 1 || src->nChannels < 1);
        printf("Video stream: %d x %d \n", src->width, src->height);
    }
    else
        src = get_webcam_frame(cap);
    //IplImage* src = cvQueryFrame(cap);
    IplImage* src;
    static int once = 1;
    if (once) {
        once = 0;
        do {
            src = get_webcam_frame(cap);
            if (!src) return make_empty_image(0, 0, 0);
        } while (src->width < 1 || src->height < 1 || src->nChannels < 1);
        printf("Video stream: %d x %d \n", src->width, src->height);
    }
    else
        src = get_webcam_frame(cap);

    if (!src) return make_empty_image(0, 0, 0);
    image im = ipl_to_image(src);
    rgbgr_image(im);
    return im;
    if (!src) return make_empty_image(0, 0, 0);
    image im = ipl_to_image(src);
    rgbgr_image(im);
    return im;
}

int wait_for_stream(CvCapture *cap, IplImage* src, int dont_close) {
    if (!src) {
        if (dont_close) src = cvCreateImage(cvSize(416, 416), IPL_DEPTH_8U, 3);
        else return 0;
    }
    if (src->width < 1 || src->height < 1 || src->nChannels < 1) {
        if (dont_close) {
            cvReleaseImage(&src);
            int z = 0;
            for (z = 0; z < 20; ++z) {
                get_webcam_frame(cap);
                cvReleaseImage(&src);
            }
            src = cvCreateImage(cvSize(416, 416), IPL_DEPTH_8U, 3);
        }
        else return 0;
    }
    return 1;
    if (!src) {
        if (dont_close) src = cvCreateImage(cvSize(416, 416), IPL_DEPTH_8U, 3);
        else return 0;
    }
    if (src->width < 1 || src->height < 1 || src->nChannels < 1) {
        if (dont_close) {
            cvReleaseImage(&src);
            int z = 0;
            for (z = 0; z < 20; ++z) {
                get_webcam_frame(cap);
                cvReleaseImage(&src);
            }
            src = cvCreateImage(cvSize(416, 416), IPL_DEPTH_8U, 3);
        }
        else return 0;
    }
    return 1;
}

image get_image_from_stream_resize(CvCapture *cap, int w, int h, int c, IplImage** in_img, int cpp_video_capture, int dont_close)
{
    c = c ? c : 3;
    IplImage* src;
    if (cpp_video_capture) {
        static int once = 1;
        if (once) {
            once = 0;
            do {
                src = get_webcam_frame(cap);
                if (!src) return make_empty_image(0, 0, 0);
            } while (src->width < 1 || src->height < 1 || src->nChannels < 1);
            printf("Video stream: %d x %d \n", src->width, src->height);
        } else
            src = get_webcam_frame(cap);
    }
    else src = cvQueryFrame(cap);
    c = c ? c : 3;
    IplImage* src;
    if (cpp_video_capture) {
        static int once = 1;
        if (once) {
            once = 0;
            do {
                src = get_webcam_frame(cap);
                if (!src) return make_empty_image(0, 0, 0);
            } while (src->width < 1 || src->height < 1 || src->nChannels < 1);
            printf("Video stream: %d x %d \n", src->width, src->height);
        } else
            src = get_webcam_frame(cap);
    }
    else src = cvQueryFrame(cap);

    if (cpp_video_capture) 
        if(!wait_for_stream(cap, src, dont_close)) return make_empty_image(0, 0, 0);
    IplImage* new_img = cvCreateImage(cvSize(w, h), IPL_DEPTH_8U, c);
    *in_img = cvCreateImage(cvSize(src->width, src->height), IPL_DEPTH_8U, c);
    cvResize(src, *in_img, CV_INTER_LINEAR);
    cvResize(src, new_img, CV_INTER_LINEAR);
    image im = ipl_to_image(new_img);
    cvReleaseImage(&new_img);
    if (cpp_video_capture) cvReleaseImage(&src);
    if (c>1)
        rgbgr_image(im);
    return im;
    if (cpp_video_capture) 
        if(!wait_for_stream(cap, src, dont_close)) return make_empty_image(0, 0, 0);
    IplImage* new_img = cvCreateImage(cvSize(w, h), IPL_DEPTH_8U, c);
    *in_img = cvCreateImage(cvSize(src->width, src->height), IPL_DEPTH_8U, c);
    cvResize(src, *in_img, CV_INTER_LINEAR);
    cvResize(src, new_img, CV_INTER_LINEAR);
    image im = ipl_to_image(new_img);
    cvReleaseImage(&new_img);
    if (cpp_video_capture) cvReleaseImage(&src);
    if (c>1)
        rgbgr_image(im);
    return im;
}

image get_image_from_stream_letterbox(CvCapture *cap, int w, int h, int c, IplImage** in_img, int cpp_video_capture, int dont_close)
{
    c = c ? c : 3;
    IplImage* src;
    if (cpp_video_capture) {
        static int once = 1;
        if (once) {
            once = 0;
            do {
                src = get_webcam_frame(cap);
                if (!src) return make_empty_image(0, 0, 0);
            } while (src->width < 1 || src->height < 1 || src->nChannels < 1);
            printf("Video stream: %d x %d \n", src->width, src->height);
        }
        else
            src = get_webcam_frame(cap);
    }
    else src = cvQueryFrame(cap);
    c = c ? c : 3;
    IplImage* src;
    if (cpp_video_capture) {
        static int once = 1;
        if (once) {
            once = 0;
            do {
                src = get_webcam_frame(cap);
                if (!src) return make_empty_image(0, 0, 0);
            } while (src->width < 1 || src->height < 1 || src->nChannels < 1);
            printf("Video stream: %d x %d \n", src->width, src->height);
        }
        else
            src = get_webcam_frame(cap);
    }
    else src = cvQueryFrame(cap);

    if (cpp_video_capture)
        if (!wait_for_stream(cap, src, dont_close)) return make_empty_image(0, 0, 0);
    *in_img = cvCreateImage(cvSize(src->width, src->height), IPL_DEPTH_8U, c);
    cvResize(src, *in_img, CV_INTER_LINEAR);
    image tmp = ipl_to_image(src);
    image im = letterbox_image(tmp, w, h);
    free_image(tmp);
    if (cpp_video_capture) cvReleaseImage(&src);
    if (c>1) rgbgr_image(im);
    return im;
    if (cpp_video_capture)
        if (!wait_for_stream(cap, src, dont_close)) return make_empty_image(0, 0, 0);
    *in_img = cvCreateImage(cvSize(src->width, src->height), IPL_DEPTH_8U, c);
    cvResize(src, *in_img, CV_INTER_LINEAR);
    image tmp = ipl_to_image(src);
    image im = letterbox_image(tmp, w, h);
    free_image(tmp);
    if (cpp_video_capture) cvReleaseImage(&src);
    if (c>1) rgbgr_image(im);
    return im;
}

int get_stream_fps(CvCapture *cap, int cpp_video_capture)
{
    int fps = 25;
    if (cpp_video_capture) {
        fps = get_stream_fps_cpp(cap);
    }
    else {
        fps = cvGetCaptureProperty(cap, CV_CAP_PROP_FPS);
    }
    return fps;
    int fps = 25;
    if (cpp_video_capture) {
        fps = get_stream_fps_cpp(cap);
    }
    else {
        fps = cvGetCaptureProperty(cap, CV_CAP_PROP_FPS);
    }
    return fps;
}

void save_image_jpg(image p, const char *name)
@@ -1358,47 +1358,47 @@

void fill_image(image m, float s)
{
    int i;
    for (i = 0; i < m.h*m.w*m.c; ++i) m.data[i] = s;
    int i;
    for (i = 0; i < m.h*m.w*m.c; ++i) m.data[i] = s;
}

void letterbox_image_into(image im, int w, int h, image boxed)
{
    int new_w = im.w;
    int new_h = im.h;
    if (((float)w / im.w) < ((float)h / im.h)) {
        new_w = w;
        new_h = (im.h * w) / im.w;
    }
    else {
        new_h = h;
        new_w = (im.w * h) / im.h;
    }
    image resized = resize_image(im, new_w, new_h);
    embed_image(resized, boxed, (w - new_w) / 2, (h - new_h) / 2);
    free_image(resized);
    int new_w = im.w;
    int new_h = im.h;
    if (((float)w / im.w) < ((float)h / im.h)) {
        new_w = w;
        new_h = (im.h * w) / im.w;
    }
    else {
        new_h = h;
        new_w = (im.w * h) / im.h;
    }
    image resized = resize_image(im, new_w, new_h);
    embed_image(resized, boxed, (w - new_w) / 2, (h - new_h) / 2);
    free_image(resized);
}

image letterbox_image(image im, int w, int h)
{
    int new_w = im.w;
    int new_h = im.h;
    if (((float)w / im.w) < ((float)h / im.h)) {
        new_w = w;
        new_h = (im.h * w) / im.w;
    }
    else {
        new_h = h;
        new_w = (im.w * h) / im.h;
    }
    image resized = resize_image(im, new_w, new_h);
    image boxed = make_image(w, h, im.c);
    fill_image(boxed, .5);
    //int i;
    //for(i = 0; i < boxed.w*boxed.h*boxed.c; ++i) boxed.data[i] = 0;
    embed_image(resized, boxed, (w - new_w) / 2, (h - new_h) / 2);
    free_image(resized);
    return boxed;
    int new_w = im.w;
    int new_h = im.h;
    if (((float)w / im.w) < ((float)h / im.h)) {
        new_w = w;
        new_h = (im.h * w) / im.w;
    }
    else {
        new_h = h;
        new_w = (im.w * h) / im.h;
    }
    image resized = resize_image(im, new_w, new_h);
    image boxed = make_image(w, h, im.c);
    fill_image(boxed, .5);
    //int i;
    //for(i = 0; i < boxed.w*boxed.h*boxed.c; ++i) boxed.data[i] = 0;
    embed_image(resized, boxed, (w - new_w) / 2, (h - new_h) / 2);
    free_image(resized);
    return boxed;
}

image resize_max(image im, int max)
@@ -1660,23 +1660,23 @@

void distort_image(image im, float hue, float sat, float val)
{
    if (im.c >= 3)
    {
        rgb_to_hsv(im);
        scale_image_channel(im, 1, sat);
        scale_image_channel(im, 2, val);
        int i;
        for(i = 0; i < im.w*im.h; ++i){
            im.data[i] = im.data[i] + hue;
            if (im.data[i] > 1) im.data[i] -= 1;
            if (im.data[i] < 0) im.data[i] += 1;
        }
        hsv_to_rgb(im);
    }
    else
    {
        scale_image_channel(im, 0, val);
    }
    if (im.c >= 3)
    {
        rgb_to_hsv(im);
        scale_image_channel(im, 1, sat);
        scale_image_channel(im, 2, val);
        int i;
        for(i = 0; i < im.w*im.h; ++i){
            im.data[i] = im.data[i] + hue;
            if (im.data[i] > 1) im.data[i] -= 1;
            if (im.data[i] < 0) im.data[i] += 1;
        }
        hsv_to_rgb(im);
    }
    else
    {
        scale_image_channel(im, 0, val);
    }
    constrain_image(im);
}

@@ -1812,16 +1812,16 @@
    int w, h, c;
    unsigned char *data = stbi_load(filename, &w, &h, &c, channels);
    if (!data) {
        char shrinked_filename[1024];
        if (strlen(filename) >= 1024) sprintf(shrinked_filename, "name is too long");
        else sprintf(shrinked_filename, "%s", filename);
        fprintf(stderr, "Cannot load image \"%s\"\nSTB Reason: %s\n", shrinked_filename, stbi_failure_reason());
        FILE* fw = fopen("bad.list", "a");
        fwrite(shrinked_filename, sizeof(char), strlen(shrinked_filename), fw);
        char *new_line = "\n";
        fwrite(new_line, sizeof(char), strlen(new_line), fw);
        fclose(fw);
        return make_image(10, 10, 3);
        char shrinked_filename[1024];
        if (strlen(filename) >= 1024) sprintf(shrinked_filename, "name is too long");
        else sprintf(shrinked_filename, "%s", filename);
        fprintf(stderr, "Cannot load image \"%s\"\nSTB Reason: %s\n", shrinked_filename, stbi_failure_reason());
        FILE* fw = fopen("bad.list", "a");
        fwrite(shrinked_filename, sizeof(char), strlen(shrinked_filename), fw);
        char *new_line = "\n";
        fwrite(new_line, sizeof(char), strlen(new_line), fw);
        fclose(fw);
        return make_image(10, 10, 3);
        //exit(EXIT_FAILURE);
    }
    if(channels) c = channels;
@@ -1845,14 +1845,14 @@
#ifdef OPENCV

#ifndef CV_VERSION_EPOCH
    //image out = load_image_stb(filename, c);  // OpenCV 3.x
    image out = load_image_cv(filename, c);
    //image out = load_image_stb(filename, c);    // OpenCV 3.x
    image out = load_image_cv(filename, c);
#else
    image out = load_image_cv(filename, c);     // OpenCV 2.4.x
    image out = load_image_cv(filename, c);        // OpenCV 2.4.x
#endif

#else
    image out = load_image_stb(filename, c);    // without OpenCV
    image out = load_image_stb(filename, c);    // without OpenCV
#endif

    if((h && w) && (h != out.h || w != out.w)){

 src/list.c

@@ -5,11 +5,11 @@

list *make_list()
{
    list *l = malloc(sizeof(list));
    l->size = 0;
    l->front = 0;
    l->back = 0;
    return l;
    list *l = malloc(sizeof(list));
    l->size = 0;
    l->front = 0;
    l->back = 0;
    return l;
}

/*
@@ -40,55 +40,55 @@

void list_insert(list *l, void *val)
{
    node *new = malloc(sizeof(node));
    new->val = val;
    new->next = 0;
    node *new = malloc(sizeof(node));
    new->val = val;
    new->next = 0;

    if(!l->back){
        l->front = new;
        new->prev = 0;
    }else{
        l->back->next = new;
        new->prev = l->back;
    }
    l->back = new;
    ++l->size;
    if(!l->back){
        l->front = new;
        new->prev = 0;
    }else{
        l->back->next = new;
        new->prev = l->back;
    }
    l->back = new;
    ++l->size;
}

void free_node(node *n)
{
    node *next;
    while(n) {
        next = n->next;
        free(n);
        n = next;
    }
    node *next;
    while(n) {
        next = n->next;
        free(n);
        n = next;
    }
}

void free_list(list *l)
{
    free_node(l->front);
    free(l);
    free_node(l->front);
    free(l);
}

void free_list_contents(list *l)
{
    node *n = l->front;
    while(n){
        free(n->val);
        n = n->next;
    }
    node *n = l->front;
    while(n){
        free(n->val);
        n = n->next;
    }
}

void free_list_contents_kvp(list *l)
{
    node *n = l->front;
    while (n) {
        kvp *p = n->val;
        free(p->key);
        free(n->val);
        n = n->next;
    }
    node *n = l->front;
    while (n) {
        kvp *p = n->val;
        free(p->key);
        free(n->val);
        n = n->next;
    }
}

void **list_to_array(list *l)

 src/network.c

@@ -33,19 +33,19 @@

network *load_network_custom(char *cfg, char *weights, int clear, int batch)
{
    printf(" Try to load cfg: %s, weights: %s, clear = %d \n", cfg, weights, clear);
    network *net = calloc(1, sizeof(network));
    *net = parse_network_cfg_custom(cfg, batch);
    if (weights && weights[0] != 0) {
        load_weights(net, weights);
    }
    if (clear) (*net->seen) = 0;
    return net;
    printf(" Try to load cfg: %s, weights: %s, clear = %d \n", cfg, weights, clear);
    network *net = calloc(1, sizeof(network));
    *net = parse_network_cfg_custom(cfg, batch);
    if (weights && weights[0] != 0) {
        load_weights(net, weights);
    }
    if (clear) (*net->seen) = 0;
    return net;
}

network *load_network(char *cfg, char *weights, int clear)
{
    return load_network_custom(cfg, weights, clear, 0);
    return load_network_custom(cfg, weights, clear, 0);
}

int get_current_batch(network net)
@@ -67,23 +67,23 @@

void reset_network_state(network *net, int b)
{
    int i;
    for (i = 0; i < net->n; ++i) {
    int i;
    for (i = 0; i < net->n; ++i) {
#ifdef GPU
        layer l = net->layers[i];
        if (l.state_gpu) {
            fill_ongpu(l.outputs, 0, l.state_gpu + l.outputs*b, 1);
        }
        if (l.h_gpu) {
            fill_ongpu(l.outputs, 0, l.h_gpu + l.outputs*b, 1);
        }
        layer l = net->layers[i];
        if (l.state_gpu) {
            fill_ongpu(l.outputs, 0, l.state_gpu + l.outputs*b, 1);
        }
        if (l.h_gpu) {
            fill_ongpu(l.outputs, 0, l.h_gpu + l.outputs*b, 1);
        }
#endif
    }
    }
}

void reset_rnn(network *net)
{
    reset_network_state(net, 0);
    reset_network_state(net, 0);
}

float get_current_rate(network net)
@@ -91,7 +91,7 @@
    int batch_num = get_current_batch(net);
    int i;
    float rate;
    if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power);
    if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power);
    switch (net.policy) {
        case CONSTANT:
            return net.learning_rate;
@@ -108,7 +108,7 @@
        case EXP:
            return net.learning_rate * pow(net.gamma, batch_num);
        case POLY:
            return net.learning_rate * pow(1 - (float)batch_num / net.max_batches, net.power);
            return net.learning_rate * pow(1 - (float)batch_num / net.max_batches, net.power);
            //if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power);
            //return net.learning_rate * pow(1 - (float)batch_num / net.max_batches, net.power);
        case RANDOM:
@@ -182,10 +182,10 @@
    net.input_gpu = calloc(1, sizeof(float *));
    net.truth_gpu = calloc(1, sizeof(float *));

    net.input16_gpu = calloc(1, sizeof(float *));
    net.output16_gpu = calloc(1, sizeof(float *));
    net.max_input16_size = calloc(1, sizeof(size_t));
    net.max_output16_size = calloc(1, sizeof(size_t));
    net.input16_gpu = calloc(1, sizeof(float *));
    net.output16_gpu = calloc(1, sizeof(float *));
    net.max_input16_size = calloc(1, sizeof(size_t));
    net.max_output16_size = calloc(1, sizeof(size_t));
#endif
    return net;
}
@@ -362,20 +362,20 @@
        net->layers[i].batch = b;
#ifdef CUDNN
        if(net->layers[i].type == CONVOLUTIONAL){
            cudnn_convolutional_setup(net->layers + i, cudnn_fastest);
            /*
            layer *l = net->layers + i;
            cudnn_convolutional_setup(net->layers + i, cudnn_fastest);
            /*
            layer *l = net->layers + i;
            cudnn_convolutional_setup(l, cudnn_fastest);
            // check for excessive memory consumption 
            size_t free_byte;
            size_t total_byte;
            check_error(cudaMemGetInfo(&free_byte, &total_byte));
            if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) {
                printf(" used slow CUDNN algo without Workspace! \n");
                cudnn_convolutional_setup(l, cudnn_smallest);
                l->workspace_size = get_workspace_size(*l);
            }
            */
            // check for excessive memory consumption 
            size_t free_byte;
            size_t total_byte;
            check_error(cudaMemGetInfo(&free_byte, &total_byte));
            if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) {
                printf(" used slow CUDNN algo without Workspace! \n");
                cudnn_convolutional_setup(l, cudnn_smallest);
                l->workspace_size = get_workspace_size(*l);
            }
            */
        }
#endif
    }
@@ -387,12 +387,12 @@
    cuda_set_device(net->gpu_index);
    if(gpu_index >= 0){
        cuda_free(net->workspace);
        if (net->input_gpu) {
            cuda_free(*net->input_gpu);
            *net->input_gpu = 0;
            cuda_free(*net->truth_gpu);
            *net->truth_gpu = 0;
        }
        if (net->input_gpu) {
            cuda_free(*net->input_gpu);
            *net->input_gpu = 0;
            cuda_free(*net->truth_gpu);
            *net->truth_gpu = 0;
        }
    }
#endif
    int i;
@@ -405,7 +405,7 @@
    //fflush(stderr);
    for (i = 0; i < net->n; ++i){
        layer l = net->layers[i];
        //printf(" %d: layer = %d,", i, l.type);
        //printf(" %d: layer = %d,", i, l.type);
        if(l.type == CONVOLUTIONAL){
            resize_convolutional_layer(&l, w, h);
        }else if(l.type == CROP){
@@ -414,14 +414,14 @@
            resize_maxpool_layer(&l, w, h);
        }else if(l.type == REGION){
            resize_region_layer(&l, w, h);
        }else if (l.type == YOLO) {
            resize_yolo_layer(&l, w, h);
        }else if (l.type == YOLO) {
            resize_yolo_layer(&l, w, h);
        }else if(l.type == ROUTE){
            resize_route_layer(&l, net);
        }else if (l.type == SHORTCUT) {
            resize_shortcut_layer(&l, w, h);
        }else if (l.type == UPSAMPLE) {
            resize_upsample_layer(&l, w, h);
        }else if (l.type == SHORTCUT) {
            resize_shortcut_layer(&l, w, h);
        }else if (l.type == UPSAMPLE) {
            resize_upsample_layer(&l, w, h);
        }else if(l.type == REORG){
            resize_reorg_layer(&l, w, h);
        }else if(l.type == AVGPOOL){
@@ -431,7 +431,7 @@
        }else if(l.type == COST){
            resize_cost_layer(&l, inputs);
        }else{
            fprintf(stderr, "Resizing type %d \n", (int)l.type);
            fprintf(stderr, "Resizing type %d \n", (int)l.type);
            error("Cannot resize this type of layer");
        }
        if(l.workspace_size > workspace_size) workspace_size = l.workspace_size;
@@ -443,9 +443,9 @@
    }
#ifdef GPU
    if(gpu_index >= 0){
        printf(" try to allocate workspace = %zu * sizeof(float), ", workspace_size / sizeof(float) + 1);
        printf(" try to allocate workspace = %zu * sizeof(float), ", workspace_size / sizeof(float) + 1);
        net->workspace = cuda_make_array(0, workspace_size/sizeof(float) + 1);
        printf(" CUDA allocate done! \n");
        printf(" CUDA allocate done! \n");
    }else {
        free(net->workspace);
        net->workspace = calloc(1, workspace_size);
@@ -551,112 +551,112 @@

int num_detections(network *net, float thresh)
{
    int i;
    int s = 0;
    for (i = 0; i < net->n; ++i) {
        layer l = net->layers[i];
        if (l.type == YOLO) {
            s += yolo_num_detections(l, thresh);
        }
        if (l.type == DETECTION || l.type == REGION) {
            s += l.w*l.h*l.n;
        }
    }
    return s;
    int i;
    int s = 0;
    for (i = 0; i < net->n; ++i) {
        layer l = net->layers[i];
        if (l.type == YOLO) {
            s += yolo_num_detections(l, thresh);
        }
        if (l.type == DETECTION || l.type == REGION) {
            s += l.w*l.h*l.n;
        }
    }
    return s;
}

detection *make_network_boxes(network *net, float thresh, int *num)
{
    layer l = net->layers[net->n - 1];
    int i;
    int nboxes = num_detections(net, thresh);
    if (num) *num = nboxes;
    detection *dets = calloc(nboxes, sizeof(detection));
    for (i = 0; i < nboxes; ++i) {
        dets[i].prob = calloc(l.classes, sizeof(float));
        if (l.coords > 4) {
            dets[i].mask = calloc(l.coords - 4, sizeof(float));
        }
    }
    return dets;
    layer l = net->layers[net->n - 1];
    int i;
    int nboxes = num_detections(net, thresh);
    if (num) *num = nboxes;
    detection *dets = calloc(nboxes, sizeof(detection));
    for (i = 0; i < nboxes; ++i) {
        dets[i].prob = calloc(l.classes, sizeof(float));
        if (l.coords > 4) {
            dets[i].mask = calloc(l.coords - 4, sizeof(float));
        }
    }
    return dets;
}


void custom_get_region_detections(layer l, int w, int h, int net_w, int net_h, float thresh, int *map, float hier, int relative, detection *dets, int letter)
{
    box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
    float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
    int i, j;
    for (j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float));
    get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, map);
    for (j = 0; j < l.w*l.h*l.n; ++j) {
        dets[j].classes = l.classes;
        dets[j].bbox = boxes[j];
        dets[j].objectness = 1;
        for (i = 0; i < l.classes; ++i) {
            dets[j].prob[i] = probs[j][i];
        }
    }
    box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
    float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
    int i, j;
    for (j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float));
    get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, map);
    for (j = 0; j < l.w*l.h*l.n; ++j) {
        dets[j].classes = l.classes;
        dets[j].bbox = boxes[j];
        dets[j].objectness = 1;
        for (i = 0; i < l.classes; ++i) {
            dets[j].prob[i] = probs[j][i];
        }
    }

    free(boxes);
    free_ptrs((void **)probs, l.w*l.h*l.n);
    free(boxes);
    free_ptrs((void **)probs, l.w*l.h*l.n);

    //correct_region_boxes(dets, l.w*l.h*l.n, w, h, net_w, net_h, relative);
    correct_yolo_boxes(dets, l.w*l.h*l.n, w, h, net_w, net_h, relative, letter);
    //correct_region_boxes(dets, l.w*l.h*l.n, w, h, net_w, net_h, relative);
    correct_yolo_boxes(dets, l.w*l.h*l.n, w, h, net_w, net_h, relative, letter);
}

void fill_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, detection *dets, int letter)
{
    int prev_classes = -1;
    int j;
    for (j = 0; j < net->n; ++j) {
        layer l = net->layers[j];
        if (l.type == YOLO) {
            int count = get_yolo_detections(l, w, h, net->w, net->h, thresh, map, relative, dets, letter);
            dets += count;
            if (prev_classes < 0) prev_classes = l.classes;
            else if (prev_classes != l.classes) {
                printf(" Error: Different [yolo] layers have different number of classes = %d and %d - check your cfg-file! \n",
                    prev_classes, l.classes);
            }
        }
        if (l.type == REGION) {
            custom_get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets, letter);
            //get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets);
            dets += l.w*l.h*l.n;
        }
        if (l.type == DETECTION) {
            get_detection_detections(l, w, h, thresh, dets);
            dets += l.w*l.h*l.n;
        }
    }
    int prev_classes = -1;
    int j;
    for (j = 0; j < net->n; ++j) {
        layer l = net->layers[j];
        if (l.type == YOLO) {
            int count = get_yolo_detections(l, w, h, net->w, net->h, thresh, map, relative, dets, letter);
            dets += count;
            if (prev_classes < 0) prev_classes = l.classes;
            else if (prev_classes != l.classes) {
                printf(" Error: Different [yolo] layers have different number of classes = %d and %d - check your cfg-file! \n",
                    prev_classes, l.classes);
            }
        }
        if (l.type == REGION) {
            custom_get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets, letter);
            //get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets);
            dets += l.w*l.h*l.n;
        }
        if (l.type == DETECTION) {
            get_detection_detections(l, w, h, thresh, dets);
            dets += l.w*l.h*l.n;
        }
    }
}

detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num, int letter)
{
    detection *dets = make_network_boxes(net, thresh, num);
    fill_network_boxes(net, w, h, thresh, hier, map, relative, dets, letter);
    return dets;
    detection *dets = make_network_boxes(net, thresh, num);
    fill_network_boxes(net, w, h, thresh, hier, map, relative, dets, letter);
    return dets;
}

void free_detections(detection *dets, int n)
{
    int i;
    for (i = 0; i < n; ++i) {
        free(dets[i].prob);
        if (dets[i].mask) free(dets[i].mask);
    }
    free(dets);
    int i;
    for (i = 0; i < n; ++i) {
        free(dets[i].prob);
        if (dets[i].mask) free(dets[i].mask);
    }
    free(dets);
}

float *network_predict_image(network *net, image im)
{
    //image imr = letterbox_image(im, net->w, net->h);
    image imr = resize_image(im, net->w, net->h);
    set_batch_network(net, 1);
    float *p = network_predict(*net, imr.data);
    free_image(imr);
    return p;
    //image imr = letterbox_image(im, net->w, net->h);
    image imr = resize_image(im, net->w, net->h);
    set_batch_network(net, 1);
    float *p = network_predict(*net, imr.data);
    free_image(imr);
    return p;
}

int network_width(network *net) { return net->w; }
@@ -780,70 +780,70 @@

void free_network(network net)
{
    int i;
    for (i = 0; i < net.n; ++i) {
        free_layer(net.layers[i]);
    }
    free(net.layers);
    int i;
    for (i = 0; i < net.n; ++i) {
        free_layer(net.layers[i]);
    }
    free(net.layers);

    free(net.scales);
    free(net.steps);
    free(net.seen);
    free(net.scales);
    free(net.steps);
    free(net.seen);

#ifdef GPU
    if (gpu_index >= 0) cuda_free(net.workspace);
    else free(net.workspace);
    if (*net.input_gpu) cuda_free(*net.input_gpu);
    if (*net.truth_gpu) cuda_free(*net.truth_gpu);
    if (net.input_gpu) free(net.input_gpu);
    if (net.truth_gpu) free(net.truth_gpu);
    if (gpu_index >= 0) cuda_free(net.workspace);
    else free(net.workspace);
    if (*net.input_gpu) cuda_free(*net.input_gpu);
    if (*net.truth_gpu) cuda_free(*net.truth_gpu);
    if (net.input_gpu) free(net.input_gpu);
    if (net.truth_gpu) free(net.truth_gpu);

    if (*net.input16_gpu) cuda_free(*net.input16_gpu);
    if (*net.output16_gpu) cuda_free(*net.output16_gpu);
    if (net.input16_gpu) free(net.input16_gpu);
    if (net.output16_gpu) free(net.output16_gpu);
    if (net.max_input16_size) free(net.max_input16_size);
    if (net.max_output16_size) free(net.max_output16_size);
    if (*net.input16_gpu) cuda_free(*net.input16_gpu);
    if (*net.output16_gpu) cuda_free(*net.output16_gpu);
    if (net.input16_gpu) free(net.input16_gpu);
    if (net.output16_gpu) free(net.output16_gpu);
    if (net.max_input16_size) free(net.max_input16_size);
    if (net.max_output16_size) free(net.max_output16_size);
#else
    free(net.workspace);
    free(net.workspace);
#endif
}


void fuse_conv_batchnorm(network net)
{
    int j;
    for (j = 0; j < net.n; ++j) {
        layer *l = &net.layers[j];
    int j;
    for (j = 0; j < net.n; ++j) {
        layer *l = &net.layers[j];

        if (l->type == CONVOLUTIONAL) {
            //printf(" Merges Convolutional-%d and batch_norm \n", j);
        if (l->type == CONVOLUTIONAL) {
            //printf(" Merges Convolutional-%d and batch_norm \n", j);

            if (l->batch_normalize) {
                int f;
                for (f = 0; f < l->n; ++f)
                {
                    l->biases[f] = l->biases[f] - (double)l->scales[f] * l->rolling_mean[f] / (sqrt((double)l->rolling_variance[f]) + .000001f);
            if (l->batch_normalize) {
                int f;
                for (f = 0; f < l->n; ++f)
                {
                    l->biases[f] = l->biases[f] - (double)l->scales[f] * l->rolling_mean[f] / (sqrt((double)l->rolling_variance[f]) + .000001f);

                    const size_t filter_size = l->size*l->size*l->c;
                    int i;
                    for (i = 0; i < filter_size; ++i) {
                        int w_index = f*filter_size + i;
                    const size_t filter_size = l->size*l->size*l->c;
                    int i;
                    for (i = 0; i < filter_size; ++i) {
                        int w_index = f*filter_size + i;

                        l->weights[w_index] = (double)l->weights[w_index] * l->scales[f] / (sqrt((double)l->rolling_variance[f]) + .000001f);
                    }
                }
                        l->weights[w_index] = (double)l->weights[w_index] * l->scales[f] / (sqrt((double)l->rolling_variance[f]) + .000001f);
                    }
                }

                l->batch_normalize = 0;
                l->batch_normalize = 0;
#ifdef GPU
                if (gpu_index >= 0) {
                    push_convolutional_layer(*l);
                }
                if (gpu_index >= 0) {
                    push_convolutional_layer(*l);
                }
#endif
            }
        }
        else {
            //printf(" Fusion skip layer type: %d \n", l->type);
        }
    }
            }
        }
        else {
            //printf(" Fusion skip layer type: %d \n", l->type);
        }
    }
}

 src/network_kernels.cu

@@ -55,23 +55,23 @@
            fill_ongpu(l.outputs * l.batch, 0, l.delta_gpu, 1);
        }
        l.forward_gpu(l, state);
        if(net.wait_stream)
            cudaStreamSynchronize(get_cuda_stream());
        if(net.wait_stream)
            cudaStreamSynchronize(get_cuda_stream());
        state.input = l.output_gpu;
/*
        cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
        if (l.out_w >= 0 && l.out_h >= 1 && l.c >= 3) {
            int j;
            for (j = 0; j < l.out_c; ++j) {
                image img = make_image(l.out_w, l.out_h, 3);
                memcpy(img.data, l.output+ l.out_w*l.out_h*j, l.out_w*l.out_h * 1 * sizeof(float));
                char buff[256];
                sprintf(buff, "layer-%d slice-%d", i, j);
                show_image(img, buff);
            }
            cvWaitKey(0); // wait press-key in console
            cvDestroyAllWindows();
        }
        cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
        if (l.out_w >= 0 && l.out_h >= 1 && l.c >= 3) {
            int j;
            for (j = 0; j < l.out_c; ++j) {
                image img = make_image(l.out_w, l.out_h, 3);
                memcpy(img.data, l.output+ l.out_w*l.out_h*j, l.out_w*l.out_h * 1 * sizeof(float));
                char buff[256];
                sprintf(buff, "layer-%d slice-%d", i, j);
                show_image(img, buff);
            }
            cvWaitKey(0); // wait press-key in console
            cvDestroyAllWindows();
        }
*/
    }
}
@@ -133,14 +133,14 @@
    state.truth = *net.truth_gpu;
    state.train = 1;
#ifdef CUDNN_HALF
    int i;
    for (i = 0; i < net.n; ++i) {
        layer l = net.layers[i];
        cuda_convert_f32_to_f16(l.weights_gpu, l.c*l.n*l.size*l.size, l.weights_gpu16);
    }
    int i;
    for (i = 0; i < net.n; ++i) {
        layer l = net.layers[i];
        cuda_convert_f32_to_f16(l.weights_gpu, l.c*l.n*l.size*l.size, l.weights_gpu16);
    }
#endif
    forward_network_gpu(net, state);
    //cudaStreamSynchronize(get_cuda_stream());
    //cudaStreamSynchronize(get_cuda_stream());
    backward_network_gpu(net, state);
}

@@ -421,8 +421,8 @@

float *network_predict_gpu(network net, float *input)
{
    if (net.gpu_index != cuda_get_device())
        cuda_set_device(net.gpu_index);
    if (net.gpu_index != cuda_get_device())
        cuda_set_device(net.gpu_index);
    int size = get_network_input_size(net) * net.batch;
    network_state state;
    state.index = 0;

 src/option_list.c

@@ -34,21 +34,21 @@

metadata get_metadata(char *file)
{
    metadata m = { 0 };
    list *options = read_data_cfg(file);
    metadata m = { 0 };
    list *options = read_data_cfg(file);

    char *name_list = option_find_str(options, "names", 0);
    if (!name_list) name_list = option_find_str(options, "labels", 0);
    if (!name_list) {
        fprintf(stderr, "No names or labels found\n");
    }
    else {
        m.names = get_labels(name_list);
    }
    m.classes = option_find_int(options, "classes", 2);
    free_list(options);
    printf("Loaded - names_list: %s, classes = %d \n", name_list, m.classes);
    return m;
    char *name_list = option_find_str(options, "names", 0);
    if (!name_list) name_list = option_find_str(options, "labels", 0);
    if (!name_list) {
        fprintf(stderr, "No names or labels found\n");
    }
    else {
        m.names = get_labels(name_list);
    }
    m.classes = option_find_int(options, "classes", 2);
    free_list(options);
    printf("Loaded - names_list: %s, classes = %d \n", name_list, m.classes);
    return m;
}

int read_option(char *s, list *options)

 src/parser.c

@@ -49,7 +49,7 @@
    if (strcmp(type, "[cost]")==0) return COST;
    if (strcmp(type, "[detection]")==0) return DETECTION;
    if (strcmp(type, "[region]")==0) return REGION;
    if (strcmp(type, "[yolo]") == 0) return YOLO;
    if (strcmp(type, "[yolo]") == 0) return YOLO;
    if (strcmp(type, "[local]")==0) return LOCAL;
    if (strcmp(type, "[conv]")==0
            || strcmp(type, "[convolutional]")==0) return CONVOLUTIONAL;
@@ -64,7 +64,7 @@
    if (strcmp(type, "[max]")==0
            || strcmp(type, "[maxpool]")==0) return MAXPOOL;
    if (strcmp(type, "[reorg]")==0) return REORG;
    if (strcmp(type, "[reorg_old]") == 0) return REORG_OLD;
    if (strcmp(type, "[reorg_old]") == 0) return REORG_OLD;
    if (strcmp(type, "[avg]")==0
            || strcmp(type, "[avgpool]")==0) return AVGPOOL;
    if (strcmp(type, "[dropout]")==0) return DROPOUT;
@@ -74,7 +74,7 @@
    if (strcmp(type, "[soft]")==0
            || strcmp(type, "[softmax]")==0) return SOFTMAX;
    if (strcmp(type, "[route]")==0) return ROUTE;
    if (strcmp(type, "[upsample]") == 0) return UPSAMPLE;
    if (strcmp(type, "[upsample]") == 0) return UPSAMPLE;
    return BLANK;
}

@@ -241,68 +241,68 @@

int *parse_yolo_mask(char *a, int *num)
{
    int *mask = 0;
    if (a) {
        int len = strlen(a);
        int n = 1;
        int i;
        for (i = 0; i < len; ++i) {
            if (a[i] == ',') ++n;
        }
        mask = calloc(n, sizeof(int));
        for (i = 0; i < n; ++i) {
            int val = atoi(a);
            mask[i] = val;
            a = strchr(a, ',') + 1;
        }
        *num = n;
    }
    return mask;
    int *mask = 0;
    if (a) {
        int len = strlen(a);
        int n = 1;
        int i;
        for (i = 0; i < len; ++i) {
            if (a[i] == ',') ++n;
        }
        mask = calloc(n, sizeof(int));
        for (i = 0; i < n; ++i) {
            int val = atoi(a);
            mask[i] = val;
            a = strchr(a, ',') + 1;
        }
        *num = n;
    }
    return mask;
}

layer parse_yolo(list *options, size_params params)
{
    int classes = option_find_int(options, "classes", 20);
    int total = option_find_int(options, "num", 1);
    int num = total;
    int classes = option_find_int(options, "classes", 20);
    int total = option_find_int(options, "num", 1);
    int num = total;

    char *a = option_find_str(options, "mask", 0);
    int *mask = parse_yolo_mask(a, &num);
    int max_boxes = option_find_int_quiet(options, "max", 90);
    layer l = make_yolo_layer(params.batch, params.w, params.h, num, total, mask, classes, max_boxes);
    if (l.outputs != params.inputs) {
        printf("Error: l.outputs == params.inputs \n");
        printf("filters= in the [convolutional]-layer doesn't correspond to classes= or mask= in [yolo]-layer \n");
        exit(EXIT_FAILURE);
    }
    //assert(l.outputs == params.inputs);
    char *a = option_find_str(options, "mask", 0);
    int *mask = parse_yolo_mask(a, &num);
    int max_boxes = option_find_int_quiet(options, "max", 90);
    layer l = make_yolo_layer(params.batch, params.w, params.h, num, total, mask, classes, max_boxes);
    if (l.outputs != params.inputs) {
        printf("Error: l.outputs == params.inputs \n");
        printf("filters= in the [convolutional]-layer doesn't correspond to classes= or mask= in [yolo]-layer \n");
        exit(EXIT_FAILURE);
    }
    //assert(l.outputs == params.inputs);

    //l.max_boxes = option_find_int_quiet(options, "max", 90);
    l.jitter = option_find_float(options, "jitter", .2);
    l.focal_loss = option_find_int_quiet(options, "focal_loss", 0);
    //l.max_boxes = option_find_int_quiet(options, "max", 90);
    l.jitter = option_find_float(options, "jitter", .2);
    l.focal_loss = option_find_int_quiet(options, "focal_loss", 0);

    l.ignore_thresh = option_find_float(options, "ignore_thresh", .5);
    l.truth_thresh = option_find_float(options, "truth_thresh", 1);
    l.random = option_find_int_quiet(options, "random", 0);
    l.ignore_thresh = option_find_float(options, "ignore_thresh", .5);
    l.truth_thresh = option_find_float(options, "truth_thresh", 1);
    l.random = option_find_int_quiet(options, "random", 0);

    char *map_file = option_find_str(options, "map", 0);
    if (map_file) l.map = read_map(map_file);
    char *map_file = option_find_str(options, "map", 0);
    if (map_file) l.map = read_map(map_file);

    a = option_find_str(options, "anchors", 0);
    if (a) {
        int len = strlen(a);
        int n = 1;
        int i;
        for (i = 0; i < len; ++i) {
            if (a[i] == ',') ++n;
        }
        for (i = 0; i < n && i < total*2; ++i) {
            float bias = atof(a);
            l.biases[i] = bias;
            a = strchr(a, ',') + 1;
        }
    }
    return l;
    a = option_find_str(options, "anchors", 0);
    if (a) {
        int len = strlen(a);
        int n = 1;
        int i;
        for (i = 0; i < len; ++i) {
            if (a[i] == ',') ++n;
        }
        for (i = 0; i < n && i < total*2; ++i) {
            float bias = atof(a);
            l.biases[i] = bias;
            a = strchr(a, ',') + 1;
        }
    }
    return l;
}

layer parse_region(list *options, size_params params)
@@ -310,21 +310,21 @@
    int coords = option_find_int(options, "coords", 4);
    int classes = option_find_int(options, "classes", 20);
    int num = option_find_int(options, "num", 1);
    int max_boxes = option_find_int_quiet(options, "max", 90);
    int max_boxes = option_find_int_quiet(options, "max", 90);

    layer l = make_region_layer(params.batch, params.w, params.h, num, classes, coords, max_boxes);
    if (l.outputs != params.inputs) {
        printf("Error: l.outputs == params.inputs \n");
        printf("filters= in the [convolutional]-layer doesn't correspond to classes= or num= in [region]-layer \n");
        exit(EXIT_FAILURE);
    }
    if (l.outputs != params.inputs) {
        printf("Error: l.outputs == params.inputs \n");
        printf("filters= in the [convolutional]-layer doesn't correspond to classes= or num= in [region]-layer \n");
        exit(EXIT_FAILURE);
    }
    //assert(l.outputs == params.inputs);

    l.log = option_find_int_quiet(options, "log", 0);
    l.sqrt = option_find_int_quiet(options, "sqrt", 0);

    l.softmax = option_find_int(options, "softmax", 0);
    l.focal_loss = option_find_int_quiet(options, "focal_loss", 0);
    l.focal_loss = option_find_int_quiet(options, "focal_loss", 0);
    //l.max_boxes = option_find_int_quiet(options, "max",30);
    l.jitter = option_find_float(options, "jitter", .2);
    l.rescore = option_find_int_quiet(options, "rescore",0);
@@ -337,7 +337,7 @@
    l.coord_scale = option_find_float(options, "coord_scale", 1);
    l.object_scale = option_find_float(options, "object_scale", 1);
    l.noobject_scale = option_find_float(options, "noobject_scale", 1);
    l.mask_scale = option_find_float(options, "mask_scale", 1);
    l.mask_scale = option_find_float(options, "mask_scale", 1);
    l.class_scale = option_find_float(options, "class_scale", 1);
    l.bias_match = option_find_int_quiet(options, "bias_match",0);

@@ -438,19 +438,19 @@

layer parse_reorg_old(list *options, size_params params)
{
    printf("\n reorg_old \n");
    int stride = option_find_int(options, "stride", 1);
    int reverse = option_find_int_quiet(options, "reverse", 0);
    printf("\n reorg_old \n");
    int stride = option_find_int(options, "stride", 1);
    int reverse = option_find_int_quiet(options, "reverse", 0);

    int batch, h, w, c;
    h = params.h;
    w = params.w;
    c = params.c;
    batch = params.batch;
    if (!(h && w && c)) error("Layer before reorg layer must output image.");
    int batch, h, w, c;
    h = params.h;
    w = params.w;
    c = params.c;
    batch = params.batch;
    if (!(h && w && c)) error("Layer before reorg layer must output image.");

    layer layer = make_reorg_old_layer(batch, w, h, c, stride, reverse);
    return layer;
    layer layer = make_reorg_old_layer(batch, w, h, c, stride, reverse);
    return layer;
}

maxpool_layer parse_maxpool(list *options, size_params params)
@@ -547,10 +547,10 @@
layer parse_upsample(list *options, size_params params, network net)
{

    int stride = option_find_int(options, "stride", 2);
    layer l = make_upsample_layer(params.batch, params.w, params.h, params.c, stride);
    l.scale = option_find_float_quiet(options, "scale", 1);
    return l;
    int stride = option_find_int(options, "stride", 2);
    layer l = make_upsample_layer(params.batch, params.w, params.h, params.c, stride);
    l.scale = option_find_float_quiet(options, "scale", 1);
    return l;
}

route_layer parse_route(list *options, size_params params, network net)
@@ -632,15 +632,15 @@
    net->inputs = option_find_int_quiet(options, "inputs", net->h * net->w * net->c);
    net->max_crop = option_find_int_quiet(options, "max_crop",net->w*2);
    net->min_crop = option_find_int_quiet(options, "min_crop",net->w);
    net->flip = option_find_int_quiet(options, "flip", 1);
    net->flip = option_find_int_quiet(options, "flip", 1);

    net->small_object = option_find_int_quiet(options, "small_object", 0);
    net->small_object = option_find_int_quiet(options, "small_object", 0);
    net->angle = option_find_float_quiet(options, "angle", 0);
    net->aspect = option_find_float_quiet(options, "aspect", 1);
    net->saturation = option_find_float_quiet(options, "saturation", 1);
    net->exposure = option_find_float_quiet(options, "exposure", 1);
    net->hue = option_find_float_quiet(options, "hue", 0);
    net->power = option_find_float_quiet(options, "power", 4);
    net->power = option_find_float_quiet(options, "power", 4);

    if(!net->inputs && !(net->h && net->w && net->c)) error("No input parameters supplied");

@@ -648,7 +648,7 @@
    net->policy = get_policy(policy_s);
    net->burn_in = option_find_int_quiet(options, "burn_in", 0);
#ifdef CUDNN_HALF
    net->burn_in = 0;
    net->burn_in = 0;
#endif
    if(net->policy == STEP){
        net->step = option_find_int(options, "step", 1);
@@ -696,7 +696,7 @@

network parse_network_cfg(char *filename)
{
    return parse_network_cfg_custom(filename, 0);
    return parse_network_cfg_custom(filename, 0);
}

network parse_network_cfg_custom(char *filename, int batch)
@@ -717,12 +717,12 @@
    params.w = net.w;
    params.c = net.c;
    params.inputs = net.inputs;
    if (batch > 0) net.batch = batch;
    if (batch > 0) net.batch = batch;
    params.batch = net.batch;
    params.time_steps = net.time_steps;
    params.net = net;

    float bflops = 0;
    float bflops = 0;
    size_t workspace_size = 0;
    n = n->next;
    int count = 0;
@@ -755,8 +755,8 @@
            l = parse_cost(options, params);
        }else if(lt == REGION){
            l = parse_region(options, params);
        }else if (lt == YOLO) {
            l = parse_yolo(options, params);
        }else if (lt == YOLO) {
            l = parse_yolo(options, params);
        }else if(lt == DETECTION){
            l = parse_detection(options, params);
        }else if(lt == SOFTMAX){
@@ -769,15 +769,15 @@
        }else if(lt == MAXPOOL){
            l = parse_maxpool(options, params);
        }else if(lt == REORG){
            l = parse_reorg(options, params);       }
        else if (lt == REORG_OLD) {
            l = parse_reorg_old(options, params);
            l = parse_reorg(options, params);        }
        else if (lt == REORG_OLD) {
            l = parse_reorg_old(options, params);
        }else if(lt == AVGPOOL){
            l = parse_avgpool(options, params);
        }else if(lt == ROUTE){
            l = parse_route(options, params, net);
        }else if (lt == UPSAMPLE) {
            l = parse_upsample(options, params, net);
        }else if (lt == UPSAMPLE) {
            l = parse_upsample(options, params, net);
        }else if(lt == SHORTCUT){
            l = parse_shortcut(options, params, net);
        }else if(lt == DROPOUT){
@@ -807,12 +807,12 @@
            params.c = l.out_c;
            params.inputs = l.outputs;
        }
        if (l.bflops > 0) bflops += l.bflops;
        if (l.bflops > 0) bflops += l.bflops;
    }   
    free_list(sections);
    net.outputs = get_network_output_size(net);
    net.output = get_network_output(net);
    printf("Total BFLOPS %5.3f \n", bflops);
    printf("Total BFLOPS %5.3f \n", bflops);
    if(workspace_size){
        //printf("%ld\n", workspace_size);
#ifdef GPU
@@ -825,11 +825,11 @@
        net.workspace = calloc(1, workspace_size);
#endif
    }
    LAYER_TYPE lt = net.layers[net.n - 1].type;
    if ((net.w % 32 != 0 || net.h % 32 != 0) && (lt == YOLO || lt == REGION || lt == DETECTION)) {
        printf("\n Warning: width=%d and height=%d in cfg-file must be divisible by 32 for default networks Yolo v1/v2/v3!!! \n\n",
            net.w, net.h);
    }
    LAYER_TYPE lt = net.layers[net.n - 1].type;
    if ((net.w % 32 != 0 || net.h % 32 != 0) && (lt == YOLO || lt == REGION || lt == DETECTION)) {
        printf("\n Warning: width=%d and height=%d in cfg-file must be divisible by 32 for default networks Yolo v1/v2/v3!!! \n\n",
            net.w, net.h);
    }
    return net;
}

@@ -1160,16 +1160,16 @@
    fread(&major, sizeof(int), 1, fp);
    fread(&minor, sizeof(int), 1, fp);
    fread(&revision, sizeof(int), 1, fp);
    if ((major * 10 + minor) >= 2) {
        printf("\n seen 64 \n");
        uint64_t iseen = 0;
        fread(&iseen, sizeof(uint64_t), 1, fp);
        *net->seen = iseen;
    }
    else {
        printf("\n seen 32 \n");
        fread(net->seen, sizeof(int), 1, fp);
    }
    if ((major * 10 + minor) >= 2) {
        printf("\n seen 64 \n");
        uint64_t iseen = 0;
        fread(&iseen, sizeof(uint64_t), 1, fp);
        *net->seen = iseen;
    }
    else {
        printf("\n seen 32 \n");
        fread(net->seen, sizeof(int), 1, fp);
    }
    int transpose = (major > 1000) || (minor > 1000);

    int i;

 src/region_layer.c

@@ -27,7 +27,7 @@
    l.bias_updates = calloc(n*2, sizeof(float));
    l.outputs = h*w*n*(classes + coords + 1);
    l.inputs = l.outputs;
    l.max_boxes = max_boxes;
    l.max_boxes = max_boxes;
    l.truths = max_boxes*(5);
    l.delta = calloc(batch*l.outputs, sizeof(float));
    l.output = calloc(batch*l.outputs, sizeof(float));
@@ -53,8 +53,8 @@

void resize_region_layer(layer *l, int w, int h)
{
    int old_w = l->w;
    int old_h = l->h;
    int old_w = l->w;
    int old_h = l->h;
    l->w = w;
    l->h = h;

@@ -65,13 +65,13 @@
    l->delta = realloc(l->delta, l->batch*l->outputs*sizeof(float));

#ifdef GPU
    if (old_w < w || old_h < h) {
        cuda_free(l->delta_gpu);
        cuda_free(l->output_gpu);
    if (old_w < w || old_h < h) {
        cuda_free(l->delta_gpu);
        cuda_free(l->output_gpu);

        l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs);
        l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
    }
        l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs);
        l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
    }
#endif
}

@@ -127,34 +127,34 @@
            class_id = hier->parent[class_id];
        }
        *avg_cat += pred;
    } else {		
        // Focal loss
        if (focal_loss) {
            // Focal Loss
            float alpha = 0.5;  // 0.25 or 0.5
            //float gamma = 2;  // hardcoded in many places of the grad-formula	
    } else {        
        // Focal loss
        if (focal_loss) {
            // Focal Loss
            float alpha = 0.5;    // 0.25 or 0.5
            //float gamma = 2;    // hardcoded in many places of the grad-formula    

            int ti = index + class_id;
            float pt = output[ti] + 0.000000000000001F;
            // http://fooplot.com/#W3sidHlwZSI6MCwiZXEiOiItKDEteCkqKDIqeCpsb2coeCkreC0xKSIsImNvbG9yIjoiIzAwMDAwMCJ9LHsidHlwZSI6MTAwMH1d
            float grad = -(1 - pt) * (2 * pt*logf(pt) + pt - 1);    // http://blog.csdn.net/linmingan/article/details/77885832	
            //float grad = (1 - pt) * (2 * pt*logf(pt) + pt - 1);   // https://github.com/unsky/focal-loss
            int ti = index + class_id;
            float pt = output[ti] + 0.000000000000001F;
            // http://fooplot.com/#W3sidHlwZSI6MCwiZXEiOiItKDEteCkqKDIqeCpsb2coeCkreC0xKSIsImNvbG9yIjoiIzAwMDAwMCJ9LHsidHlwZSI6MTAwMH1d
            float grad = -(1 - pt) * (2 * pt*logf(pt) + pt - 1);    // http://blog.csdn.net/linmingan/article/details/77885832    
            //float grad = (1 - pt) * (2 * pt*logf(pt) + pt - 1);    // https://github.com/unsky/focal-loss

            for (n = 0; n < classes; ++n) {
                delta[index + n] = scale * (((n == class_id) ? 1 : 0) - output[index + n]);
            for (n = 0; n < classes; ++n) {
                delta[index + n] = scale * (((n == class_id) ? 1 : 0) - output[index + n]);

                delta[index + n] *= alpha*grad;
                delta[index + n] *= alpha*grad;

                if (n == class_id) *avg_cat += output[index + n];
            }
        }
        else {
            // default
            for (n = 0; n < classes; ++n) {
                delta[index + n] = scale * (((n == class_id) ? 1 : 0) - output[index + n]);
                if (n == class_id) *avg_cat += output[index + n];
            }
        }
                if (n == class_id) *avg_cat += output[index + n];
            }
        }
        else {
            // default
            for (n = 0; n < classes; ++n) {
                delta[index + n] = scale * (((n == class_id) ? 1 : 0) - output[index + n]);
                if (n == class_id) *avg_cat += output[index + n];
            }
        }
    }
}

@@ -170,9 +170,9 @@

static int entry_index(layer l, int batch, int location, int entry)
{
    int n = location / (l.w*l.h);
    int loc = location % (l.w*l.h);
    return batch*l.outputs + n*l.w*l.h*(l.coords + l.classes + 1) + entry*l.w*l.h + loc;
    int n = location / (l.w*l.h);
    int loc = location % (l.w*l.h);
    return batch*l.outputs + n*l.w*l.h*(l.coords + l.classes + 1) + entry*l.w*l.h + loc;
}

void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarchy, float *output);
@@ -256,8 +256,8 @@
                    int best_class_id = -1;
                    for(t = 0; t < l.max_boxes; ++t){
                        box truth = float_to_box(state.truth + t*5 + b*l.truths);
                        int class_id = state.truth[t * 5 + b*l.truths + 4];
                        if (class_id >= l.classes) continue; // if label contains class_id more than number of classes in the cfg-file
                        int class_id = state.truth[t * 5 + b*l.truths + 4];
                        if (class_id >= l.classes) continue; // if label contains class_id more than number of classes in the cfg-file
                        if(!truth.x) break;
                        float iou = box_iou(pred, truth);
                        if (iou > best_iou) {
@@ -295,12 +295,12 @@
        }
        for(t = 0; t < l.max_boxes; ++t){
            box truth = float_to_box(state.truth + t*5 + b*l.truths);
            int class_id = state.truth[t * 5 + b*l.truths + 4];
            if (class_id >= l.classes) {
                printf(" Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes-1);
                getchar();
                continue; // if label contains class_id more than number of classes in the cfg-file
            }
            int class_id = state.truth[t * 5 + b*l.truths + 4];
            if (class_id >= l.classes) {
                printf(" Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes-1);
                getchar();
                continue; // if label contains class_id more than number of classes in the cfg-file
            }

            if(!truth.x) break;
            float best_iou = 0;
@@ -450,7 +450,7 @@
        cuda_pull_array(state.truth, truth_cpu, num_truth);
    }
    cuda_pull_array(l.output_gpu, in_cpu, l.batch*l.inputs);
    //cudaStreamSynchronize(get_cuda_stream());
    //cudaStreamSynchronize(get_cuda_stream());
    network_state cpu_state = state;
    cpu_state.train = state.train;
    cpu_state.truth = truth_cpu;
@@ -460,7 +460,7 @@
    free(cpu_state.input);
    if(!state.train) return;
    cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
    //cudaStreamSynchronize(get_cuda_stream());
    //cudaStreamSynchronize(get_cuda_stream());
    if(cpu_state.truth) free(cpu_state.truth);
}

@@ -473,107 +473,107 @@

void correct_region_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative)
{
    int i;
    int new_w = 0;
    int new_h = 0;
    if (((float)netw / w) < ((float)neth / h)) {
        new_w = netw;
        new_h = (h * netw) / w;
    }
    else {
        new_h = neth;
        new_w = (w * neth) / h;
    }
    for (i = 0; i < n; ++i) {
        box b = dets[i].bbox;
        b.x = (b.x - (netw - new_w) / 2. / netw) / ((float)new_w / netw);
        b.y = (b.y - (neth - new_h) / 2. / neth) / ((float)new_h / neth);
        b.w *= (float)netw / new_w;
        b.h *= (float)neth / new_h;
        if (!relative) {
            b.x *= w;
            b.w *= w;
            b.y *= h;
            b.h *= h;
        }
        dets[i].bbox = b;
    }
    int i;
    int new_w = 0;
    int new_h = 0;
    if (((float)netw / w) < ((float)neth / h)) {
        new_w = netw;
        new_h = (h * netw) / w;
    }
    else {
        new_h = neth;
        new_w = (w * neth) / h;
    }
    for (i = 0; i < n; ++i) {
        box b = dets[i].bbox;
        b.x = (b.x - (netw - new_w) / 2. / netw) / ((float)new_w / netw);
        b.y = (b.y - (neth - new_h) / 2. / neth) / ((float)new_h / neth);
        b.w *= (float)netw / new_w;
        b.h *= (float)neth / new_h;
        if (!relative) {
            b.x *= w;
            b.w *= w;
            b.y *= h;
            b.h *= h;
        }
        dets[i].bbox = b;
    }
}


void get_region_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, float tree_thresh, int relative, detection *dets)
{
    int i, j, n, z;
    float *predictions = l.output;
    if (l.batch == 2) {
        float *flip = l.output + l.outputs;
        for (j = 0; j < l.h; ++j) {
            for (i = 0; i < l.w / 2; ++i) {
                for (n = 0; n < l.n; ++n) {
                    for (z = 0; z < l.classes + l.coords + 1; ++z) {
                        int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i;
                        int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1);
                        float swap = flip[i1];
                        flip[i1] = flip[i2];
                        flip[i2] = swap;
                        if (z == 0) {
                            flip[i1] = -flip[i1];
                            flip[i2] = -flip[i2];
                        }
                    }
                }
            }
        }
        for (i = 0; i < l.outputs; ++i) {
            l.output[i] = (l.output[i] + flip[i]) / 2.;
        }
    }
    for (i = 0; i < l.w*l.h; ++i) {
        int row = i / l.w;
        int col = i % l.w;
        for (n = 0; n < l.n; ++n) {
            int index = n*l.w*l.h + i;
            for (j = 0; j < l.classes; ++j) {
                dets[index].prob[j] = 0;
            }
            int obj_index = entry_index(l, 0, n*l.w*l.h + i, l.coords);
            int box_index = entry_index(l, 0, n*l.w*l.h + i, 0);
            int mask_index = entry_index(l, 0, n*l.w*l.h + i, 4);
            float scale = l.background ? 1 : predictions[obj_index];
            dets[index].bbox = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h);// , l.w*l.h);
            dets[index].objectness = scale > thresh ? scale : 0;
            if (dets[index].mask) {
                for (j = 0; j < l.coords - 4; ++j) {
                    dets[index].mask[j] = l.output[mask_index + j*l.w*l.h];
                }
            }
    int i, j, n, z;
    float *predictions = l.output;
    if (l.batch == 2) {
        float *flip = l.output + l.outputs;
        for (j = 0; j < l.h; ++j) {
            for (i = 0; i < l.w / 2; ++i) {
                for (n = 0; n < l.n; ++n) {
                    for (z = 0; z < l.classes + l.coords + 1; ++z) {
                        int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i;
                        int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1);
                        float swap = flip[i1];
                        flip[i1] = flip[i2];
                        flip[i2] = swap;
                        if (z == 0) {
                            flip[i1] = -flip[i1];
                            flip[i2] = -flip[i2];
                        }
                    }
                }
            }
        }
        for (i = 0; i < l.outputs; ++i) {
            l.output[i] = (l.output[i] + flip[i]) / 2.;
        }
    }
    for (i = 0; i < l.w*l.h; ++i) {
        int row = i / l.w;
        int col = i % l.w;
        for (n = 0; n < l.n; ++n) {
            int index = n*l.w*l.h + i;
            for (j = 0; j < l.classes; ++j) {
                dets[index].prob[j] = 0;
            }
            int obj_index = entry_index(l, 0, n*l.w*l.h + i, l.coords);
            int box_index = entry_index(l, 0, n*l.w*l.h + i, 0);
            int mask_index = entry_index(l, 0, n*l.w*l.h + i, 4);
            float scale = l.background ? 1 : predictions[obj_index];
            dets[index].bbox = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h);// , l.w*l.h);
            dets[index].objectness = scale > thresh ? scale : 0;
            if (dets[index].mask) {
                for (j = 0; j < l.coords - 4; ++j) {
                    dets[index].mask[j] = l.output[mask_index + j*l.w*l.h];
                }
            }

            int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + !l.background);
            if (l.softmax_tree) {
            int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + !l.background);
            if (l.softmax_tree) {

                hierarchy_predictions(predictions + class_index, l.classes, l.softmax_tree, 0);// , l.w*l.h);
                if (map) {
                    for (j = 0; j < 200; ++j) {
                        int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + 1 + map[j]);
                        float prob = scale*predictions[class_index];
                        dets[index].prob[j] = (prob > thresh) ? prob : 0;
                    }
                }
                else {
                    int j = hierarchy_top_prediction(predictions + class_index, l.softmax_tree, tree_thresh, l.w*l.h);
                    dets[index].prob[j] = (scale > thresh) ? scale : 0;
                }
            }
            else {
                if (dets[index].objectness) {
                    for (j = 0; j < l.classes; ++j) {
                        int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + 1 + j);
                        float prob = scale*predictions[class_index];
                        dets[index].prob[j] = (prob > thresh) ? prob : 0;
                    }
                }
            }
        }
    }
    correct_region_boxes(dets, l.w*l.h*l.n, w, h, netw, neth, relative);
                hierarchy_predictions(predictions + class_index, l.classes, l.softmax_tree, 0);// , l.w*l.h);
                if (map) {
                    for (j = 0; j < 200; ++j) {
                        int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + 1 + map[j]);
                        float prob = scale*predictions[class_index];
                        dets[index].prob[j] = (prob > thresh) ? prob : 0;
                    }
                }
                else {
                    int j = hierarchy_top_prediction(predictions + class_index, l.softmax_tree, tree_thresh, l.w*l.h);
                    dets[index].prob[j] = (scale > thresh) ? scale : 0;
                }
            }
            else {
                if (dets[index].objectness) {
                    for (j = 0; j < l.classes; ++j) {
                        int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + 1 + j);
                        float prob = scale*predictions[class_index];
                        dets[index].prob[j] = (prob > thresh) ? prob : 0;
                    }
                }
            }
        }
    }
    correct_region_boxes(dets, l.w*l.h*l.n, w, h, netw, neth, relative);
}

 src/reorg_layer.c

@@ -77,42 +77,42 @@

void forward_reorg_layer(const layer l, network_state state)
{
    if (l.reverse) {
        reorg_cpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, l.output);
    }
    else {
        reorg_cpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, l.output);
    }
    if (l.reverse) {
        reorg_cpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, l.output);
    }
    else {
        reorg_cpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, l.output);
    }
}

void backward_reorg_layer(const layer l, network_state state)
{
    if (l.reverse) {
        reorg_cpu(l.delta, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, state.delta);
    }
    else {
        reorg_cpu(l.delta, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, state.delta);
    }
    if (l.reverse) {
        reorg_cpu(l.delta, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, state.delta);
    }
    else {
        reorg_cpu(l.delta, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, state.delta);
    }
}

#ifdef GPU
void forward_reorg_layer_gpu(layer l, network_state state)
{
    if (l.reverse) {
        reorg_ongpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, l.output_gpu);
    }
    else {
        reorg_ongpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, l.output_gpu);
    }
    if (l.reverse) {
        reorg_ongpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, l.output_gpu);
    }
    else {
        reorg_ongpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, l.output_gpu);
    }
}

void backward_reorg_layer_gpu(layer l, network_state state)
{
    if (l.reverse) {
        reorg_ongpu(l.delta_gpu, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, state.delta);
    }
    else {
        reorg_ongpu(l.delta_gpu, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, state.delta);
    }
    if (l.reverse) {
        reorg_ongpu(l.delta_gpu, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, state.delta);
    }
    else {
        reorg_ongpu(l.delta_gpu, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, state.delta);
    }
}
#endif

 src/reorg_old_layer.c

@@ -77,42 +77,42 @@

void forward_reorg_old_layer(const layer l, network_state state)
{
    if (l.reverse) {
        reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output);
    }
    else {
        reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output);
    }
    if (l.reverse) {
        reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output);
    }
    else {
        reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output);
    }
}

void backward_reorg_old_layer(const layer l, network_state state)
{
    if (l.reverse) {
        reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta);
    }
    else {
        reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta);
    }
    if (l.reverse) {
        reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta);
    }
    else {
        reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta);
    }
}

#ifdef GPU
void forward_reorg_old_layer_gpu(layer l, network_state state)
{
    if (l.reverse) {
        reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu);
    }
    else {
        reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu);
    }
    if (l.reverse) {
        reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu);
    }
    else {
        reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu);
    }
}

void backward_reorg_old_layer_gpu(layer l, network_state state)
{
    if (l.reverse) {
        reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta);
    }
    else {
        reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta);
    }
    if (l.reverse) {
        reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta);
    }
    else {
        reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta);
    }
}
#endif

 src/shortcut_layer.c

@@ -38,20 +38,20 @@

void resize_shortcut_layer(layer *l, int w, int h)
{
    //assert(l->w == l->out_w);
    //assert(l->h == l->out_h);
    l->w = l->out_w = w;
    l->h = l->out_h = h;
    l->outputs = w*h*l->out_c;
    l->inputs = l->outputs;
    l->delta = realloc(l->delta, l->outputs*l->batch * sizeof(float));
    l->output = realloc(l->output, l->outputs*l->batch * sizeof(float));
    //assert(l->w == l->out_w);
    //assert(l->h == l->out_h);
    l->w = l->out_w = w;
    l->h = l->out_h = h;
    l->outputs = w*h*l->out_c;
    l->inputs = l->outputs;
    l->delta = realloc(l->delta, l->outputs*l->batch * sizeof(float));
    l->output = realloc(l->output, l->outputs*l->batch * sizeof(float));

#ifdef GPU
    cuda_free(l->output_gpu);
    cuda_free(l->delta_gpu);
    l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch);
    l->delta_gpu = cuda_make_array(l->delta, l->outputs*l->batch);
    cuda_free(l->output_gpu);
    cuda_free(l->delta_gpu);
    l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch);
    l->delta_gpu = cuda_make_array(l->delta, l->outputs*l->batch);
#endif

}

 src/tree.c

@@ -52,34 +52,34 @@

int hierarchy_top_prediction(float *predictions, tree *hier, float thresh, int stride)
{
    float p = 1;
    int group = 0;
    int i;
    while (1) {
        float max = 0;
        int max_i = 0;
    float p = 1;
    int group = 0;
    int i;
    while (1) {
        float max = 0;
        int max_i = 0;

        for (i = 0; i < hier->group_size[group]; ++i) {
            int index = i + hier->group_offset[group];
            float val = predictions[(i + hier->group_offset[group])*stride];
            if (val > max) {
                max_i = index;
                max = val;
            }
        }
        if (p*max > thresh) {
            p = p*max;
            group = hier->child[max_i];
            if (hier->child[max_i] < 0) return max_i;
        }
        else if (group == 0) {
            return max_i;
        }
        else {
            return hier->parent[hier->group_offset[group]];
        }
    }
    return 0;
        for (i = 0; i < hier->group_size[group]; ++i) {
            int index = i + hier->group_offset[group];
            float val = predictions[(i + hier->group_offset[group])*stride];
            if (val > max) {
                max_i = index;
                max = val;
            }
        }
        if (p*max > thresh) {
            p = p*max;
            group = hier->child[max_i];
            if (hier->child[max_i] < 0) return max_i;
        }
        else if (group == 0) {
            return max_i;
        }
        else {
            return hier->parent[hier->group_offset[group]];
        }
    }
    return 0;
}

tree *read_tree(char *filename)

 src/utils.c

@@ -18,11 +18,11 @@

double what_time_is_it_now()
{
    struct timeval time;
    if (gettimeofday(&time, NULL)) {
        return 0;
    }
    return (double)time.tv_sec + (double)time.tv_usec * .000001;
    struct timeval time;
    if (gettimeofday(&time, NULL)) {
        return 0;
    }
    return (double)time.tv_sec + (double)time.tv_usec * .000001;
}

int *read_map(char *filename)
@@ -57,7 +57,7 @@
    void *swp = calloc(1, size);
    for(i = 0; i < n-1; ++i){
        size_t j = i + rand()/(RAND_MAX / (n-i)+1);
        memcpy(swp,         (char*)arr+(j*size), size);
        memcpy(swp,            (char*)arr+(j*size), size);
        memcpy((char*)arr+(j*size), (char*)arr+(i*size), size);
        memcpy((char*)arr+(i*size), swp,          size);
    }
@@ -137,7 +137,7 @@
    {
        c = next+1;
    }
    if(!next) while ((next = strchr(c, '\\'))) { c = next + 1; }
    if(!next) while ((next = strchr(c, '\\'))) { c = next + 1; }
    c = copy_string(c);
    next = strchr(c, '.');
    if (next) *next = 0;
@@ -169,63 +169,63 @@

void find_replace(char *str, char *orig, char *rep, char *output)
{
    char *buffer = calloc(8192, sizeof(char));
    char *buffer = calloc(8192, sizeof(char));
    char *p;

    sprintf(buffer, "%s", str);
    if(!(p = strstr(buffer, orig))){  // Is 'orig' even in 'str'?
        sprintf(output, "%s", str);
        free(buffer);
        free(buffer);
        return;
    }

    *p = '\0';

    sprintf(output, "%s%s%s", buffer, rep, p+strlen(orig));
    free(buffer);
    free(buffer);
}

void find_replace_extension(char *str, char *orig, char *rep, char *output)
{
    char *buffer = calloc(8192, sizeof(char));
    char *buffer = calloc(8192, sizeof(char));

    sprintf(buffer, "%s", str);
    char *p = strstr(buffer, orig);
    int offset = (p - buffer);
    int chars_from_end = strlen(buffer) - offset;
    if (!p || chars_from_end != strlen(orig)) {  // Is 'orig' even in 'str' AND is 'orig' found at the end of 'str'?
        sprintf(output, "%s", str);
        free(buffer);
        return;
    }
    sprintf(buffer, "%s", str);
    char *p = strstr(buffer, orig);
    int offset = (p - buffer);
    int chars_from_end = strlen(buffer) - offset;
    if (!p || chars_from_end != strlen(orig)) {  // Is 'orig' even in 'str' AND is 'orig' found at the end of 'str'?
        sprintf(output, "%s", str);
        free(buffer);
        return;
    }

    *p = '\0';
    *p = '\0';

    sprintf(output, "%s%s%s", buffer, rep, p + strlen(orig));
    free(buffer);
    sprintf(output, "%s%s%s", buffer, rep, p + strlen(orig));
    free(buffer);
}

void replace_image_to_label(char *input_path, char *output_path) {
    //find_replace(input_path, "/images/", "/labels/", output_path);    // COCO
    find_replace(input_path, "/images/train2014/", "/labels/train2014/", output_path);  // COCO
    find_replace(output_path, "/images/val2014/", "/labels/val2014/", output_path);     // COCO
    find_replace(output_path, "/JPEGImages/", "/labels/", output_path); // PascalVOC
    //find_replace(output_path, "/VOC2007/JPEGImages/", "/VOC2007/labels/", output_path);       // PascalVOC
    //find_replace(output_path, "/VOC2012/JPEGImages/", "/VOC2012/labels/", output_path);       // PascalVOC
    //find_replace(input_path, "/images/", "/labels/", output_path);    // COCO
    find_replace(input_path, "/images/train2014/", "/labels/train2014/", output_path);    // COCO
    find_replace(output_path, "/images/val2014/", "/labels/val2014/", output_path);        // COCO
    find_replace(output_path, "/JPEGImages/", "/labels/", output_path);    // PascalVOC
    //find_replace(output_path, "/VOC2007/JPEGImages/", "/VOC2007/labels/", output_path);        // PascalVOC
    //find_replace(output_path, "/VOC2012/JPEGImages/", "/VOC2012/labels/", output_path);        // PascalVOC

    //find_replace(output_path, "/raw/", "/labels/", output_path);
    //find_replace(output_path, "/raw/", "/labels/", output_path);

    // replace only ext of files
    find_replace_extension(output_path, ".jpg", ".txt", output_path);
    find_replace_extension(output_path, ".JPG", ".txt", output_path); // error
    find_replace_extension(output_path, ".jpeg", ".txt", output_path);
    find_replace_extension(output_path, ".JPEG", ".txt", output_path);
    find_replace_extension(output_path, ".png", ".txt", output_path);
    find_replace_extension(output_path, ".PNG", ".txt", output_path);
    find_replace_extension(output_path, ".bmp", ".txt", output_path);
    find_replace_extension(output_path, ".BMP", ".txt", output_path);
    find_replace_extension(output_path, ".ppm", ".txt", output_path);
    find_replace_extension(output_path, ".PPM", ".txt", output_path);
    // replace only ext of files
    find_replace_extension(output_path, ".jpg", ".txt", output_path);
    find_replace_extension(output_path, ".JPG", ".txt", output_path); // error
    find_replace_extension(output_path, ".jpeg", ".txt", output_path);
    find_replace_extension(output_path, ".JPEG", ".txt", output_path);
    find_replace_extension(output_path, ".png", ".txt", output_path);
    find_replace_extension(output_path, ".PNG", ".txt", output_path);
    find_replace_extension(output_path, ".bmp", ".txt", output_path);
    find_replace_extension(output_path, ".BMP", ".txt", output_path);
    find_replace_extension(output_path, ".ppm", ".txt", output_path);
    find_replace_extension(output_path, ".PPM", ".txt", output_path);
}

float sec(clock_t clocks)
@@ -299,15 +299,15 @@

void strip_args(char *s)
{
    size_t i;
    size_t len = strlen(s);
    size_t offset = 0;
    for (i = 0; i < len; ++i) {
        char c = s[i];
        if (c == '\t' || c == '\n' || c == '\r' || c == 0x0d || c == 0x0a) ++offset;
        else s[i - offset] = c;
    }
    s[len - offset] = '\0';
    size_t i;
    size_t len = strlen(s);
    size_t offset = 0;
    for (i = 0; i < len; ++i) {
        char c = s[i];
        if (c == '\t' || c == '\n' || c == '\r' || c == 0x0d || c == 0x0a) ++offset;
        else s[i - offset] = c;
    }
    s[len - offset] = '\0';
}

void strip_char(char *s, char bad)
@@ -356,11 +356,11 @@
        fgets(&line[curr], readsize, fp);
        curr = strlen(line);
    }
    if(curr >= 2)
        if(line[curr-2] == 0x0d) line[curr-2] = 0x00;
    if(curr >= 2)
        if(line[curr-2] == 0x0d) line[curr-2] = 0x00;

    if(curr >= 1)
        if(line[curr-1] == 0x0a) line[curr-1] = 0x00;
    if(curr >= 1)
        if(line[curr-1] == 0x0a) line[curr-1] = 0x00;

    return line;
}
@@ -620,11 +620,11 @@

int int_index(int *a, int val, int n)
{
    int i;
    for (i = 0; i < n; ++i) {
        if (a[i] == val) return i;
    }
    return -1;
    int i;
    for (i = 0; i < n; ++i) {
        if (a[i] == val) return i;
    }
    return -1;
}

int rand_int(int min, int max)
@@ -691,7 +691,7 @@
        max = swap;
    }
    return ((float)rand()/RAND_MAX * (max - min)) + min;
    //return (random_float() * (max - min)) + min;
    //return (random_float() * (max - min)) + min;
}

float rand_scale(float s)
@@ -715,30 +715,30 @@

unsigned int random_gen()
{
    unsigned int rnd = 0;
    unsigned int rnd = 0;
#ifdef WIN32
    rand_s(&rnd);
    rand_s(&rnd);
#else
    rnd = rand();
    rnd = rand();
#endif
    return rnd;
    return rnd;
}

float random_float()
{
#ifdef WIN32
    return ((float)random_gen() / (float)UINT_MAX);
    return ((float)random_gen() / (float)UINT_MAX);
#else
    return ((float)random_gen() / (float)RAND_MAX);
    return ((float)random_gen() / (float)RAND_MAX);
#endif
}

float rand_uniform_strong(float min, float max)
{
    if (max < min) {
        float swap = min;
        min = max;
        max = swap;
    }
    return (random_float() * (max - min)) + min;
    if (max < min) {
        float swap = min;
        min = max;
        max = swap;
    }
    return (random_float() * (max - min)) + min;
}

 src/yolo_console_dll.cpp

@@ -21,10 +21,10 @@
//#pragma comment(lib, "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.1/lib/x64/cudart.lib")
//static std::shared_ptr<image_t> device_ptr(NULL, [](void *img) { cudaDeviceReset(); });

#include "yolo_v2_class.hpp"    // imported functions from DLL
#include "yolo_v2_class.hpp"    // imported functions from DLL

#ifdef OPENCV
#include <opencv2/opencv.hpp>           // C++
#include <opencv2/opencv.hpp>            // C++
#include "opencv2/core/version.hpp"
#ifndef CV_VERSION_EPOCH
#include "opencv2/videoio/videoio.hpp"
@@ -36,67 +36,67 @@
#pragma comment(lib, "opencv_core" OPENCV_VERSION ".lib")
#pragma comment(lib, "opencv_imgproc" OPENCV_VERSION ".lib")
#pragma comment(lib, "opencv_highgui" OPENCV_VERSION ".lib")
#endif  // TRACK_OPTFLOW
#endif    // TRACK_OPTFLOW
#else
#define OPENCV_VERSION CVAUX_STR(CV_VERSION_EPOCH)""CVAUX_STR(CV_VERSION_MAJOR)""CVAUX_STR(CV_VERSION_MINOR)
#pragma comment(lib, "opencv_core" OPENCV_VERSION ".lib")
#pragma comment(lib, "opencv_imgproc" OPENCV_VERSION ".lib")
#pragma comment(lib, "opencv_highgui" OPENCV_VERSION ".lib")
#endif  // CV_VERSION_EPOCH
#endif    // CV_VERSION_EPOCH

class track_kalman {
public:
    cv::KalmanFilter kf;
    int state_size, meas_size, contr_size;
    cv::KalmanFilter kf;
    int state_size, meas_size, contr_size;


    track_kalman(int _state_size = 10, int _meas_size = 10, int _contr_size = 0)
        : state_size(_state_size), meas_size(_meas_size), contr_size(_contr_size)
    {
        kf.init(state_size, meas_size, contr_size, CV_32F);
    track_kalman(int _state_size = 10, int _meas_size = 10, int _contr_size = 0)
        : state_size(_state_size), meas_size(_meas_size), contr_size(_contr_size)
    {
        kf.init(state_size, meas_size, contr_size, CV_32F);

        cv::setIdentity(kf.measurementMatrix);
        cv::setIdentity(kf.measurementNoiseCov, cv::Scalar::all(1e-1));
        cv::setIdentity(kf.processNoiseCov, cv::Scalar::all(1e-5));
        cv::setIdentity(kf.errorCovPost, cv::Scalar::all(1e-2));
        cv::setIdentity(kf.transitionMatrix);
    }
        cv::setIdentity(kf.measurementMatrix);
        cv::setIdentity(kf.measurementNoiseCov, cv::Scalar::all(1e-1));
        cv::setIdentity(kf.processNoiseCov, cv::Scalar::all(1e-5));
        cv::setIdentity(kf.errorCovPost, cv::Scalar::all(1e-2));
        cv::setIdentity(kf.transitionMatrix);
    }

    void set(std::vector<bbox_t> result_vec) {
        for (size_t i = 0; i < result_vec.size() && i < state_size*2; ++i) {
            kf.statePost.at<float>(i * 2 + 0) = result_vec[i].x;
            kf.statePost.at<float>(i * 2 + 1) = result_vec[i].y;
        }
    }
    void set(std::vector<bbox_t> result_vec) {
        for (size_t i = 0; i < result_vec.size() && i < state_size*2; ++i) {
            kf.statePost.at<float>(i * 2 + 0) = result_vec[i].x;
            kf.statePost.at<float>(i * 2 + 1) = result_vec[i].y;
        }
    }

    // Kalman.correct() calculates: statePost = statePre + gain * (z(k)-measurementMatrix*statePre);
    // corrected state (x(k)): x(k)=x'(k)+K(k)*(z(k)-H*x'(k))
    std::vector<bbox_t> correct(std::vector<bbox_t> result_vec) {
        cv::Mat measurement(meas_size, 1, CV_32F);
        for (size_t i = 0; i < result_vec.size() && i < meas_size * 2; ++i) {
            measurement.at<float>(i * 2 + 0) = result_vec[i].x;
            measurement.at<float>(i * 2 + 1) = result_vec[i].y;
        }
        cv::Mat estimated = kf.correct(measurement);
        for (size_t i = 0; i < result_vec.size() && i < meas_size * 2; ++i) {
            result_vec[i].x = estimated.at<float>(i * 2 + 0);
            result_vec[i].y = estimated.at<float>(i * 2 + 1);
        }
        return result_vec;
    }
    // Kalman.correct() calculates: statePost = statePre + gain * (z(k)-measurementMatrix*statePre);
    // corrected state (x(k)): x(k)=x'(k)+K(k)*(z(k)-H*x'(k))
    std::vector<bbox_t> correct(std::vector<bbox_t> result_vec) {
        cv::Mat measurement(meas_size, 1, CV_32F);
        for (size_t i = 0; i < result_vec.size() && i < meas_size * 2; ++i) {
            measurement.at<float>(i * 2 + 0) = result_vec[i].x;
            measurement.at<float>(i * 2 + 1) = result_vec[i].y;
        }
        cv::Mat estimated = kf.correct(measurement);
        for (size_t i = 0; i < result_vec.size() && i < meas_size * 2; ++i) {
            result_vec[i].x = estimated.at<float>(i * 2 + 0);
            result_vec[i].y = estimated.at<float>(i * 2 + 1);
        }
        return result_vec;
    }

    // Kalman.predict() calculates: statePre = TransitionMatrix * statePost;
    // predicted state (x'(k)): x(k)=A*x(k-1)+B*u(k)
    std::vector<bbox_t> predict() {
        std::vector<bbox_t> result_vec;
        cv::Mat control;
        cv::Mat prediction = kf.predict(control);
        for (size_t i = 0; i < prediction.rows && i < state_size * 2; ++i) {
            result_vec[i].x = prediction.at<float>(i * 2 + 0);
            result_vec[i].y = prediction.at<float>(i * 2 + 1);
        }
        return result_vec;
    }
    // Kalman.predict() calculates: statePre = TransitionMatrix * statePost;
    // predicted state (x'(k)): x(k)=A*x(k-1)+B*u(k)
    std::vector<bbox_t> predict() {
        std::vector<bbox_t> result_vec;
        cv::Mat control;
        cv::Mat prediction = kf.predict(control);
        for (size_t i = 0; i < prediction.rows && i < state_size * 2; ++i) {
            result_vec[i].x = prediction.at<float>(i * 2 + 0);
            result_vec[i].y = prediction.at<float>(i * 2 + 1);
        }
        return result_vec;
    }

};

@@ -105,384 +105,384 @@

class extrapolate_coords_t {
public:
    std::vector<bbox_t> old_result_vec;
    std::vector<float> dx_vec, dy_vec, time_vec;
    std::vector<float> old_dx_vec, old_dy_vec;
    std::vector<bbox_t> old_result_vec;
    std::vector<float> dx_vec, dy_vec, time_vec;
    std::vector<float> old_dx_vec, old_dy_vec;

    void new_result(std::vector<bbox_t> new_result_vec, float new_time) {
        old_dx_vec = dx_vec;
        old_dy_vec = dy_vec;
        if (old_dx_vec.size() != old_result_vec.size()) std::cout << "old_dx != old_res \n";
        dx_vec = std::vector<float>(new_result_vec.size(), 0);
        dy_vec = std::vector<float>(new_result_vec.size(), 0);
        update_result(new_result_vec, new_time, false);
        old_result_vec = new_result_vec;
        time_vec = std::vector<float>(new_result_vec.size(), new_time);
    }
    void new_result(std::vector<bbox_t> new_result_vec, float new_time) {
        old_dx_vec = dx_vec;
        old_dy_vec = dy_vec;
        if (old_dx_vec.size() != old_result_vec.size()) std::cout << "old_dx != old_res \n";
        dx_vec = std::vector<float>(new_result_vec.size(), 0);
        dy_vec = std::vector<float>(new_result_vec.size(), 0);
        update_result(new_result_vec, new_time, false);
        old_result_vec = new_result_vec;
        time_vec = std::vector<float>(new_result_vec.size(), new_time);
    }

    void update_result(std::vector<bbox_t> new_result_vec, float new_time, bool update = true) {
        for (size_t i = 0; i < new_result_vec.size(); ++i) {
            for (size_t k = 0; k < old_result_vec.size(); ++k) {
                if (old_result_vec[k].track_id == new_result_vec[i].track_id && old_result_vec[k].obj_id == new_result_vec[i].obj_id) {
                    float const delta_time = new_time - time_vec[k];
                    if (abs(delta_time) < 1) break;
                    size_t index = (update) ? k : i;
                    float dx = ((float)new_result_vec[i].x - (float)old_result_vec[k].x) / delta_time;
                    float dy = ((float)new_result_vec[i].y - (float)old_result_vec[k].y) / delta_time;
                    float old_dx = dx, old_dy = dy;
    void update_result(std::vector<bbox_t> new_result_vec, float new_time, bool update = true) {
        for (size_t i = 0; i < new_result_vec.size(); ++i) {
            for (size_t k = 0; k < old_result_vec.size(); ++k) {
                if (old_result_vec[k].track_id == new_result_vec[i].track_id && old_result_vec[k].obj_id == new_result_vec[i].obj_id) {
                    float const delta_time = new_time - time_vec[k];
                    if (abs(delta_time) < 1) break;
                    size_t index = (update) ? k : i;
                    float dx = ((float)new_result_vec[i].x - (float)old_result_vec[k].x) / delta_time;
                    float dy = ((float)new_result_vec[i].y - (float)old_result_vec[k].y) / delta_time;
                    float old_dx = dx, old_dy = dy;

                    // if it's shaking
                    if (update) {
                        if (dx * dx_vec[i] < 0) dx = dx / 2;
                        if (dy * dy_vec[i] < 0) dy = dy / 2;
                    } else {
                        if (dx * old_dx_vec[k] < 0) dx = dx / 2;
                        if (dy * old_dy_vec[k] < 0) dy = dy / 2;
                    }
                    dx_vec[index] = dx;
                    dy_vec[index] = dy;
                    // if it's shaking
                    if (update) {
                        if (dx * dx_vec[i] < 0) dx = dx / 2;
                        if (dy * dy_vec[i] < 0) dy = dy / 2;
                    } else {
                        if (dx * old_dx_vec[k] < 0) dx = dx / 2;
                        if (dy * old_dy_vec[k] < 0) dy = dy / 2;
                    }
                    dx_vec[index] = dx;
                    dy_vec[index] = dy;

                    //if (old_dx == dx && old_dy == dy) std::cout << "not shakin \n";
                    //else std::cout << "shakin \n";
                    //if (old_dx == dx && old_dy == dy) std::cout << "not shakin \n";
                    //else std::cout << "shakin \n";

                    if (dx_vec[index] > 1000 || dy_vec[index] > 1000) {
                        //std::cout << "!!! bad dx or dy, dx = " << dx_vec[index] << ", dy = " << dy_vec[index] << 
                        //  ", delta_time = " << delta_time << ", update = " << update << std::endl;
                        dx_vec[index] = 0;
                        dy_vec[index] = 0;						
                    }
                    old_result_vec[k].x = new_result_vec[i].x;
                    old_result_vec[k].y = new_result_vec[i].y;
                    time_vec[k] = new_time;
                    break;
                }
            }
        }
    }
                    if (dx_vec[index] > 1000 || dy_vec[index] > 1000) {
                        //std::cout << "!!! bad dx or dy, dx = " << dx_vec[index] << ", dy = " << dy_vec[index] << 
                        //    ", delta_time = " << delta_time << ", update = " << update << std::endl;
                        dx_vec[index] = 0;
                        dy_vec[index] = 0;                        
                    }
                    old_result_vec[k].x = new_result_vec[i].x;
                    old_result_vec[k].y = new_result_vec[i].y;
                    time_vec[k] = new_time;
                    break;
                }
            }
        }
    }

    std::vector<bbox_t> predict(float cur_time) {
        std::vector<bbox_t> result_vec = old_result_vec;
        for (size_t i = 0; i < old_result_vec.size(); ++i) {
            float const delta_time = cur_time - time_vec[i];
            auto &bbox = result_vec[i];
            float new_x = (float) bbox.x + dx_vec[i] * delta_time;
            float new_y = (float) bbox.y + dy_vec[i] * delta_time;
            if (new_x > 0) bbox.x = new_x;
            else bbox.x = 0;
            if (new_y > 0) bbox.y = new_y;
            else bbox.y = 0;
        }
        return result_vec;
    }
    std::vector<bbox_t> predict(float cur_time) {
        std::vector<bbox_t> result_vec = old_result_vec;
        for (size_t i = 0; i < old_result_vec.size(); ++i) {
            float const delta_time = cur_time - time_vec[i];
            auto &bbox = result_vec[i];
            float new_x = (float) bbox.x + dx_vec[i] * delta_time;
            float new_y = (float) bbox.y + dy_vec[i] * delta_time;
            if (new_x > 0) bbox.x = new_x;
            else bbox.x = 0;
            if (new_y > 0) bbox.y = new_y;
            else bbox.y = 0;
        }
        return result_vec;
    }

};


void draw_boxes(cv::Mat mat_img, std::vector<bbox_t> result_vec, std::vector<std::string> obj_names, 
    int current_det_fps = -1, int current_cap_fps = -1)
    int current_det_fps = -1, int current_cap_fps = -1)
{
    int const colors[6][3] = { { 1,0,1 },{ 0,0,1 },{ 0,1,1 },{ 0,1,0 },{ 1,1,0 },{ 1,0,0 } };
    int const colors[6][3] = { { 1,0,1 },{ 0,0,1 },{ 0,1,1 },{ 0,1,0 },{ 1,1,0 },{ 1,0,0 } };

    for (auto &i : result_vec) {
        cv::Scalar color = obj_id_to_color(i.obj_id);
        cv::rectangle(mat_img, cv::Rect(i.x, i.y, i.w, i.h), color, 2);
        if (obj_names.size() > i.obj_id) {
            std::string obj_name = obj_names[i.obj_id];
            if (i.track_id > 0) obj_name += " - " + std::to_string(i.track_id);
            cv::Size const text_size = getTextSize(obj_name, cv::FONT_HERSHEY_COMPLEX_SMALL, 1.2, 2, 0);
            int const max_width = (text_size.width > i.w + 2) ? text_size.width : (i.w + 2);
            cv::rectangle(mat_img, cv::Point2f(std::max((int)i.x - 1, 0), std::max((int)i.y - 30, 0)), 
                cv::Point2f(std::min((int)i.x + max_width, mat_img.cols-1), std::min((int)i.y, mat_img.rows-1)), 
                color, CV_FILLED, 8, 0);
            putText(mat_img, obj_name, cv::Point2f(i.x, i.y - 10), cv::FONT_HERSHEY_COMPLEX_SMALL, 1.2, cv::Scalar(0, 0, 0), 2);
        }
    }
    if (current_det_fps >= 0 && current_cap_fps >= 0) {
        std::string fps_str = "FPS detection: " + std::to_string(current_det_fps) + "   FPS capture: " + std::to_string(current_cap_fps);
        putText(mat_img, fps_str, cv::Point2f(10, 20), cv::FONT_HERSHEY_COMPLEX_SMALL, 1.2, cv::Scalar(50, 255, 0), 2);
    }
    for (auto &i : result_vec) {
        cv::Scalar color = obj_id_to_color(i.obj_id);
        cv::rectangle(mat_img, cv::Rect(i.x, i.y, i.w, i.h), color, 2);
        if (obj_names.size() > i.obj_id) {
            std::string obj_name = obj_names[i.obj_id];
            if (i.track_id > 0) obj_name += " - " + std::to_string(i.track_id);
            cv::Size const text_size = getTextSize(obj_name, cv::FONT_HERSHEY_COMPLEX_SMALL, 1.2, 2, 0);
            int const max_width = (text_size.width > i.w + 2) ? text_size.width : (i.w + 2);
            cv::rectangle(mat_img, cv::Point2f(std::max((int)i.x - 1, 0), std::max((int)i.y - 30, 0)), 
                cv::Point2f(std::min((int)i.x + max_width, mat_img.cols-1), std::min((int)i.y, mat_img.rows-1)), 
                color, CV_FILLED, 8, 0);
            putText(mat_img, obj_name, cv::Point2f(i.x, i.y - 10), cv::FONT_HERSHEY_COMPLEX_SMALL, 1.2, cv::Scalar(0, 0, 0), 2);
        }
    }
    if (current_det_fps >= 0 && current_cap_fps >= 0) {
        std::string fps_str = "FPS detection: " + std::to_string(current_det_fps) + "   FPS capture: " + std::to_string(current_cap_fps);
        putText(mat_img, fps_str, cv::Point2f(10, 20), cv::FONT_HERSHEY_COMPLEX_SMALL, 1.2, cv::Scalar(50, 255, 0), 2);
    }
}
#endif  // OPENCV
#endif    // OPENCV


void show_console_result(std::vector<bbox_t> const result_vec, std::vector<std::string> const obj_names) {
    for (auto &i : result_vec) {
        if (obj_names.size() > i.obj_id) std::cout << obj_names[i.obj_id] << " - ";
        std::cout << "obj_id = " << i.obj_id << ",  x = " << i.x << ", y = " << i.y 
            << ", w = " << i.w << ", h = " << i.h
            << std::setprecision(3) << ", prob = " << i.prob << std::endl;
    }
    for (auto &i : result_vec) {
        if (obj_names.size() > i.obj_id) std::cout << obj_names[i.obj_id] << " - ";
        std::cout << "obj_id = " << i.obj_id << ",  x = " << i.x << ", y = " << i.y 
            << ", w = " << i.w << ", h = " << i.h
            << std::setprecision(3) << ", prob = " << i.prob << std::endl;
    }
}

std::vector<std::string> objects_names_from_file(std::string const filename) {
    std::ifstream file(filename);
    std::vector<std::string> file_lines;
    if (!file.is_open()) return file_lines;
    for(std::string line; getline(file, line);) file_lines.push_back(line);
    std::cout << "object names loaded \n";
    return file_lines;
    std::ifstream file(filename);
    std::vector<std::string> file_lines;
    if (!file.is_open()) return file_lines;
    for(std::string line; getline(file, line);) file_lines.push_back(line);
    std::cout << "object names loaded \n";
    return file_lines;
}


int main(int argc, char *argv[])
{
    std::string  names_file = "data/coco.names";
    std::string  cfg_file = "cfg/yolov3.cfg";
    std::string  weights_file = "yolov3.weights";
    std::string filename;
    std::string  names_file = "data/coco.names";
    std::string  cfg_file = "cfg/yolov3.cfg";
    std::string  weights_file = "yolov3.weights";
    std::string filename;

    if (argc > 4) { //voc.names yolo-voc.cfg yolo-voc.weights test.mp4		
        names_file = argv[1];
        cfg_file = argv[2];
        weights_file = argv[3];
        filename = argv[4];
    }
    else if (argc > 1) filename = argv[1];
    if (argc > 4) {    //voc.names yolo-voc.cfg yolo-voc.weights test.mp4        
        names_file = argv[1];
        cfg_file = argv[2];
        weights_file = argv[3];
        filename = argv[4];
    }
    else if (argc > 1) filename = argv[1];

    float const thresh = (argc > 5) ? std::stof(argv[5]) : 0.20;
    float const thresh = (argc > 5) ? std::stof(argv[5]) : 0.20;

    Detector detector(cfg_file, weights_file);
    Detector detector(cfg_file, weights_file);

    auto obj_names = objects_names_from_file(names_file);
    std::string out_videofile = "result.avi";
    bool const save_output_videofile = true;
    auto obj_names = objects_names_from_file(names_file);
    std::string out_videofile = "result.avi";
    bool const save_output_videofile = true;
#ifdef TRACK_OPTFLOW
    Tracker_optflow tracker_flow;
    detector.wait_stream = true;
    Tracker_optflow tracker_flow;
    detector.wait_stream = true;
#endif

    while (true) 
    {		
        std::cout << "input image or video filename: ";
        if(filename.size() == 0) std::cin >> filename;
        if (filename.size() == 0) break;
		
        try {
    while (true) 
    {        
        std::cout << "input image or video filename: ";
        if(filename.size() == 0) std::cin >> filename;
        if (filename.size() == 0) break;
        
        try {
#ifdef OPENCV
            extrapolate_coords_t extrapolate_coords;
            bool extrapolate_flag = false;
            float cur_time_extrapolate = 0, old_time_extrapolate = 0;
            preview_boxes_t large_preview(100, 150, false), small_preview(50, 50, true);
            bool show_small_boxes = false;
            extrapolate_coords_t extrapolate_coords;
            bool extrapolate_flag = false;
            float cur_time_extrapolate = 0, old_time_extrapolate = 0;
            preview_boxes_t large_preview(100, 150, false), small_preview(50, 50, true);
            bool show_small_boxes = false;

            std::string const file_ext = filename.substr(filename.find_last_of(".") + 1);
            std::string const protocol = filename.substr(0, 7);
            if (file_ext == "avi" || file_ext == "mp4" || file_ext == "mjpg" || file_ext == "mov" ||    // video file
                protocol == "rtmp://" || protocol == "rtsp://" || protocol == "http://" || protocol == "https:/")   // video network stream
            {
                cv::Mat cap_frame, cur_frame, det_frame, write_frame;
                std::queue<cv::Mat> track_optflow_queue;
                int passed_flow_frames = 0;
                std::shared_ptr<image_t> det_image;
                std::vector<bbox_t> result_vec, thread_result_vec;
                detector.nms = 0.02;    // comment it - if track_id is not required
                std::atomic<bool> consumed, videowrite_ready;
                bool exit_flag = false;
                consumed = true;
                videowrite_ready = true;
                std::atomic<int> fps_det_counter, fps_cap_counter;
                fps_det_counter = 0;
                fps_cap_counter = 0;
                int current_det_fps = 0, current_cap_fps = 0;
                std::thread t_detect, t_cap, t_videowrite;
                std::mutex mtx;
                std::condition_variable cv_detected, cv_pre_tracked;
                std::chrono::steady_clock::time_point steady_start, steady_end;
                cv::VideoCapture cap(filename); cap >> cur_frame;
                int const video_fps = cap.get(CV_CAP_PROP_FPS);
                cv::Size const frame_size = cur_frame.size();
                cv::VideoWriter output_video;
                if (save_output_videofile)
                    output_video.open(out_videofile, CV_FOURCC('D', 'I', 'V', 'X'), std::max(35, video_fps), frame_size, true);
            std::string const file_ext = filename.substr(filename.find_last_of(".") + 1);
            std::string const protocol = filename.substr(0, 7);
            if (file_ext == "avi" || file_ext == "mp4" || file_ext == "mjpg" || file_ext == "mov" ||     // video file
                protocol == "rtmp://" || protocol == "rtsp://" || protocol == "http://" || protocol == "https:/")    // video network stream
            {
                cv::Mat cap_frame, cur_frame, det_frame, write_frame;
                std::queue<cv::Mat> track_optflow_queue;
                int passed_flow_frames = 0;
                std::shared_ptr<image_t> det_image;
                std::vector<bbox_t> result_vec, thread_result_vec;
                detector.nms = 0.02;    // comment it - if track_id is not required
                std::atomic<bool> consumed, videowrite_ready;
                bool exit_flag = false;
                consumed = true;
                videowrite_ready = true;
                std::atomic<int> fps_det_counter, fps_cap_counter;
                fps_det_counter = 0;
                fps_cap_counter = 0;
                int current_det_fps = 0, current_cap_fps = 0;
                std::thread t_detect, t_cap, t_videowrite;
                std::mutex mtx;
                std::condition_variable cv_detected, cv_pre_tracked;
                std::chrono::steady_clock::time_point steady_start, steady_end;
                cv::VideoCapture cap(filename); cap >> cur_frame;
                int const video_fps = cap.get(CV_CAP_PROP_FPS);
                cv::Size const frame_size = cur_frame.size();
                cv::VideoWriter output_video;
                if (save_output_videofile)
                    output_video.open(out_videofile, CV_FOURCC('D', 'I', 'V', 'X'), std::max(35, video_fps), frame_size, true);

                while (!cur_frame.empty()) 
                {
                    // always sync
                    if (t_cap.joinable()) {
                        t_cap.join();
                        ++fps_cap_counter;
                        cur_frame = cap_frame.clone();
                    }
                    t_cap = std::thread([&]() { cap >> cap_frame; });
                    ++cur_time_extrapolate;
                while (!cur_frame.empty()) 
                {
                    // always sync
                    if (t_cap.joinable()) {
                        t_cap.join();
                        ++fps_cap_counter;
                        cur_frame = cap_frame.clone();
                    }
                    t_cap = std::thread([&]() { cap >> cap_frame; });
                    ++cur_time_extrapolate;

                    // swap result bouned-boxes and input-frame
                    if(consumed)
                    {
                        std::unique_lock<std::mutex> lock(mtx);
                        det_image = detector.mat_to_image_resize(cur_frame);
                        auto old_result_vec = detector.tracking_id(result_vec);
                        auto detected_result_vec = thread_result_vec;
                        result_vec = detected_result_vec;
                    // swap result bouned-boxes and input-frame
                    if(consumed)
                    {
                        std::unique_lock<std::mutex> lock(mtx);
                        det_image = detector.mat_to_image_resize(cur_frame);
                        auto old_result_vec = detector.tracking_id(result_vec);
                        auto detected_result_vec = thread_result_vec;
                        result_vec = detected_result_vec;
#ifdef TRACK_OPTFLOW
                        // track optical flow
                        if (track_optflow_queue.size() > 0) {
                            //std::cout << "\n !!!! all = " << track_optflow_queue.size() << ", cur = " << passed_flow_frames << std::endl;
                            cv::Mat first_frame = track_optflow_queue.front();
                            tracker_flow.update_tracking_flow(track_optflow_queue.front(), result_vec);
                        // track optical flow
                        if (track_optflow_queue.size() > 0) {
                            //std::cout << "\n !!!! all = " << track_optflow_queue.size() << ", cur = " << passed_flow_frames << std::endl;
                            cv::Mat first_frame = track_optflow_queue.front();
                            tracker_flow.update_tracking_flow(track_optflow_queue.front(), result_vec);

                            while (track_optflow_queue.size() > 1) {
                                track_optflow_queue.pop();
                                result_vec = tracker_flow.tracking_flow(track_optflow_queue.front(), true);
                            }
                            track_optflow_queue.pop();
                            passed_flow_frames = 0;
                            while (track_optflow_queue.size() > 1) {
                                track_optflow_queue.pop();
                                result_vec = tracker_flow.tracking_flow(track_optflow_queue.front(), true);
                            }
                            track_optflow_queue.pop();
                            passed_flow_frames = 0;

                            result_vec = detector.tracking_id(result_vec);
                            auto tmp_result_vec = detector.tracking_id(detected_result_vec, false);
                            small_preview.set(first_frame, tmp_result_vec);
                            result_vec = detector.tracking_id(result_vec);
                            auto tmp_result_vec = detector.tracking_id(detected_result_vec, false);
                            small_preview.set(first_frame, tmp_result_vec);

                            extrapolate_coords.new_result(tmp_result_vec, old_time_extrapolate);
                            old_time_extrapolate = cur_time_extrapolate;
                            extrapolate_coords.update_result(result_vec, cur_time_extrapolate - 1);
                        }
                            extrapolate_coords.new_result(tmp_result_vec, old_time_extrapolate);
                            old_time_extrapolate = cur_time_extrapolate;
                            extrapolate_coords.update_result(result_vec, cur_time_extrapolate - 1);
                        }
#else
                        result_vec = detector.tracking_id(result_vec);  // comment it - if track_id is not required					
                        extrapolate_coords.new_result(result_vec, cur_time_extrapolate - 1);
                        result_vec = detector.tracking_id(result_vec);    // comment it - if track_id is not required                    
                        extrapolate_coords.new_result(result_vec, cur_time_extrapolate - 1);
#endif
                        // add old tracked objects
                        for (auto &i : old_result_vec) {
                            auto it = std::find_if(result_vec.begin(), result_vec.end(),
                                [&i](bbox_t const& b) { return b.track_id == i.track_id && b.obj_id == i.obj_id; });
                            bool track_id_absent = (it == result_vec.end());
                            if (track_id_absent) {
                                if (i.frames_counter-- > 1)
                                    result_vec.push_back(i);
                            }
                            else {
                                it->frames_counter = std::min((unsigned)3, i.frames_counter + 1);
                            }
                        }
                        // add old tracked objects
                        for (auto &i : old_result_vec) {
                            auto it = std::find_if(result_vec.begin(), result_vec.end(),
                                [&i](bbox_t const& b) { return b.track_id == i.track_id && b.obj_id == i.obj_id; });
                            bool track_id_absent = (it == result_vec.end());
                            if (track_id_absent) {
                                if (i.frames_counter-- > 1)
                                    result_vec.push_back(i);
                            }
                            else {
                                it->frames_counter = std::min((unsigned)3, i.frames_counter + 1);
                            }
                        }
#ifdef TRACK_OPTFLOW
                        tracker_flow.update_cur_bbox_vec(result_vec);
                        result_vec = tracker_flow.tracking_flow(cur_frame, true);   // track optical flow
                        tracker_flow.update_cur_bbox_vec(result_vec);
                        result_vec = tracker_flow.tracking_flow(cur_frame, true);    // track optical flow
#endif
                        consumed = false;
                        cv_pre_tracked.notify_all();
                    }
                    // launch thread once - Detection
                    if (!t_detect.joinable()) {
                        t_detect = std::thread([&]() {
                            auto current_image = det_image;
                            consumed = true;
                            while (current_image.use_count() > 0 && !exit_flag) {
                                auto result = detector.detect_resized(*current_image, frame_size.width, frame_size.height, 
                                    thresh, false); // true
                                ++fps_det_counter;
                                std::unique_lock<std::mutex> lock(mtx);
                                thread_result_vec = result;
                                consumed = true;
                                cv_detected.notify_all();
                                if (detector.wait_stream) {
                                    while (consumed && !exit_flag) cv_pre_tracked.wait(lock);
                                }
                                current_image = det_image;
                            }
                        });
                    }
                    //while (!consumed);    // sync detection
                        consumed = false;
                        cv_pre_tracked.notify_all();
                    }
                    // launch thread once - Detection
                    if (!t_detect.joinable()) {
                        t_detect = std::thread([&]() {
                            auto current_image = det_image;
                            consumed = true;
                            while (current_image.use_count() > 0 && !exit_flag) {
                                auto result = detector.detect_resized(*current_image, frame_size.width, frame_size.height, 
                                    thresh, false);    // true
                                ++fps_det_counter;
                                std::unique_lock<std::mutex> lock(mtx);
                                thread_result_vec = result;
                                consumed = true;
                                cv_detected.notify_all();
                                if (detector.wait_stream) {
                                    while (consumed && !exit_flag) cv_pre_tracked.wait(lock);
                                }
                                current_image = det_image;
                            }
                        });
                    }
                    //while (!consumed);    // sync detection

                    if (!cur_frame.empty()) {
                        steady_end = std::chrono::steady_clock::now();
                        if (std::chrono::duration<double>(steady_end - steady_start).count() >= 1) {
                            current_det_fps = fps_det_counter;
                            current_cap_fps = fps_cap_counter;
                            steady_start = steady_end;
                            fps_det_counter = 0;
                            fps_cap_counter = 0;
                        }
                    if (!cur_frame.empty()) {
                        steady_end = std::chrono::steady_clock::now();
                        if (std::chrono::duration<double>(steady_end - steady_start).count() >= 1) {
                            current_det_fps = fps_det_counter;
                            current_cap_fps = fps_cap_counter;
                            steady_start = steady_end;
                            fps_det_counter = 0;
                            fps_cap_counter = 0;
                        }

                        large_preview.set(cur_frame, result_vec);
                        large_preview.set(cur_frame, result_vec);
#ifdef TRACK_OPTFLOW
                        ++passed_flow_frames;
                        track_optflow_queue.push(cur_frame.clone());
                        result_vec = tracker_flow.tracking_flow(cur_frame); // track optical flow
                        extrapolate_coords.update_result(result_vec, cur_time_extrapolate);
                        small_preview.draw(cur_frame, show_small_boxes);
#endif						
                        auto result_vec_draw = result_vec;
                        if (extrapolate_flag) {
                            result_vec_draw = extrapolate_coords.predict(cur_time_extrapolate);
                            cv::putText(cur_frame, "extrapolate", cv::Point2f(10, 40), cv::FONT_HERSHEY_COMPLEX_SMALL, 1.0, cv::Scalar(50, 50, 0), 2);
                        }
                        draw_boxes(cur_frame, result_vec_draw, obj_names, current_det_fps, current_cap_fps);
                        //show_console_result(result_vec, obj_names);
                        large_preview.draw(cur_frame);
                        ++passed_flow_frames;
                        track_optflow_queue.push(cur_frame.clone());
                        result_vec = tracker_flow.tracking_flow(cur_frame);    // track optical flow
                        extrapolate_coords.update_result(result_vec, cur_time_extrapolate);
                        small_preview.draw(cur_frame, show_small_boxes);
#endif                        
                        auto result_vec_draw = result_vec;
                        if (extrapolate_flag) {
                            result_vec_draw = extrapolate_coords.predict(cur_time_extrapolate);
                            cv::putText(cur_frame, "extrapolate", cv::Point2f(10, 40), cv::FONT_HERSHEY_COMPLEX_SMALL, 1.0, cv::Scalar(50, 50, 0), 2);
                        }
                        draw_boxes(cur_frame, result_vec_draw, obj_names, current_det_fps, current_cap_fps);
                        //show_console_result(result_vec, obj_names);
                        large_preview.draw(cur_frame);

                        cv::imshow("window name", cur_frame);
                        int key = cv::waitKey(3);   // 3 or 16ms
                        if (key == 'f') show_small_boxes = !show_small_boxes;
                        if (key == 'p') while (true) if(cv::waitKey(100) == 'p') break;
                        if (key == 'e') extrapolate_flag = !extrapolate_flag;
                        if (key == 27) { exit_flag = true; break; }
                        cv::imshow("window name", cur_frame);
                        int key = cv::waitKey(3);    // 3 or 16ms
                        if (key == 'f') show_small_boxes = !show_small_boxes;
                        if (key == 'p') while (true) if(cv::waitKey(100) == 'p') break;
                        if (key == 'e') extrapolate_flag = !extrapolate_flag;
                        if (key == 27) { exit_flag = true; break; }

                        if (output_video.isOpened() && videowrite_ready) {
                            if (t_videowrite.joinable()) t_videowrite.join();
                            write_frame = cur_frame.clone();
                            videowrite_ready = false;
                            t_videowrite = std::thread([&]() { 
                                 output_video << write_frame; videowrite_ready = true;
                            });
                        }
                    }
                        if (output_video.isOpened() && videowrite_ready) {
                            if (t_videowrite.joinable()) t_videowrite.join();
                            write_frame = cur_frame.clone();
                            videowrite_ready = false;
                            t_videowrite = std::thread([&]() { 
                                 output_video << write_frame; videowrite_ready = true;
                            });
                        }
                    }

#ifndef TRACK_OPTFLOW
                    // wait detection result for video-file only (not for net-cam)
                    if (protocol != "rtsp://" && protocol != "http://" && protocol != "https:/") {
                        std::unique_lock<std::mutex> lock(mtx);
                        while (!consumed) cv_detected.wait(lock);
                    }
                    // wait detection result for video-file only (not for net-cam)
                    if (protocol != "rtsp://" && protocol != "http://" && protocol != "https:/") {
                        std::unique_lock<std::mutex> lock(mtx);
                        while (!consumed) cv_detected.wait(lock);
                    }
#endif
                }
                exit_flag = true;
                if (t_cap.joinable()) t_cap.join();
                if (t_detect.joinable()) t_detect.join();
                if (t_videowrite.joinable()) t_videowrite.join();
                std::cout << "Video ended \n";
                break;
            }
            else if (file_ext == "txt") {   // list of image files
                std::ifstream file(filename);
                if (!file.is_open()) std::cout << "File not found! \n";
                else 
                    for (std::string line; file >> line;) {
                        std::cout << line << std::endl;
                        cv::Mat mat_img = cv::imread(line);
                        std::vector<bbox_t> result_vec = detector.detect(mat_img);
                        show_console_result(result_vec, obj_names);
                        //draw_boxes(mat_img, result_vec, obj_names);
                        //cv::imwrite("res_" + line, mat_img);
                    }
				
            }
            else {  // image file
                cv::Mat mat_img = cv::imread(filename);
				
                auto start = std::chrono::steady_clock::now();
                std::vector<bbox_t> result_vec = detector.detect(mat_img);
                auto end = std::chrono::steady_clock::now();
                std::chrono::duration<double> spent = end - start;
                std::cout << " Time: " << spent.count() << " sec \n";
                }
                exit_flag = true;
                if (t_cap.joinable()) t_cap.join();
                if (t_detect.joinable()) t_detect.join();
                if (t_videowrite.joinable()) t_videowrite.join();
                std::cout << "Video ended \n";
                break;
            }
            else if (file_ext == "txt") {    // list of image files
                std::ifstream file(filename);
                if (!file.is_open()) std::cout << "File not found! \n";
                else 
                    for (std::string line; file >> line;) {
                        std::cout << line << std::endl;
                        cv::Mat mat_img = cv::imread(line);
                        std::vector<bbox_t> result_vec = detector.detect(mat_img);
                        show_console_result(result_vec, obj_names);
                        //draw_boxes(mat_img, result_vec, obj_names);
                        //cv::imwrite("res_" + line, mat_img);
                    }
                
            }
            else {    // image file
                cv::Mat mat_img = cv::imread(filename);
                
                auto start = std::chrono::steady_clock::now();
                std::vector<bbox_t> result_vec = detector.detect(mat_img);
                auto end = std::chrono::steady_clock::now();
                std::chrono::duration<double> spent = end - start;
                std::cout << " Time: " << spent.count() << " sec \n";

                //result_vec = detector.tracking_id(result_vec);    // comment it - if track_id is not required
                draw_boxes(mat_img, result_vec, obj_names);
                cv::imshow("window name", mat_img);
                show_console_result(result_vec, obj_names);
                cv::waitKey(0);
            }
                //result_vec = detector.tracking_id(result_vec);    // comment it - if track_id is not required
                draw_boxes(mat_img, result_vec, obj_names);
                cv::imshow("window name", mat_img);
                show_console_result(result_vec, obj_names);
                cv::waitKey(0);
            }
#else
            //std::vector<bbox_t> result_vec = detector.detect(filename);
            //std::vector<bbox_t> result_vec = detector.detect(filename);

            auto img = detector.load_image(filename);
            std::vector<bbox_t> result_vec = detector.detect(img);
            detector.free_image(img);
            show_console_result(result_vec, obj_names);
#endif			
        }
        catch (std::exception &e) { std::cerr << "exception: " << e.what() << "\n"; getchar(); }
        catch (...) { std::cerr << "unknown exception \n"; getchar(); }
        filename.clear();
    }
            auto img = detector.load_image(filename);
            std::vector<bbox_t> result_vec = detector.detect(img);
            detector.free_image(img);
            show_console_result(result_vec, obj_names);
#endif            
        }
        catch (std::exception &e) { std::cerr << "exception: " << e.what() << "\n"; getchar(); }
        catch (...) { std::cerr << "unknown exception \n"; getchar(); }
        filename.clear();
    }

    return 0;
    return 0;
}

 src/yolo_layer.c

@@ -38,8 +38,8 @@
    l.bias_updates = calloc(n*2, sizeof(float));
    l.outputs = h*w*n*(classes + 4 + 1);
    l.inputs = l.outputs;
    l.max_boxes = max_boxes;
    l.truths = l.max_boxes*(4 + 1); // 90*(4 + 1);
    l.max_boxes = max_boxes;
    l.truths = l.max_boxes*(4 + 1);    // 90*(4 + 1);
    l.delta = calloc(batch*l.outputs, sizeof(float));
    l.output = calloc(batch*l.outputs, sizeof(float));
    for(i = 0; i < total*2; ++i){
@@ -117,33 +117,33 @@
        if(avg_cat) *avg_cat += output[index + stride*class_id];
        return;
    }
    // Focal loss
    if (focal_loss) {
        // Focal Loss
        float alpha = 0.5;  // 0.25 or 0.5
        //float gamma = 2;  // hardcoded in many places of the grad-formula	
    // Focal loss
    if (focal_loss) {
        // Focal Loss
        float alpha = 0.5;    // 0.25 or 0.5
        //float gamma = 2;    // hardcoded in many places of the grad-formula    

        int ti = index + stride*class_id;
        float pt = output[ti] + 0.000000000000001F;
        // http://fooplot.com/#W3sidHlwZSI6MCwiZXEiOiItKDEteCkqKDIqeCpsb2coeCkreC0xKSIsImNvbG9yIjoiIzAwMDAwMCJ9LHsidHlwZSI6MTAwMH1d
        float grad = -(1 - pt) * (2 * pt*logf(pt) + pt - 1);    // http://blog.csdn.net/linmingan/article/details/77885832	
        //float grad = (1 - pt) * (2 * pt*logf(pt) + pt - 1);   // https://github.com/unsky/focal-loss
        int ti = index + stride*class_id;
        float pt = output[ti] + 0.000000000000001F;
        // http://fooplot.com/#W3sidHlwZSI6MCwiZXEiOiItKDEteCkqKDIqeCpsb2coeCkreC0xKSIsImNvbG9yIjoiIzAwMDAwMCJ9LHsidHlwZSI6MTAwMH1d
        float grad = -(1 - pt) * (2 * pt*logf(pt) + pt - 1);    // http://blog.csdn.net/linmingan/article/details/77885832    
        //float grad = (1 - pt) * (2 * pt*logf(pt) + pt - 1);    // https://github.com/unsky/focal-loss

        for (n = 0; n < classes; ++n) {
            delta[index + stride*n] = (((n == class_id) ? 1 : 0) - output[index + stride*n]);
        for (n = 0; n < classes; ++n) {
            delta[index + stride*n] = (((n == class_id) ? 1 : 0) - output[index + stride*n]);

            delta[index + stride*n] *= alpha*grad;
            delta[index + stride*n] *= alpha*grad;

            if (n == class_id) *avg_cat += output[index + stride*n];
        }
    }
    else {
        // default
        for (n = 0; n < classes; ++n) {
            delta[index + stride*n] = ((n == class_id) ? 1 : 0) - output[index + stride*n];
            if (n == class_id && avg_cat) *avg_cat += output[index + stride*n];
        }
    }
            if (n == class_id) *avg_cat += output[index + stride*n];
        }
    }
    else {
        // default
        for (n = 0; n < classes; ++n) {
            delta[index + stride*n] = ((n == class_id) ? 1 : 0) - output[index + stride*n];
            if (n == class_id && avg_cat) *avg_cat += output[index + stride*n];
        }
    }
}

static int entry_index(layer l, int batch, int location, int entry)
@@ -155,12 +155,12 @@

static box float_to_box_stride(float *f, int stride)
{
    box b = { 0 };
    b.x = f[0];
    b.y = f[1 * stride];
    b.w = f[2 * stride];
    b.h = f[3 * stride];
    return b;
    box b = { 0 };
    b.x = f[0];
    b.y = f[1 * stride];
    b.w = f[2 * stride];
    b.h = f[3 * stride];
    return b;
}

void forward_yolo_layer(const layer l, network_state state)
@@ -200,12 +200,12 @@
                    int best_t = 0;
                    for(t = 0; t < l.max_boxes; ++t){
                        box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1);
                        int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
                        if (class_id >= l.classes) {
                            printf(" Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes - 1);
                            getchar();
                            continue; // if label contains class_id more than number of classes in the cfg-file
                        }
                        int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
                        if (class_id >= l.classes) {
                            printf(" Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes - 1);
                            getchar();
                            continue; // if label contains class_id more than number of classes in the cfg-file
                        }
                        if(!truth.x) break;
                        float iou = box_iou(pred, truth);
                        if (iou > best_iou) {
@@ -234,8 +234,8 @@
        }
        for(t = 0; t < l.max_boxes; ++t){
            box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1);
            int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
            if (class_id >= l.classes) continue; // if label contains class_id more than number of classes in the cfg-file
            int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
            if (class_id >= l.classes) continue; // if label contains class_id more than number of classes in the cfg-file

            if(!truth.x) break;
            float best_iou = 0;
@@ -291,20 +291,20 @@
    int i;
    int new_w=0;
    int new_h=0;
    if (letter) {
        if (((float)netw / w) < ((float)neth / h)) {
            new_w = netw;
            new_h = (h * netw) / w;
        }
        else {
            new_h = neth;
            new_w = (w * neth) / h;
        }
    }
    else {
        new_w = netw;
        new_h = neth;
    }
    if (letter) {
        if (((float)netw / w) < ((float)neth / h)) {
            new_w = netw;
            new_h = (h * netw) / w;
        }
        else {
            new_h = neth;
            new_w = (w * neth) / h;
        }
    }
    else {
        new_w = netw;
        new_h = neth;
    }
    for (i = 0; i < n; ++i){
        box b = dets[i].bbox;
        b.x =  (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw); 
@@ -411,25 +411,25 @@
    }

    //cuda_pull_array(l.output_gpu, state.input, l.batch*l.inputs);
    float *in_cpu = calloc(l.batch*l.inputs, sizeof(float));
    cuda_pull_array(l.output_gpu, in_cpu, l.batch*l.inputs);
    float *truth_cpu = 0;
    if (state.truth) {
        int num_truth = l.batch*l.truths;
        truth_cpu = calloc(num_truth, sizeof(float));
        cuda_pull_array(state.truth, truth_cpu, num_truth);
    }
    network_state cpu_state = state;
    cpu_state.net = state.net;
    cpu_state.index = state.index;
    cpu_state.train = state.train;
    cpu_state.truth = truth_cpu;
    cpu_state.input = in_cpu;
    forward_yolo_layer(l, cpu_state);
    float *in_cpu = calloc(l.batch*l.inputs, sizeof(float));
    cuda_pull_array(l.output_gpu, in_cpu, l.batch*l.inputs);
    float *truth_cpu = 0;
    if (state.truth) {
        int num_truth = l.batch*l.truths;
        truth_cpu = calloc(num_truth, sizeof(float));
        cuda_pull_array(state.truth, truth_cpu, num_truth);
    }
    network_state cpu_state = state;
    cpu_state.net = state.net;
    cpu_state.index = state.index;
    cpu_state.train = state.train;
    cpu_state.truth = truth_cpu;
    cpu_state.input = in_cpu;
    forward_yolo_layer(l, cpu_state);
    //forward_yolo_layer(l, state);
    cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
    free(in_cpu);
    if (cpu_state.truth) free(cpu_state.truth);
    free(in_cpu);
    if (cpu_state.truth) free(cpu_state.truth);
}

void backward_yolo_layer_gpu(const layer l, network_state state)

 src/yolo_v2_class.cpp

@@ -50,310 +50,310 @@
    return detection.size();
#else
    return -1;
#endif  // OPENCV
#endif    // OPENCV
}

int dispose() {
    //if (detector != NULL) delete detector;
    //detector = NULL;
    //if (detector != NULL) delete detector;
    //detector = NULL;
    detector.reset();
    return 1;
}

#ifdef GPU
void check_cuda(cudaError_t status) {
    if (status != cudaSuccess) {
        const char *s = cudaGetErrorString(status);
        printf("CUDA Error Prev: %s\n", s);
    }
    if (status != cudaSuccess) {
        const char *s = cudaGetErrorString(status);
        printf("CUDA Error Prev: %s\n", s);
    }
}
#endif

struct detector_gpu_t {
    network net;
    image images[FRAMES];
    float *avg;
    float *predictions[FRAMES];
    int demo_index;
    unsigned int *track_id;
    network net;
    image images[FRAMES];
    float *avg;
    float *predictions[FRAMES];
    int demo_index;
    unsigned int *track_id;
};

YOLODLL_API Detector::Detector(std::string cfg_filename, std::string weight_filename, int gpu_id) : cur_gpu_id(gpu_id)
{
    wait_stream = 0;
    int old_gpu_index;
    wait_stream = 0;
    int old_gpu_index;
#ifdef GPU
    check_cuda( cudaGetDevice(&old_gpu_index) );
    check_cuda( cudaGetDevice(&old_gpu_index) );
#endif

    detector_gpu_ptr = std::make_shared<detector_gpu_t>();
    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
    detector_gpu_ptr = std::make_shared<detector_gpu_t>();
    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());

#ifdef GPU
    //check_cuda( cudaSetDevice(cur_gpu_id) );
    cuda_set_device(cur_gpu_id);
    printf(" Used GPU %d \n", cur_gpu_id);
    //check_cuda( cudaSetDevice(cur_gpu_id) );
    cuda_set_device(cur_gpu_id);
    printf(" Used GPU %d \n", cur_gpu_id);
#endif
    network &net = detector_gpu.net;
    net.gpu_index = cur_gpu_id;
    //gpu_index = i;
	
    char *cfgfile = const_cast<char *>(cfg_filename.data());
    char *weightfile = const_cast<char *>(weight_filename.data());
    network &net = detector_gpu.net;
    net.gpu_index = cur_gpu_id;
    //gpu_index = i;
    
    char *cfgfile = const_cast<char *>(cfg_filename.data());
    char *weightfile = const_cast<char *>(weight_filename.data());

    net = parse_network_cfg_custom(cfgfile, 1);
    if (weightfile) {
        load_weights(&net, weightfile);
    }
    set_batch_network(&net, 1);
    net.gpu_index = cur_gpu_id;
    fuse_conv_batchnorm(net);
    net = parse_network_cfg_custom(cfgfile, 1);
    if (weightfile) {
        load_weights(&net, weightfile);
    }
    set_batch_network(&net, 1);
    net.gpu_index = cur_gpu_id;
    fuse_conv_batchnorm(net);

    layer l = net.layers[net.n - 1];
    int j;
    layer l = net.layers[net.n - 1];
    int j;

    detector_gpu.avg = (float *)calloc(l.outputs, sizeof(float));
    for (j = 0; j < FRAMES; ++j) detector_gpu.predictions[j] = (float *)calloc(l.outputs, sizeof(float));
    for (j = 0; j < FRAMES; ++j) detector_gpu.images[j] = make_image(1, 1, 3);
    detector_gpu.avg = (float *)calloc(l.outputs, sizeof(float));
    for (j = 0; j < FRAMES; ++j) detector_gpu.predictions[j] = (float *)calloc(l.outputs, sizeof(float));
    for (j = 0; j < FRAMES; ++j) detector_gpu.images[j] = make_image(1, 1, 3);

    detector_gpu.track_id = (unsigned int *)calloc(l.classes, sizeof(unsigned int));
    for (j = 0; j < l.classes; ++j) detector_gpu.track_id[j] = 1;
    detector_gpu.track_id = (unsigned int *)calloc(l.classes, sizeof(unsigned int));
    for (j = 0; j < l.classes; ++j) detector_gpu.track_id[j] = 1;

#ifdef GPU
    check_cuda( cudaSetDevice(old_gpu_index) );
    check_cuda( cudaSetDevice(old_gpu_index) );
#endif
}


YOLODLL_API Detector::~Detector() 
{
    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
    layer l = detector_gpu.net.layers[detector_gpu.net.n - 1];
    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
    layer l = detector_gpu.net.layers[detector_gpu.net.n - 1];

    free(detector_gpu.track_id);
    free(detector_gpu.track_id);

    free(detector_gpu.avg);
    for (int j = 0; j < FRAMES; ++j) free(detector_gpu.predictions[j]);
    for (int j = 0; j < FRAMES; ++j) if(detector_gpu.images[j].data) free(detector_gpu.images[j].data);
    free(detector_gpu.avg);
    for (int j = 0; j < FRAMES; ++j) free(detector_gpu.predictions[j]);
    for (int j = 0; j < FRAMES; ++j) if(detector_gpu.images[j].data) free(detector_gpu.images[j].data);

    int old_gpu_index;
    int old_gpu_index;
#ifdef GPU
    cudaGetDevice(&old_gpu_index);
    cuda_set_device(detector_gpu.net.gpu_index);
    cudaGetDevice(&old_gpu_index);
    cuda_set_device(detector_gpu.net.gpu_index);
#endif

    free_network(detector_gpu.net);
    free_network(detector_gpu.net);

#ifdef GPU
    cudaSetDevice(old_gpu_index);
    cudaSetDevice(old_gpu_index);
#endif
}

YOLODLL_API int Detector::get_net_width() const {
    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
    return detector_gpu.net.w;
    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
    return detector_gpu.net.w;
}
YOLODLL_API int Detector::get_net_height() const {
    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
    return detector_gpu.net.h;
    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
    return detector_gpu.net.h;
}
YOLODLL_API int Detector::get_net_color_depth() const {
    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
    return detector_gpu.net.c;
    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
    return detector_gpu.net.c;
}


YOLODLL_API std::vector<bbox_t> Detector::detect(std::string image_filename, float thresh, bool use_mean)
{
    std::shared_ptr<image_t> image_ptr(new image_t, [](image_t *img) { if (img->data) free(img->data); delete img; });
    *image_ptr = load_image(image_filename);
    return detect(*image_ptr, thresh, use_mean);
    std::shared_ptr<image_t> image_ptr(new image_t, [](image_t *img) { if (img->data) free(img->data); delete img; });
    *image_ptr = load_image(image_filename);
    return detect(*image_ptr, thresh, use_mean);
}

static image load_image_stb(char *filename, int channels)
{
    int w, h, c;
    unsigned char *data = stbi_load(filename, &w, &h, &c, channels);
    if (!data) 
        throw std::runtime_error("file not found");
    if (channels) c = channels;
    int i, j, k;
    image im = make_image(w, h, c);
    for (k = 0; k < c; ++k) {
        for (j = 0; j < h; ++j) {
            for (i = 0; i < w; ++i) {
                int dst_index = i + w*j + w*h*k;
                int src_index = k + c*i + c*w*j;
                im.data[dst_index] = (float)data[src_index] / 255.;
            }
        }
    }
    free(data);
    return im;
    int w, h, c;
    unsigned char *data = stbi_load(filename, &w, &h, &c, channels);
    if (!data) 
        throw std::runtime_error("file not found");
    if (channels) c = channels;
    int i, j, k;
    image im = make_image(w, h, c);
    for (k = 0; k < c; ++k) {
        for (j = 0; j < h; ++j) {
            for (i = 0; i < w; ++i) {
                int dst_index = i + w*j + w*h*k;
                int src_index = k + c*i + c*w*j;
                im.data[dst_index] = (float)data[src_index] / 255.;
            }
        }
    }
    free(data);
    return im;
}

YOLODLL_API image_t Detector::load_image(std::string image_filename)
{
    char *input = const_cast<char *>(image_filename.data());
    image im = load_image_stb(input, 3);
    char *input = const_cast<char *>(image_filename.data());
    image im = load_image_stb(input, 3);

    image_t img;
    img.c = im.c;
    img.data = im.data;
    img.h = im.h;
    img.w = im.w;
    image_t img;
    img.c = im.c;
    img.data = im.data;
    img.h = im.h;
    img.w = im.w;

    return img;
    return img;
}


YOLODLL_API void Detector::free_image(image_t m)
{
    if (m.data) {
        free(m.data);
    }
    if (m.data) {
        free(m.data);
    }
}

YOLODLL_API std::vector<bbox_t> Detector::detect(image_t img, float thresh, bool use_mean)
{
    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
    network &net = detector_gpu.net;
    int old_gpu_index;
    detector_gpu_t &detector_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
    network &net = detector_gpu.net;
    int old_gpu_index;
#ifdef GPU
    cudaGetDevice(&old_gpu_index);
    if(cur_gpu_id != old_gpu_index)
        cudaSetDevice(net.gpu_index);
    cudaGetDevice(&old_gpu_index);
    if(cur_gpu_id != old_gpu_index)
        cudaSetDevice(net.gpu_index);

    net.wait_stream = wait_stream;  // 1 - wait CUDA-stream, 0 - not to wait
    net.wait_stream = wait_stream;    // 1 - wait CUDA-stream, 0 - not to wait
#endif
    //std::cout << "net.gpu_index = " << net.gpu_index << std::endl;
    //std::cout << "net.gpu_index = " << net.gpu_index << std::endl;

    //float nms = .4;
    //float nms = .4;

    image im;
    im.c = img.c;
    im.data = img.data;
    im.h = img.h;
    im.w = img.w;
    image im;
    im.c = img.c;
    im.data = img.data;
    im.h = img.h;
    im.w = img.w;

    image sized;
	
    if (net.w == im.w && net.h == im.h) {
        sized = make_image(im.w, im.h, im.c);
        memcpy(sized.data, im.data, im.w*im.h*im.c * sizeof(float));
    }
    else
        sized = resize_image(im, net.w, net.h);
    image sized;
    
    if (net.w == im.w && net.h == im.h) {
        sized = make_image(im.w, im.h, im.c);
        memcpy(sized.data, im.data, im.w*im.h*im.c * sizeof(float));
    }
    else
        sized = resize_image(im, net.w, net.h);

    layer l = net.layers[net.n - 1];
    layer l = net.layers[net.n - 1];

    float *X = sized.data;
    float *X = sized.data;

    float *prediction = network_predict(net, X);
    float *prediction = network_predict(net, X);

    if (use_mean) {
        memcpy(detector_gpu.predictions[detector_gpu.demo_index], prediction, l.outputs * sizeof(float));
        mean_arrays(detector_gpu.predictions, FRAMES, l.outputs, detector_gpu.avg);
        l.output = detector_gpu.avg;
        detector_gpu.demo_index = (detector_gpu.demo_index + 1) % FRAMES;
    }
    //get_region_boxes(l, 1, 1, thresh, detector_gpu.probs, detector_gpu.boxes, 0, 0);
    //if (nms) do_nms_sort(detector_gpu.boxes, detector_gpu.probs, l.w*l.h*l.n, l.classes, nms);
    if (use_mean) {
        memcpy(detector_gpu.predictions[detector_gpu.demo_index], prediction, l.outputs * sizeof(float));
        mean_arrays(detector_gpu.predictions, FRAMES, l.outputs, detector_gpu.avg);
        l.output = detector_gpu.avg;
        detector_gpu.demo_index = (detector_gpu.demo_index + 1) % FRAMES;
    }
    //get_region_boxes(l, 1, 1, thresh, detector_gpu.probs, detector_gpu.boxes, 0, 0);
    //if (nms) do_nms_sort(detector_gpu.boxes, detector_gpu.probs, l.w*l.h*l.n, l.classes, nms);

    int nboxes = 0;
    int letterbox = 0;
    float hier_thresh = 0.5;
    detection *dets = get_network_boxes(&net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes, letterbox);
    if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
    int nboxes = 0;
    int letterbox = 0;
    float hier_thresh = 0.5;
    detection *dets = get_network_boxes(&net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes, letterbox);
    if (nms) do_nms_sort(dets, nboxes, l.classes, nms);

    std::vector<bbox_t> bbox_vec;
    std::vector<bbox_t> bbox_vec;

    for (size_t i = 0; i < nboxes; ++i) {
        box b = dets[i].bbox;
        int const obj_id = max_index(dets[i].prob, l.classes);
        float const prob = dets[i].prob[obj_id];
		
        if (prob > thresh) 
        {
            bbox_t bbox;
            bbox.x = std::max((double)0, (b.x - b.w / 2.)*im.w);
            bbox.y = std::max((double)0, (b.y - b.h / 2.)*im.h);
            bbox.w = b.w*im.w;
            bbox.h = b.h*im.h;
            bbox.obj_id = obj_id;
            bbox.prob = prob;
            bbox.track_id = 0;
    for (size_t i = 0; i < nboxes; ++i) {
        box b = dets[i].bbox;
        int const obj_id = max_index(dets[i].prob, l.classes);
        float const prob = dets[i].prob[obj_id];
        
        if (prob > thresh) 
        {
            bbox_t bbox;
            bbox.x = std::max((double)0, (b.x - b.w / 2.)*im.w);
            bbox.y = std::max((double)0, (b.y - b.h / 2.)*im.h);
            bbox.w = b.w*im.w;
            bbox.h = b.h*im.h;
            bbox.obj_id = obj_id;
            bbox.prob = prob;
            bbox.track_id = 0;

            bbox_vec.push_back(bbox);
        }
    }
            bbox_vec.push_back(bbox);
        }
    }

    free_detections(dets, nboxes);
    if(sized.data)
        free(sized.data);
    free_detections(dets, nboxes);
    if(sized.data)
        free(sized.data);

#ifdef GPU
    if (cur_gpu_id != old_gpu_index)
        cudaSetDevice(old_gpu_index);
    if (cur_gpu_id != old_gpu_index)
        cudaSetDevice(old_gpu_index);
#endif

    return bbox_vec;
    return bbox_vec;
}

YOLODLL_API std::vector<bbox_t> Detector::tracking_id(std::vector<bbox_t> cur_bbox_vec, bool const change_history, 
    int const frames_story, int const max_dist)
    int const frames_story, int const max_dist)
{
    detector_gpu_t &det_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());
    detector_gpu_t &det_gpu = *static_cast<detector_gpu_t *>(detector_gpu_ptr.get());

    bool prev_track_id_present = false;
    for (auto &i : prev_bbox_vec_deque)
        if (i.size() > 0) prev_track_id_present = true;
    bool prev_track_id_present = false;
    for (auto &i : prev_bbox_vec_deque)
        if (i.size() > 0) prev_track_id_present = true;

    if (!prev_track_id_present) {
        for (size_t i = 0; i < cur_bbox_vec.size(); ++i)
            cur_bbox_vec[i].track_id = det_gpu.track_id[cur_bbox_vec[i].obj_id]++;
        prev_bbox_vec_deque.push_front(cur_bbox_vec);
        if (prev_bbox_vec_deque.size() > frames_story) prev_bbox_vec_deque.pop_back();
        return cur_bbox_vec;
    }
    if (!prev_track_id_present) {
        for (size_t i = 0; i < cur_bbox_vec.size(); ++i)
            cur_bbox_vec[i].track_id = det_gpu.track_id[cur_bbox_vec[i].obj_id]++;
        prev_bbox_vec_deque.push_front(cur_bbox_vec);
        if (prev_bbox_vec_deque.size() > frames_story) prev_bbox_vec_deque.pop_back();
        return cur_bbox_vec;
    }

    std::vector<unsigned int> dist_vec(cur_bbox_vec.size(), std::numeric_limits<unsigned int>::max());
    std::vector<unsigned int> dist_vec(cur_bbox_vec.size(), std::numeric_limits<unsigned int>::max());

    for (auto &prev_bbox_vec : prev_bbox_vec_deque) {
        for (auto &i : prev_bbox_vec) {
            int cur_index = -1;
            for (size_t m = 0; m < cur_bbox_vec.size(); ++m) {
                bbox_t const& k = cur_bbox_vec[m];
                if (i.obj_id == k.obj_id) {
                    float center_x_diff = (float)(i.x + i.w/2) - (float)(k.x + k.w/2);
                    float center_y_diff = (float)(i.y + i.h/2) - (float)(k.y + k.h/2);
                    unsigned int cur_dist = sqrt(center_x_diff*center_x_diff + center_y_diff*center_y_diff);
                    if (cur_dist < max_dist && (k.track_id == 0 || dist_vec[m] > cur_dist)) {
                        dist_vec[m] = cur_dist;
                        cur_index = m;
                    }
                }
            }
    for (auto &prev_bbox_vec : prev_bbox_vec_deque) {
        for (auto &i : prev_bbox_vec) {
            int cur_index = -1;
            for (size_t m = 0; m < cur_bbox_vec.size(); ++m) {
                bbox_t const& k = cur_bbox_vec[m];
                if (i.obj_id == k.obj_id) {
                    float center_x_diff = (float)(i.x + i.w/2) - (float)(k.x + k.w/2);
                    float center_y_diff = (float)(i.y + i.h/2) - (float)(k.y + k.h/2);
                    unsigned int cur_dist = sqrt(center_x_diff*center_x_diff + center_y_diff*center_y_diff);
                    if (cur_dist < max_dist && (k.track_id == 0 || dist_vec[m] > cur_dist)) {
                        dist_vec[m] = cur_dist;
                        cur_index = m;
                    }
                }
            }

            bool track_id_absent = !std::any_of(cur_bbox_vec.begin(), cur_bbox_vec.end(), 
                [&i](bbox_t const& b) { return b.track_id == i.track_id && b.obj_id == i.obj_id; });
            bool track_id_absent = !std::any_of(cur_bbox_vec.begin(), cur_bbox_vec.end(), 
                [&i](bbox_t const& b) { return b.track_id == i.track_id && b.obj_id == i.obj_id; });

            if (cur_index >= 0 && track_id_absent){
                cur_bbox_vec[cur_index].track_id = i.track_id;
                cur_bbox_vec[cur_index].w = (cur_bbox_vec[cur_index].w + i.w) / 2;
                cur_bbox_vec[cur_index].h = (cur_bbox_vec[cur_index].h + i.h) / 2;
            }
        }
    }
            if (cur_index >= 0 && track_id_absent){
                cur_bbox_vec[cur_index].track_id = i.track_id;
                cur_bbox_vec[cur_index].w = (cur_bbox_vec[cur_index].w + i.w) / 2;
                cur_bbox_vec[cur_index].h = (cur_bbox_vec[cur_index].h + i.h) / 2;
            }
        }
    }

    for (size_t i = 0; i < cur_bbox_vec.size(); ++i)
        if (cur_bbox_vec[i].track_id == 0)
            cur_bbox_vec[i].track_id = det_gpu.track_id[cur_bbox_vec[i].obj_id]++;
    for (size_t i = 0; i < cur_bbox_vec.size(); ++i)
        if (cur_bbox_vec[i].track_id == 0)
            cur_bbox_vec[i].track_id = det_gpu.track_id[cur_bbox_vec[i].obj_id]++;

    if (change_history) {
        prev_bbox_vec_deque.push_front(cur_bbox_vec);
        if (prev_bbox_vec_deque.size() > frames_story) prev_bbox_vec_deque.pop_back();
    }
    if (change_history) {
        prev_bbox_vec_deque.push_front(cur_bbox_vec);
        if (prev_bbox_vec_deque.size() > frames_story) prev_bbox_vec_deque.pop_back();
    }

    return cur_bbox_vec;
    return cur_bbox_vec;
}

 src/yolo_v2_class.hpp

@@ -14,18 +14,18 @@
#endif

struct bbox_t {
    unsigned int x, y, w, h;    // (x,y) - top-left corner, (w, h) - width & height of bounded box
    float prob;                 // confidence - probability that the object was found correctly
    unsigned int obj_id;        // class of object - from range [0, classes-1]
    unsigned int track_id;      // tracking id for video (0 - untracked, 1 - inf - tracked object)
    unsigned int frames_counter;// counter of frames on which the object was detected
    unsigned int x, y, w, h;    // (x,y) - top-left corner, (w, h) - width & height of bounded box
    float prob;                    // confidence - probability that the object was found correctly
    unsigned int obj_id;        // class of object - from range [0, classes-1]
    unsigned int track_id;        // tracking id for video (0 - untracked, 1 - inf - tracked object)
    unsigned int frames_counter;// counter of frames on which the object was detected
};

struct image_t {
    int h;                      // height
    int w;                      // width
    int c;                      // number of chanels (3 - for RGB)
    float *data;                // pointer to the image data
    int h;                        // height
    int w;                        // width
    int c;                        // number of chanels (3 - for RGB)
    float *data;                // pointer to the image data
};

#define C_SHARP_MAX_OBJECTS 1000
@@ -40,10 +40,10 @@
#include <algorithm>

#ifdef OPENCV
#include <opencv2/opencv.hpp>           // C++
#include "opencv2/highgui/highgui_c.h"  // C
#include "opencv2/imgproc/imgproc_c.h"  // C
#endif  // OPENCV
#include <opencv2/opencv.hpp>            // C++
#include "opencv2/highgui/highgui_c.h"    // C
#include "opencv2/imgproc/imgproc_c.h"    // C
#endif    // OPENCV

extern "C" YOLODLL_API int init(const char *configurationFilename, const char *weightsFilename, int gpu);
extern "C" YOLODLL_API int detect_image(const char *filename, bbox_t_container &container);
@@ -51,106 +51,106 @@
extern "C" YOLODLL_API int dispose();

class Detector {
    std::shared_ptr<void> detector_gpu_ptr;
    std::deque<std::vector<bbox_t>> prev_bbox_vec_deque;
    const int cur_gpu_id;
    std::shared_ptr<void> detector_gpu_ptr;
    std::deque<std::vector<bbox_t>> prev_bbox_vec_deque;
    const int cur_gpu_id;
public:
    float nms = .4;
    bool wait_stream;
    float nms = .4;
    bool wait_stream;

    YOLODLL_API Detector(std::string cfg_filename, std::string weight_filename, int gpu_id = 0);
    YOLODLL_API ~Detector();
    YOLODLL_API Detector(std::string cfg_filename, std::string weight_filename, int gpu_id = 0);
    YOLODLL_API ~Detector();

    YOLODLL_API std::vector<bbox_t> detect(std::string image_filename, float thresh = 0.2, bool use_mean = false);
    YOLODLL_API std::vector<bbox_t> detect(image_t img, float thresh = 0.2, bool use_mean = false);
    static YOLODLL_API image_t load_image(std::string image_filename);
    static YOLODLL_API void free_image(image_t m);
    YOLODLL_API int get_net_width() const;
    YOLODLL_API int get_net_height() const;
    YOLODLL_API int get_net_color_depth() const;
    YOLODLL_API std::vector<bbox_t> detect(std::string image_filename, float thresh = 0.2, bool use_mean = false);
    YOLODLL_API std::vector<bbox_t> detect(image_t img, float thresh = 0.2, bool use_mean = false);
    static YOLODLL_API image_t load_image(std::string image_filename);
    static YOLODLL_API void free_image(image_t m);
    YOLODLL_API int get_net_width() const;
    YOLODLL_API int get_net_height() const;
    YOLODLL_API int get_net_color_depth() const;

    YOLODLL_API std::vector<bbox_t> tracking_id(std::vector<bbox_t> cur_bbox_vec, bool const change_history = true, 
                                                int const frames_story = 10, int const max_dist = 150);
    YOLODLL_API std::vector<bbox_t> tracking_id(std::vector<bbox_t> cur_bbox_vec, bool const change_history = true, 
                                                int const frames_story = 10, int const max_dist = 150);

    std::vector<bbox_t> detect_resized(image_t img, int init_w, int init_h, float thresh = 0.2, bool use_mean = false)
    {
        if (img.data == NULL)
            throw std::runtime_error("Image is empty");
        auto detection_boxes = detect(img, thresh, use_mean);
        float wk = (float)init_w / img.w, hk = (float)init_h / img.h;
        for (auto &i : detection_boxes) i.x *= wk, i.w *= wk, i.y *= hk, i.h *= hk;
        return detection_boxes;
    }
    std::vector<bbox_t> detect_resized(image_t img, int init_w, int init_h, float thresh = 0.2, bool use_mean = false)
    {
        if (img.data == NULL)
            throw std::runtime_error("Image is empty");
        auto detection_boxes = detect(img, thresh, use_mean);
        float wk = (float)init_w / img.w, hk = (float)init_h / img.h;
        for (auto &i : detection_boxes) i.x *= wk, i.w *= wk, i.y *= hk, i.h *= hk;
        return detection_boxes;
    }

#ifdef OPENCV
    std::vector<bbox_t> detect(cv::Mat mat, float thresh = 0.2, bool use_mean = false)
    {
        if(mat.data == NULL)
            throw std::runtime_error("Image is empty");
        auto image_ptr = mat_to_image_resize(mat);
        return detect_resized(*image_ptr, mat.cols, mat.rows, thresh, use_mean);
    }
    std::vector<bbox_t> detect(cv::Mat mat, float thresh = 0.2, bool use_mean = false)
    {
        if(mat.data == NULL)
            throw std::runtime_error("Image is empty");
        auto image_ptr = mat_to_image_resize(mat);
        return detect_resized(*image_ptr, mat.cols, mat.rows, thresh, use_mean);
    }

    std::shared_ptr<image_t> mat_to_image_resize(cv::Mat mat) const
    {
        if (mat.data == NULL) return std::shared_ptr<image_t>(NULL);
        cv::Mat det_mat;
        cv::resize(mat, det_mat, cv::Size(get_net_width(), get_net_height()));
        return mat_to_image(det_mat);
    }
    std::shared_ptr<image_t> mat_to_image_resize(cv::Mat mat) const
    {
        if (mat.data == NULL) return std::shared_ptr<image_t>(NULL);
        cv::Mat det_mat;
        cv::resize(mat, det_mat, cv::Size(get_net_width(), get_net_height()));
        return mat_to_image(det_mat);
    }

    static std::shared_ptr<image_t> mat_to_image(cv::Mat img_src)
    {
        cv::Mat img;
        cv::cvtColor(img_src, img, cv::COLOR_RGB2BGR);
        std::shared_ptr<image_t> image_ptr(new image_t, [](image_t *img) { free_image(*img); delete img; });
        std::shared_ptr<IplImage> ipl_small = std::make_shared<IplImage>(img);
        *image_ptr = ipl_to_image(ipl_small.get());
        return image_ptr;
    }
    static std::shared_ptr<image_t> mat_to_image(cv::Mat img_src)
    {
        cv::Mat img;
        cv::cvtColor(img_src, img, cv::COLOR_RGB2BGR);
        std::shared_ptr<image_t> image_ptr(new image_t, [](image_t *img) { free_image(*img); delete img; });
        std::shared_ptr<IplImage> ipl_small = std::make_shared<IplImage>(img);
        *image_ptr = ipl_to_image(ipl_small.get());
        return image_ptr;
    }

private:

    static image_t ipl_to_image(IplImage* src)
    {
        unsigned char *data = (unsigned char *)src->imageData;
        int h = src->height;
        int w = src->width;
        int c = src->nChannels;
        int step = src->widthStep;
        image_t out = make_image_custom(w, h, c);
        int count = 0;
    static image_t ipl_to_image(IplImage* src)
    {
        unsigned char *data = (unsigned char *)src->imageData;
        int h = src->height;
        int w = src->width;
        int c = src->nChannels;
        int step = src->widthStep;
        image_t out = make_image_custom(w, h, c);
        int count = 0;

        for (int k = 0; k < c; ++k) {
            for (int i = 0; i < h; ++i) {
                int i_step = i*step;
                for (int j = 0; j < w; ++j) {
                    out.data[count++] = data[i_step + j*c + k] / 255.;
                }
            }
        }
        for (int k = 0; k < c; ++k) {
            for (int i = 0; i < h; ++i) {
                int i_step = i*step;
                for (int j = 0; j < w; ++j) {
                    out.data[count++] = data[i_step + j*c + k] / 255.;
                }
            }
        }

        return out;
    }
        return out;
    }

    static image_t make_empty_image(int w, int h, int c)
    {
        image_t out;
        out.data = 0;
        out.h = h;
        out.w = w;
        out.c = c;
        return out;
    }
    static image_t make_empty_image(int w, int h, int c)
    {
        image_t out;
        out.data = 0;
        out.h = h;
        out.w = w;
        out.c = c;
        return out;
    }

    static image_t make_image_custom(int w, int h, int c)
    {
        image_t out = make_empty_image(w, h, c);
        out.data = (float *)calloc(h*w*c, sizeof(float));
        return out;
    }
    static image_t make_image_custom(int w, int h, int c)
    {
        image_t out = make_empty_image(w, h, c);
        out.data = (float *)calloc(h*w*c, sizeof(float));
        return out;
    }

#endif  // OPENCV
#endif    // OPENCV

};

@@ -165,170 +165,170 @@

class Tracker_optflow {
public:
    const int gpu_count;
    const int gpu_id;
    const int flow_error;
    const int gpu_count;
    const int gpu_id;
    const int flow_error;


    Tracker_optflow(int _gpu_id = 0, int win_size = 9, int max_level = 3, int iterations = 8000, int _flow_error = -1) :
        gpu_count(cv::cuda::getCudaEnabledDeviceCount()), gpu_id(std::min(_gpu_id, gpu_count-1)),
        flow_error((_flow_error > 0)? _flow_error:(win_size*4))
    {
        int const old_gpu_id = cv::cuda::getDevice();
        cv::cuda::setDevice(gpu_id);
    Tracker_optflow(int _gpu_id = 0, int win_size = 9, int max_level = 3, int iterations = 8000, int _flow_error = -1) :
        gpu_count(cv::cuda::getCudaEnabledDeviceCount()), gpu_id(std::min(_gpu_id, gpu_count-1)),
        flow_error((_flow_error > 0)? _flow_error:(win_size*4))
    {
        int const old_gpu_id = cv::cuda::getDevice();
        cv::cuda::setDevice(gpu_id);

        stream = cv::cuda::Stream();
        stream = cv::cuda::Stream();

        sync_PyrLKOpticalFlow_gpu = cv::cuda::SparsePyrLKOpticalFlow::create();
        sync_PyrLKOpticalFlow_gpu->setWinSize(cv::Size(win_size, win_size));    // 9, 15, 21, 31
        sync_PyrLKOpticalFlow_gpu->setMaxLevel(max_level);      // +- 3 pt
        sync_PyrLKOpticalFlow_gpu->setNumIters(iterations); // 2000, def: 30
        sync_PyrLKOpticalFlow_gpu = cv::cuda::SparsePyrLKOpticalFlow::create();
        sync_PyrLKOpticalFlow_gpu->setWinSize(cv::Size(win_size, win_size));    // 9, 15, 21, 31
        sync_PyrLKOpticalFlow_gpu->setMaxLevel(max_level);        // +- 3 pt
        sync_PyrLKOpticalFlow_gpu->setNumIters(iterations);    // 2000, def: 30

        cv::cuda::setDevice(old_gpu_id);
    }
        cv::cuda::setDevice(old_gpu_id);
    }

    // just to avoid extra allocations
    cv::cuda::GpuMat src_mat_gpu;
    cv::cuda::GpuMat dst_mat_gpu, dst_grey_gpu;
    cv::cuda::GpuMat prev_pts_flow_gpu, cur_pts_flow_gpu;
    cv::cuda::GpuMat status_gpu, err_gpu;
    // just to avoid extra allocations
    cv::cuda::GpuMat src_mat_gpu;
    cv::cuda::GpuMat dst_mat_gpu, dst_grey_gpu;
    cv::cuda::GpuMat prev_pts_flow_gpu, cur_pts_flow_gpu;
    cv::cuda::GpuMat status_gpu, err_gpu;

    cv::cuda::GpuMat src_grey_gpu;  // used in both functions
    cv::Ptr<cv::cuda::SparsePyrLKOpticalFlow> sync_PyrLKOpticalFlow_gpu;
    cv::cuda::Stream stream;
    cv::cuda::GpuMat src_grey_gpu;    // used in both functions
    cv::Ptr<cv::cuda::SparsePyrLKOpticalFlow> sync_PyrLKOpticalFlow_gpu;
    cv::cuda::Stream stream;

    std::vector<bbox_t> cur_bbox_vec;
    std::vector<bool> good_bbox_vec_flags;
    cv::Mat prev_pts_flow_cpu;
    std::vector<bbox_t> cur_bbox_vec;
    std::vector<bool> good_bbox_vec_flags;
    cv::Mat prev_pts_flow_cpu;

    void update_cur_bbox_vec(std::vector<bbox_t> _cur_bbox_vec)
    {
        cur_bbox_vec = _cur_bbox_vec;
        good_bbox_vec_flags = std::vector<bool>(cur_bbox_vec.size(), true);
        cv::Mat prev_pts, cur_pts_flow_cpu;
    void update_cur_bbox_vec(std::vector<bbox_t> _cur_bbox_vec)
    {
        cur_bbox_vec = _cur_bbox_vec;
        good_bbox_vec_flags = std::vector<bool>(cur_bbox_vec.size(), true);
        cv::Mat prev_pts, cur_pts_flow_cpu;

        for (auto &i : cur_bbox_vec) {
            float x_center = (i.x + i.w / 2.0F);
            float y_center = (i.y + i.h / 2.0F);
            prev_pts.push_back(cv::Point2f(x_center, y_center));
        }
        for (auto &i : cur_bbox_vec) {
            float x_center = (i.x + i.w / 2.0F);
            float y_center = (i.y + i.h / 2.0F);
            prev_pts.push_back(cv::Point2f(x_center, y_center));
        }

        if (prev_pts.rows == 0)
            prev_pts_flow_cpu = cv::Mat();
        else
            cv::transpose(prev_pts, prev_pts_flow_cpu);
        if (prev_pts.rows == 0)
            prev_pts_flow_cpu = cv::Mat();
        else
            cv::transpose(prev_pts, prev_pts_flow_cpu);

        if (prev_pts_flow_gpu.cols < prev_pts_flow_cpu.cols) {
            prev_pts_flow_gpu = cv::cuda::GpuMat(prev_pts_flow_cpu.size(), prev_pts_flow_cpu.type());
            cur_pts_flow_gpu = cv::cuda::GpuMat(prev_pts_flow_cpu.size(), prev_pts_flow_cpu.type());
        if (prev_pts_flow_gpu.cols < prev_pts_flow_cpu.cols) {
            prev_pts_flow_gpu = cv::cuda::GpuMat(prev_pts_flow_cpu.size(), prev_pts_flow_cpu.type());
            cur_pts_flow_gpu = cv::cuda::GpuMat(prev_pts_flow_cpu.size(), prev_pts_flow_cpu.type());

            status_gpu = cv::cuda::GpuMat(prev_pts_flow_cpu.size(), CV_8UC1);
            err_gpu = cv::cuda::GpuMat(prev_pts_flow_cpu.size(), CV_32FC1);
        }
            status_gpu = cv::cuda::GpuMat(prev_pts_flow_cpu.size(), CV_8UC1);
            err_gpu = cv::cuda::GpuMat(prev_pts_flow_cpu.size(), CV_32FC1);
        }

        prev_pts_flow_gpu.upload(cv::Mat(prev_pts_flow_cpu), stream);
    }
        prev_pts_flow_gpu.upload(cv::Mat(prev_pts_flow_cpu), stream);
    }


    void update_tracking_flow(cv::Mat src_mat, std::vector<bbox_t> _cur_bbox_vec)
    {
        int const old_gpu_id = cv::cuda::getDevice();
        if (old_gpu_id != gpu_id)
            cv::cuda::setDevice(gpu_id);
    void update_tracking_flow(cv::Mat src_mat, std::vector<bbox_t> _cur_bbox_vec)
    {
        int const old_gpu_id = cv::cuda::getDevice();
        if (old_gpu_id != gpu_id)
            cv::cuda::setDevice(gpu_id);

        if (src_mat.channels() == 3) {
            if (src_mat_gpu.cols == 0) {
                src_mat_gpu = cv::cuda::GpuMat(src_mat.size(), src_mat.type());
                src_grey_gpu = cv::cuda::GpuMat(src_mat.size(), CV_8UC1);
            }
        if (src_mat.channels() == 3) {
            if (src_mat_gpu.cols == 0) {
                src_mat_gpu = cv::cuda::GpuMat(src_mat.size(), src_mat.type());
                src_grey_gpu = cv::cuda::GpuMat(src_mat.size(), CV_8UC1);
            }

            update_cur_bbox_vec(_cur_bbox_vec);
            update_cur_bbox_vec(_cur_bbox_vec);

            //src_grey_gpu.upload(src_mat, stream); // use BGR
            src_mat_gpu.upload(src_mat, stream);
            cv::cuda::cvtColor(src_mat_gpu, src_grey_gpu, CV_BGR2GRAY, 1, stream);
        }
        if (old_gpu_id != gpu_id)
            cv::cuda::setDevice(old_gpu_id);
    }
            //src_grey_gpu.upload(src_mat, stream);    // use BGR
            src_mat_gpu.upload(src_mat, stream);
            cv::cuda::cvtColor(src_mat_gpu, src_grey_gpu, CV_BGR2GRAY, 1, stream);
        }
        if (old_gpu_id != gpu_id)
            cv::cuda::setDevice(old_gpu_id);
    }


    std::vector<bbox_t> tracking_flow(cv::Mat dst_mat, bool check_error = true)
    {
        if (sync_PyrLKOpticalFlow_gpu.empty()) {
            std::cout << "sync_PyrLKOpticalFlow_gpu isn't initialized \n";
            return cur_bbox_vec;
        }
    std::vector<bbox_t> tracking_flow(cv::Mat dst_mat, bool check_error = true)
    {
        if (sync_PyrLKOpticalFlow_gpu.empty()) {
            std::cout << "sync_PyrLKOpticalFlow_gpu isn't initialized \n";
            return cur_bbox_vec;
        }

        int const old_gpu_id = cv::cuda::getDevice();
        if(old_gpu_id != gpu_id)
            cv::cuda::setDevice(gpu_id);
        int const old_gpu_id = cv::cuda::getDevice();
        if(old_gpu_id != gpu_id)
            cv::cuda::setDevice(gpu_id);

        if (dst_mat_gpu.cols == 0) {
            dst_mat_gpu = cv::cuda::GpuMat(dst_mat.size(), dst_mat.type());
            dst_grey_gpu = cv::cuda::GpuMat(dst_mat.size(), CV_8UC1);
        }
        if (dst_mat_gpu.cols == 0) {
            dst_mat_gpu = cv::cuda::GpuMat(dst_mat.size(), dst_mat.type());
            dst_grey_gpu = cv::cuda::GpuMat(dst_mat.size(), CV_8UC1);
        }

        //dst_grey_gpu.upload(dst_mat, stream); // use BGR
        dst_mat_gpu.upload(dst_mat, stream);
        cv::cuda::cvtColor(dst_mat_gpu, dst_grey_gpu, CV_BGR2GRAY, 1, stream);
        //dst_grey_gpu.upload(dst_mat, stream);    // use BGR
        dst_mat_gpu.upload(dst_mat, stream);
        cv::cuda::cvtColor(dst_mat_gpu, dst_grey_gpu, CV_BGR2GRAY, 1, stream);

        if (src_grey_gpu.rows != dst_grey_gpu.rows || src_grey_gpu.cols != dst_grey_gpu.cols) {
            stream.waitForCompletion();
            src_grey_gpu = dst_grey_gpu.clone();
            cv::cuda::setDevice(old_gpu_id);
            return cur_bbox_vec;
        }
        if (src_grey_gpu.rows != dst_grey_gpu.rows || src_grey_gpu.cols != dst_grey_gpu.cols) {
            stream.waitForCompletion();
            src_grey_gpu = dst_grey_gpu.clone();
            cv::cuda::setDevice(old_gpu_id);
            return cur_bbox_vec;
        }

        ////sync_PyrLKOpticalFlow_gpu.sparse(src_grey_gpu, dst_grey_gpu, prev_pts_flow_gpu, cur_pts_flow_gpu, status_gpu, &err_gpu);    // OpenCV 2.4.x
        sync_PyrLKOpticalFlow_gpu->calc(src_grey_gpu, dst_grey_gpu, prev_pts_flow_gpu, cur_pts_flow_gpu, status_gpu, err_gpu, stream);  // OpenCV 3.x
        ////sync_PyrLKOpticalFlow_gpu.sparse(src_grey_gpu, dst_grey_gpu, prev_pts_flow_gpu, cur_pts_flow_gpu, status_gpu, &err_gpu);    // OpenCV 2.4.x
        sync_PyrLKOpticalFlow_gpu->calc(src_grey_gpu, dst_grey_gpu, prev_pts_flow_gpu, cur_pts_flow_gpu, status_gpu, err_gpu, stream);    // OpenCV 3.x

        cv::Mat cur_pts_flow_cpu;
        cur_pts_flow_gpu.download(cur_pts_flow_cpu, stream);
        cv::Mat cur_pts_flow_cpu;
        cur_pts_flow_gpu.download(cur_pts_flow_cpu, stream);

        dst_grey_gpu.copyTo(src_grey_gpu, stream);
        dst_grey_gpu.copyTo(src_grey_gpu, stream);

        cv::Mat err_cpu, status_cpu;
        err_gpu.download(err_cpu, stream);
        status_gpu.download(status_cpu, stream);
        cv::Mat err_cpu, status_cpu;
        err_gpu.download(err_cpu, stream);
        status_gpu.download(status_cpu, stream);

        stream.waitForCompletion();
        stream.waitForCompletion();

        std::vector<bbox_t> result_bbox_vec;
        std::vector<bbox_t> result_bbox_vec;

        if (err_cpu.cols == cur_bbox_vec.size() && status_cpu.cols == cur_bbox_vec.size()) 
        {
            for (size_t i = 0; i < cur_bbox_vec.size(); ++i)
            {
                cv::Point2f cur_key_pt = cur_pts_flow_cpu.at<cv::Point2f>(0, i);
                cv::Point2f prev_key_pt = prev_pts_flow_cpu.at<cv::Point2f>(0, i);
        if (err_cpu.cols == cur_bbox_vec.size() && status_cpu.cols == cur_bbox_vec.size()) 
        {
            for (size_t i = 0; i < cur_bbox_vec.size(); ++i)
            {
                cv::Point2f cur_key_pt = cur_pts_flow_cpu.at<cv::Point2f>(0, i);
                cv::Point2f prev_key_pt = prev_pts_flow_cpu.at<cv::Point2f>(0, i);

                float moved_x = cur_key_pt.x - prev_key_pt.x;
                float moved_y = cur_key_pt.y - prev_key_pt.y;
                float moved_x = cur_key_pt.x - prev_key_pt.x;
                float moved_y = cur_key_pt.y - prev_key_pt.y;

                if (abs(moved_x) < 100 && abs(moved_y) < 100 && good_bbox_vec_flags[i])
                    if (err_cpu.at<float>(0, i) < flow_error && status_cpu.at<unsigned char>(0, i) != 0 &&
                        ((float)cur_bbox_vec[i].x + moved_x) > 0 && ((float)cur_bbox_vec[i].y + moved_y) > 0)
                    {
                        cur_bbox_vec[i].x += moved_x + 0.5;
                        cur_bbox_vec[i].y += moved_y + 0.5;
                        result_bbox_vec.push_back(cur_bbox_vec[i]);
                    }
                    else good_bbox_vec_flags[i] = false;
                else good_bbox_vec_flags[i] = false;
                if (abs(moved_x) < 100 && abs(moved_y) < 100 && good_bbox_vec_flags[i])
                    if (err_cpu.at<float>(0, i) < flow_error && status_cpu.at<unsigned char>(0, i) != 0 &&
                        ((float)cur_bbox_vec[i].x + moved_x) > 0 && ((float)cur_bbox_vec[i].y + moved_y) > 0)
                    {
                        cur_bbox_vec[i].x += moved_x + 0.5;
                        cur_bbox_vec[i].y += moved_y + 0.5;
                        result_bbox_vec.push_back(cur_bbox_vec[i]);
                    }
                    else good_bbox_vec_flags[i] = false;
                else good_bbox_vec_flags[i] = false;

                //if(!check_error && !good_bbox_vec_flags[i]) result_bbox_vec.push_back(cur_bbox_vec[i]);
            }
        }
                //if(!check_error && !good_bbox_vec_flags[i]) result_bbox_vec.push_back(cur_bbox_vec[i]);
            }
        }

        cur_pts_flow_gpu.swap(prev_pts_flow_gpu);
        cur_pts_flow_cpu.copyTo(prev_pts_flow_cpu);
        cur_pts_flow_gpu.swap(prev_pts_flow_gpu);
        cur_pts_flow_cpu.copyTo(prev_pts_flow_cpu);

        if (old_gpu_id != gpu_id)
            cv::cuda::setDevice(old_gpu_id);
        if (old_gpu_id != gpu_id)
            cv::cuda::setDevice(old_gpu_id);

        return result_bbox_vec;
    }
        return result_bbox_vec;
    }

};

@@ -339,314 +339,314 @@

class Tracker_optflow {
public:
    const int flow_error;
    const int flow_error;


    Tracker_optflow(int win_size = 9, int max_level = 3, int iterations = 8000, int _flow_error = -1) :
        flow_error((_flow_error > 0)? _flow_error:(win_size*4))
    {
        sync_PyrLKOpticalFlow = cv::SparsePyrLKOpticalFlow::create();
        sync_PyrLKOpticalFlow->setWinSize(cv::Size(win_size, win_size));    // 9, 15, 21, 31
        sync_PyrLKOpticalFlow->setMaxLevel(max_level);      // +- 3 pt
    Tracker_optflow(int win_size = 9, int max_level = 3, int iterations = 8000, int _flow_error = -1) :
        flow_error((_flow_error > 0)? _flow_error:(win_size*4))
    {
        sync_PyrLKOpticalFlow = cv::SparsePyrLKOpticalFlow::create();
        sync_PyrLKOpticalFlow->setWinSize(cv::Size(win_size, win_size));    // 9, 15, 21, 31
        sync_PyrLKOpticalFlow->setMaxLevel(max_level);        // +- 3 pt

    }
    }

    // just to avoid extra allocations
    cv::Mat dst_grey;
    cv::Mat prev_pts_flow, cur_pts_flow;
    cv::Mat status, err;
    // just to avoid extra allocations
    cv::Mat dst_grey;
    cv::Mat prev_pts_flow, cur_pts_flow;
    cv::Mat status, err;

    cv::Mat src_grey;   // used in both functions
    cv::Ptr<cv::SparsePyrLKOpticalFlow> sync_PyrLKOpticalFlow;
    cv::Mat src_grey;    // used in both functions
    cv::Ptr<cv::SparsePyrLKOpticalFlow> sync_PyrLKOpticalFlow;

    std::vector<bbox_t> cur_bbox_vec;
    std::vector<bool> good_bbox_vec_flags;
    std::vector<bbox_t> cur_bbox_vec;
    std::vector<bool> good_bbox_vec_flags;

    void update_cur_bbox_vec(std::vector<bbox_t> _cur_bbox_vec)
    {
        cur_bbox_vec = _cur_bbox_vec;
        good_bbox_vec_flags = std::vector<bool>(cur_bbox_vec.size(), true);
        cv::Mat prev_pts, cur_pts_flow;
    void update_cur_bbox_vec(std::vector<bbox_t> _cur_bbox_vec)
    {
        cur_bbox_vec = _cur_bbox_vec;
        good_bbox_vec_flags = std::vector<bool>(cur_bbox_vec.size(), true);
        cv::Mat prev_pts, cur_pts_flow;

        for (auto &i : cur_bbox_vec) {
            float x_center = (i.x + i.w / 2.0F);
            float y_center = (i.y + i.h / 2.0F);
            prev_pts.push_back(cv::Point2f(x_center, y_center));
        }
        for (auto &i : cur_bbox_vec) {
            float x_center = (i.x + i.w / 2.0F);
            float y_center = (i.y + i.h / 2.0F);
            prev_pts.push_back(cv::Point2f(x_center, y_center));
        }

        if (prev_pts.rows == 0)
            prev_pts_flow = cv::Mat();
        else
            cv::transpose(prev_pts, prev_pts_flow);
    }
        if (prev_pts.rows == 0)
            prev_pts_flow = cv::Mat();
        else
            cv::transpose(prev_pts, prev_pts_flow);
    }


    void update_tracking_flow(cv::Mat new_src_mat, std::vector<bbox_t> _cur_bbox_vec)
    {
        if (new_src_mat.channels() == 3) {
    void update_tracking_flow(cv::Mat new_src_mat, std::vector<bbox_t> _cur_bbox_vec)
    {
        if (new_src_mat.channels() == 3) {

            update_cur_bbox_vec(_cur_bbox_vec);
            update_cur_bbox_vec(_cur_bbox_vec);

            cv::cvtColor(new_src_mat, src_grey, CV_BGR2GRAY, 1);
        }
    }
            cv::cvtColor(new_src_mat, src_grey, CV_BGR2GRAY, 1);
        }
    }


    std::vector<bbox_t> tracking_flow(cv::Mat new_dst_mat, bool check_error = true)
    {
        if (sync_PyrLKOpticalFlow.empty()) {
            std::cout << "sync_PyrLKOpticalFlow isn't initialized \n";
            return cur_bbox_vec;
        }
    std::vector<bbox_t> tracking_flow(cv::Mat new_dst_mat, bool check_error = true)
    {
        if (sync_PyrLKOpticalFlow.empty()) {
            std::cout << "sync_PyrLKOpticalFlow isn't initialized \n";
            return cur_bbox_vec;
        }

        cv::cvtColor(new_dst_mat, dst_grey, CV_BGR2GRAY, 1);
        cv::cvtColor(new_dst_mat, dst_grey, CV_BGR2GRAY, 1);

        if (src_grey.rows != dst_grey.rows || src_grey.cols != dst_grey.cols) {
            src_grey = dst_grey.clone();
            return cur_bbox_vec;
        }
        if (src_grey.rows != dst_grey.rows || src_grey.cols != dst_grey.cols) {
            src_grey = dst_grey.clone();
            return cur_bbox_vec;
        }

        if (prev_pts_flow.cols < 1) {
            return cur_bbox_vec;
        }
        if (prev_pts_flow.cols < 1) {
            return cur_bbox_vec;
        }

        ////sync_PyrLKOpticalFlow_gpu.sparse(src_grey_gpu, dst_grey_gpu, prev_pts_flow_gpu, cur_pts_flow_gpu, status_gpu, &err_gpu);    // OpenCV 2.4.x
        sync_PyrLKOpticalFlow->calc(src_grey, dst_grey, prev_pts_flow, cur_pts_flow, status, err);  // OpenCV 3.x
        ////sync_PyrLKOpticalFlow_gpu.sparse(src_grey_gpu, dst_grey_gpu, prev_pts_flow_gpu, cur_pts_flow_gpu, status_gpu, &err_gpu);    // OpenCV 2.4.x
        sync_PyrLKOpticalFlow->calc(src_grey, dst_grey, prev_pts_flow, cur_pts_flow, status, err);    // OpenCV 3.x

        dst_grey.copyTo(src_grey);
        dst_grey.copyTo(src_grey);

        std::vector<bbox_t> result_bbox_vec;
        std::vector<bbox_t> result_bbox_vec;

        if (err.rows == cur_bbox_vec.size() && status.rows == cur_bbox_vec.size())
        {
            for (size_t i = 0; i < cur_bbox_vec.size(); ++i)
            {
                cv::Point2f cur_key_pt = cur_pts_flow.at<cv::Point2f>(0, i);
                cv::Point2f prev_key_pt = prev_pts_flow.at<cv::Point2f>(0, i);
        if (err.rows == cur_bbox_vec.size() && status.rows == cur_bbox_vec.size())
        {
            for (size_t i = 0; i < cur_bbox_vec.size(); ++i)
            {
                cv::Point2f cur_key_pt = cur_pts_flow.at<cv::Point2f>(0, i);
                cv::Point2f prev_key_pt = prev_pts_flow.at<cv::Point2f>(0, i);

                float moved_x = cur_key_pt.x - prev_key_pt.x;
                float moved_y = cur_key_pt.y - prev_key_pt.y;
                float moved_x = cur_key_pt.x - prev_key_pt.x;
                float moved_y = cur_key_pt.y - prev_key_pt.y;

                if (abs(moved_x) < 100 && abs(moved_y) < 100 && good_bbox_vec_flags[i])
                    if (err.at<float>(0, i) < flow_error && status.at<unsigned char>(0, i) != 0 &&
                        ((float)cur_bbox_vec[i].x + moved_x) > 0 && ((float)cur_bbox_vec[i].y + moved_y) > 0)
                    {
                        cur_bbox_vec[i].x += moved_x + 0.5;
                        cur_bbox_vec[i].y += moved_y + 0.5;
                        result_bbox_vec.push_back(cur_bbox_vec[i]);
                    }
                    else good_bbox_vec_flags[i] = false;
                else good_bbox_vec_flags[i] = false;
                if (abs(moved_x) < 100 && abs(moved_y) < 100 && good_bbox_vec_flags[i])
                    if (err.at<float>(0, i) < flow_error && status.at<unsigned char>(0, i) != 0 &&
                        ((float)cur_bbox_vec[i].x + moved_x) > 0 && ((float)cur_bbox_vec[i].y + moved_y) > 0)
                    {
                        cur_bbox_vec[i].x += moved_x + 0.5;
                        cur_bbox_vec[i].y += moved_y + 0.5;
                        result_bbox_vec.push_back(cur_bbox_vec[i]);
                    }
                    else good_bbox_vec_flags[i] = false;
                else good_bbox_vec_flags[i] = false;

                //if(!check_error && !good_bbox_vec_flags[i]) result_bbox_vec.push_back(cur_bbox_vec[i]);
            }
        }
                //if(!check_error && !good_bbox_vec_flags[i]) result_bbox_vec.push_back(cur_bbox_vec[i]);
            }
        }

        prev_pts_flow = cur_pts_flow.clone();
        prev_pts_flow = cur_pts_flow.clone();

        return result_bbox_vec;
    }
        return result_bbox_vec;
    }

};
#else

class Tracker_optflow {};

#endif  // defined(TRACK_OPTFLOW) && defined(OPENCV)
#endif    // defined(TRACK_OPTFLOW) && defined(OPENCV)


#ifdef OPENCV

static cv::Scalar obj_id_to_color(int obj_id) {
    int const colors[6][3] = { { 1,0,1 },{ 0,0,1 },{ 0,1,1 },{ 0,1,0 },{ 1,1,0 },{ 1,0,0 } };
    int const offset = obj_id * 123457 % 6;
    int const color_scale = 150 + (obj_id * 123457) % 100;
    cv::Scalar color(colors[offset][0], colors[offset][1], colors[offset][2]);
    color *= color_scale;
    return color;
    int const colors[6][3] = { { 1,0,1 },{ 0,0,1 },{ 0,1,1 },{ 0,1,0 },{ 1,1,0 },{ 1,0,0 } };
    int const offset = obj_id * 123457 % 6;
    int const color_scale = 150 + (obj_id * 123457) % 100;
    cv::Scalar color(colors[offset][0], colors[offset][1], colors[offset][2]);
    color *= color_scale;
    return color;
}

class preview_boxes_t {
    enum { frames_history = 30 };   // how long to keep the history saved
    enum { frames_history = 30 };    // how long to keep the history saved

    struct preview_box_track_t {
        unsigned int track_id, obj_id, last_showed_frames_ago;
        bool current_detection;
        bbox_t bbox;
        cv::Mat mat_obj, mat_resized_obj;
        preview_box_track_t() : track_id(0), obj_id(0), last_showed_frames_ago(frames_history), current_detection(false) {}
    };
    std::vector<preview_box_track_t> preview_box_track_id;
    size_t const preview_box_size, bottom_offset;
    bool const one_off_detections;
    struct preview_box_track_t {
        unsigned int track_id, obj_id, last_showed_frames_ago;
        bool current_detection;
        bbox_t bbox;
        cv::Mat mat_obj, mat_resized_obj;
        preview_box_track_t() : track_id(0), obj_id(0), last_showed_frames_ago(frames_history), current_detection(false) {}
    };
    std::vector<preview_box_track_t> preview_box_track_id;
    size_t const preview_box_size, bottom_offset;
    bool const one_off_detections;
public:
    preview_boxes_t(size_t _preview_box_size = 100, size_t _bottom_offset = 100, bool _one_off_detections = false) :
        preview_box_size(_preview_box_size), bottom_offset(_bottom_offset), one_off_detections(_one_off_detections)
    {}
    preview_boxes_t(size_t _preview_box_size = 100, size_t _bottom_offset = 100, bool _one_off_detections = false) :
        preview_box_size(_preview_box_size), bottom_offset(_bottom_offset), one_off_detections(_one_off_detections)
    {}

    void set(cv::Mat src_mat, std::vector<bbox_t> result_vec)
    {
        size_t const count_preview_boxes = src_mat.cols / preview_box_size;
        if (preview_box_track_id.size() != count_preview_boxes) preview_box_track_id.resize(count_preview_boxes);
    void set(cv::Mat src_mat, std::vector<bbox_t> result_vec)
    {
        size_t const count_preview_boxes = src_mat.cols / preview_box_size;
        if (preview_box_track_id.size() != count_preview_boxes) preview_box_track_id.resize(count_preview_boxes);

        // increment frames history
        for (auto &i : preview_box_track_id)
            i.last_showed_frames_ago = std::min((unsigned)frames_history, i.last_showed_frames_ago + 1);
        // increment frames history
        for (auto &i : preview_box_track_id)
            i.last_showed_frames_ago = std::min((unsigned)frames_history, i.last_showed_frames_ago + 1);

        // occupy empty boxes
        for (auto &k : result_vec) {
            bool found = false;
            // find the same (track_id)
            for (auto &i : preview_box_track_id) {
                if (i.track_id == k.track_id) {
                    if (!one_off_detections) i.last_showed_frames_ago = 0; // for tracked objects
                    found = true;
                    break;
                }
            }
            if (!found) {
                // find empty box
                for (auto &i : preview_box_track_id) {
                    if (i.last_showed_frames_ago == frames_history) {
                        if (!one_off_detections && k.frames_counter == 0) break; // don't show if obj isn't tracked yet
                        i.track_id = k.track_id;
                        i.obj_id = k.obj_id;
                        i.bbox = k;
                        i.last_showed_frames_ago = 0;
                        break;
                    }
                }
            }
        }
        // occupy empty boxes
        for (auto &k : result_vec) {
            bool found = false;
            // find the same (track_id)
            for (auto &i : preview_box_track_id) {
                if (i.track_id == k.track_id) {
                    if (!one_off_detections) i.last_showed_frames_ago = 0; // for tracked objects
                    found = true;
                    break;
                }
            }
            if (!found) {
                // find empty box
                for (auto &i : preview_box_track_id) {
                    if (i.last_showed_frames_ago == frames_history) {
                        if (!one_off_detections && k.frames_counter == 0) break; // don't show if obj isn't tracked yet
                        i.track_id = k.track_id;
                        i.obj_id = k.obj_id;
                        i.bbox = k;
                        i.last_showed_frames_ago = 0;
                        break;
                    }
                }
            }
        }

        // draw preview box (from old or current frame)
        for (size_t i = 0; i < preview_box_track_id.size(); ++i)
        {
            // get object image
            cv::Mat dst = preview_box_track_id[i].mat_resized_obj;
            preview_box_track_id[i].current_detection = false;
        // draw preview box (from old or current frame)
        for (size_t i = 0; i < preview_box_track_id.size(); ++i)
        {
            // get object image
            cv::Mat dst = preview_box_track_id[i].mat_resized_obj;
            preview_box_track_id[i].current_detection = false;

            for (auto &k : result_vec) {
                if (preview_box_track_id[i].track_id == k.track_id) {
                    if (one_off_detections && preview_box_track_id[i].last_showed_frames_ago > 0) {
                        preview_box_track_id[i].last_showed_frames_ago = frames_history; break;
                    }
                    bbox_t b = k;
                    cv::Rect r(b.x, b.y, b.w, b.h);
                    cv::Rect img_rect(cv::Point2i(0, 0), src_mat.size());
                    cv::Rect rect_roi = r & img_rect;
                    if (rect_roi.width > 1 || rect_roi.height > 1) {
                        cv::Mat roi = src_mat(rect_roi);
                        cv::resize(roi, dst, cv::Size(preview_box_size, preview_box_size), cv::INTER_NEAREST);
                        preview_box_track_id[i].mat_obj = roi.clone();
                        preview_box_track_id[i].mat_resized_obj = dst.clone();
                        preview_box_track_id[i].current_detection = true;
                        preview_box_track_id[i].bbox = k;
                    }
                    break;
                }
            }
        }
    }
            for (auto &k : result_vec) {
                if (preview_box_track_id[i].track_id == k.track_id) {
                    if (one_off_detections && preview_box_track_id[i].last_showed_frames_ago > 0) {
                        preview_box_track_id[i].last_showed_frames_ago = frames_history; break;
                    }
                    bbox_t b = k;
                    cv::Rect r(b.x, b.y, b.w, b.h);
                    cv::Rect img_rect(cv::Point2i(0, 0), src_mat.size());
                    cv::Rect rect_roi = r & img_rect;
                    if (rect_roi.width > 1 || rect_roi.height > 1) {
                        cv::Mat roi = src_mat(rect_roi);
                        cv::resize(roi, dst, cv::Size(preview_box_size, preview_box_size), cv::INTER_NEAREST);
                        preview_box_track_id[i].mat_obj = roi.clone();
                        preview_box_track_id[i].mat_resized_obj = dst.clone();
                        preview_box_track_id[i].current_detection = true;
                        preview_box_track_id[i].bbox = k;
                    }
                    break;
                }
            }
        }
    }


    void draw(cv::Mat draw_mat, bool show_small_boxes = false)
    {
        // draw preview box (from old or current frame)
        for (size_t i = 0; i < preview_box_track_id.size(); ++i)
        {
            auto &prev_box = preview_box_track_id[i];
    void draw(cv::Mat draw_mat, bool show_small_boxes = false)
    {
        // draw preview box (from old or current frame)
        for (size_t i = 0; i < preview_box_track_id.size(); ++i)
        {
            auto &prev_box = preview_box_track_id[i];

            // draw object image
            cv::Mat dst = prev_box.mat_resized_obj;
            if (prev_box.last_showed_frames_ago < frames_history &&
                dst.size() == cv::Size(preview_box_size, preview_box_size))
            {
                cv::Rect dst_rect_roi(cv::Point2i(i * preview_box_size, draw_mat.rows - bottom_offset), dst.size());
                cv::Mat dst_roi = draw_mat(dst_rect_roi);
                dst.copyTo(dst_roi);
            // draw object image
            cv::Mat dst = prev_box.mat_resized_obj;
            if (prev_box.last_showed_frames_ago < frames_history &&
                dst.size() == cv::Size(preview_box_size, preview_box_size))
            {
                cv::Rect dst_rect_roi(cv::Point2i(i * preview_box_size, draw_mat.rows - bottom_offset), dst.size());
                cv::Mat dst_roi = draw_mat(dst_rect_roi);
                dst.copyTo(dst_roi);

                cv::Scalar color = obj_id_to_color(prev_box.obj_id);
                int thickness = (prev_box.current_detection) ? 5 : 1;
                cv::rectangle(draw_mat, dst_rect_roi, color, thickness);
                cv::Scalar color = obj_id_to_color(prev_box.obj_id);
                int thickness = (prev_box.current_detection) ? 5 : 1;
                cv::rectangle(draw_mat, dst_rect_roi, color, thickness);

                unsigned int const track_id = prev_box.track_id;
                std::string track_id_str = (track_id > 0) ? std::to_string(track_id) : "";
                putText(draw_mat, track_id_str, dst_rect_roi.tl() - cv::Point2i(-4, 5), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.9, cv::Scalar(0, 0, 0), 2);
                unsigned int const track_id = prev_box.track_id;
                std::string track_id_str = (track_id > 0) ? std::to_string(track_id) : "";
                putText(draw_mat, track_id_str, dst_rect_roi.tl() - cv::Point2i(-4, 5), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.9, cv::Scalar(0, 0, 0), 2);

                std::string size_str = std::to_string(prev_box.bbox.w) + "x" + std::to_string(prev_box.bbox.h);
                putText(draw_mat, size_str, dst_rect_roi.tl() + cv::Point2i(0, 12), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.8, cv::Scalar(0, 0, 0), 1);
                std::string size_str = std::to_string(prev_box.bbox.w) + "x" + std::to_string(prev_box.bbox.h);
                putText(draw_mat, size_str, dst_rect_roi.tl() + cv::Point2i(0, 12), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.8, cv::Scalar(0, 0, 0), 1);

                if (!one_off_detections && prev_box.current_detection) {
                    cv::line(draw_mat, dst_rect_roi.tl() + cv::Point2i(preview_box_size, 0),
                        cv::Point2i(prev_box.bbox.x, prev_box.bbox.y + prev_box.bbox.h),
                        color);
                }
                if (!one_off_detections && prev_box.current_detection) {
                    cv::line(draw_mat, dst_rect_roi.tl() + cv::Point2i(preview_box_size, 0),
                        cv::Point2i(prev_box.bbox.x, prev_box.bbox.y + prev_box.bbox.h),
                        color);
                }

                if (one_off_detections && show_small_boxes) {
                    cv::Rect src_rect_roi(cv::Point2i(prev_box.bbox.x, prev_box.bbox.y),
                        cv::Size(prev_box.bbox.w, prev_box.bbox.h));
                    unsigned int const color_history = (255 * prev_box.last_showed_frames_ago) / frames_history;
                    color = cv::Scalar(255 - 3 * color_history, 255 - 2 * color_history, 255 - 1 * color_history);
                    if (prev_box.mat_obj.size() == src_rect_roi.size()) {
                        prev_box.mat_obj.copyTo(draw_mat(src_rect_roi));
                    }
                    cv::rectangle(draw_mat, src_rect_roi, color, thickness);
                    putText(draw_mat, track_id_str, src_rect_roi.tl() - cv::Point2i(0, 10), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.8, cv::Scalar(0, 0, 0), 1);
                }
            }
        }
    }
                if (one_off_detections && show_small_boxes) {
                    cv::Rect src_rect_roi(cv::Point2i(prev_box.bbox.x, prev_box.bbox.y),
                        cv::Size(prev_box.bbox.w, prev_box.bbox.h));
                    unsigned int const color_history = (255 * prev_box.last_showed_frames_ago) / frames_history;
                    color = cv::Scalar(255 - 3 * color_history, 255 - 2 * color_history, 255 - 1 * color_history);
                    if (prev_box.mat_obj.size() == src_rect_roi.size()) {
                        prev_box.mat_obj.copyTo(draw_mat(src_rect_roi));
                    }
                    cv::rectangle(draw_mat, src_rect_roi, color, thickness);
                    putText(draw_mat, track_id_str, src_rect_roi.tl() - cv::Point2i(0, 10), cv::FONT_HERSHEY_COMPLEX_SMALL, 0.8, cv::Scalar(0, 0, 0), 1);
                }
            }
        }
    }
};
#endif  // OPENCV
#endif    // OPENCV

//extern "C" {
#endif  // __cplusplus
#endif    // __cplusplus

/*
    // C - wrappers
    YOLODLL_API void create_detector(char const* cfg_filename, char const* weight_filename, int gpu_id);
    YOLODLL_API void delete_detector();
    YOLODLL_API bbox_t* detect_custom(image_t img, float thresh, bool use_mean, int *result_size);
    YOLODLL_API bbox_t* detect_resized(image_t img, int init_w, int init_h, float thresh, bool use_mean, int *result_size);
    YOLODLL_API bbox_t* detect(image_t img, int *result_size);
    YOLODLL_API image_t load_img(char *image_filename);
    YOLODLL_API void free_img(image_t m);
    // C - wrappers
    YOLODLL_API void create_detector(char const* cfg_filename, char const* weight_filename, int gpu_id);
    YOLODLL_API void delete_detector();
    YOLODLL_API bbox_t* detect_custom(image_t img, float thresh, bool use_mean, int *result_size);
    YOLODLL_API bbox_t* detect_resized(image_t img, int init_w, int init_h, float thresh, bool use_mean, int *result_size);
    YOLODLL_API bbox_t* detect(image_t img, int *result_size);
    YOLODLL_API image_t load_img(char *image_filename);
    YOLODLL_API void free_img(image_t m);

#ifdef __cplusplus
}   // extern "C"
}    // extern "C"

static std::shared_ptr<void> c_detector_ptr;
static std::vector<bbox_t> c_result_vec;

void create_detector(char const* cfg_filename, char const* weight_filename, int gpu_id) {
    c_detector_ptr = std::make_shared<YOLODLL_API Detector>(cfg_filename, weight_filename, gpu_id);
    c_detector_ptr = std::make_shared<YOLODLL_API Detector>(cfg_filename, weight_filename, gpu_id);
}

void delete_detector() { c_detector_ptr.reset(); }

bbox_t* detect_custom(image_t img, float thresh, bool use_mean, int *result_size) {
    c_result_vec = static_cast<Detector*>(c_detector_ptr.get())->detect(img, thresh, use_mean);
    *result_size = c_result_vec.size();
    return c_result_vec.data();
    c_result_vec = static_cast<Detector*>(c_detector_ptr.get())->detect(img, thresh, use_mean);
    *result_size = c_result_vec.size();
    return c_result_vec.data();
}

bbox_t* detect_resized(image_t img, int init_w, int init_h, float thresh, bool use_mean, int *result_size) {
    c_result_vec = static_cast<Detector*>(c_detector_ptr.get())->detect_resized(img, init_w, init_h, thresh, use_mean);
    *result_size = c_result_vec.size();
    return c_result_vec.data();
    c_result_vec = static_cast<Detector*>(c_detector_ptr.get())->detect_resized(img, init_w, init_h, thresh, use_mean);
    *result_size = c_result_vec.size();
    return c_result_vec.data();
}

bbox_t* detect(image_t img, int *result_size) {
    return detect_custom(img, 0.24, true, result_size);
    return detect_custom(img, 0.24, true, result_size);
}

image_t load_img(char *image_filename) {
    return static_cast<Detector*>(c_detector_ptr.get())->load_image(image_filename);
    return static_cast<Detector*>(c_detector_ptr.get())->load_image(image_filename);
}
void free_img(image_t m) {
    static_cast<Detector*>(c_detector_ptr.get())->free_image(m);
    static_cast<Detector*>(c_detector_ptr.get())->free_image(m);
}

#endif  // __cplusplus
#endif    // __cplusplus
*/

New file
			@@ -0,0 +1,8 @@
			root=true

			[*]
			trim_trailing_whitespace = true
			indent_style = space
			indent_size = 4

			@@ -53,10 +53,10 @@
			layer.x_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs);
			layer.x_norm_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs);
			#ifdef CUDNN
			cudnnCreateTensorDescriptor(&layer.normTensorDesc);
			cudnnCreateTensorDescriptor(&layer.normDstTensorDesc);
			cudnnSetTensor4dDescriptor(layer.normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, layer.batch, layer.out_c, layer.out_h, layer.out_w);
			cudnnSetTensor4dDescriptor(layer.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, layer.out_c, 1, 1);
			cudnnCreateTensorDescriptor(&layer.normTensorDesc);
			cudnnCreateTensorDescriptor(&layer.normDstTensorDesc);
			cudnnSetTensor4dDescriptor(layer.normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, layer.batch, layer.out_c, layer.out_h, layer.out_w);
			cudnnSetTensor4dDescriptor(layer.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, layer.out_c, 1, 1);
			#endif
			#endif
			return layer;
			@@ -179,93 +179,93 @@

			void forward_batchnorm_layer_gpu(layer l, network_state state)
			{
			if (l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
			copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
			if (state.train) {
			if (l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
			copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
			if (state.train) {
			#ifdef CUDNN
			float one = 1;
			float zero = 0;
			cudnnBatchNormalizationForwardTraining(cudnn_handle(),
			CUDNN_BATCHNORM_SPATIAL,
			&one,
			&zero,
			l.normDstTensorDesc,
			l.x_gpu, // input
			l.normDstTensorDesc,
			l.output_gpu, // output
			l.normTensorDesc,
			l.scales_gpu,
			l.biases_gpu,
			.01,
			l.rolling_mean_gpu, // output (should be FP32)
			l.rolling_variance_gpu, // output (should be FP32)
			.00001,
			l.mean_gpu, // output (should be FP32)
			l.variance_gpu); // output (should be FP32)
			float one = 1;
			float zero = 0;
			cudnnBatchNormalizationForwardTraining(cudnn_handle(),
			CUDNN_BATCHNORM_SPATIAL,
			&one,
			&zero,
			l.normDstTensorDesc,
			l.x_gpu, // input
			l.normDstTensorDesc,
			l.output_gpu, // output
			l.normTensorDesc,
			l.scales_gpu,
			l.biases_gpu,
			.01,
			l.rolling_mean_gpu, // output (should be FP32)
			l.rolling_variance_gpu, // output (should be FP32)
			.00001,
			l.mean_gpu, // output (should be FP32)
			l.variance_gpu); // output (should be FP32)
			#else
			fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
			fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu);
			fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
			fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu);

			scal_ongpu(l.out_c, .99, l.rolling_mean_gpu, 1);
			axpy_ongpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
			scal_ongpu(l.out_c, .99, l.rolling_variance_gpu, 1);
			axpy_ongpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);
			scal_ongpu(l.out_c, .99, l.rolling_mean_gpu, 1);
			axpy_ongpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
			scal_ongpu(l.out_c, .99, l.rolling_variance_gpu, 1);
			axpy_ongpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);

			copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
			normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
			copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);
			copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
			normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
			copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);

			scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
			add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
			scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
			add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
			#endif
			}
			else {
			normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
			scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
			add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
			}
			}
			else {
			normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
			scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
			add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
			}

			}

			void backward_batchnorm_layer_gpu(layer l, network_state state)
			{
			if (!state.train) {
			l.mean_gpu = l.rolling_mean_gpu;
			l.variance_gpu = l.rolling_variance_gpu;
			}
			if (!state.train) {
			l.mean_gpu = l.rolling_mean_gpu;
			l.variance_gpu = l.rolling_variance_gpu;
			}
			#ifdef CUDNN
			float one = 1;
			float zero = 0;
			cudnnBatchNormalizationBackward(cudnn_handle(),
			CUDNN_BATCHNORM_SPATIAL,
			&one,
			&zero,
			&one,
			&one,
			l.normDstTensorDesc,
			l.x_gpu, // input
			l.normDstTensorDesc,
			l.delta_gpu, // input
			l.normDstTensorDesc,
			l.x_norm_gpu, // output
			l.normTensorDesc,
			l.scales_gpu, // output (should be FP32)
			l.scale_updates_gpu, // output (should be FP32)
			l.bias_updates_gpu, // output (should be FP32)
			.00001,
			l.mean_gpu, // input (should be FP32)
			l.variance_gpu); // input (should be FP32)
			copy_ongpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
			float one = 1;
			float zero = 0;
			cudnnBatchNormalizationBackward(cudnn_handle(),
			CUDNN_BATCHNORM_SPATIAL,
			&one,
			&zero,
			&one,
			&one,
			l.normDstTensorDesc,
			l.x_gpu, // input
			l.normDstTensorDesc,
			l.delta_gpu, // input
			l.normDstTensorDesc,
			l.x_norm_gpu, // output
			l.normTensorDesc,
			l.scales_gpu, // output (should be FP32)
			l.scale_updates_gpu, // output (should be FP32)
			l.bias_updates_gpu, // output (should be FP32)
			.00001,
			l.mean_gpu, // input (should be FP32)
			l.variance_gpu); // input (should be FP32)
			copy_ongpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
			#else
			backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h);
			backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu);
			backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h);
			backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu);

			scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
			scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);

			fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu);
			fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu);
			normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
			fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu);
			fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu);
			normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
			#endif
			if (l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
			if (l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
			}
			#endif

			@@ -11,8 +11,8 @@
			int b,i,j,k;
			int in_c = out_c/(stride*stride);

			//printf("\n out_c = %d, out_w = %d, out_h = %d, stride = %d, forward = %d \n", out_c, out_w, out_h, stride, forward);
			//printf(" in_c = %d, in_w = %d, in_h = %d \n", in_c, out_wstride, out_hstride);
			//printf("\n out_c = %d, out_w = %d, out_h = %d, stride = %d, forward = %d \n", out_c, out_w, out_h, stride, forward);
			//printf(" in_c = %d, in_w = %d, in_h = %d \n", in_c, out_wstride, out_hstride);

			for(b = 0; b < batch; ++b){
			for(k = 0; k < out_c; ++k){
			@@ -24,7 +24,7 @@
			int w2 = i*stride + offset % stride;
			int h2 = j*stride + offset / stride;
			int out_index = w2 + out_wstride(h2 + out_hstride(c2 + in_c*b));
			if(forward) out[out_index] = x[in_index]; // used by default for forward (i.e. forward = 0)
			if(forward) out[out_index] = x[in_index]; // used by default for forward (i.e. forward = 0)
			else out[in_index] = x[out_index];
			}
			}
			@@ -293,17 +293,17 @@

			void upsample_cpu(float in, int w, int h, int c, int batch, int stride, int forward, float scale, float out)
			{
			int i, j, k, b;
			for (b = 0; b < batch; ++b) {
			for (k = 0; k < c; ++k) {
			for (j = 0; j < h*stride; ++j) {
			for (i = 0; i < w*stride; ++i) {
			int in_index = bwhc + kwh + (j / stride)w + i / stride;
			int out_index = bwhcstridestride + kwhstridestride + jw*stride + i;
			if (forward) out[out_index] = scale*in[in_index];
			else in[in_index] += scale*out[out_index];
			}
			}
			}
			}
			int i, j, k, b;
			for (b = 0; b < batch; ++b) {
			for (k = 0; k < c; ++k) {
			for (j = 0; j < h*stride; ++j) {
			for (i = 0; i < w*stride; ++i) {
			int in_index = bwhc + kwh + (j / stride)w + i / stride;
			int out_index = bwhcstridestride + kwhstridestride + jw*stride + i;
			if (forward) out[out_index] = scale*in[in_index];
			else in[in_index] += scale*out[out_index];
			}
			}
			}
			}
			}

			@@ -157,16 +157,16 @@

			extern "C" void adam_update_gpu(float w, float d, float m, float v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t)
			{
			scal_ongpu(n, B1, m, 1);
			scal_ongpu(n, B2, v, 1);
			axpy_ongpu(n, -decay*batch, w, 1, d, 1);
			scal_ongpu(n, B1, m, 1);
			scal_ongpu(n, B2, v, 1);
			axpy_ongpu(n, -decay*batch, w, 1, d, 1);

			axpy_ongpu(n, (1 - B1), d, 1, m, 1);
			mul_ongpu(n, d, 1, d, 1);
			axpy_ongpu(n, (1 - B2), d, 1, v, 1);
			axpy_ongpu(n, (1 - B1), d, 1, m, 1);
			mul_ongpu(n, d, 1, d, 1);
			axpy_ongpu(n, (1 - B2), d, 1, v, 1);

			adam_gpu(n, w, m, v, B1, B2, rate, eps, t);
			fill_ongpu(n, 0, d, 1);
			adam_gpu(n, w, m, v, B1, B2, rate, eps, t);
			fill_ongpu(n, 0, d, 1);
			}

			__global__ void normalize_kernel(int N, float x, float mean, float *variance, int batch, int filters, int spatial)
			@@ -237,7 +237,7 @@
			local[id] += (i+id < spatial) ? delta[index] : 0;
			}
			}
			__syncthreads();
			__syncthreads();

			if(id == 0){
			mean_delta[filter] = 0;
			@@ -266,7 +266,7 @@
			local[id] += (i+id < spatial) ? delta[index]*(x[index] - mean[filter]) : 0;
			}
			}
			__syncthreads();
			__syncthreads();

			if(id == 0){
			variance_delta[filter] = 0;
			@@ -462,7 +462,7 @@
			local[id] += (i+id < spatial) ? x[index] : 0;
			}
			}
			__syncthreads();
			__syncthreads();

			if(id == 0){
			mean[filter] = 0;
			@@ -491,7 +491,7 @@
			local[id] += (i+id < spatial) ? powf((x[index] - mean[filter]), 2) : 0;
			}
			}
			__syncthreads();
			__syncthreads();

			if(id == 0){
			variance[filter] = 0;
			@@ -787,31 +787,31 @@

			__global__ void upsample_kernel(size_t N, float x, int w, int h, int c, int batch, int stride, int forward, float scale, float out)
			{
			size_t i = (blockIdx.x + blockIdx.ygridDim.x) blockDim.x + threadIdx.x;
			if (i >= N) return;
			int out_index = i;
			int out_w = i % (w*stride);
			i = i / (w*stride);
			int out_h = i % (h*stride);
			i = i / (h*stride);
			int out_c = i%c;
			i = i / c;
			int b = i%batch;
			size_t i = (blockIdx.x + blockIdx.ygridDim.x) blockDim.x + threadIdx.x;
			if (i >= N) return;
			int out_index = i;
			int out_w = i % (w*stride);
			i = i / (w*stride);
			int out_h = i % (h*stride);
			i = i / (h*stride);
			int out_c = i%c;
			i = i / c;
			int b = i%batch;

			int in_w = out_w / stride;
			int in_h = out_h / stride;
			int in_c = out_c;
			int in_w = out_w / stride;
			int in_h = out_h / stride;
			int in_c = out_c;

			int in_index = bwhc + in_cwh + in_hw + in_w;
			int in_index = bwhc + in_cwh + in_hw + in_w;


			if (forward) out[out_index] += scale * x[in_index];
			else atomicAdd(x + in_index, scale * out[out_index]);
			if (forward) out[out_index] += scale * x[in_index];
			else atomicAdd(x + in_index, scale * out[out_index]);
			}

			extern "C" void upsample_gpu(float in, int w, int h, int c, int batch, int stride, int forward, float scale, float out)
			{
			size_t size = whcbatchstride*stride;
			upsample_kernel << <cuda_gridsize(size), BLOCK >> >(size, in, w, h, c, batch, stride, forward, scale, out);
			check_error(cudaPeekAtLastError());
			size_t size = whcbatchstride*stride;
			upsample_kernel << <cuda_gridsize(size), BLOCK >> >(size, in, w, h, c, batch, stride, forward, scale, out);
			check_error(cudaPeekAtLastError());
			}

			@@ -278,88 +278,88 @@

			int nms_comparator_v3(const void pa, const void pb)
			{
			detection a = (detection )pa;
			detection b = (detection )pb;
			float diff = 0;
			if (b.sort_class >= 0) {
			diff = a.prob[b.sort_class] - b.prob[b.sort_class];
			}
			else {
			diff = a.objectness - b.objectness;
			}
			if (diff < 0) return 1;
			else if (diff > 0) return -1;
			return 0;
			detection a = (detection )pa;
			detection b = (detection )pb;
			float diff = 0;
			if (b.sort_class >= 0) {
			diff = a.prob[b.sort_class] - b.prob[b.sort_class];
			}
			else {
			diff = a.objectness - b.objectness;
			}
			if (diff < 0) return 1;
			else if (diff > 0) return -1;
			return 0;
			}

			void do_nms_obj(detection *dets, int total, int classes, float thresh)
			{
			int i, j, k;
			k = total - 1;
			for (i = 0; i <= k; ++i) {
			if (dets[i].objectness == 0) {
			detection swap = dets[i];
			dets[i] = dets[k];
			dets[k] = swap;
			--k;
			--i;
			}
			}
			total = k + 1;
			int i, j, k;
			k = total - 1;
			for (i = 0; i <= k; ++i) {
			if (dets[i].objectness == 0) {
			detection swap = dets[i];
			dets[i] = dets[k];
			dets[k] = swap;
			--k;
			--i;
			}
			}
			total = k + 1;

			for (i = 0; i < total; ++i) {
			dets[i].sort_class = -1;
			}
			for (i = 0; i < total; ++i) {
			dets[i].sort_class = -1;
			}

			qsort(dets, total, sizeof(detection), nms_comparator_v3);
			for (i = 0; i < total; ++i) {
			if (dets[i].objectness == 0) continue;
			box a = dets[i].bbox;
			for (j = i + 1; j < total; ++j) {
			if (dets[j].objectness == 0) continue;
			box b = dets[j].bbox;
			if (box_iou(a, b) > thresh) {
			dets[j].objectness = 0;
			for (k = 0; k < classes; ++k) {
			dets[j].prob[k] = 0;
			}
			}
			}
			}
			qsort(dets, total, sizeof(detection), nms_comparator_v3);
			for (i = 0; i < total; ++i) {
			if (dets[i].objectness == 0) continue;
			box a = dets[i].bbox;
			for (j = i + 1; j < total; ++j) {
			if (dets[j].objectness == 0) continue;
			box b = dets[j].bbox;
			if (box_iou(a, b) > thresh) {
			dets[j].objectness = 0;
			for (k = 0; k < classes; ++k) {
			dets[j].prob[k] = 0;
			}
			}
			}
			}
			}

			void do_nms_sort(detection *dets, int total, int classes, float thresh)
			{
			int i, j, k;
			k = total - 1;
			for (i = 0; i <= k; ++i) {
			if (dets[i].objectness == 0) {
			detection swap = dets[i];
			dets[i] = dets[k];
			dets[k] = swap;
			--k;
			--i;
			}
			}
			total = k + 1;
			int i, j, k;
			k = total - 1;
			for (i = 0; i <= k; ++i) {
			if (dets[i].objectness == 0) {
			detection swap = dets[i];
			dets[i] = dets[k];
			dets[k] = swap;
			--k;
			--i;
			}
			}
			total = k + 1;

			for (k = 0; k < classes; ++k) {
			for (i = 0; i < total; ++i) {
			dets[i].sort_class = k;
			}
			qsort(dets, total, sizeof(detection), nms_comparator_v3);
			for (i = 0; i < total; ++i) {
			//printf(" k = %d, \t i = %d \n", k, i);
			if (dets[i].prob[k] == 0) continue;
			box a = dets[i].bbox;
			for (j = i + 1; j < total; ++j) {
			box b = dets[j].bbox;
			if (box_iou(a, b) > thresh) {
			dets[j].prob[k] = 0;
			}
			}
			}
			}
			for (k = 0; k < classes; ++k) {
			for (i = 0; i < total; ++i) {
			dets[i].sort_class = k;
			}
			qsort(dets, total, sizeof(detection), nms_comparator_v3);
			for (i = 0; i < total; ++i) {
			//printf(" k = %d, \t i = %d \n", k, i);
			if (dets[i].prob[k] == 0) continue;
			box a = dets[i].bbox;
			for (j = i + 1; j < total; ++j) {
			box b = dets[j].bbox;
			if (box_iou(a, b) > thresh) {
			dets[j].prob[k] = 0;
			}
			}
			}
			}
			}

			void do_nms(box boxes, float *probs, int total, int classes, float thresh)

			@@ -76,36 +76,36 @@

			__global__ void cuda_f32_to_f16(float* input_f32, size_t size, half *output_f16)
			{
			int idx = blockIdx.x * blockDim.x + threadIdx.x;
			if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
			//if (idx < size) ((unsigned short )output_f16 + idx) = __float2half(input_f32[idx]);
			int idx = blockIdx.x * blockDim.x + threadIdx.x;
			if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
			//if (idx < size) ((unsigned short )output_f16 + idx) = __float2half(input_f32[idx]);
			}

			void cuda_convert_f32_to_f16(float* input_f32, size_t size, float *output_f16) {
			cuda_f32_to_f16 <<< size / BLOCK + 1, BLOCK, 0, get_cuda_stream() >>> (input_f32, size, (half *)output_f16);
			cuda_f32_to_f16 <<< size / BLOCK + 1, BLOCK, 0, get_cuda_stream() >>> (input_f32, size, (half *)output_f16);
			}

			__global__ void cuda_f16_to_f32(half* input_f16, size_t size, float *output_f32)
			{
			int idx = blockIdx.x * blockDim.x + threadIdx.x;
			if (idx < size) output_f32[idx] = __half2float(input_f16[idx]);
			//if (idx < size) output_f32[idx] = __half2float(((unsigned short )input_f16 + idx));
			int idx = blockIdx.x * blockDim.x + threadIdx.x;
			if (idx < size) output_f32[idx] = __half2float(input_f16[idx]);
			//if (idx < size) output_f32[idx] = __half2float(((unsigned short )input_f16 + idx));
			}

			void cuda_convert_f16_to_f32(float* input_f16, size_t size, float *output_f32) {
			cuda_f16_to_f32 <<< size / BLOCK + 1, BLOCK, 0, get_cuda_stream() >>> ((half *)input_f16, size, output_f32);
			cuda_f16_to_f32 <<< size / BLOCK + 1, BLOCK, 0, get_cuda_stream() >>> ((half *)input_f16, size, output_f32);
			}

			half cuda_make_f16_from_f32_array(float src, size_t n)
			{
			half *dst16;
			size_t size = sizeof(half)*n;
			check_error(cudaMalloc((void **)&dst16, size));
			if (src) {
			cuda_convert_f32_to_f16(src, n, (float *)dst16);
			}
			if (!dst16) error("Cuda malloc failed\n");
			return dst16;
			half *dst16;
			size_t size = sizeof(half)*n;
			check_error(cudaMalloc((void **)&dst16, size));
			if (src) {
			cuda_convert_f32_to_f16(src, n, (float *)dst16);
			}
			if (!dst16) error("Cuda malloc failed\n");
			return dst16;
			}

			void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
			@@ -124,96 +124,96 @@
			}

			#ifdef CUDNN
			float one = 1; // alpha[0], beta[0] is float for HALF and FLOAT
			float alpha = 1, beta = 0;
			float one = 1; // alpha[0], beta[0] is float for HALF and FLOAT
			float alpha = 1, beta = 0;

			#ifdef CUDNN_HALF
			// Note: For improved performance it is advised to use beta[0] = 0.0.
			// For Tensor Core: cudnnSetConvolutionMathType() where cudnnMathType_t mathType = CUDNN_TENSOR_OP_MATH;
			// 1. or CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM and use CUDNN_DATA_HALF
			// 2. or CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED
			// More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
			// Note: For improved performance it is advised to use beta[0] = 0.0.
			// For Tensor Core: cudnnSetConvolutionMathType() where cudnnMathType_t mathType = CUDNN_TENSOR_OP_MATH;
			// 1. or CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM and use CUDNN_DATA_HALF
			// 2. or CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED
			// More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops

			const size_t input16_size = l.batchl.cl.w*l.h;
			const size_t output16_size = l.batchl.out_cl.out_h*l.out_w;
			const size_t input16_size = l.batchl.cl.w*l.h;
			const size_t output16_size = l.batchl.out_cl.out_h*l.out_w;

			if (*state.net.max_input16_size < input16_size) {
			//printf("\n input16_size: cur = %zu \t max = %zu \n", input16_size, *state.net.max_input16_size);
			*state.net.max_input16_size = input16_size;
			if (state.net.input16_gpu) cuda_free(state.net.input16_gpu);
			state.net.input16_gpu = (float )cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size);
			}
			float input16 = state.net.input16_gpu;
			if (*state.net.max_input16_size < input16_size) {
			//printf("\n input16_size: cur = %zu \t max = %zu \n", input16_size, *state.net.max_input16_size);
			*state.net.max_input16_size = input16_size;
			if (state.net.input16_gpu) cuda_free(state.net.input16_gpu);
			state.net.input16_gpu = (float )cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size);
			}
			float input16 = state.net.input16_gpu;

			if (*state.net.max_output16_size < output16_size) {
			*state.net.max_output16_size = output16_size;
			if (state.net.output16_gpu) cuda_free(state.net.output16_gpu);
			state.net.output16_gpu = (float )cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size);
			}
			float output16 = state.net.output16_gpu;
			if (*state.net.max_output16_size < output16_size) {
			*state.net.max_output16_size = output16_size;
			if (state.net.output16_gpu) cuda_free(state.net.output16_gpu);
			state.net.output16_gpu = (float )cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size);
			}
			float output16 = state.net.output16_gpu;

			cuda_convert_f32_to_f16(state.input, input16_size, input16);
			cuda_convert_f32_to_f16(state.input, input16_size, input16);

			//fill_ongpu(output16_size / 2, 0, (float *)output16, 1);
			cudnnConvolutionForward(cudnn_handle(),
			&alpha,
			l.srcTensorDesc,
			input16,
			l.weightDesc,
			l.weights_gpu16,
			l.convDesc,
			l.fw_algo,
			state.workspace,
			l.workspace_size,
			&beta,
			l.dstTensorDesc,
			output16);

			//fill_ongpu(output16_size / 2, 0, (float *)output16, 1);
			cudnnConvolutionForward(cudnn_handle(),
			&alpha,
			l.srcTensorDesc,
			input16,
			l.weightDesc,
			l.weights_gpu16,
			l.convDesc,
			l.fw_algo,
			state.workspace,
			l.workspace_size,
			&beta,
			l.dstTensorDesc,
			output16);


			if (l.batch_normalize)
			{
			if (state.train) // Training
			{
			copy_ongpu(l.outputs*l.batch / 2, output16, 1, l.x_gpu, 1);
			//cudaMemcpyAsync(l.x_gpu, output16, l.outputsl.batchsizeof(half), cudaMemcpyDefault, get_cuda_stream());
			float one = 1;
			float zero = 0;
			// Batch-normalization can still take FP16 inputs and outputs, saving half the bandwidth
			// compared to FP32, its just that the statistics and value adjustment should be done in FP32.
			cudnnBatchNormalizationForwardTraining(cudnn_handle(),
			CUDNN_BATCHNORM_SPATIAL,
			&one,
			&zero,
			l.normDstTensorDescF16,
			l.x_gpu, // input
			l.normDstTensorDescF16,
			output16, // output
			l.normTensorDesc,
			l.scales_gpu,
			l.biases_gpu,
			.01,
			l.rolling_mean_gpu, // output (should be FP32)
			l.rolling_variance_gpu, // output (should be FP32)
			.00001,
			l.mean_gpu, // output (should be FP32)
			l.variance_gpu); // output (should be FP32)
			if (l.batch_normalize)
			{
			if (state.train) // Training
			{
			copy_ongpu(l.outputs*l.batch / 2, output16, 1, l.x_gpu, 1);
			//cudaMemcpyAsync(l.x_gpu, output16, l.outputsl.batchsizeof(half), cudaMemcpyDefault, get_cuda_stream());
			float one = 1;
			float zero = 0;
			// Batch-normalization can still take FP16 inputs and outputs, saving half the bandwidth
			// compared to FP32, its just that the statistics and value adjustment should be done in FP32.
			cudnnBatchNormalizationForwardTraining(cudnn_handle(),
			CUDNN_BATCHNORM_SPATIAL,
			&one,
			&zero,
			l.normDstTensorDescF16,
			l.x_gpu, // input
			l.normDstTensorDescF16,
			output16, // output
			l.normTensorDesc,
			l.scales_gpu,
			l.biases_gpu,
			.01,
			l.rolling_mean_gpu, // output (should be FP32)
			l.rolling_variance_gpu, // output (should be FP32)
			.00001,
			l.mean_gpu, // output (should be FP32)
			l.variance_gpu); // output (should be FP32)

			cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
			//forward_batchnorm_layer_gpu(l, state);
			}
			else // Detection
			{
			cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
			normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
			scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
			add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
			}
			}
			else // BIAS only
			{
			cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
			add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
			}
			cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
			//forward_batchnorm_layer_gpu(l, state);
			}
			else // Detection
			{
			cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
			normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
			scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
			add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
			}
			}
			else // BIAS only
			{
			cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
			add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
			}

			#else

			@@ -230,7 +230,7 @@
			&one,
			l.dstTensorDesc,
			l.output_gpu);
			#endif // CUDNN_HALF
			#endif // CUDNN_HALF


			#else
			@@ -250,16 +250,16 @@
			#ifndef CUDNN_HALF
			if (l.batch_normalize) {
			forward_batchnorm_layer_gpu(l, state);
			}
			else {
			add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
			}
			}
			else {
			add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
			}
			#endif // no CUDNN_HALF

			activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
			//if(l.dot > 0) dot_error_gpu(l);
			if(l.binary \|\| l.xnor) swap_binary(&l);
			//cudaDeviceSynchronize(); // for correct profiling of performance
			//cudaDeviceSynchronize(); // for correct profiling of performance
			}

			void backward_convolutional_layer_gpu(convolutional_layer l, network_state state)
			@@ -272,126 +272,126 @@
			if(l.batch_normalize){
			backward_batchnorm_layer_gpu(l, state);
			} else {
			//backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
			//backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
			}
			#endif // no CUDNN_HALF
			float *original_input = state.input;

			if(l.xnor) state.input = l.binary_input_gpu;
			#ifdef CUDNN
			float one = 1;
			float alpha = 1, beta = 0;
			float one = 1;
			float alpha = 1, beta = 0;

			#ifdef CUDNN_HALF

			const size_t input16_size = l.batchl.cl.w*l.h;
			const size_t delta16_size = l.batchl.nl.out_w*l.out_h;

			if (*state.net.max_input16_size < input16_size) {
			*state.net.max_input16_size = input16_size;
			if(state.net.input16_gpu) cuda_free(state.net.input16_gpu);
			state.net.input16_gpu = (float )cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size);
			}
			float input16 = state.net.input16_gpu;

			const size_t input16_size = l.batchl.cl.w*l.h;
			const size_t delta16_size = l.batchl.nl.out_w*l.out_h;

			if (*state.net.max_input16_size < input16_size) {
			*state.net.max_input16_size = input16_size;
			if(state.net.input16_gpu) cuda_free(state.net.input16_gpu);
			state.net.input16_gpu = (float )cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size);
			}
			float input16 = state.net.input16_gpu;

			if (*state.net.max_output16_size < delta16_size) {
			*state.net.max_output16_size = delta16_size;
			if(state.net.output16_gpu) cuda_free(state.net.output16_gpu);
			state.net.output16_gpu = (float )cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size);
			}
			float delta16 = state.net.output16_gpu;
			if (*state.net.max_output16_size < delta16_size) {
			*state.net.max_output16_size = delta16_size;
			if(state.net.output16_gpu) cuda_free(state.net.output16_gpu);
			state.net.output16_gpu = (float )cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size);
			}
			float delta16 = state.net.output16_gpu;

			cuda_convert_f32_to_f16(state.input, input16_size, input16);
			cuda_convert_f32_to_f16(l.delta_gpu, delta16_size, delta16);
			cuda_convert_f32_to_f16(state.input, input16_size, input16);
			cuda_convert_f32_to_f16(l.delta_gpu, delta16_size, delta16);

			if (l.batch_normalize) {
			//if (!state.train) {
			// l.mean_gpu = l.rolling_mean_gpu;
			// l.variance_gpu = l.rolling_variance_gpu;
			//}
			float one = 1;
			float zero = 0;
			cudnnBatchNormalizationBackward(cudnn_handle(),
			CUDNN_BATCHNORM_SPATIAL,
			&one,
			&zero,
			&one,
			&one,
			l.normDstTensorDescF16,
			l.x_gpu, // input
			l.normDstTensorDescF16,
			delta16, // input
			l.normDstTensorDescF16,
			l.x_norm_gpu, // output
			l.normTensorDesc,
			l.scales_gpu, // output (should be FP32)
			l.scale_updates_gpu, // output (should be FP32)
			l.bias_updates_gpu, // output (should be FP32)
			.00001,
			l.mean_gpu, // input (should be FP32)
			l.variance_gpu); // input (should be FP32)
			copy_ongpu(l.outputs*l.batch / 2, l.x_norm_gpu, 1, delta16, 1);
			//cudaMemcpyAsync(delta16, l.x_norm_gpu, l.outputsl.batch sizeof(half), cudaMemcpyDefault, get_cuda_stream());
			}
			else
			{
			//backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
			}
			if (l.batch_normalize) {
			//if (!state.train) {
			// l.mean_gpu = l.rolling_mean_gpu;
			// l.variance_gpu = l.rolling_variance_gpu;
			//}
			float one = 1;
			float zero = 0;
			cudnnBatchNormalizationBackward(cudnn_handle(),
			CUDNN_BATCHNORM_SPATIAL,
			&one,
			&zero,
			&one,
			&one,
			l.normDstTensorDescF16,
			l.x_gpu, // input
			l.normDstTensorDescF16,
			delta16, // input
			l.normDstTensorDescF16,
			l.x_norm_gpu, // output
			l.normTensorDesc,
			l.scales_gpu, // output (should be FP32)
			l.scale_updates_gpu, // output (should be FP32)
			l.bias_updates_gpu, // output (should be FP32)
			.00001,
			l.mean_gpu, // input (should be FP32)
			l.variance_gpu); // input (should be FP32)
			copy_ongpu(l.outputs*l.batch / 2, l.x_norm_gpu, 1, delta16, 1);
			//cudaMemcpyAsync(delta16, l.x_norm_gpu, l.outputsl.batch sizeof(half), cudaMemcpyDefault, get_cuda_stream());
			}
			else
			{
			//backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
			}

			// convert input: state.input (x), l.delta_gpu (y) from fp32 to fp16
			// get output: l.weight_updates_gpu (dw) and convert it to fp32 (ONLY if it is fp16)
			// convert input: state.input (x), l.delta_gpu (y) from fp32 to fp16
			// get output: l.weight_updates_gpu (dw) and convert it to fp32 (ONLY if it is fp16)

			// calculate conv weight updates
			// Already: l.weight_updates_gpu = (l.weight_updates_gpu - l.weightdecaybatchsubdivision)momentum
			// so we should copy f32 to f16, or compute: f16=(w_up - wdbs)m
			cuda_convert_f32_to_f16(l.weight_updates_gpu, l.cl.nl.size*l.size, l.weight_updates_gpu16);
			// calculate conv weight updates
			// Already: l.weight_updates_gpu = (l.weight_updates_gpu - l.weightdecaybatchsubdivision)momentum
			// so we should copy f32 to f16, or compute: f16=(w_up - wdbs)m
			cuda_convert_f32_to_f16(l.weight_updates_gpu, l.cl.nl.size*l.size, l.weight_updates_gpu16);

			cudnnConvolutionBackwardFilter(cudnn_handle(),
			&one,
			l.srcTensorDesc,
			input16, //state.input,
			l.ddstTensorDesc,
			delta16, //l.delta_gpu,
			l.convDesc,
			l.bf_algo,
			state.workspace,
			l.workspace_size,
			&one,
			l.dweightDesc,
			l.weight_updates_gpu16); // l.weight_updates_gpu);
			cudnnConvolutionBackwardFilter(cudnn_handle(),
			&one,
			l.srcTensorDesc,
			input16, //state.input,
			l.ddstTensorDesc,
			delta16, //l.delta_gpu,
			l.convDesc,
			l.bf_algo,
			state.workspace,
			l.workspace_size,
			&one,
			l.dweightDesc,
			l.weight_updates_gpu16); // l.weight_updates_gpu);

			cuda_convert_f16_to_f32(l.weight_updates_gpu16, l.cl.nl.size*l.size, l.weight_updates_gpu);
			cuda_convert_f16_to_f32(l.weight_updates_gpu16, l.cl.nl.size*l.size, l.weight_updates_gpu);

			if (state.delta) {
			if (l.binary \|\| l.xnor) swap_binary(&l);
			if (state.delta) {
			if (l.binary \|\| l.xnor) swap_binary(&l);

			// http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
			// calculate delta for the next layer
			// convert input: l.weights_gpu (w), l.delta_gpu (dy) from fp32 to fp16
			// get output: state.delta (dx) and convert it to fp32 (ONLY if it is fp16)
			cudnnConvolutionBackwardData(cudnn_handle(),
			&alpha,
			l.weightDesc,
			l.weights_gpu16, //l.weights_gpu,
			l.ddstTensorDesc,
			delta16, //l.delta_gpu,
			l.convDesc,
			l.bd_algo,
			state.workspace,
			l.workspace_size,
			&beta,
			l.dsrcTensorDesc,
			input16); // state.delta);
			// http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
			// calculate delta for the next layer
			// convert input: l.weights_gpu (w), l.delta_gpu (dy) from fp32 to fp16
			// get output: state.delta (dx) and convert it to fp32 (ONLY if it is fp16)
			cudnnConvolutionBackwardData(cudnn_handle(),
			&alpha,
			l.weightDesc,
			l.weights_gpu16, //l.weights_gpu,
			l.ddstTensorDesc,
			delta16, //l.delta_gpu,
			l.convDesc,
			l.bd_algo,
			state.workspace,
			l.workspace_size,
			&beta,
			l.dsrcTensorDesc,
			input16); // state.delta);

			cuda_convert_f16_to_f32(input16, input16_size, state.delta);
			cuda_convert_f16_to_f32(input16, input16_size, state.delta);

			if (l.binary \|\| l.xnor) swap_binary(&l);
			if (l.xnor) gradient_array_ongpu(original_input, l.batchl.cl.h*l.w, HARDTAN, state.delta);
			}
			#else // CUDNN_HALF
			if (l.binary \|\| l.xnor) swap_binary(&l);
			if (l.xnor) gradient_array_ongpu(original_input, l.batchl.cl.h*l.w, HARDTAN, state.delta);
			}
			#else // CUDNN_HALF

			// calculate conv weight updates
			// if used: beta=1 then loss decreases faster
			// calculate conv weight updates
			// if used: beta=1 then loss decreases faster
			cudnnConvolutionBackwardFilter(cudnn_handle(),
			&one,
			l.srcTensorDesc,
			@@ -408,8 +408,8 @@

			if(state.delta){
			if(l.binary \|\| l.xnor) swap_binary(&l);
			// http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
			// calculate delta for the next layer
			// http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
			// calculate delta for the next layer
			cudnnConvolutionBackwardData(cudnn_handle(),
			&one,
			l.weightDesc,
			@@ -427,9 +427,9 @@
			if(l.xnor) gradient_array_ongpu(original_input, l.batchl.cl.h*l.w, HARDTAN, state.delta);
			}

			#endif // CUDNN_HALF
			#endif // CUDNN_HALF

			#else // CUDNN
			#else // CUDNN
			int m = l.n;
			int n = l.sizel.sizel.c;
			int k = l.out_w*l.out_h;
			@@ -482,7 +482,7 @@
			{
			cuda_push_array(layer.weights_gpu, layer.weights, layer.clayer.nlayer.size*layer.size);
			#ifdef CUDNN_HALF
			cuda_convert_f32_to_f16(layer.weights_gpu, layer.clayer.nlayer.size*layer.size, layer.weights_gpu16);
			cuda_convert_f32_to_f16(layer.weights_gpu, layer.clayer.nlayer.size*layer.size, layer.weights_gpu16);
			#endif
			cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
			cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.clayer.nlayer.size*layer.size);
			@@ -522,14 +522,14 @@
			adam_gpu(size, layer.weights_gpu, layer.m_gpu, layer.v_gpu, layer.B1, layer.B2, learning_rate/batch, layer.eps, layer.t+1);
			fill_ongpu(size, 0, layer.weight_updates_gpu, 1);
			}else{
			// update weights:
			// weights_gpu = weights_gpu(1 - decaylr) + weight_updates_gpulr / (batchsubdivision) =
			// weights_gpu(1 - 0.00050.001) + weight_updates_gpu0.001/(648) =
			// weights_gpu * 0.999 999 5 + weight_updates_gpu * 0.000 001 953125
			//
			// weight_updates_gpu = (weight_updates_gpu - weights_gpudecaybatchsubdivision)momentum =
			// (weight_updates_gpu - weights_gpu * 0.0005 * 64 * 8) * 0.9 =
			// weight_updates_gpu0.9 - weights_gpu0.2304
			// update weights:
			// weights_gpu = weights_gpu(1 - decaylr) + weight_updates_gpulr / (batchsubdivision) =
			// weights_gpu(1 - 0.00050.001) + weight_updates_gpu0.001/(648) =
			// weights_gpu * 0.999 999 5 + weight_updates_gpu * 0.000 001 953125
			//
			// weight_updates_gpu = (weight_updates_gpu - weights_gpudecaybatchsubdivision)momentum =
			// (weight_updates_gpu - weights_gpu * 0.0005 * 64 * 8) * 0.9 =
			// weight_updates_gpu0.9 - weights_gpu0.2304
			axpy_ongpu(size, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1);
			axpy_ongpu(size, learning_rate/batch, layer.weight_updates_gpu, 1, layer.weights_gpu, 1);
			scal_ongpu(size, momentum, layer.weight_updates_gpu, 1);

			@@ -141,67 +141,67 @@
			{

			#ifdef CUDNN_HALF
			// TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0):
			// Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
			// PSEUDO_HALF_CONFIG is required for Tensor Cores - our case!
			const cudnnDataType_t data_type = CUDNN_DATA_HALF;
			// TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0):
			// Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
			// PSEUDO_HALF_CONFIG is required for Tensor Cores - our case!
			const cudnnDataType_t data_type = CUDNN_DATA_HALF;
			#else
			cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
			cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
			#endif

			#if(CUDNN_MAJOR >= 7)
			// Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH
			// For *_ALGO_WINOGRAD_NONFUSED can be used CUDNN_DATA_FLOAT
			// otherwise Input, Filter and Output descriptors (xDesc, yDesc, wDesc, dxDesc, dyDesc and dwDesc as applicable) have dataType = CUDNN_DATA_HALF
			// Three techniques for training using Mixed-precision: https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
			// 1. Accumulation into FP32
			// 2. Loss Scaling - required only for: activation gradients. We do not use.
			// 3. FP32 Master Copy of Weights
			// More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
			cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH);
			// Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH
			// For *_ALGO_WINOGRAD_NONFUSED can be used CUDNN_DATA_FLOAT
			// otherwise Input, Filter and Output descriptors (xDesc, yDesc, wDesc, dxDesc, dyDesc and dwDesc as applicable) have dataType = CUDNN_DATA_HALF
			// Three techniques for training using Mixed-precision: https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
			// 1. Accumulation into FP32
			// 2. Loss Scaling - required only for: activation gradients. We do not use.
			// 3. FP32 Master Copy of Weights
			// More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
			cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH);
			#endif

			// INT8_CONFIG, INT8_EXT_CONFIG, INT8x4_CONFIG and INT8x4_EXT_CONFIG are only supported
			// on architectures with DP4A support (compute capability 6.1 and later).
			//cudnnDataType_t data_type = CUDNN_DATA_INT8;
			// INT8_CONFIG, INT8_EXT_CONFIG, INT8x4_CONFIG and INT8x4_EXT_CONFIG are only supported
			// on architectures with DP4A support (compute capability 6.1 and later).
			//cudnnDataType_t data_type = CUDNN_DATA_INT8;

			// backward delta
			// backward delta
			cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w);
			cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
			cudnnSetFilter4dDescriptor(l->dweightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);

			// forward
			// forward
			cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w);
			cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
			cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);

			// batch norm
			cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1);
			cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
			// batch norm
			cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1);
			cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);

			cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
			cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
			#if(CUDNN_MAJOR >= 6)
			cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); // cudnn >= 6.0
			cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); // cudnn >= 6.0
			#else
			cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); // cudnn 5.1
			cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); // cudnn 5.1
			#endif
			int forward_algo = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
			int backward_algo = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
			int backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
			if (cudnn_preference == cudnn_smallest)
			{
			forward_algo = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
			backward_algo = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
			backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
			printf(" CUDNN-slow ");
			}
			int forward_algo = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
			int backward_algo = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
			int backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
			if (cudnn_preference == cudnn_smallest)
			{
			forward_algo = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
			backward_algo = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
			backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
			printf(" CUDNN-slow ");
			}

			cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
			cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
			l->srcTensorDesc,
			l->weightDesc,
			l->convDesc,
			l->dstTensorDesc,
			forward_algo,
			forward_algo,
			0,
			&l->fw_algo);
			cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
			@@ -209,7 +209,7 @@
			l->ddstTensorDesc,
			l->convDesc,
			l->dsrcTensorDesc,
			backward_algo,
			backward_algo,
			0,
			&l->bd_algo);
			cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
			@@ -217,41 +217,41 @@
			l->ddstTensorDesc,
			l->convDesc,
			l->dweightDesc,
			backward_filter,
			backward_filter,
			0,
			&l->bf_algo);

			if (data_type == CUDNN_DATA_HALF)
			{
			// HALF-16 if(data_type == CUDNN_DATA_HALF)
			l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
			l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
			l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
			if (data_type == CUDNN_DATA_HALF)
			{
			// HALF-16 if(data_type == CUDNN_DATA_HALF)
			l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
			l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
			l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;

			// FLOAT-32 if(data_type == CUDNN_DATA_FLOAT)
			//l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
			//l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
			//l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;
			// FLOAT-32 if(data_type == CUDNN_DATA_FLOAT)
			//l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
			//l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
			//l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;

			int fw = 0, bd = 0, bf = 0;
			if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1;
			//printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM \n");
			if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2;
			//printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED \n");
			int fw = 0, bd = 0, bf = 0;
			if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1;
			//printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM \n");
			if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2;
			//printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED \n");

			if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1;
			//printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 \n");
			if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2;
			//printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED \n");
			if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1;
			//printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 \n");
			if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2;
			//printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED \n");

			if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1;
			//printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 \n");
			if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2;
			//printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED \n");
			if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1;
			//printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 \n");
			if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2;
			//printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED \n");

			if (fw == 2 && bd == 2 && bf == 2) printf("TF ");
			else if (fw == 1 && bd == 1 && bf == 1) printf("TH ");
			}
			if (fw == 2 && bd == 2 && bf == 2) printf("TF ");
			else if (fw == 1 && bd == 1 && bf == 1) printf("TH ");
			}
			}
			#endif
			#endif
			@@ -344,8 +344,8 @@

			l.weights_gpu = cuda_make_array(l.weights, cnsize*size);
			#ifdef CUDNN_HALF
			l.weights_gpu16 = cuda_make_array(NULL, cnsizesize / 2); //cuda_make_array(l.weights, cnsizesize / 2);
			l.weight_updates_gpu16 = cuda_make_array(NULL, cnsizesize / 2); //cuda_make_array(l.weight_updates, cnsizesize / 2);
			l.weights_gpu16 = cuda_make_array(NULL, cnsizesize / 2); //cuda_make_array(l.weights, cnsizesize / 2);
			l.weight_updates_gpu16 = cuda_make_array(NULL, cnsizesize / 2); //cuda_make_array(l.weight_updates, cnsizesize / 2);
			#endif
			l.weight_updates_gpu = cuda_make_array(l.weight_updates, cnsize*size);

			@@ -379,10 +379,10 @@
			l.x_gpu = cuda_make_array(l.output, l.batchout_hout_w*n);
			l.x_norm_gpu = cuda_make_array(l.output, l.batchout_hout_w*n);
			}
			#ifdef CUDNN
			cudnnCreateTensorDescriptor(&l.normDstTensorDesc);
			cudnnCreateTensorDescriptor(&l.normDstTensorDescF16);
			cudnnCreateTensorDescriptor(&l.normTensorDesc);
			#ifdef CUDNN
			cudnnCreateTensorDescriptor(&l.normDstTensorDesc);
			cudnnCreateTensorDescriptor(&l.normDstTensorDescF16);
			cudnnCreateTensorDescriptor(&l.normTensorDesc);
			cudnnCreateTensorDescriptor(&l.srcTensorDesc);
			cudnnCreateTensorDescriptor(&l.dstTensorDesc);
			cudnnCreateFilterDescriptor(&l.weightDesc);
			@@ -398,8 +398,8 @@
			l.activation = activation;

			//fprintf(stderr, "conv %5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
			l.bflops = (2.0 * l.n * l.sizel.sizel.c * l.out_h*l.out_w) / 1000000000.;
			fprintf(stderr, "conv %5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
			l.bflops = (2.0 * l.n * l.sizel.sizel.c * l.out_h*l.out_w) / 1000000000.;
			fprintf(stderr, "conv %5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);

			return l;
			}
			@@ -445,8 +445,8 @@

			void resize_convolutional_layer(convolutional_layer *l, int w, int h)
			{
			int old_w = l->w;
			int old_h = l->h;
			int old_w = l->w;
			int old_h = l->h;
			l->w = w;
			l->h = h;
			int out_w = convolutional_out_width(*l);
			@@ -465,31 +465,31 @@
			l->x_norm = realloc(l->x_norm, l->batchl->outputssizeof(float));
			}

			if (l->xnor) {
			//l->binary_input = realloc(l->inputs*l->batch, sizeof(float));
			}
			if (l->xnor) {
			//l->binary_input = realloc(l->inputs*l->batch, sizeof(float));
			}

			#ifdef GPU
			if (old_w < w \|\| old_h < h) {
			cuda_free(l->delta_gpu);
			cuda_free(l->output_gpu);
			if (old_w < w \|\| old_h < h) {
			cuda_free(l->delta_gpu);
			cuda_free(l->output_gpu);

			l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs);
			l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
			l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs);
			l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);

			if (l->batch_normalize) {
			cuda_free(l->x_gpu);
			cuda_free(l->x_norm_gpu);
			if (l->batch_normalize) {
			cuda_free(l->x_gpu);
			cuda_free(l->x_norm_gpu);

			l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs);
			l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs);
			}
			l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs);
			l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs);
			}

			if (l->xnor) {
			cuda_free(l->binary_input_gpu);
			l->binary_input_gpu = cuda_make_array(0, l->inputs*l->batch);
			}
			}
			if (l->xnor) {
			cuda_free(l->binary_input_gpu);
			l->binary_input_gpu = cuda_make_array(0, l->inputs*l->batch);
			}
			}
			#ifdef CUDNN
			cudnn_convolutional_setup(l, cudnn_fastest);
			#endif
			@@ -497,15 +497,15 @@
			l->workspace_size = get_workspace_size(*l);

			#ifdef CUDNN
			// check for excessive memory consumption
			size_t free_byte;
			size_t total_byte;
			check_error(cudaMemGetInfo(&free_byte, &total_byte));
			if (l->workspace_size > free_byte \|\| l->workspace_size >= total_byte / 2) {
			printf(" used slow CUDNN algo without Workspace! Need memory: %zu, available: %zu\n", l->workspace_size, (free_byte < total_byte/2) ? free_byte : total_byte/2);
			cudnn_convolutional_setup(l, cudnn_smallest);
			l->workspace_size = get_workspace_size(*l);
			}
			// check for excessive memory consumption
			size_t free_byte;
			size_t total_byte;
			check_error(cudaMemGetInfo(&free_byte, &total_byte));
			if (l->workspace_size > free_byte \|\| l->workspace_size >= total_byte / 2) {
			printf(" used slow CUDNN algo without Workspace! Need memory: %zu, available: %zu\n", l->workspace_size, (free_byte < total_byte/2) ? free_byte : total_byte/2);
			cudnn_convolutional_setup(l, cudnn_smallest);
			l->workspace_size = get_workspace_size(*l);
			}
			#endif
			}

			@@ -61,25 +61,25 @@
			return d;
			}

			static cudaStream_t streamsArray[16]; // cudaStreamSynchronize( get_cuda_stream() );
			static cudaStream_t streamsArray[16]; // cudaStreamSynchronize( get_cuda_stream() );
			static int streamInit[16] = { 0 };

			cudaStream_t get_cuda_stream() {
			int i = cuda_get_device();
			if (!streamInit[i]) {
			cudaError_t status = cudaStreamCreate(&streamsArray[i]);
			//cudaError_t status = cudaStreamCreateWithFlags(&streamsArray[i], cudaStreamNonBlocking);
			if (status != cudaSuccess) {
			printf(" cudaStreamCreate error: %d \n", status);
			const char *s = cudaGetErrorString(status);
			char buffer[256];
			printf("CUDA Error: %s\n", s);
			status = cudaStreamCreateWithFlags(&streamsArray[i], cudaStreamDefault);
			check_error(status);
			}
			streamInit[i] = 1;
			}
			return streamsArray[i];
			int i = cuda_get_device();
			if (!streamInit[i]) {
			cudaError_t status = cudaStreamCreate(&streamsArray[i]);
			//cudaError_t status = cudaStreamCreateWithFlags(&streamsArray[i], cudaStreamNonBlocking);
			if (status != cudaSuccess) {
			printf(" cudaStreamCreate error: %d \n", status);
			const char *s = cudaGetErrorString(status);
			char buffer[256];
			printf("CUDA Error: %s\n", s);
			status = cudaStreamCreateWithFlags(&streamsArray[i], cudaStreamDefault);
			check_error(status);
			}
			streamInit[i] = 1;
			}
			return streamsArray[i];
			}


			@@ -92,7 +92,7 @@
			if(!init[i]) {
			cudnnCreate(&handle[i]);
			init[i] = 1;
			cudnnStatus_t status = cudnnSetStream(handle[i], get_cuda_stream());
			cudnnStatus_t status = cudnnSetStream(handle[i], get_cuda_stream());
			}
			return handle[i];
			}
			@@ -105,7 +105,7 @@
			int i = cuda_get_device();
			if(!init[i]) {
			cublasCreate(&handle[i]);
			cublasStatus_t status = cublasSetStream(handle[i], get_cuda_stream());
			cublasStatus_t status = cublasSetStream(handle[i], get_cuda_stream());
			init[i] = 1;
			}
			return handle[i];
			@@ -119,7 +119,7 @@
			check_error(status);
			if(x){
			//status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
			status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyHostToDevice, get_cuda_stream());
			status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyHostToDevice, get_cuda_stream());
			check_error(status);
			}
			if(!x_gpu) error("Cuda malloc failed\n");
			@@ -164,7 +164,7 @@

			void cuda_free(float *x_gpu)
			{
			//cudaStreamSynchronize(get_cuda_stream());
			//cudaStreamSynchronize(get_cuda_stream());
			cudaError_t status = cudaFree(x_gpu);
			check_error(status);
			}
			@@ -173,7 +173,7 @@
			{
			size_t size = sizeof(float)*n;
			//cudaError_t status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
			cudaError_t status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyHostToDevice, get_cuda_stream());
			cudaError_t status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyHostToDevice, get_cuda_stream());
			check_error(status);
			}

			@@ -181,9 +181,9 @@
			{
			size_t size = sizeof(float)*n;
			//cudaError_t status = cudaMemcpy(x, x_gpu, size, cudaMemcpyDeviceToHost);
			cudaError_t status = cudaMemcpyAsync(x, x_gpu, size, cudaMemcpyDeviceToHost, get_cuda_stream());
			cudaError_t status = cudaMemcpyAsync(x, x_gpu, size, cudaMemcpyDeviceToHost, get_cuda_stream());
			check_error(status);
			cudaStreamSynchronize(get_cuda_stream());
			cudaStreamSynchronize(get_cuda_stream());
			}

			#else // GPU

			@@ -44,15 +44,15 @@
			char *random_paths = calloc(n, sizeof(char));
			int i;
			pthread_mutex_lock(&mutex);
			//printf("n = %d \n", n);
			//printf("n = %d \n", n);
			for(i = 0; i < n; ++i){
			do {
			int index = random_gen() % m;
			random_paths[i] = paths[index];
			//if(i == 0) printf("%s\n", paths[index]);
			//printf("grp: %s\n", paths[index]);
			if (strlen(random_paths[i]) <= 4) printf(" Very small path to the image: %s \n", random_paths[i]);
			} while (strlen(random_paths[i]) == 0);
			do {
			int index = random_gen() % m;
			random_paths[i] = paths[index];
			//if(i == 0) printf("%s\n", paths[index]);
			//printf("grp: %s\n", paths[index]);
			if (strlen(random_paths[i]) <= 4) printf(" Very small path to the image: %s \n", random_paths[i]);
			} while (strlen(random_paths[i]) == 0);
			}
			pthread_mutex_unlock(&mutex);
			return random_paths;
			@@ -140,18 +140,18 @@
			{
			box_label *boxes = calloc(1, sizeof(box_label));
			FILE *file = fopen(filename, "r");
			if (!file) {
			printf("Can't open label file. (This can be normal only if you use MSCOCO) \n");
			//file_error(filename);
			FILE* fw = fopen("bad.list", "a");
			fwrite(filename, sizeof(char), strlen(filename), fw);
			char *new_line = "\n";
			fwrite(new_line, sizeof(char), strlen(new_line), fw);
			fclose(fw);
			if (!file) {
			printf("Can't open label file. (This can be normal only if you use MSCOCO) \n");
			//file_error(filename);
			FILE* fw = fopen("bad.list", "a");
			fwrite(filename, sizeof(char), strlen(filename), fw);
			char *new_line = "\n";
			fwrite(new_line, sizeof(char), strlen(new_line), fw);
			fclose(fw);

			*n = 0;
			return boxes;
			}
			*n = 0;
			return boxes;
			}
			float x, y, h, w;
			int id;
			int count = 0;
			@@ -224,7 +224,7 @@
			void fill_truth_swag(char path, float truth, int classes, int flip, float dx, float dy, float sx, float sy)
			{
			char labelpath[4096];
			replace_image_to_label(path, labelpath);
			replace_image_to_label(path, labelpath);

			int count = 0;
			box_label *boxes = read_boxes(labelpath, &count);
			@@ -258,9 +258,9 @@
			void fill_truth_region(char path, float truth, int classes, int num_boxes, int flip, float dx, float dy, float sx, float sy)
			{
			char labelpath[4096];
			replace_image_to_label(path, labelpath);
			replace_image_to_label(path, labelpath);

			int count = 0;
			int count = 0;
			box_label *boxes = read_boxes(labelpath, &count);
			randomize_boxes(boxes, count);
			correct_boxes(boxes, count, dx, dy, sx, sy, flip);
			@@ -299,77 +299,77 @@
			}

			void fill_truth_detection(char path, int num_boxes, float truth, int classes, int flip, float dx, float dy, float sx, float sy,
			int small_object, int net_w, int net_h)
			int small_object, int net_w, int net_h)
			{
			char labelpath[4096];
			replace_image_to_label(path, labelpath);
			char labelpath[4096];
			replace_image_to_label(path, labelpath);

			int count = 0;
			int i;
			box_label *boxes = read_boxes(labelpath, &count);
			float lowest_w = 1.F / net_w;
			float lowest_h = 1.F / net_h;
			if (small_object == 1) {
			for (i = 0; i < count; ++i) {
			if (boxes[i].w < lowest_w) boxes[i].w = lowest_w;
			if (boxes[i].h < lowest_h) boxes[i].h = lowest_h;
			}
			}
			randomize_boxes(boxes, count);
			correct_boxes(boxes, count, dx, dy, sx, sy, flip);
			if (count > num_boxes) count = num_boxes;
			float x, y, w, h;
			int id;
			int count = 0;
			int i;
			box_label *boxes = read_boxes(labelpath, &count);
			float lowest_w = 1.F / net_w;
			float lowest_h = 1.F / net_h;
			if (small_object == 1) {
			for (i = 0; i < count; ++i) {
			if (boxes[i].w < lowest_w) boxes[i].w = lowest_w;
			if (boxes[i].h < lowest_h) boxes[i].h = lowest_h;
			}
			}
			randomize_boxes(boxes, count);
			correct_boxes(boxes, count, dx, dy, sx, sy, flip);
			if (count > num_boxes) count = num_boxes;
			float x, y, w, h;
			int id;

			for (i = 0; i < count; ++i) {
			x = boxes[i].x;
			y = boxes[i].y;
			w = boxes[i].w;
			h = boxes[i].h;
			id = boxes[i].id;
			for (i = 0; i < count; ++i) {
			x = boxes[i].x;
			y = boxes[i].y;
			w = boxes[i].w;
			h = boxes[i].h;
			id = boxes[i].id;

			// not detect small objects
			//if ((w < 0.001F \|\| h < 0.001F)) continue;
			// if truth (box for object) is smaller than 1x1 pix
			char buff[256];
			if (id >= classes) {
			printf("\n Wrong annotation: class_id = %d. But class_id should be [from 0 to %d] \n", id, classes);
			sprintf(buff, "echo %s \"Wrong annotation: class_id = %d. But class_id should be [from 0 to %d]\" >> bad_label.list", labelpath, id, classes);
			system(buff);
			getchar();
			continue;
			}
			if ((w < lowest_w \|\| h < lowest_h)) {
			//sprintf(buff, "echo %s \"Very small object: w < lowest_w OR h < lowest_h\" >> bad_label.list", labelpath);
			//system(buff);
			continue;
			}
			if (x == 999999 \|\| y == 999999) {
			printf("\n Wrong annotation: x = 0, y = 0 \n");
			sprintf(buff, "echo %s \"Wrong annotation: x = 0 or y = 0\" >> bad_label.list", labelpath);
			system(buff);
			continue;
			}
			if (x <= 0 \|\| x > 1 \|\| y <= 0 \|\| y > 1) {
			printf("\n Wrong annotation: x = %f, y = %f \n", x, y);
			sprintf(buff, "echo %s \"Wrong annotation: x = %f, y = %f\" >> bad_label.list", labelpath, x, y);
			system(buff);
			continue;
			}
			if (w > 1) {
			printf("\n Wrong annotation: w = %f \n", w);
			sprintf(buff, "echo %s \"Wrong annotation: w = %f\" >> bad_label.list", labelpath, w);
			system(buff);
			w = 1;
			}
			if (h > 1) {
			printf("\n Wrong annotation: h = %f \n", h);
			sprintf(buff, "echo %s \"Wrong annotation: h = %f\" >> bad_label.list", labelpath, h);
			system(buff);
			h = 1;
			}
			if (x == 0) x += lowest_w;
			if (y == 0) y += lowest_h;
			// not detect small objects
			//if ((w < 0.001F \|\| h < 0.001F)) continue;
			// if truth (box for object) is smaller than 1x1 pix
			char buff[256];
			if (id >= classes) {
			printf("\n Wrong annotation: class_id = %d. But class_id should be [from 0 to %d] \n", id, classes);
			sprintf(buff, "echo %s \"Wrong annotation: class_id = %d. But class_id should be [from 0 to %d]\" >> bad_label.list", labelpath, id, classes);
			system(buff);
			getchar();
			continue;
			}
			if ((w < lowest_w \|\| h < lowest_h)) {
			//sprintf(buff, "echo %s \"Very small object: w < lowest_w OR h < lowest_h\" >> bad_label.list", labelpath);
			//system(buff);
			continue;
			}
			if (x == 999999 \|\| y == 999999) {
			printf("\n Wrong annotation: x = 0, y = 0 \n");
			sprintf(buff, "echo %s \"Wrong annotation: x = 0 or y = 0\" >> bad_label.list", labelpath);
			system(buff);
			continue;
			}
			if (x <= 0 \|\| x > 1 \|\| y <= 0 \|\| y > 1) {
			printf("\n Wrong annotation: x = %f, y = %f \n", x, y);
			sprintf(buff, "echo %s \"Wrong annotation: x = %f, y = %f\" >> bad_label.list", labelpath, x, y);
			system(buff);
			continue;
			}
			if (w > 1) {
			printf("\n Wrong annotation: w = %f \n", w);
			sprintf(buff, "echo %s \"Wrong annotation: w = %f\" >> bad_label.list", labelpath, w);
			system(buff);
			w = 1;
			}
			if (h > 1) {
			printf("\n Wrong annotation: h = %f \n", h);
			sprintf(buff, "echo %s \"Wrong annotation: h = %f\" >> bad_label.list", labelpath, h);
			system(buff);
			h = 1;
			}
			if (x == 0) x += lowest_w;
			if (y == 0) y += lowest_h;

			truth[i*5+0] = x;
			truth[i*5+1] = y;
			@@ -524,7 +524,7 @@
			char *get_labels_custom(char filename, int *size)
			{
			list *plist = get_paths(filename);
			if(size) *size = plist->size;
			if(size) *size = plist->size;
			char labels = (char )list_to_array(plist);
			free_list(plist);
			return labels;
			@@ -532,7 +532,7 @@

			char *get_labels(char filename)
			{
			return get_labels_custom(filename, NULL);
			return get_labels_custom(filename, NULL);
			}

			void free_data(data d)
			@@ -742,22 +742,22 @@

			d.y = make_matrix(n, 5*boxes);
			for(i = 0; i < n; ++i){
			const char *filename = random_paths[i];
			const char *filename = random_paths[i];

			int flag = (c >= 3);
			IplImage *src;
			if ((src = cvLoadImage(filename, flag)) == 0)
			{
			fprintf(stderr, "Cannot load image \"%s\"\n", filename);
			char buff[256];
			sprintf(buff, "echo %s >> bad.list", filename);
			system(buff);
			continue;
			//exit(0);
			}
			int flag = (c >= 3);
			IplImage *src;
			if ((src = cvLoadImage(filename, flag)) == 0)
			{
			fprintf(stderr, "Cannot load image \"%s\"\n", filename);
			char buff[256];
			sprintf(buff, "echo %s >> bad.list", filename);
			system(buff);
			continue;
			//exit(0);
			}

			int oh = src->height;
			int ow = src->width;
			int oh = src->height;
			int ow = src->width;

			int dw = (ow*jitter);
			int dh = (oh*jitter);
			@@ -778,81 +778,81 @@
			float dx = ((float)pleft/ow)/sx;
			float dy = ((float)ptop /oh)/sy;

			float dhue = rand_uniform_strong(-hue, hue);
			float dsat = rand_scale(saturation);
			float dexp = rand_scale(exposure);
			float dhue = rand_uniform_strong(-hue, hue);
			float dsat = rand_scale(saturation);
			float dexp = rand_scale(exposure);

			image ai = image_data_augmentation(src, w, h, pleft, ptop, swidth, sheight, flip, jitter, dhue, dsat, dexp);
			d.X.vals[i] = ai.data;

			//show_image(ai, "aug");
			//cvWaitKey(0);
			image ai = image_data_augmentation(src, w, h, pleft, ptop, swidth, sheight, flip, jitter, dhue, dsat, dexp);
			d.X.vals[i] = ai.data;

			//show_image(ai, "aug");
			//cvWaitKey(0);

			fill_truth_detection(filename, boxes, d.y.vals[i], classes, flip, dx, dy, 1./sx, 1./sy, small_object, w, h);

			cvReleaseImage(&src);
			cvReleaseImage(&src);
			}
			free(random_paths);
			return d;
			}
			#else // OPENCV
			#else // OPENCV
			data load_data_detection(int n, char **paths, int m, int w, int h, int c, int boxes, int classes, int use_flip, float jitter, float hue, float saturation, float exposure, int small_object)
			{
			c = c ? c : 3;
			char **random_paths = get_random_paths(paths, n, m);
			int i;
			data d = { 0 };
			d.shallow = 0;
			char **random_paths = get_random_paths(paths, n, m);
			int i;
			data d = { 0 };
			d.shallow = 0;

			d.X.rows = n;
			d.X.vals = calloc(d.X.rows, sizeof(float*));
			d.X.cols = hwc;
			d.X.rows = n;
			d.X.vals = calloc(d.X.rows, sizeof(float*));
			d.X.cols = hwc;

			d.y = make_matrix(n, 5 * boxes);
			for (i = 0; i < n; ++i) {
			image orig = load_image(random_paths[i], 0, 0, c);
			d.y = make_matrix(n, 5 * boxes);
			for (i = 0; i < n; ++i) {
			image orig = load_image(random_paths[i], 0, 0, c);

			int oh = orig.h;
			int ow = orig.w;
			int oh = orig.h;
			int ow = orig.w;

			int dw = (ow*jitter);
			int dh = (oh*jitter);
			int dw = (ow*jitter);
			int dh = (oh*jitter);

			int pleft = rand_uniform_strong(-dw, dw);
			int pright = rand_uniform_strong(-dw, dw);
			int ptop = rand_uniform_strong(-dh, dh);
			int pbot = rand_uniform_strong(-dh, dh);
			int pleft = rand_uniform_strong(-dw, dw);
			int pright = rand_uniform_strong(-dw, dw);
			int ptop = rand_uniform_strong(-dh, dh);
			int pbot = rand_uniform_strong(-dh, dh);

			int swidth = ow - pleft - pright;
			int sheight = oh - ptop - pbot;
			int swidth = ow - pleft - pright;
			int sheight = oh - ptop - pbot;

			float sx = (float)swidth / ow;
			float sy = (float)sheight / oh;
			float sx = (float)swidth / ow;
			float sy = (float)sheight / oh;

			int flip = use_flip ? random_gen() % 2 : 0;
			image cropped = crop_image(orig, pleft, ptop, swidth, sheight);
			int flip = use_flip ? random_gen() % 2 : 0;
			image cropped = crop_image(orig, pleft, ptop, swidth, sheight);

			float dx = ((float)pleft / ow) / sx;
			float dy = ((float)ptop / oh) / sy;
			float dx = ((float)pleft / ow) / sx;
			float dy = ((float)ptop / oh) / sy;

			image sized = resize_image(cropped, w, h);
			if (flip) flip_image(sized);
			random_distort_image(sized, hue, saturation, exposure);
			d.X.vals[i] = sized.data;
			image sized = resize_image(cropped, w, h);
			if (flip) flip_image(sized);
			random_distort_image(sized, hue, saturation, exposure);
			d.X.vals[i] = sized.data;

			fill_truth_detection(random_paths[i], boxes, d.y.vals[i], classes, flip, dx, dy, 1. / sx, 1. / sy, small_object, w, h);
			fill_truth_detection(random_paths[i], boxes, d.y.vals[i], classes, flip, dx, dy, 1. / sx, 1. / sy, small_object, w, h);

			free_image(orig);
			free_image(cropped);
			}
			free(random_paths);
			return d;
			free_image(orig);
			free_image(cropped);
			}
			free(random_paths);
			return d;
			}
			#endif // OPENCV
			#endif // OPENCV

			void load_thread(void ptr)
			{
			//srand(time(0));
			//srand(time(0));
			//printf("Loading data: %d\n", random_gen());
			load_args a = (struct load_args)ptr;
			if(a.exposure == 0) a.exposure = 1;
			@@ -878,9 +878,9 @@
			} else if (a.type == IMAGE_DATA){
			*(a.im) = load_image(a.path, 0, 0, a.c);
			(a.resized) = resize_image((a.im), a.w, a.h);
			}else if (a.type == LETTERBOX_DATA) {
			*(a.im) = load_image(a.path, 0, 0, a.c);
			(a.resized) = letterbox_image((a.im), a.w, a.h);
			}else if (a.type == LETTERBOX_DATA) {
			*(a.im) = load_image(a.path, 0, 0, a.c);
			(a.resized) = letterbox_image((a.im), a.w, a.h);
			} else if (a.type == TAG_DATA){
			*a.d = load_data_tag(a.paths, a.n, a.m, a.classes, a.flip, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
			}
			@@ -899,7 +899,7 @@

			void load_threads(void ptr)
			{
			//srand(time(0));
			//srand(time(0));
			int i;
			load_args args = (load_args )ptr;
			if (args.threads == 0) args.threads = 1;

			@@ -64,25 +64,25 @@
			void fetch_in_thread(void ptr)
			{
			//in = get_image_from_stream(cap);
			int dont_close_stream = 0; // set 1 if your IP-camera periodically turns off and turns on video-stream
			if(letter_box)
			in_s = get_image_from_stream_letterbox(cap, net.w, net.h, net.c, &in_img, cpp_video_capture, dont_close_stream);
			else
			in_s = get_image_from_stream_resize(cap, net.w, net.h, net.c, &in_img, cpp_video_capture, dont_close_stream);
			int dont_close_stream = 0; // set 1 if your IP-camera periodically turns off and turns on video-stream
			if(letter_box)
			in_s = get_image_from_stream_letterbox(cap, net.w, net.h, net.c, &in_img, cpp_video_capture, dont_close_stream);
			else
			in_s = get_image_from_stream_resize(cap, net.w, net.h, net.c, &in_img, cpp_video_capture, dont_close_stream);
			if(!in_s.data){
			//error("Stream closed.");
			printf("Stream closed.\n");
			flag_exit = 1;
			return EXIT_FAILURE;
			printf("Stream closed.\n");
			flag_exit = 1;
			return EXIT_FAILURE;
			}
			//in_s = resize_image(in, net.w, net.h);


			return 0;
			}

			void detect_in_thread(void ptr)
			{
			float nms = .45; // 0.4F
			float nms = .45; // 0.4F

			layer l = net.layers[net.n-1];
			float *X = det_s.data;
			@@ -94,29 +94,29 @@

			free_image(det_s);

			int nboxes = 0;
			detection *dets = NULL;
			if (letter_box)
			dets = get_network_boxes(&net, in_img->width, in_img->height, demo_thresh, demo_thresh, 0, 1, &nboxes, 1); // letter box
			else
			dets = get_network_boxes(&net, det_s.w, det_s.h, demo_thresh, demo_thresh, 0, 1, &nboxes, 0); // resized
			//if (nms) do_nms_obj(dets, nboxes, l.classes, nms); // bad results
			if (nms) do_nms_sort(dets, nboxes, l.classes, nms);

			int nboxes = 0;
			detection *dets = NULL;
			if (letter_box)
			dets = get_network_boxes(&net, in_img->width, in_img->height, demo_thresh, demo_thresh, 0, 1, &nboxes, 1); // letter box
			else
			dets = get_network_boxes(&net, det_s.w, det_s.h, demo_thresh, demo_thresh, 0, 1, &nboxes, 0); // resized
			//if (nms) do_nms_obj(dets, nboxes, l.classes, nms); // bad results
			if (nms) do_nms_sort(dets, nboxes, l.classes, nms);


			printf("\033[2J");
			printf("\033[1;1H");
			printf("\nFPS:%.1f\n",fps);
			printf("Objects:\n\n");

			ipl_images[demo_index] = det_img;
			det_img = ipl_images[(demo_index + FRAMES / 2 + 1) % FRAMES];
			ipl_images[demo_index] = det_img;
			det_img = ipl_images[(demo_index + FRAMES / 2 + 1) % FRAMES];
			demo_index = (demo_index + 1)%FRAMES;

			draw_detections_cv_v3(det_img, dets, nboxes, demo_thresh, demo_names, demo_alphabet, demo_classes, demo_ext_output);
			free_detections(dets, nboxes);

			return 0;
			draw_detections_cv_v3(det_img, dets, nboxes, demo_thresh, demo_names, demo_alphabet, demo_classes, demo_ext_output);
			free_detections(dets, nboxes);

			return 0;
			}

			double get_wall_time()
			@@ -129,7 +129,7 @@
			}

			void demo(char cfgfile, char weightfile, float thresh, float hier_thresh, int cam_index, const char filename, char *names, int classes,
			int frame_skip, char prefix, char out_filename, int http_stream_port, int dont_show, int ext_output)
			int frame_skip, char prefix, char out_filename, int http_stream_port, int dont_show, int ext_output)
			{
			//skip = frame_skip;
			image **alphabet = load_alphabet();
			@@ -138,40 +138,40 @@
			demo_alphabet = alphabet;
			demo_classes = classes;
			demo_thresh = thresh;
			demo_ext_output = ext_output;
			demo_ext_output = ext_output;
			printf("Demo\n");
			net = parse_network_cfg_custom(cfgfile, 1); // set batch=1
			net = parse_network_cfg_custom(cfgfile, 1); // set batch=1
			if(weightfile){
			load_weights(&net, weightfile);
			}
			//set_batch_network(&net, 1);
			fuse_conv_batchnorm(net);
			fuse_conv_batchnorm(net);
			srand(2222222);

			if(filename){
			printf("video file: %s\n", filename);
			//#ifdef CV_VERSION_EPOCH // OpenCV 2.x
			// cap = cvCaptureFromFile(filename);
			//#else // OpenCV 3.x
			cpp_video_capture = 1;
			cap = get_capture_video_stream(filename);
			//#ifdef CV_VERSION_EPOCH // OpenCV 2.x
			// cap = cvCaptureFromFile(filename);
			//#else // OpenCV 3.x
			cpp_video_capture = 1;
			cap = get_capture_video_stream(filename);
			//#endif
			}else{
			printf("Webcam index: %d\n", cam_index);
			//#ifdef CV_VERSION_EPOCH // OpenCV 2.x
			printf("Webcam index: %d\n", cam_index);
			//#ifdef CV_VERSION_EPOCH // OpenCV 2.x
			// cap = cvCaptureFromCAM(cam_index);
			//#else // OpenCV 3.x
			cpp_video_capture = 1;
			cap = get_capture_webcam(cam_index);
			//#else // OpenCV 3.x
			cpp_video_capture = 1;
			cap = get_capture_webcam(cam_index);
			//#endif
			}

			if (!cap) {
			if (!cap) {
			#ifdef WIN32
			printf("Check that you have copied file opencv_ffmpeg340_64.dll to the same directory where is darknet.exe \n");
			printf("Check that you have copied file opencv_ffmpeg340_64.dll to the same directory where is darknet.exe \n");
			#endif
			error("Couldn't connect to webcam.\n");
			}
			error("Couldn't connect to webcam.\n");
			}

			layer l = net.layers[net.n-1];
			int j;
			@@ -184,51 +184,51 @@
			probs = (float *)calloc(l.wl.hl.n, sizeof(float ));
			for(j = 0; j < l.wl.hl.n; ++j) probs[j] = (float )calloc(l.classes, sizeof(float ));

			flag_exit = 0;
			flag_exit = 0;

			pthread_t fetch_thread;
			pthread_t detect_thread;

			fetch_in_thread(0);
			det_img = in_img;
			det_img = in_img;
			det_s = in_s;

			fetch_in_thread(0);
			detect_in_thread(0);
			det_img = in_img;
			det_img = in_img;
			det_s = in_s;

			for(j = 0; j < FRAMES/2; ++j){
			fetch_in_thread(0);
			detect_in_thread(0);
			det_img = in_img;
			det_img = in_img;
			det_s = in_s;
			}

			int count = 0;
			if(!prefix && !dont_show){
			cvNamedWindow("Demo", CV_WINDOW_NORMAL);
			cvNamedWindow("Demo", CV_WINDOW_NORMAL);
			cvMoveWindow("Demo", 0, 0);
			cvResizeWindow("Demo", 1352, 1013);
			}

			CvVideoWriter* output_video_writer = NULL; // cv::VideoWriter output_video;
			if (out_filename && !flag_exit)
			{
			CvSize size;
			size.width = det_img->width, size.height = det_img->height;
			int src_fps = 25;
			src_fps = get_stream_fps(cap, cpp_video_capture);
			CvVideoWriter* output_video_writer = NULL; // cv::VideoWriter output_video;
			if (out_filename && !flag_exit)
			{
			CvSize size;
			size.width = det_img->width, size.height = det_img->height;
			int src_fps = 25;
			src_fps = get_stream_fps(cap, cpp_video_capture);

			//const char* output_name = "test_dnn_out.avi";
			//output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('H', '2', '6', '4'), src_fps, size, 1);
			output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('D', 'I', 'V', 'X'), src_fps, size, 1);
			//output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('M', 'J', 'P', 'G'), src_fps, size, 1);
			//output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('M', 'P', '4', 'V'), src_fps, size, 1);
			//output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('M', 'P', '4', '2'), src_fps, size, 1);
			//output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('X', 'V', 'I', 'D'), src_fps, size, 1);
			//output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('W', 'M', 'V', '2'), src_fps, size, 1);
			}
			//const char* output_name = "test_dnn_out.avi";
			//output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('H', '2', '6', '4'), src_fps, size, 1);
			output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('D', 'I', 'V', 'X'), src_fps, size, 1);
			//output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('M', 'J', 'P', 'G'), src_fps, size, 1);
			//output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('M', 'P', '4', 'V'), src_fps, size, 1);
			//output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('M', 'P', '4', '2'), src_fps, size, 1);
			//output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('X', 'V', 'I', 'D'), src_fps, size, 1);
			//output_video_writer = cvCreateVideoWriter(out_filename, CV_FOURCC('W', 'M', 'V', '2'), src_fps, size, 1);
			}

			double before = get_wall_time();

			@@ -239,66 +239,66 @@
			if(pthread_create(&detect_thread, 0, detect_in_thread, 0)) error("Thread creation failed");

			if(!prefix){
			if (!dont_show) {
			show_image_cv_ipl(show_img, "Demo");
			int c = cvWaitKey(1);
			if (c == 10) {
			if (frame_skip == 0) frame_skip = 60;
			else if (frame_skip == 4) frame_skip = 0;
			else if (frame_skip == 60) frame_skip = 4;
			else frame_skip = 0;
			}
			else if (c == 27 \|\| c == 1048603) // ESC - exit (OpenCV 2.x / 3.x)
			{
			flag_exit = 1;
			}
			}
			if (!dont_show) {
			show_image_cv_ipl(show_img, "Demo");
			int c = cvWaitKey(1);
			if (c == 10) {
			if (frame_skip == 0) frame_skip = 60;
			else if (frame_skip == 4) frame_skip = 0;
			else if (frame_skip == 60) frame_skip = 4;
			else frame_skip = 0;
			}
			else if (c == 27 \|\| c == 1048603) // ESC - exit (OpenCV 2.x / 3.x)
			{
			flag_exit = 1;
			}
			}
			}else{
			char buff[256];
			sprintf(buff, "%s_%08d.jpg", prefix, count);
			cvSaveImage(buff, show_img, 0);
			cvSaveImage(buff, show_img, 0);
			//save_image(disp, buff);
			}

			// if you run it with param -http_port 8090 then open URL in your web-browser: http://localhost:8090
			if (http_stream_port > 0 && show_img) {
			//int port = 8090;
			int port = http_stream_port;
			int timeout = 200;
			int jpeg_quality = 30; // 1 - 100
			send_mjpeg(show_img, port, timeout, jpeg_quality);
			}
			// if you run it with param -http_port 8090 then open URL in your web-browser: http://localhost:8090
			if (http_stream_port > 0 && show_img) {
			//int port = 8090;
			int port = http_stream_port;
			int timeout = 200;
			int jpeg_quality = 30; // 1 - 100
			send_mjpeg(show_img, port, timeout, jpeg_quality);
			}

			// save video file
			if (output_video_writer && show_img) {
			cvWriteFrame(output_video_writer, show_img);
			printf("\n cvWriteFrame \n");
			}
			// save video file
			if (output_video_writer && show_img) {
			cvWriteFrame(output_video_writer, show_img);
			printf("\n cvWriteFrame \n");
			}

			cvReleaseImage(&show_img);
			cvReleaseImage(&show_img);

			pthread_join(fetch_thread, 0);
			pthread_join(detect_thread, 0);

			if (flag_exit == 1) break;
			if (flag_exit == 1) break;

			if(delay == 0){
			show_img = det_img;
			show_img = det_img;
			}
			det_img = in_img;
			det_img = in_img;
			det_s = in_s;
			}else {
			fetch_in_thread(0);
			det_img = in_img;
			det_img = in_img;
			det_s = in_s;
			detect_in_thread(0);

			show_img = det_img;
			if (!dont_show) {
			show_image_cv_ipl(show_img, "Demo");
			cvWaitKey(1);
			}
			cvReleaseImage(&show_img);
			show_img = det_img;
			if (!dont_show) {
			show_image_cv_ipl(show_img, "Demo");
			cvWaitKey(1);
			}
			cvReleaseImage(&show_img);
			}
			--delay;
			if(delay < 0){
			@@ -310,42 +310,42 @@
			before = after;
			}
			}
			printf("input video stream closed. \n");
			if (output_video_writer) {
			cvReleaseVideoWriter(&output_video_writer);
			printf("output_video_writer closed. \n");
			}
			printf("input video stream closed. \n");
			if (output_video_writer) {
			cvReleaseVideoWriter(&output_video_writer);
			printf("output_video_writer closed. \n");
			}

			// free memory
			cvReleaseImage(&show_img);
			cvReleaseImage(&in_img);
			free_image(in_s);
			// free memory
			cvReleaseImage(&show_img);
			cvReleaseImage(&in_img);
			free_image(in_s);

			free(avg);
			for (j = 0; j < FRAMES; ++j) free(predictions[j]);
			for (j = 0; j < FRAMES; ++j) free_image(images[j]);
			free(avg);
			for (j = 0; j < FRAMES; ++j) free(predictions[j]);
			for (j = 0; j < FRAMES; ++j) free_image(images[j]);

			for (j = 0; j < l.wl.hl.n; ++j) free(probs[j]);
			free(boxes);
			free(probs);
			for (j = 0; j < l.wl.hl.n; ++j) free(probs[j]);
			free(boxes);
			free(probs);

			free_ptrs(names, net.layers[net.n - 1].classes);
			free_ptrs(names, net.layers[net.n - 1].classes);

			int i;
			const int nsize = 8;
			for (j = 0; j < nsize; ++j) {
			for (i = 32; i < 127; ++i) {
			free_image(alphabet[j][i]);
			}
			free(alphabet[j]);
			}
			free(alphabet);
			int i;
			const int nsize = 8;
			for (j = 0; j < nsize; ++j) {
			for (i = 32; i < 127; ++i) {
			free_image(alphabet[j][i]);
			}
			free(alphabet[j]);
			}
			free(alphabet);

			free_network(net);
			free_network(net);
			}
			#else
			void demo(char cfgfile, char weightfile, float thresh, float hier_thresh, int cam_index, const char filename, char *names, int classes,
			int frame_skip, char prefix, char out_filename, int http_stream_port, int dont_show, int ext_output)
			int frame_skip, char prefix, char out_filename, int http_stream_port, int dont_show, int ext_output)
			{
			fprintf(stderr, "Demo needs OpenCV for webcam images.\n");
			}

			@@ -87,7 +87,7 @@
			#include <immintrin.h>
			#include <smmintrin.h>

			#else // Linux GCC/Clang
			#else // Linux GCC/Clang
			#include <x86intrin.h>
			#include <ammintrin.h>
			#include <immintrin.h>
			@@ -96,124 +96,124 @@

			void asm_cpuid(uint32_t* abcd, uint32_t eax)
			{
			uint32_t ebx = 0, edx = 0, ecx = 0;
			uint32_t ebx = 0, edx = 0, ecx = 0;

			// EBX is saved to EDI and later restored
			__asm__("movl %%ebx, %%edi;"
			"cpuid;"
			"xchgl %%ebx, %%edi;"
			: "=D"(ebx),
			"+a"(eax), "+c"(ecx), "=d"(edx));
			// EBX is saved to EDI and later restored
			__asm__("movl %%ebx, %%edi;"
			"cpuid;"
			"xchgl %%ebx, %%edi;"
			: "=D"(ebx),
			"+a"(eax), "+c"(ecx), "=d"(edx));

			abcd[0] = eax;
			abcd[1] = ebx;
			abcd[2] = ecx;
			abcd[3] = edx;
			abcd[0] = eax;
			abcd[1] = ebx;
			abcd[2] = ecx;
			abcd[3] = edx;
			}

			#endif

			int simd_detect_x86(unsigned int idFeature)
			{
			uint32_t regs[4]; // EAX, EBX, ECX, EDX;
			uint32_t regs[4]; // EAX, EBX, ECX, EDX;
			#ifdef _WIN32
			__cpuid(regs, 0);
			if (regs[0] > 1U) __cpuid(regs, 1);
			__cpuid(regs, 0);
			if (regs[0] > 1U) __cpuid(regs, 1);
			#else
			__get_cpuid(0, &regs[0], &regs[1], &regs[2], &regs[3]);
			if(regs[0] > 1U) __get_cpuid(1, &regs[0], &regs[1], &regs[2], &regs[3]);
			__get_cpuid(0, &regs[0], &regs[1], &regs[2], &regs[3]);
			if(regs[0] > 1U) __get_cpuid(1, &regs[0], &regs[1], &regs[2], &regs[3]);
			#endif

			if ((regs[2] & idFeature) != idFeature)
			return 0;
			return 1;
			if ((regs[2] & idFeature) != idFeature)
			return 0;
			return 1;
			}

			int is_fma_avx() {
			static int result = -1;
			if (result == -1) {
			result = simd_detect_x86(AVXFlag);
			if (result == 1) printf(" Used AVX \n");
			else printf(" Not used AVX \n");
			}
			return result;
			static int result = -1;
			if (result == -1) {
			result = simd_detect_x86(AVXFlag);
			if (result == 1) printf(" Used AVX \n");
			else printf(" Not used AVX \n");
			}
			return result;
			}

			// https://software.intel.com/sites/landingpage/IntrinsicsGuide
			void gemm_nn(int M, int N, int K, float ALPHA,
			float *A, int lda,
			float *B, int ldb,
			float *C, int ldc)
			float *A, int lda,
			float *B, int ldb,
			float *C, int ldc)
			{
			int i, j, k;
			if (is_fma_avx() == 1) { // AVX
			for (i = 0; i < M; ++i) {
			for (k = 0; k < K; ++k) {
			float A_PART = ALPHAA[ilda + k];
			__m256 a256, b256, c256, result256; // AVX
			a256 = _mm256_set1_ps(A_PART);
			for (j = 0; j < N - 8; j += 8) {
			b256 = _mm256_loadu_ps(&B[k*ldb + j]);
			c256 = _mm256_loadu_ps(&C[i*ldc + j]);
			// FMA - Intel Haswell (2013), AMD Piledriver (2012)
			//result256 = _mm256_fmadd_ps(a256, b256, c256);
			result256 = _mm256_mul_ps(a256, b256);
			result256 = _mm256_add_ps(result256, c256);
			_mm256_storeu_ps(&C[i*ldc + j], result256);
			}
			int i, j, k;
			if (is_fma_avx() == 1) { // AVX
			for (i = 0; i < M; ++i) {
			for (k = 0; k < K; ++k) {
			float A_PART = ALPHAA[ilda + k];
			__m256 a256, b256, c256, result256; // AVX
			a256 = _mm256_set1_ps(A_PART);
			for (j = 0; j < N - 8; j += 8) {
			b256 = _mm256_loadu_ps(&B[k*ldb + j]);
			c256 = _mm256_loadu_ps(&C[i*ldc + j]);
			// FMA - Intel Haswell (2013), AMD Piledriver (2012)
			//result256 = _mm256_fmadd_ps(a256, b256, c256);
			result256 = _mm256_mul_ps(a256, b256);
			result256 = _mm256_add_ps(result256, c256);
			_mm256_storeu_ps(&C[i*ldc + j], result256);
			}

			int prev_end = (N % 8 == 0) ? (N - 8) : (N / 8) * 8;
			for (j = prev_end; j < N; ++j)
			C[ildc + j] += A_PARTB[k*ldb + j];
			}
			}
			}
			else {
			for (i = 0; i < M; ++i) {
			for (k = 0; k < K; ++k) {
			register float A_PART = ALPHAA[ilda + k];
			for (j = 0; j < N; ++j) {
			C[ildc + j] += A_PARTB[k*ldb + j];
			}
			/* // SSE
			__m128 a128, b128, c128, result128; // SSE
			a128 = _mm_set1_ps(A_PART);
			for (j = 0; j < N - 4; j += 4) {
			b128 = _mm_loadu_ps(&B[k*ldb + j]);
			c128 = _mm_loadu_ps(&C[i*ldc + j]);
			//result128 = _mm_fmadd_ps(a128, b128, c128);
			result128 = _mm_mul_ps(a128, b128);
			result128 = _mm_add_ps(result128, c128);
			_mm_storeu_ps(&C[i*ldc + j], result128);
			}
			int prev_end = (N % 8 == 0) ? (N - 8) : (N / 8) * 8;
			for (j = prev_end; j < N; ++j)
			C[ildc + j] += A_PARTB[k*ldb + j];
			}
			}
			}
			else {
			for (i = 0; i < M; ++i) {
			for (k = 0; k < K; ++k) {
			register float A_PART = ALPHAA[ilda + k];
			for (j = 0; j < N; ++j) {
			C[ildc + j] += A_PARTB[k*ldb + j];
			}
			/* // SSE
			__m128 a128, b128, c128, result128; // SSE
			a128 = _mm_set1_ps(A_PART);
			for (j = 0; j < N - 4; j += 4) {
			b128 = _mm_loadu_ps(&B[k*ldb + j]);
			c128 = _mm_loadu_ps(&C[i*ldc + j]);
			//result128 = _mm_fmadd_ps(a128, b128, c128);
			result128 = _mm_mul_ps(a128, b128);
			result128 = _mm_add_ps(result128, c128);
			_mm_storeu_ps(&C[i*ldc + j], result128);
			}

			int prev_end = (N % 4 == 0) ? (N - 4) : (N / 4) * 4;
			for (j = prev_end; j < N; ++j){
			C[ildc + j] += A_PARTB[k*ldb + j];
			}
			*/
			}
			}
			}
			int prev_end = (N % 4 == 0) ? (N - 4) : (N / 4) * 4;
			for (j = prev_end; j < N; ++j){
			C[ildc + j] += A_PARTB[k*ldb + j];
			}
			*/
			}
			}
			}
			}
			#else

			void gemm_nn(int M, int N, int K, float ALPHA,
			float *A, int lda,
			float *B, int ldb,
			float *C, int ldc)
			float *A, int lda,
			float *B, int ldb,
			float *C, int ldc)
			{
			int i, j, k;
			for (i = 0; i < M; ++i) {
			for (k = 0; k < K; ++k) {
			register float A_PART = ALPHAA[ilda + k];
			for (j = 0; j < N; ++j) {
			C[ildc + j] += A_PARTB[k*ldb + j];
			}
			}
			}
			int i, j, k;
			for (i = 0; i < M; ++i) {
			for (k = 0; k < K; ++k) {
			register float A_PART = ALPHAA[ilda + k];
			for (j = 0; j < N; ++j) {
			C[ildc + j] += A_PARTB[k*ldb + j];
			}
			}
			}
			}
			#endif // __x86_64
			#endif // __x86_64

			void gemm_nt(int M, int N, int K, float ALPHA,
			float *A, int lda,
			@@ -282,18 +282,18 @@
			}
			}

			int t;
			#pragma omp parallel for
			for (t = 0; t < M; ++t) {
			if (!TA && !TB)
			gemm_nn(1, N, K, ALPHA, A + tlda, lda, B, ldb, C + tldc, ldc);
			else if (TA && !TB)
			gemm_tn(1, N, K, ALPHA, A + t, lda, B, ldb, C + t*ldc, ldc);
			else if (!TA && TB)
			gemm_nt(1, N, K, ALPHA, A + tlda, lda, B, ldb, C + tldc, ldc);
			else
			gemm_tt(1, N, K, ALPHA, A + t, lda, B, ldb, C + t*ldc, ldc);
			}
			int t;
			#pragma omp parallel for
			for (t = 0; t < M; ++t) {
			if (!TA && !TB)
			gemm_nn(1, N, K, ALPHA, A + tlda, lda, B, ldb, C + tldc, ldc);
			else if (TA && !TB)
			gemm_tn(1, N, K, ALPHA, A + t, lda, B, ldb, C + t*ldc, ldc);
			else if (!TA && TB)
			gemm_nt(1, N, K, ALPHA, A + tlda, lda, B, ldb, C + tldc, ldc);
			else
			gemm_tt(1, N, K, ALPHA, A + t, lda, B, ldb, C + t*ldc, ldc);
			}
			}

			#ifdef GPU
			@@ -307,7 +307,7 @@
			float *C_gpu, int ldc)
			{
			cublasHandle_t handle = blas_handle();
			cudaError_t stream_status = cublasSetStream(handle, get_cuda_stream());
			cudaError_t stream_status = cublasSetStream(handle, get_cuda_stream());
			cudaError_t status = cublasSgemm(handle, (TB ? CUBLAS_OP_T : CUBLAS_OP_N),
			(TA ? CUBLAS_OP_T : CUBLAS_OP_N), N, M, K, &ALPHA, B_gpu, ldb, A_gpu, lda, &BETA, C_gpu, ldc);
			check_error(status);

			@@ -1,5 +1,5 @@
			#include "gettimeofday.h"


			int gettimeofday(struct timeval tv, struct timezone tz)
			{
			FILETIME ft;

			@@ -7,7 +7,7 @@
			//
			// socket related abstractions:
			//
			#ifdef _WIN32
			#ifdef _WIN32
			#pragma comment(lib, "ws2_32.lib")
			#include <winsock.h>
			#include <windows.h>
			@@ -16,8 +16,8 @@
			#define ADDRPOINTER int*
			struct _INIT_W32DATA
			{
			WSADATA w;
			_INIT_W32DATA() { WSAStartup(MAKEWORD(2, 1), &w); }
			WSADATA w;
			_INIT_W32DATA() { WSAStartup(MAKEWORD(2, 1), &w); }
			} _init_once;
			#else /* ! win32 */
			#include <unistd.h>
			@@ -58,274 +58,274 @@

			class MJPGWriter
			{
			SOCKET sock;
			SOCKET maxfd;
			fd_set master;
			int timeout; // master sock timeout, shutdown after timeout millis.
			int quality; // jpeg compression [1..100]
			SOCKET sock;
			SOCKET maxfd;
			fd_set master;
			int timeout; // master sock timeout, shutdown after timeout millis.
			int quality; // jpeg compression [1..100]

			int _write(int sock, char const*const s, int len)
			{
			if (len < 1) { len = strlen(s); }
			return ::send(sock, s, len, 0);
			}
			int _write(int sock, char const*const s, int len)
			{
			if (len < 1) { len = strlen(s); }
			return ::send(sock, s, len, 0);
			}

			public:

			MJPGWriter(int port = 0, int _timeout = 200000, int _quality = 30)
			: sock(INVALID_SOCKET)
			, timeout(_timeout)
			, quality(_quality)
			{
			FD_ZERO(&master);
			if (port)
			open(port);
			}
			MJPGWriter(int port = 0, int _timeout = 200000, int _quality = 30)
			: sock(INVALID_SOCKET)
			, timeout(_timeout)
			, quality(_quality)
			{
			FD_ZERO(&master);
			if (port)
			open(port);
			}

			~MJPGWriter()
			{
			release();
			}
			~MJPGWriter()
			{
			release();
			}

			bool release()
			{
			if (sock != INVALID_SOCKET)
			::shutdown(sock, 2);
			sock = (INVALID_SOCKET);
			return false;
			}
			bool release()
			{
			if (sock != INVALID_SOCKET)
			::shutdown(sock, 2);
			sock = (INVALID_SOCKET);
			return false;
			}

			bool open(int port)
			{
			sock = ::socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
			bool open(int port)
			{
			sock = ::socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);

			SOCKADDR_IN address;
			address.sin_addr.s_addr = INADDR_ANY;
			address.sin_family = AF_INET;
			address.sin_port = htons(port); // ::htons(port);
			if (::bind(sock, (SOCKADDR*)&address, sizeof(SOCKADDR_IN)) == SOCKET_ERROR)
			{
			cerr << "error : couldn't bind sock " << sock << " to port " << port << "!" << endl;
			return release();
			}
			if (::listen(sock, 10) == SOCKET_ERROR)
			{
			cerr << "error : couldn't listen on sock " << sock << " on port " << port << " !" << endl;
			return release();
			}
			FD_ZERO(&master);
			FD_SET(sock, &master);
			maxfd = sock;
			return true;
			}
			SOCKADDR_IN address;
			address.sin_addr.s_addr = INADDR_ANY;
			address.sin_family = AF_INET;
			address.sin_port = htons(port); // ::htons(port);
			if (::bind(sock, (SOCKADDR*)&address, sizeof(SOCKADDR_IN)) == SOCKET_ERROR)
			{
			cerr << "error : couldn't bind sock " << sock << " to port " << port << "!" << endl;
			return release();
			}
			if (::listen(sock, 10) == SOCKET_ERROR)
			{
			cerr << "error : couldn't listen on sock " << sock << " on port " << port << " !" << endl;
			return release();
			}
			FD_ZERO(&master);
			FD_SET(sock, &master);
			maxfd = sock;
			return true;
			}

			bool isOpened()
			{
			return sock != INVALID_SOCKET;
			}
			bool isOpened()
			{
			return sock != INVALID_SOCKET;
			}

			bool write(const Mat & frame)
			{
			fd_set rread = master;
			struct timeval to = { 0,timeout };
			if (::select(maxfd+1, &rread, NULL, NULL, &to) <= 0)
			return true; // nothing broken, there's just noone listening
			bool write(const Mat & frame)
			{
			fd_set rread = master;
			struct timeval to = { 0,timeout };
			if (::select(maxfd+1, &rread, NULL, NULL, &to) <= 0)
			return true; // nothing broken, there's just noone listening

			std::vector<uchar> outbuf;
			std::vector<int> params;
			params.push_back(IMWRITE_JPEG_QUALITY);
			params.push_back(quality);
			cv::imencode(".jpg", frame, outbuf, params);
			size_t outlen = outbuf.size();
			std::vector<uchar> outbuf;
			std::vector<int> params;
			params.push_back(IMWRITE_JPEG_QUALITY);
			params.push_back(quality);
			cv::imencode(".jpg", frame, outbuf, params);
			size_t outlen = outbuf.size();

			#ifdef _WIN32
			for (unsigned i = 0; i<rread.fd_count; i++)
			{
			int addrlen = sizeof(SOCKADDR);
			SOCKET s = rread.fd_array[i]; // fd_set on win is an array, while ...
			#else
			for (int s = 0; s<=maxfd; s++)
			{
			socklen_t addrlen = sizeof(SOCKADDR);
			if (!FD_ISSET(s, &rread)) // ... on linux it's a bitmask ;)
			continue;
			#endif
			if (s == sock) // request on master socket, accept and send main header.
			{
			SOCKADDR_IN address = { 0 };
			SOCKET client = ::accept(sock, (SOCKADDR*)&address, &addrlen);
			if (client == SOCKET_ERROR)
			{
			cerr << "error : couldn't accept connection on sock " << sock << " !" << endl;
			return false;
			}
			maxfd = (maxfd>client ? maxfd : client);
			FD_SET(client, &master);
			_write(client, "HTTP/1.0 200 OK\r\n", 0);
			_write(client,
			"Server: Mozarella/2.2\r\n"
			"Accept-Range: bytes\r\n"
			"Connection: close\r\n"
			"Max-Age: 0\r\n"
			"Expires: 0\r\n"
			"Cache-Control: no-cache, private\r\n"
			"Pragma: no-cache\r\n"
			"Content-Type: multipart/x-mixed-replace; boundary=mjpegstream\r\n"
			"\r\n", 0);
			cerr << "new client " << client << endl;
			}
			else // existing client, just stream pix
			{
			char head[400];
			sprintf(head, "--mjpegstream\r\nContent-Type: image/jpeg\r\nContent-Length: %zu\r\n\r\n", outlen);
			_write(s, head, 0);
			int n = _write(s, (char*)(&outbuf[0]), outlen);
			//cerr << "known client " << s << " " << n << endl;
			if (n < outlen)
			{
			cerr << "kill client " << s << endl;
			::shutdown(s, 2);
			FD_CLR(s, &master);
			}
			}
			}
			return true;
			}
			#ifdef _WIN32
			for (unsigned i = 0; i<rread.fd_count; i++)
			{
			int addrlen = sizeof(SOCKADDR);
			SOCKET s = rread.fd_array[i]; // fd_set on win is an array, while ...
			#else
			for (int s = 0; s<=maxfd; s++)
			{
			socklen_t addrlen = sizeof(SOCKADDR);
			if (!FD_ISSET(s, &rread)) // ... on linux it's a bitmask ;)
			continue;
			#endif
			if (s == sock) // request on master socket, accept and send main header.
			{
			SOCKADDR_IN address = { 0 };
			SOCKET client = ::accept(sock, (SOCKADDR*)&address, &addrlen);
			if (client == SOCKET_ERROR)
			{
			cerr << "error : couldn't accept connection on sock " << sock << " !" << endl;
			return false;
			}
			maxfd = (maxfd>client ? maxfd : client);
			FD_SET(client, &master);
			_write(client, "HTTP/1.0 200 OK\r\n", 0);
			_write(client,
			"Server: Mozarella/2.2\r\n"
			"Accept-Range: bytes\r\n"
			"Connection: close\r\n"
			"Max-Age: 0\r\n"
			"Expires: 0\r\n"
			"Cache-Control: no-cache, private\r\n"
			"Pragma: no-cache\r\n"
			"Content-Type: multipart/x-mixed-replace; boundary=mjpegstream\r\n"
			"\r\n", 0);
			cerr << "new client " << client << endl;
			}
			else // existing client, just stream pix
			{
			char head[400];
			sprintf(head, "--mjpegstream\r\nContent-Type: image/jpeg\r\nContent-Length: %zu\r\n\r\n", outlen);
			_write(s, head, 0);
			int n = _write(s, (char*)(&outbuf[0]), outlen);
			//cerr << "known client " << s << " " << n << endl;
			if (n < outlen)
			{
			cerr << "kill client " << s << endl;
			::shutdown(s, 2);
			FD_CLR(s, &master);
			}
			}
			}
			return true;
			}
			};
			// ----------------------------------------

			void send_mjpeg(IplImage* ipl, int port, int timeout, int quality) {
			static MJPGWriter wri(port, timeout, quality);
			cv::Mat mat = cv::cvarrToMat(ipl);
			wri.write(mat);
			std::cout << " MJPEG-stream sent. \n";
			static MJPGWriter wri(port, timeout, quality);
			cv::Mat mat = cv::cvarrToMat(ipl);
			wri.write(mat);
			std::cout << " MJPEG-stream sent. \n";
			}
			// ----------------------------------------

			CvCapture* get_capture_video_stream(char *path) {
			CvCapture* cap = NULL;
			try {
			cap = (CvCapture*)new cv::VideoCapture(path);
			}
			catch (...) {
			std::cout << " Error: video-stream " << path << " can't be opened! \n";
			}
			return cap;
			CvCapture* cap = NULL;
			try {
			cap = (CvCapture*)new cv::VideoCapture(path);
			}
			catch (...) {
			std::cout << " Error: video-stream " << path << " can't be opened! \n";
			}
			return cap;
			}
			// ----------------------------------------

			CvCapture* get_capture_webcam(int index) {
			CvCapture* cap = NULL;
			try {
			cap = (CvCapture*)new cv::VideoCapture(index);
			//((cv::VideoCapture*)cap)->set(CV_CAP_PROP_FRAME_WIDTH, 1280);
			//((cv::VideoCapture*)cap)->set(CV_CAP_PROP_FRAME_HEIGHT, 960);
			}
			catch (...) {
			std::cout << " Error: Web-camera " << index << " can't be opened! \n";
			}
			return cap;
			CvCapture* cap = NULL;
			try {
			cap = (CvCapture*)new cv::VideoCapture(index);
			//((cv::VideoCapture*)cap)->set(CV_CAP_PROP_FRAME_WIDTH, 1280);
			//((cv::VideoCapture*)cap)->set(CV_CAP_PROP_FRAME_HEIGHT, 960);
			}
			catch (...) {
			std::cout << " Error: Web-camera " << index << " can't be opened! \n";
			}
			return cap;
			}
			// ----------------------------------------

			IplImage* get_webcam_frame(CvCapture *cap) {
			IplImage* src = NULL;
			try {
			cv::VideoCapture &cpp_cap = (cv::VideoCapture )cap;
			cv::Mat frame;
			if (cpp_cap.isOpened())
			{
			cpp_cap >> frame;
			IplImage tmp = frame;
			src = cvCloneImage(&tmp);
			}
			else {
			std::cout << " Video-stream stoped! \n";
			}
			}
			catch (...) {
			std::cout << " Video-stream stoped! \n";
			}
			return src;
			IplImage* src = NULL;
			try {
			cv::VideoCapture &cpp_cap = (cv::VideoCapture )cap;
			cv::Mat frame;
			if (cpp_cap.isOpened())
			{
			cpp_cap >> frame;
			IplImage tmp = frame;
			src = cvCloneImage(&tmp);
			}
			else {
			std::cout << " Video-stream stoped! \n";
			}
			}
			catch (...) {
			std::cout << " Video-stream stoped! \n";
			}
			return src;
			}

			int get_stream_fps_cpp(CvCapture *cap) {
			int fps = 25;
			try {
			cv::VideoCapture &cpp_cap = (cv::VideoCapture )cap;
			#ifndef CV_VERSION_EPOCH // OpenCV 3.x
			fps = cpp_cap.get(CAP_PROP_FPS);
			#else // OpenCV 2.x
			fps = cpp_cap.get(CV_CAP_PROP_FPS);
			#endif
			}
			catch (...) {
			std::cout << " Can't get FPS of source videofile. For output video FPS = 25 by default. \n";
			}
			return fps;
			int fps = 25;
			try {
			cv::VideoCapture &cpp_cap = (cv::VideoCapture )cap;
			#ifndef CV_VERSION_EPOCH // OpenCV 3.x
			fps = cpp_cap.get(CAP_PROP_FPS);
			#else // OpenCV 2.x
			fps = cpp_cap.get(CV_CAP_PROP_FPS);
			#endif
			}
			catch (...) {
			std::cout << " Can't get FPS of source videofile. For output video FPS = 25 by default. \n";
			}
			return fps;
			}
			// ----------------------------------------
			extern "C" {
			image ipl_to_image(IplImage* src); // image.c
			image ipl_to_image(IplImage* src); // image.c
			}

			image image_data_augmentation(IplImage* ipl, int w, int h,
			int pleft, int ptop, int swidth, int sheight, int flip,
			float jitter, float dhue, float dsat, float dexp)
			int pleft, int ptop, int swidth, int sheight, int flip,
			float jitter, float dhue, float dsat, float dexp)
			{
			cv::Mat img = cv::cvarrToMat(ipl);
			cv::Mat img = cv::cvarrToMat(ipl);

			// crop
			cv::Rect src_rect(pleft, ptop, swidth, sheight);
			cv::Rect img_rect(cv::Point2i(0, 0), img.size());
			cv::Rect new_src_rect = src_rect & img_rect;
			// crop
			cv::Rect src_rect(pleft, ptop, swidth, sheight);
			cv::Rect img_rect(cv::Point2i(0, 0), img.size());
			cv::Rect new_src_rect = src_rect & img_rect;

			cv::Rect dst_rect(cv::Point2i(std::max(0, -pleft), std::max(0, -ptop)), new_src_rect.size());
			cv::Rect dst_rect(cv::Point2i(std::max(0, -pleft), std::max(0, -ptop)), new_src_rect.size());

			cv::Mat cropped(cv::Size(src_rect.width, src_rect.height), img.type());
			cropped.setTo(cv::Scalar::all(0));
			cv::Mat cropped(cv::Size(src_rect.width, src_rect.height), img.type());
			cropped.setTo(cv::Scalar::all(0));

			img(new_src_rect).copyTo(cropped(dst_rect));
			img(new_src_rect).copyTo(cropped(dst_rect));

			// resize
			cv::Mat sized;
			cv::resize(cropped, sized, cv::Size(w, h), 0, 0, INTER_LINEAR);
			// resize
			cv::Mat sized;
			cv::resize(cropped, sized, cv::Size(w, h), 0, 0, INTER_LINEAR);

			// flip
			if (flip) {
			cv::flip(sized, cropped, 1); // 0 - x-axis, 1 - y-axis, -1 - both axes (x & y)
			sized = cropped.clone();
			}
			// flip
			if (flip) {
			cv::flip(sized, cropped, 1); // 0 - x-axis, 1 - y-axis, -1 - both axes (x & y)
			sized = cropped.clone();
			}

			// HSV augmentation
			// CV_BGR2HSV, CV_RGB2HSV, CV_HSV2BGR, CV_HSV2RGB
			if (ipl->nChannels >= 3)
			{
			cv::Mat hsv_src;
			cvtColor(sized, hsv_src, CV_BGR2HSV); // also BGR -> RGB

			std::vector<cv::Mat> hsv;
			cv::split(hsv_src, hsv);
			// HSV augmentation
			// CV_BGR2HSV, CV_RGB2HSV, CV_HSV2BGR, CV_HSV2RGB
			if (ipl->nChannels >= 3)
			{
			cv::Mat hsv_src;
			cvtColor(sized, hsv_src, CV_BGR2HSV); // also BGR -> RGB

			hsv[1] *= dsat;
			hsv[2] *= dexp;
			hsv[0] += 179 * dhue;
			std::vector<cv::Mat> hsv;
			cv::split(hsv_src, hsv);

			cv::merge(hsv, hsv_src);
			hsv[1] *= dsat;
			hsv[2] *= dexp;
			hsv[0] += 179 * dhue;

			cvtColor(hsv_src, sized, CV_HSV2RGB); // now RGB instead of BGR
			}
			else
			{
			sized *= dexp;
			}
			cv::merge(hsv, hsv_src);

			// Mat -> IplImage -> image
			IplImage src = sized;
			image out = ipl_to_image(&src);
			cvtColor(hsv_src, sized, CV_HSV2RGB); // now RGB instead of BGR
			}
			else
			{
			sized *= dexp;
			}

			return out;
			// Mat -> IplImage -> image
			IplImage src = sized;
			image out = ipl_to_image(&src);

			return out;
			}


			#endif // OPENCV
			#endif // OPENCV

			@@ -14,8 +14,8 @@
			int get_stream_fps_cpp(CvCapture *cap);

			image image_data_augmentation(IplImage* ipl, int w, int h,
			int pleft, int ptop, int swidth, int sheight, int flip,
			float jitter, float dhue, float dsat, float dexp);
			int pleft, int ptop, int swidth, int sheight, int flip,
			float jitter, float dhue, float dsat, float dexp);

			#ifdef __cplusplus
			}

			@@ -5,11 +5,11 @@

			list *make_list()
			{
			list *l = malloc(sizeof(list));
			l->size = 0;
			l->front = 0;
			l->back = 0;
			return l;
			list *l = malloc(sizeof(list));
			l->size = 0;
			l->front = 0;
			l->back = 0;
			return l;
			}

			/*
			@@ -40,55 +40,55 @@

			void list_insert(list l, void val)
			{
			node *new = malloc(sizeof(node));
			new->val = val;
			new->next = 0;
			node *new = malloc(sizeof(node));
			new->val = val;
			new->next = 0;

			if(!l->back){
			l->front = new;
			new->prev = 0;
			}else{
			l->back->next = new;
			new->prev = l->back;
			}
			l->back = new;
			++l->size;
			if(!l->back){
			l->front = new;
			new->prev = 0;
			}else{
			l->back->next = new;
			new->prev = l->back;
			}
			l->back = new;
			++l->size;
			}

			void free_node(node *n)
			{
			node *next;
			while(n) {
			next = n->next;
			free(n);
			n = next;
			}
			node *next;
			while(n) {
			next = n->next;
			free(n);
			n = next;
			}
			}

			void free_list(list *l)
			{
			free_node(l->front);
			free(l);
			free_node(l->front);
			free(l);
			}

			void free_list_contents(list *l)
			{
			node *n = l->front;
			while(n){
			free(n->val);
			n = n->next;
			}
			node *n = l->front;
			while(n){
			free(n->val);
			n = n->next;
			}
			}

			void free_list_contents_kvp(list *l)
			{
			node *n = l->front;
			while (n) {
			kvp *p = n->val;
			free(p->key);
			free(n->val);
			n = n->next;
			}
			node *n = l->front;
			while (n) {
			kvp *p = n->val;
			free(p->key);
			free(n->val);
			n = n->next;
			}
			}

			void *list_to_array(list l)

			@@ -33,19 +33,19 @@

			network load_network_custom(char cfg, char *weights, int clear, int batch)
			{
			printf(" Try to load cfg: %s, weights: %s, clear = %d \n", cfg, weights, clear);
			network *net = calloc(1, sizeof(network));
			*net = parse_network_cfg_custom(cfg, batch);
			if (weights && weights[0] != 0) {
			load_weights(net, weights);
			}
			if (clear) (*net->seen) = 0;
			return net;
			printf(" Try to load cfg: %s, weights: %s, clear = %d \n", cfg, weights, clear);
			network *net = calloc(1, sizeof(network));
			*net = parse_network_cfg_custom(cfg, batch);
			if (weights && weights[0] != 0) {
			load_weights(net, weights);
			}
			if (clear) (*net->seen) = 0;
			return net;
			}

			network load_network(char cfg, char *weights, int clear)
			{
			return load_network_custom(cfg, weights, clear, 0);
			return load_network_custom(cfg, weights, clear, 0);
			}

			int get_current_batch(network net)
			@@ -67,23 +67,23 @@

			void reset_network_state(network *net, int b)
			{
			int i;
			for (i = 0; i < net->n; ++i) {
			int i;
			for (i = 0; i < net->n; ++i) {
			#ifdef GPU
			layer l = net->layers[i];
			if (l.state_gpu) {
			fill_ongpu(l.outputs, 0, l.state_gpu + l.outputs*b, 1);
			}
			if (l.h_gpu) {
			fill_ongpu(l.outputs, 0, l.h_gpu + l.outputs*b, 1);
			}
			layer l = net->layers[i];
			if (l.state_gpu) {
			fill_ongpu(l.outputs, 0, l.state_gpu + l.outputs*b, 1);
			}
			if (l.h_gpu) {
			fill_ongpu(l.outputs, 0, l.h_gpu + l.outputs*b, 1);
			}
			#endif
			}
			}
			}

			void reset_rnn(network *net)
			{
			reset_network_state(net, 0);
			reset_network_state(net, 0);
			}

			float get_current_rate(network net)
			@@ -91,7 +91,7 @@
			int batch_num = get_current_batch(net);
			int i;
			float rate;
			if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power);
			if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power);
			switch (net.policy) {
			case CONSTANT:
			return net.learning_rate;
			@@ -108,7 +108,7 @@
			case EXP:
			return net.learning_rate * pow(net.gamma, batch_num);
			case POLY:
			return net.learning_rate * pow(1 - (float)batch_num / net.max_batches, net.power);
			return net.learning_rate * pow(1 - (float)batch_num / net.max_batches, net.power);
			//if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power);
			//return net.learning_rate * pow(1 - (float)batch_num / net.max_batches, net.power);
			case RANDOM:
			@@ -182,10 +182,10 @@
			net.input_gpu = calloc(1, sizeof(float *));
			net.truth_gpu = calloc(1, sizeof(float *));

			net.input16_gpu = calloc(1, sizeof(float *));
			net.output16_gpu = calloc(1, sizeof(float *));
			net.max_input16_size = calloc(1, sizeof(size_t));
			net.max_output16_size = calloc(1, sizeof(size_t));
			net.input16_gpu = calloc(1, sizeof(float *));
			net.output16_gpu = calloc(1, sizeof(float *));
			net.max_input16_size = calloc(1, sizeof(size_t));
			net.max_output16_size = calloc(1, sizeof(size_t));
			#endif
			return net;
			}
			@@ -362,20 +362,20 @@
			net->layers[i].batch = b;
			#ifdef CUDNN
			if(net->layers[i].type == CONVOLUTIONAL){
			cudnn_convolutional_setup(net->layers + i, cudnn_fastest);
			/*
			layer *l = net->layers + i;
			cudnn_convolutional_setup(net->layers + i, cudnn_fastest);
			/*
			layer *l = net->layers + i;
			cudnn_convolutional_setup(l, cudnn_fastest);
			// check for excessive memory consumption
			size_t free_byte;
			size_t total_byte;
			check_error(cudaMemGetInfo(&free_byte, &total_byte));
			if (l->workspace_size > free_byte \|\| l->workspace_size >= total_byte / 2) {
			printf(" used slow CUDNN algo without Workspace! \n");
			cudnn_convolutional_setup(l, cudnn_smallest);
			l->workspace_size = get_workspace_size(*l);
			}
			*/
			// check for excessive memory consumption
			size_t free_byte;
			size_t total_byte;
			check_error(cudaMemGetInfo(&free_byte, &total_byte));
			if (l->workspace_size > free_byte \|\| l->workspace_size >= total_byte / 2) {
			printf(" used slow CUDNN algo without Workspace! \n");
			cudnn_convolutional_setup(l, cudnn_smallest);
			l->workspace_size = get_workspace_size(*l);
			}
			*/
			}
			#endif
			}
			@@ -387,12 +387,12 @@
			cuda_set_device(net->gpu_index);
			if(gpu_index >= 0){
			cuda_free(net->workspace);
			if (net->input_gpu) {
			cuda_free(*net->input_gpu);
			*net->input_gpu = 0;
			cuda_free(*net->truth_gpu);
			*net->truth_gpu = 0;
			}
			if (net->input_gpu) {
			cuda_free(*net->input_gpu);
			*net->input_gpu = 0;
			cuda_free(*net->truth_gpu);
			*net->truth_gpu = 0;
			}
			}
			#endif
			int i;
			@@ -405,7 +405,7 @@
			//fflush(stderr);
			for (i = 0; i < net->n; ++i){
			layer l = net->layers[i];
			//printf(" %d: layer = %d,", i, l.type);
			//printf(" %d: layer = %d,", i, l.type);
			if(l.type == CONVOLUTIONAL){
			resize_convolutional_layer(&l, w, h);
			}else if(l.type == CROP){
			@@ -414,14 +414,14 @@
			resize_maxpool_layer(&l, w, h);
			}else if(l.type == REGION){
			resize_region_layer(&l, w, h);
			}else if (l.type == YOLO) {
			resize_yolo_layer(&l, w, h);
			}else if (l.type == YOLO) {
			resize_yolo_layer(&l, w, h);
			}else if(l.type == ROUTE){
			resize_route_layer(&l, net);
			}else if (l.type == SHORTCUT) {
			resize_shortcut_layer(&l, w, h);
			}else if (l.type == UPSAMPLE) {
			resize_upsample_layer(&l, w, h);
			}else if (l.type == SHORTCUT) {
			resize_shortcut_layer(&l, w, h);
			}else if (l.type == UPSAMPLE) {
			resize_upsample_layer(&l, w, h);
			}else if(l.type == REORG){
			resize_reorg_layer(&l, w, h);
			}else if(l.type == AVGPOOL){
			@@ -431,7 +431,7 @@
			}else if(l.type == COST){
			resize_cost_layer(&l, inputs);
			}else{
			fprintf(stderr, "Resizing type %d \n", (int)l.type);
			fprintf(stderr, "Resizing type %d \n", (int)l.type);
			error("Cannot resize this type of layer");
			}
			if(l.workspace_size > workspace_size) workspace_size = l.workspace_size;
			@@ -443,9 +443,9 @@
			}
			#ifdef GPU
			if(gpu_index >= 0){
			printf(" try to allocate workspace = %zu * sizeof(float), ", workspace_size / sizeof(float) + 1);
			printf(" try to allocate workspace = %zu * sizeof(float), ", workspace_size / sizeof(float) + 1);
			net->workspace = cuda_make_array(0, workspace_size/sizeof(float) + 1);
			printf(" CUDA allocate done! \n");
			printf(" CUDA allocate done! \n");
			}else {
			free(net->workspace);
			net->workspace = calloc(1, workspace_size);
			@@ -551,112 +551,112 @@

			int num_detections(network *net, float thresh)
			{
			int i;
			int s = 0;
			for (i = 0; i < net->n; ++i) {
			layer l = net->layers[i];
			if (l.type == YOLO) {
			s += yolo_num_detections(l, thresh);
			}
			if (l.type == DETECTION \|\| l.type == REGION) {
			s += l.wl.hl.n;
			}
			}
			return s;
			int i;
			int s = 0;
			for (i = 0; i < net->n; ++i) {
			layer l = net->layers[i];
			if (l.type == YOLO) {
			s += yolo_num_detections(l, thresh);
			}
			if (l.type == DETECTION \|\| l.type == REGION) {
			s += l.wl.hl.n;
			}
			}
			return s;
			}

			detection make_network_boxes(network net, float thresh, int *num)
			{
			layer l = net->layers[net->n - 1];
			int i;
			int nboxes = num_detections(net, thresh);
			if (num) *num = nboxes;
			detection *dets = calloc(nboxes, sizeof(detection));
			for (i = 0; i < nboxes; ++i) {
			dets[i].prob = calloc(l.classes, sizeof(float));
			if (l.coords > 4) {
			dets[i].mask = calloc(l.coords - 4, sizeof(float));
			}
			}
			return dets;
			layer l = net->layers[net->n - 1];
			int i;
			int nboxes = num_detections(net, thresh);
			if (num) *num = nboxes;
			detection *dets = calloc(nboxes, sizeof(detection));
			for (i = 0; i < nboxes; ++i) {
			dets[i].prob = calloc(l.classes, sizeof(float));
			if (l.coords > 4) {
			dets[i].mask = calloc(l.coords - 4, sizeof(float));
			}
			}
			return dets;
			}


			void custom_get_region_detections(layer l, int w, int h, int net_w, int net_h, float thresh, int map, float hier, int relative, detection dets, int letter)
			{
			box boxes = calloc(l.wl.h*l.n, sizeof(box));
			float *probs = calloc(l.wl.hl.n, sizeof(float ));
			int i, j;
			for (j = 0; j < l.wl.hl.n; ++j) probs[j] = calloc(l.classes, sizeof(float));
			get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, map);
			for (j = 0; j < l.wl.hl.n; ++j) {
			dets[j].classes = l.classes;
			dets[j].bbox = boxes[j];
			dets[j].objectness = 1;
			for (i = 0; i < l.classes; ++i) {
			dets[j].prob[i] = probs[j][i];
			}
			}
			box boxes = calloc(l.wl.h*l.n, sizeof(box));
			float *probs = calloc(l.wl.hl.n, sizeof(float ));
			int i, j;
			for (j = 0; j < l.wl.hl.n; ++j) probs[j] = calloc(l.classes, sizeof(float));
			get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, map);
			for (j = 0; j < l.wl.hl.n; ++j) {
			dets[j].classes = l.classes;
			dets[j].bbox = boxes[j];
			dets[j].objectness = 1;
			for (i = 0; i < l.classes; ++i) {
			dets[j].prob[i] = probs[j][i];
			}
			}

			free(boxes);
			free_ptrs((void *)probs, l.wl.h*l.n);
			free(boxes);
			free_ptrs((void *)probs, l.wl.h*l.n);

			//correct_region_boxes(dets, l.wl.hl.n, w, h, net_w, net_h, relative);
			correct_yolo_boxes(dets, l.wl.hl.n, w, h, net_w, net_h, relative, letter);
			//correct_region_boxes(dets, l.wl.hl.n, w, h, net_w, net_h, relative);
			correct_yolo_boxes(dets, l.wl.hl.n, w, h, net_w, net_h, relative, letter);
			}

			void fill_network_boxes(network net, int w, int h, float thresh, float hier, int map, int relative, detection *dets, int letter)
			{
			int prev_classes = -1;
			int j;
			for (j = 0; j < net->n; ++j) {
			layer l = net->layers[j];
			if (l.type == YOLO) {
			int count = get_yolo_detections(l, w, h, net->w, net->h, thresh, map, relative, dets, letter);
			dets += count;
			if (prev_classes < 0) prev_classes = l.classes;
			else if (prev_classes != l.classes) {
			printf(" Error: Different [yolo] layers have different number of classes = %d and %d - check your cfg-file! \n",
			prev_classes, l.classes);
			}
			}
			if (l.type == REGION) {
			custom_get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets, letter);
			//get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets);
			dets += l.wl.hl.n;
			}
			if (l.type == DETECTION) {
			get_detection_detections(l, w, h, thresh, dets);
			dets += l.wl.hl.n;
			}
			}
			int prev_classes = -1;
			int j;
			for (j = 0; j < net->n; ++j) {
			layer l = net->layers[j];
			if (l.type == YOLO) {
			int count = get_yolo_detections(l, w, h, net->w, net->h, thresh, map, relative, dets, letter);
			dets += count;
			if (prev_classes < 0) prev_classes = l.classes;
			else if (prev_classes != l.classes) {
			printf(" Error: Different [yolo] layers have different number of classes = %d and %d - check your cfg-file! \n",
			prev_classes, l.classes);
			}
			}
			if (l.type == REGION) {
			custom_get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets, letter);
			//get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets);
			dets += l.wl.hl.n;
			}
			if (l.type == DETECTION) {
			get_detection_detections(l, w, h, thresh, dets);
			dets += l.wl.hl.n;
			}
			}
			}

			detection get_network_boxes(network net, int w, int h, float thresh, float hier, int map, int relative, int num, int letter)
			{
			detection *dets = make_network_boxes(net, thresh, num);
			fill_network_boxes(net, w, h, thresh, hier, map, relative, dets, letter);
			return dets;
			detection *dets = make_network_boxes(net, thresh, num);
			fill_network_boxes(net, w, h, thresh, hier, map, relative, dets, letter);
			return dets;
			}

			void free_detections(detection *dets, int n)
			{
			int i;
			for (i = 0; i < n; ++i) {
			free(dets[i].prob);
			if (dets[i].mask) free(dets[i].mask);
			}
			free(dets);
			int i;
			for (i = 0; i < n; ++i) {
			free(dets[i].prob);
			if (dets[i].mask) free(dets[i].mask);
			}
			free(dets);
			}

			float network_predict_image(network net, image im)
			{
			//image imr = letterbox_image(im, net->w, net->h);
			image imr = resize_image(im, net->w, net->h);
			set_batch_network(net, 1);
			float p = network_predict(net, imr.data);
			free_image(imr);
			return p;
			//image imr = letterbox_image(im, net->w, net->h);
			image imr = resize_image(im, net->w, net->h);
			set_batch_network(net, 1);
			float p = network_predict(net, imr.data);
			free_image(imr);
			return p;
			}

			int network_width(network *net) { return net->w; }
			@@ -780,70 +780,70 @@

			void free_network(network net)
			{
			int i;
			for (i = 0; i < net.n; ++i) {
			free_layer(net.layers[i]);
			}
			free(net.layers);
			int i;
			for (i = 0; i < net.n; ++i) {
			free_layer(net.layers[i]);
			}
			free(net.layers);

			free(net.scales);
			free(net.steps);
			free(net.seen);
			free(net.scales);
			free(net.steps);
			free(net.seen);

			#ifdef GPU
			if (gpu_index >= 0) cuda_free(net.workspace);
			else free(net.workspace);
			if (net.input_gpu) cuda_free(net.input_gpu);
			if (net.truth_gpu) cuda_free(net.truth_gpu);
			if (net.input_gpu) free(net.input_gpu);
			if (net.truth_gpu) free(net.truth_gpu);
			if (gpu_index >= 0) cuda_free(net.workspace);
			else free(net.workspace);
			if (net.input_gpu) cuda_free(net.input_gpu);
			if (net.truth_gpu) cuda_free(net.truth_gpu);
			if (net.input_gpu) free(net.input_gpu);
			if (net.truth_gpu) free(net.truth_gpu);

			if (net.input16_gpu) cuda_free(net.input16_gpu);
			if (net.output16_gpu) cuda_free(net.output16_gpu);
			if (net.input16_gpu) free(net.input16_gpu);
			if (net.output16_gpu) free(net.output16_gpu);
			if (net.max_input16_size) free(net.max_input16_size);
			if (net.max_output16_size) free(net.max_output16_size);
			if (net.input16_gpu) cuda_free(net.input16_gpu);
			if (net.output16_gpu) cuda_free(net.output16_gpu);
			if (net.input16_gpu) free(net.input16_gpu);
			if (net.output16_gpu) free(net.output16_gpu);
			if (net.max_input16_size) free(net.max_input16_size);
			if (net.max_output16_size) free(net.max_output16_size);
			#else
			free(net.workspace);
			free(net.workspace);
			#endif
			}


			void fuse_conv_batchnorm(network net)
			{
			int j;
			for (j = 0; j < net.n; ++j) {
			layer *l = &net.layers[j];
			int j;
			for (j = 0; j < net.n; ++j) {
			layer *l = &net.layers[j];

			if (l->type == CONVOLUTIONAL) {
			//printf(" Merges Convolutional-%d and batch_norm \n", j);
			if (l->type == CONVOLUTIONAL) {
			//printf(" Merges Convolutional-%d and batch_norm \n", j);

			if (l->batch_normalize) {
			int f;
			for (f = 0; f < l->n; ++f)
			{
			l->biases[f] = l->biases[f] - (double)l->scales[f] * l->rolling_mean[f] / (sqrt((double)l->rolling_variance[f]) + .000001f);
			if (l->batch_normalize) {
			int f;
			for (f = 0; f < l->n; ++f)
			{
			l->biases[f] = l->biases[f] - (double)l->scales[f] * l->rolling_mean[f] / (sqrt((double)l->rolling_variance[f]) + .000001f);

			const size_t filter_size = l->sizel->sizel->c;
			int i;
			for (i = 0; i < filter_size; ++i) {
			int w_index = f*filter_size + i;
			const size_t filter_size = l->sizel->sizel->c;
			int i;
			for (i = 0; i < filter_size; ++i) {
			int w_index = f*filter_size + i;

			l->weights[w_index] = (double)l->weights[w_index] * l->scales[f] / (sqrt((double)l->rolling_variance[f]) + .000001f);
			}
			}
			l->weights[w_index] = (double)l->weights[w_index] * l->scales[f] / (sqrt((double)l->rolling_variance[f]) + .000001f);
			}
			}

			l->batch_normalize = 0;
			l->batch_normalize = 0;
			#ifdef GPU
			if (gpu_index >= 0) {
			push_convolutional_layer(*l);
			}
			if (gpu_index >= 0) {
			push_convolutional_layer(*l);
			}
			#endif
			}
			}
			else {
			//printf(" Fusion skip layer type: %d \n", l->type);
			}
			}
			}
			}
			else {
			//printf(" Fusion skip layer type: %d \n", l->type);
			}
			}
			}

			@@ -55,23 +55,23 @@
			fill_ongpu(l.outputs * l.batch, 0, l.delta_gpu, 1);
			}
			l.forward_gpu(l, state);
			if(net.wait_stream)
			cudaStreamSynchronize(get_cuda_stream());
			if(net.wait_stream)
			cudaStreamSynchronize(get_cuda_stream());
			state.input = l.output_gpu;
			/*
			cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
			if (l.out_w >= 0 && l.out_h >= 1 && l.c >= 3) {
			int j;
			for (j = 0; j < l.out_c; ++j) {
			image img = make_image(l.out_w, l.out_h, 3);
			memcpy(img.data, l.output+ l.out_wl.out_hj, l.out_wl.out_h 1 * sizeof(float));
			char buff[256];
			sprintf(buff, "layer-%d slice-%d", i, j);
			show_image(img, buff);
			}
			cvWaitKey(0); // wait press-key in console
			cvDestroyAllWindows();
			}
			cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
			if (l.out_w >= 0 && l.out_h >= 1 && l.c >= 3) {
			int j;
			for (j = 0; j < l.out_c; ++j) {
			image img = make_image(l.out_w, l.out_h, 3);
			memcpy(img.data, l.output+ l.out_wl.out_hj, l.out_wl.out_h 1 * sizeof(float));
			char buff[256];
			sprintf(buff, "layer-%d slice-%d", i, j);
			show_image(img, buff);
			}
			cvWaitKey(0); // wait press-key in console
			cvDestroyAllWindows();
			}
			*/
			}
			}
			@@ -133,14 +133,14 @@
			state.truth = *net.truth_gpu;
			state.train = 1;
			#ifdef CUDNN_HALF
			int i;
			for (i = 0; i < net.n; ++i) {
			layer l = net.layers[i];
			cuda_convert_f32_to_f16(l.weights_gpu, l.cl.nl.size*l.size, l.weights_gpu16);
			}
			int i;
			for (i = 0; i < net.n; ++i) {
			layer l = net.layers[i];
			cuda_convert_f32_to_f16(l.weights_gpu, l.cl.nl.size*l.size, l.weights_gpu16);
			}
			#endif
			forward_network_gpu(net, state);
			//cudaStreamSynchronize(get_cuda_stream());
			//cudaStreamSynchronize(get_cuda_stream());
			backward_network_gpu(net, state);
			}

			@@ -421,8 +421,8 @@

			float network_predict_gpu(network net, float input)
			{
			if (net.gpu_index != cuda_get_device())
			cuda_set_device(net.gpu_index);
			if (net.gpu_index != cuda_get_device())
			cuda_set_device(net.gpu_index);
			int size = get_network_input_size(net) * net.batch;
			network_state state;
			state.index = 0;

			@@ -34,21 +34,21 @@

			metadata get_metadata(char *file)
			{
			metadata m = { 0 };
			list *options = read_data_cfg(file);
			metadata m = { 0 };
			list *options = read_data_cfg(file);

			char *name_list = option_find_str(options, "names", 0);
			if (!name_list) name_list = option_find_str(options, "labels", 0);
			if (!name_list) {
			fprintf(stderr, "No names or labels found\n");
			}
			else {
			m.names = get_labels(name_list);
			}
			m.classes = option_find_int(options, "classes", 2);
			free_list(options);
			printf("Loaded - names_list: %s, classes = %d \n", name_list, m.classes);
			return m;
			char *name_list = option_find_str(options, "names", 0);
			if (!name_list) name_list = option_find_str(options, "labels", 0);
			if (!name_list) {
			fprintf(stderr, "No names or labels found\n");
			}
			else {
			m.names = get_labels(name_list);
			}
			m.classes = option_find_int(options, "classes", 2);
			free_list(options);
			printf("Loaded - names_list: %s, classes = %d \n", name_list, m.classes);
			return m;
			}

			int read_option(char s, list options)

			@@ -49,7 +49,7 @@
			if (strcmp(type, "[cost]")==0) return COST;
			if (strcmp(type, "[detection]")==0) return DETECTION;
			if (strcmp(type, "[region]")==0) return REGION;
			if (strcmp(type, "[yolo]") == 0) return YOLO;
			if (strcmp(type, "[yolo]") == 0) return YOLO;
			if (strcmp(type, "[local]")==0) return LOCAL;
			if (strcmp(type, "[conv]")==0
			\|\| strcmp(type, "[convolutional]")==0) return CONVOLUTIONAL;
			@@ -64,7 +64,7 @@
			if (strcmp(type, "[max]")==0
			\|\| strcmp(type, "[maxpool]")==0) return MAXPOOL;
			if (strcmp(type, "[reorg]")==0) return REORG;
			if (strcmp(type, "[reorg_old]") == 0) return REORG_OLD;
			if (strcmp(type, "[reorg_old]") == 0) return REORG_OLD;
			if (strcmp(type, "[avg]")==0
			\|\| strcmp(type, "[avgpool]")==0) return AVGPOOL;
			if (strcmp(type, "[dropout]")==0) return DROPOUT;
			@@ -74,7 +74,7 @@
			if (strcmp(type, "[soft]")==0
			\|\| strcmp(type, "[softmax]")==0) return SOFTMAX;
			if (strcmp(type, "[route]")==0) return ROUTE;
			if (strcmp(type, "[upsample]") == 0) return UPSAMPLE;
			if (strcmp(type, "[upsample]") == 0) return UPSAMPLE;
			return BLANK;
			}

			@@ -241,68 +241,68 @@

			int parse_yolo_mask(char a, int *num)
			{
			int *mask = 0;
			if (a) {
			int len = strlen(a);
			int n = 1;
			int i;
			for (i = 0; i < len; ++i) {
			if (a[i] == ',') ++n;
			}
			mask = calloc(n, sizeof(int));
			for (i = 0; i < n; ++i) {
			int val = atoi(a);
			mask[i] = val;
			a = strchr(a, ',') + 1;
			}
			*num = n;
			}
			return mask;
			int *mask = 0;
			if (a) {
			int len = strlen(a);
			int n = 1;
			int i;
			for (i = 0; i < len; ++i) {
			if (a[i] == ',') ++n;
			}
			mask = calloc(n, sizeof(int));
			for (i = 0; i < n; ++i) {
			int val = atoi(a);
			mask[i] = val;
			a = strchr(a, ',') + 1;
			}
			*num = n;
			}
			return mask;
			}

			layer parse_yolo(list *options, size_params params)
			{
			int classes = option_find_int(options, "classes", 20);
			int total = option_find_int(options, "num", 1);
			int num = total;
			int classes = option_find_int(options, "classes", 20);
			int total = option_find_int(options, "num", 1);
			int num = total;

			char *a = option_find_str(options, "mask", 0);
			int *mask = parse_yolo_mask(a, &num);
			int max_boxes = option_find_int_quiet(options, "max", 90);
			layer l = make_yolo_layer(params.batch, params.w, params.h, num, total, mask, classes, max_boxes);
			if (l.outputs != params.inputs) {
			printf("Error: l.outputs == params.inputs \n");
			printf("filters= in the [convolutional]-layer doesn't correspond to classes= or mask= in [yolo]-layer \n");
			exit(EXIT_FAILURE);
			}
			//assert(l.outputs == params.inputs);
			char *a = option_find_str(options, "mask", 0);
			int *mask = parse_yolo_mask(a, &num);
			int max_boxes = option_find_int_quiet(options, "max", 90);
			layer l = make_yolo_layer(params.batch, params.w, params.h, num, total, mask, classes, max_boxes);
			if (l.outputs != params.inputs) {
			printf("Error: l.outputs == params.inputs \n");
			printf("filters= in the [convolutional]-layer doesn't correspond to classes= or mask= in [yolo]-layer \n");
			exit(EXIT_FAILURE);
			}
			//assert(l.outputs == params.inputs);

			//l.max_boxes = option_find_int_quiet(options, "max", 90);
			l.jitter = option_find_float(options, "jitter", .2);
			l.focal_loss = option_find_int_quiet(options, "focal_loss", 0);
			//l.max_boxes = option_find_int_quiet(options, "max", 90);
			l.jitter = option_find_float(options, "jitter", .2);
			l.focal_loss = option_find_int_quiet(options, "focal_loss", 0);

			l.ignore_thresh = option_find_float(options, "ignore_thresh", .5);
			l.truth_thresh = option_find_float(options, "truth_thresh", 1);
			l.random = option_find_int_quiet(options, "random", 0);
			l.ignore_thresh = option_find_float(options, "ignore_thresh", .5);
			l.truth_thresh = option_find_float(options, "truth_thresh", 1);
			l.random = option_find_int_quiet(options, "random", 0);

			char *map_file = option_find_str(options, "map", 0);
			if (map_file) l.map = read_map(map_file);
			char *map_file = option_find_str(options, "map", 0);
			if (map_file) l.map = read_map(map_file);

			a = option_find_str(options, "anchors", 0);
			if (a) {
			int len = strlen(a);
			int n = 1;
			int i;
			for (i = 0; i < len; ++i) {
			if (a[i] == ',') ++n;
			}
			for (i = 0; i < n && i < total*2; ++i) {
			float bias = atof(a);
			l.biases[i] = bias;
			a = strchr(a, ',') + 1;
			}
			}
			return l;
			a = option_find_str(options, "anchors", 0);
			if (a) {
			int len = strlen(a);
			int n = 1;
			int i;
			for (i = 0; i < len; ++i) {
			if (a[i] == ',') ++n;
			}
			for (i = 0; i < n && i < total*2; ++i) {
			float bias = atof(a);
			l.biases[i] = bias;
			a = strchr(a, ',') + 1;
			}
			}
			return l;
			}

			layer parse_region(list *options, size_params params)
			@@ -310,21 +310,21 @@
			int coords = option_find_int(options, "coords", 4);
			int classes = option_find_int(options, "classes", 20);
			int num = option_find_int(options, "num", 1);
			int max_boxes = option_find_int_quiet(options, "max", 90);
			int max_boxes = option_find_int_quiet(options, "max", 90);

			layer l = make_region_layer(params.batch, params.w, params.h, num, classes, coords, max_boxes);
			if (l.outputs != params.inputs) {
			printf("Error: l.outputs == params.inputs \n");
			printf("filters= in the [convolutional]-layer doesn't correspond to classes= or num= in [region]-layer \n");
			exit(EXIT_FAILURE);
			}
			if (l.outputs != params.inputs) {
			printf("Error: l.outputs == params.inputs \n");
			printf("filters= in the [convolutional]-layer doesn't correspond to classes= or num= in [region]-layer \n");
			exit(EXIT_FAILURE);
			}
			//assert(l.outputs == params.inputs);

			l.log = option_find_int_quiet(options, "log", 0);
			l.sqrt = option_find_int_quiet(options, "sqrt", 0);

			l.softmax = option_find_int(options, "softmax", 0);
			l.focal_loss = option_find_int_quiet(options, "focal_loss", 0);
			l.focal_loss = option_find_int_quiet(options, "focal_loss", 0);
			//l.max_boxes = option_find_int_quiet(options, "max",30);
			l.jitter = option_find_float(options, "jitter", .2);
			l.rescore = option_find_int_quiet(options, "rescore",0);
			@@ -337,7 +337,7 @@
			l.coord_scale = option_find_float(options, "coord_scale", 1);
			l.object_scale = option_find_float(options, "object_scale", 1);
			l.noobject_scale = option_find_float(options, "noobject_scale", 1);
			l.mask_scale = option_find_float(options, "mask_scale", 1);
			l.mask_scale = option_find_float(options, "mask_scale", 1);
			l.class_scale = option_find_float(options, "class_scale", 1);
			l.bias_match = option_find_int_quiet(options, "bias_match",0);

			@@ -438,19 +438,19 @@

			layer parse_reorg_old(list *options, size_params params)
			{
			printf("\n reorg_old \n");
			int stride = option_find_int(options, "stride", 1);
			int reverse = option_find_int_quiet(options, "reverse", 0);
			printf("\n reorg_old \n");
			int stride = option_find_int(options, "stride", 1);
			int reverse = option_find_int_quiet(options, "reverse", 0);

			int batch, h, w, c;
			h = params.h;
			w = params.w;
			c = params.c;
			batch = params.batch;
			if (!(h && w && c)) error("Layer before reorg layer must output image.");
			int batch, h, w, c;
			h = params.h;
			w = params.w;
			c = params.c;
			batch = params.batch;
			if (!(h && w && c)) error("Layer before reorg layer must output image.");

			layer layer = make_reorg_old_layer(batch, w, h, c, stride, reverse);
			return layer;
			layer layer = make_reorg_old_layer(batch, w, h, c, stride, reverse);
			return layer;
			}

			maxpool_layer parse_maxpool(list *options, size_params params)
			@@ -547,10 +547,10 @@
			layer parse_upsample(list *options, size_params params, network net)
			{

			int stride = option_find_int(options, "stride", 2);
			layer l = make_upsample_layer(params.batch, params.w, params.h, params.c, stride);
			l.scale = option_find_float_quiet(options, "scale", 1);
			return l;
			int stride = option_find_int(options, "stride", 2);
			layer l = make_upsample_layer(params.batch, params.w, params.h, params.c, stride);
			l.scale = option_find_float_quiet(options, "scale", 1);
			return l;
			}

			route_layer parse_route(list *options, size_params params, network net)
			@@ -632,15 +632,15 @@
			net->inputs = option_find_int_quiet(options, "inputs", net->h * net->w * net->c);
			net->max_crop = option_find_int_quiet(options, "max_crop",net->w*2);
			net->min_crop = option_find_int_quiet(options, "min_crop",net->w);
			net->flip = option_find_int_quiet(options, "flip", 1);
			net->flip = option_find_int_quiet(options, "flip", 1);

			net->small_object = option_find_int_quiet(options, "small_object", 0);
			net->small_object = option_find_int_quiet(options, "small_object", 0);
			net->angle = option_find_float_quiet(options, "angle", 0);
			net->aspect = option_find_float_quiet(options, "aspect", 1);
			net->saturation = option_find_float_quiet(options, "saturation", 1);
			net->exposure = option_find_float_quiet(options, "exposure", 1);
			net->hue = option_find_float_quiet(options, "hue", 0);
			net->power = option_find_float_quiet(options, "power", 4);
			net->power = option_find_float_quiet(options, "power", 4);

			if(!net->inputs && !(net->h && net->w && net->c)) error("No input parameters supplied");

			@@ -648,7 +648,7 @@
			net->policy = get_policy(policy_s);
			net->burn_in = option_find_int_quiet(options, "burn_in", 0);
			#ifdef CUDNN_HALF
			net->burn_in = 0;
			net->burn_in = 0;
			#endif
			if(net->policy == STEP){
			net->step = option_find_int(options, "step", 1);
			@@ -696,7 +696,7 @@

			network parse_network_cfg(char *filename)
			{
			return parse_network_cfg_custom(filename, 0);
			return parse_network_cfg_custom(filename, 0);
			}

			network parse_network_cfg_custom(char *filename, int batch)
			@@ -717,12 +717,12 @@
			params.w = net.w;
			params.c = net.c;
			params.inputs = net.inputs;
			if (batch > 0) net.batch = batch;
			if (batch > 0) net.batch = batch;
			params.batch = net.batch;
			params.time_steps = net.time_steps;
			params.net = net;

			float bflops = 0;
			float bflops = 0;
			size_t workspace_size = 0;
			n = n->next;
			int count = 0;
			@@ -755,8 +755,8 @@
			l = parse_cost(options, params);
			}else if(lt == REGION){
			l = parse_region(options, params);
			}else if (lt == YOLO) {
			l = parse_yolo(options, params);
			}else if (lt == YOLO) {
			l = parse_yolo(options, params);
			}else if(lt == DETECTION){
			l = parse_detection(options, params);
			}else if(lt == SOFTMAX){
			@@ -769,15 +769,15 @@
			}else if(lt == MAXPOOL){
			l = parse_maxpool(options, params);
			}else if(lt == REORG){
			l = parse_reorg(options, params); }
			else if (lt == REORG_OLD) {
			l = parse_reorg_old(options, params);
			l = parse_reorg(options, params); }
			else if (lt == REORG_OLD) {
			l = parse_reorg_old(options, params);
			}else if(lt == AVGPOOL){
			l = parse_avgpool(options, params);
			}else if(lt == ROUTE){
			l = parse_route(options, params, net);
			}else if (lt == UPSAMPLE) {
			l = parse_upsample(options, params, net);
			}else if (lt == UPSAMPLE) {
			l = parse_upsample(options, params, net);
			}else if(lt == SHORTCUT){
			l = parse_shortcut(options, params, net);
			}else if(lt == DROPOUT){
			@@ -807,12 +807,12 @@
			params.c = l.out_c;
			params.inputs = l.outputs;
			}
			if (l.bflops > 0) bflops += l.bflops;
			if (l.bflops > 0) bflops += l.bflops;
			}
			free_list(sections);
			net.outputs = get_network_output_size(net);
			net.output = get_network_output(net);
			printf("Total BFLOPS %5.3f \n", bflops);
			printf("Total BFLOPS %5.3f \n", bflops);
			if(workspace_size){
			//printf("%ld\n", workspace_size);
			#ifdef GPU
			@@ -825,11 +825,11 @@
			net.workspace = calloc(1, workspace_size);
			#endif
			}
			LAYER_TYPE lt = net.layers[net.n - 1].type;
			if ((net.w % 32 != 0 \|\| net.h % 32 != 0) && (lt == YOLO \|\| lt == REGION \|\| lt == DETECTION)) {
			printf("\n Warning: width=%d and height=%d in cfg-file must be divisible by 32 for default networks Yolo v1/v2/v3!!! \n\n",
			net.w, net.h);
			}
			LAYER_TYPE lt = net.layers[net.n - 1].type;
			if ((net.w % 32 != 0 \|\| net.h % 32 != 0) && (lt == YOLO \|\| lt == REGION \|\| lt == DETECTION)) {
			printf("\n Warning: width=%d and height=%d in cfg-file must be divisible by 32 for default networks Yolo v1/v2/v3!!! \n\n",
			net.w, net.h);
			}
			return net;
			}

			@@ -1160,16 +1160,16 @@
			fread(&major, sizeof(int), 1, fp);
			fread(&minor, sizeof(int), 1, fp);
			fread(&revision, sizeof(int), 1, fp);
			if ((major * 10 + minor) >= 2) {
			printf("\n seen 64 \n");
			uint64_t iseen = 0;
			fread(&iseen, sizeof(uint64_t), 1, fp);
			*net->seen = iseen;
			}
			else {
			printf("\n seen 32 \n");
			fread(net->seen, sizeof(int), 1, fp);
			}
			if ((major * 10 + minor) >= 2) {
			printf("\n seen 64 \n");
			uint64_t iseen = 0;
			fread(&iseen, sizeof(uint64_t), 1, fp);
			*net->seen = iseen;
			}
			else {
			printf("\n seen 32 \n");
			fread(net->seen, sizeof(int), 1, fp);
			}
			int transpose = (major > 1000) \|\| (minor > 1000);

			int i;

			@@ -27,7 +27,7 @@
			l.bias_updates = calloc(n*2, sizeof(float));
			l.outputs = hwn*(classes + coords + 1);
			l.inputs = l.outputs;
			l.max_boxes = max_boxes;
			l.max_boxes = max_boxes;
			l.truths = max_boxes*(5);
			l.delta = calloc(batch*l.outputs, sizeof(float));
			l.output = calloc(batch*l.outputs, sizeof(float));
			@@ -53,8 +53,8 @@

			void resize_region_layer(layer *l, int w, int h)
			{
			int old_w = l->w;
			int old_h = l->h;
			int old_w = l->w;
			int old_h = l->h;
			l->w = w;
			l->h = h;

			@@ -65,13 +65,13 @@
			l->delta = realloc(l->delta, l->batchl->outputssizeof(float));

			#ifdef GPU
			if (old_w < w \|\| old_h < h) {
			cuda_free(l->delta_gpu);
			cuda_free(l->output_gpu);
			if (old_w < w \|\| old_h < h) {
			cuda_free(l->delta_gpu);
			cuda_free(l->output_gpu);

			l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs);
			l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
			}
			l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs);
			l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
			}
			#endif
			}

			@@ -127,34 +127,34 @@
			class_id = hier->parent[class_id];
			}
			*avg_cat += pred;
			} else {
			// Focal loss
			if (focal_loss) {
			// Focal Loss
			float alpha = 0.5; // 0.25 or 0.5
			//float gamma = 2; // hardcoded in many places of the grad-formula
			} else {
			// Focal loss
			if (focal_loss) {
			// Focal Loss
			float alpha = 0.5; // 0.25 or 0.5
			//float gamma = 2; // hardcoded in many places of the grad-formula

			int ti = index + class_id;
			float pt = output[ti] + 0.000000000000001F;
			// http://fooplot.com/#W3sidHlwZSI6MCwiZXEiOiItKDEteCkqKDIqeCpsb2coeCkreC0xKSIsImNvbG9yIjoiIzAwMDAwMCJ9LHsidHlwZSI6MTAwMH1d
			float grad = -(1 - pt) * (2 * pt*logf(pt) + pt - 1); // http://blog.csdn.net/linmingan/article/details/77885832
			//float grad = (1 - pt) * (2 * pt*logf(pt) + pt - 1); // https://github.com/unsky/focal-loss
			int ti = index + class_id;
			float pt = output[ti] + 0.000000000000001F;
			// http://fooplot.com/#W3sidHlwZSI6MCwiZXEiOiItKDEteCkqKDIqeCpsb2coeCkreC0xKSIsImNvbG9yIjoiIzAwMDAwMCJ9LHsidHlwZSI6MTAwMH1d
			float grad = -(1 - pt) * (2 * pt*logf(pt) + pt - 1); // http://blog.csdn.net/linmingan/article/details/77885832
			//float grad = (1 - pt) * (2 * pt*logf(pt) + pt - 1); // https://github.com/unsky/focal-loss

			for (n = 0; n < classes; ++n) {
			delta[index + n] = scale * (((n == class_id) ? 1 : 0) - output[index + n]);
			for (n = 0; n < classes; ++n) {
			delta[index + n] = scale * (((n == class_id) ? 1 : 0) - output[index + n]);

			delta[index + n] = alphagrad;
			delta[index + n] = alphagrad;

			if (n == class_id) *avg_cat += output[index + n];
			}
			}
			else {
			// default
			for (n = 0; n < classes; ++n) {
			delta[index + n] = scale * (((n == class_id) ? 1 : 0) - output[index + n]);
			if (n == class_id) *avg_cat += output[index + n];
			}
			}
			if (n == class_id) *avg_cat += output[index + n];
			}
			}
			else {
			// default
			for (n = 0; n < classes; ++n) {
			delta[index + n] = scale * (((n == class_id) ? 1 : 0) - output[index + n]);
			if (n == class_id) *avg_cat += output[index + n];
			}
			}
			}
			}

			@@ -170,9 +170,9 @@

			static int entry_index(layer l, int batch, int location, int entry)
			{
			int n = location / (l.w*l.h);
			int loc = location % (l.w*l.h);
			return batchl.outputs + nl.wl.h(l.coords + l.classes + 1) + entryl.wl.h + loc;
			int n = location / (l.w*l.h);
			int loc = location % (l.w*l.h);
			return batchl.outputs + nl.wl.h(l.coords + l.classes + 1) + entryl.wl.h + loc;
			}

			void softmax_tree(float input, int batch, int inputs, float temp, tree hierarchy, float *output);
			@@ -256,8 +256,8 @@
			int best_class_id = -1;
			for(t = 0; t < l.max_boxes; ++t){
			box truth = float_to_box(state.truth + t5 + bl.truths);
			int class_id = state.truth[t * 5 + b*l.truths + 4];
			if (class_id >= l.classes) continue; // if label contains class_id more than number of classes in the cfg-file
			int class_id = state.truth[t * 5 + b*l.truths + 4];
			if (class_id >= l.classes) continue; // if label contains class_id more than number of classes in the cfg-file
			if(!truth.x) break;
			float iou = box_iou(pred, truth);
			if (iou > best_iou) {
			@@ -295,12 +295,12 @@
			}
			for(t = 0; t < l.max_boxes; ++t){
			box truth = float_to_box(state.truth + t5 + bl.truths);
			int class_id = state.truth[t * 5 + b*l.truths + 4];
			if (class_id >= l.classes) {
			printf(" Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes-1);
			getchar();
			continue; // if label contains class_id more than number of classes in the cfg-file
			}
			int class_id = state.truth[t * 5 + b*l.truths + 4];
			if (class_id >= l.classes) {
			printf(" Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes-1);
			getchar();
			continue; // if label contains class_id more than number of classes in the cfg-file
			}

			if(!truth.x) break;
			float best_iou = 0;
			@@ -450,7 +450,7 @@
			cuda_pull_array(state.truth, truth_cpu, num_truth);
			}
			cuda_pull_array(l.output_gpu, in_cpu, l.batch*l.inputs);
			//cudaStreamSynchronize(get_cuda_stream());
			//cudaStreamSynchronize(get_cuda_stream());
			network_state cpu_state = state;
			cpu_state.train = state.train;
			cpu_state.truth = truth_cpu;
			@@ -460,7 +460,7 @@
			free(cpu_state.input);
			if(!state.train) return;
			cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
			//cudaStreamSynchronize(get_cuda_stream());
			//cudaStreamSynchronize(get_cuda_stream());
			if(cpu_state.truth) free(cpu_state.truth);
			}

			@@ -473,107 +473,107 @@

			void correct_region_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative)
			{
			int i;
			int new_w = 0;
			int new_h = 0;
			if (((float)netw / w) < ((float)neth / h)) {
			new_w = netw;
			new_h = (h * netw) / w;
			}
			else {
			new_h = neth;
			new_w = (w * neth) / h;
			}
			for (i = 0; i < n; ++i) {
			box b = dets[i].bbox;
			b.x = (b.x - (netw - new_w) / 2. / netw) / ((float)new_w / netw);
			b.y = (b.y - (neth - new_h) / 2. / neth) / ((float)new_h / neth);
			b.w *= (float)netw / new_w;
			b.h *= (float)neth / new_h;
			if (!relative) {
			b.x *= w;
			b.w *= w;
			b.y *= h;
			b.h *= h;
			}
			dets[i].bbox = b;
			}
			int i;
			int new_w = 0;
			int new_h = 0;
			if (((float)netw / w) < ((float)neth / h)) {
			new_w = netw;
			new_h = (h * netw) / w;
			}
			else {
			new_h = neth;
			new_w = (w * neth) / h;
			}
			for (i = 0; i < n; ++i) {
			box b = dets[i].bbox;
			b.x = (b.x - (netw - new_w) / 2. / netw) / ((float)new_w / netw);
			b.y = (b.y - (neth - new_h) / 2. / neth) / ((float)new_h / neth);
			b.w *= (float)netw / new_w;
			b.h *= (float)neth / new_h;
			if (!relative) {
			b.x *= w;
			b.w *= w;
			b.y *= h;
			b.h *= h;
			}
			dets[i].bbox = b;
			}
			}


			void get_region_detections(layer l, int w, int h, int netw, int neth, float thresh, int map, float tree_thresh, int relative, detection dets)
			{
			int i, j, n, z;
			float *predictions = l.output;
			if (l.batch == 2) {
			float *flip = l.output + l.outputs;
			for (j = 0; j < l.h; ++j) {
			for (i = 0; i < l.w / 2; ++i) {
			for (n = 0; n < l.n; ++n) {
			for (z = 0; z < l.classes + l.coords + 1; ++z) {
			int i1 = zl.wl.hl.n + nl.wl.h + jl.w + i;
			int i2 = zl.wl.hl.n + nl.wl.h + jl.w + (l.w - i - 1);
			float swap = flip[i1];
			flip[i1] = flip[i2];
			flip[i2] = swap;
			if (z == 0) {
			flip[i1] = -flip[i1];
			flip[i2] = -flip[i2];
			}
			}
			}
			}
			}
			for (i = 0; i < l.outputs; ++i) {
			l.output[i] = (l.output[i] + flip[i]) / 2.;
			}
			}
			for (i = 0; i < l.w*l.h; ++i) {
			int row = i / l.w;
			int col = i % l.w;
			for (n = 0; n < l.n; ++n) {
			int index = nl.wl.h + i;
			for (j = 0; j < l.classes; ++j) {
			dets[index].prob[j] = 0;
			}
			int obj_index = entry_index(l, 0, nl.wl.h + i, l.coords);
			int box_index = entry_index(l, 0, nl.wl.h + i, 0);
			int mask_index = entry_index(l, 0, nl.wl.h + i, 4);
			float scale = l.background ? 1 : predictions[obj_index];
			dets[index].bbox = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h);// , l.w*l.h);
			dets[index].objectness = scale > thresh ? scale : 0;
			if (dets[index].mask) {
			for (j = 0; j < l.coords - 4; ++j) {
			dets[index].mask[j] = l.output[mask_index + jl.wl.h];
			}
			}
			int i, j, n, z;
			float *predictions = l.output;
			if (l.batch == 2) {
			float *flip = l.output + l.outputs;
			for (j = 0; j < l.h; ++j) {
			for (i = 0; i < l.w / 2; ++i) {
			for (n = 0; n < l.n; ++n) {
			for (z = 0; z < l.classes + l.coords + 1; ++z) {
			int i1 = zl.wl.hl.n + nl.wl.h + jl.w + i;
			int i2 = zl.wl.hl.n + nl.wl.h + jl.w + (l.w - i - 1);
			float swap = flip[i1];
			flip[i1] = flip[i2];
			flip[i2] = swap;
			if (z == 0) {
			flip[i1] = -flip[i1];
			flip[i2] = -flip[i2];
			}
			}
			}
			}
			}
			for (i = 0; i < l.outputs; ++i) {
			l.output[i] = (l.output[i] + flip[i]) / 2.;
			}
			}
			for (i = 0; i < l.w*l.h; ++i) {
			int row = i / l.w;
			int col = i % l.w;
			for (n = 0; n < l.n; ++n) {
			int index = nl.wl.h + i;
			for (j = 0; j < l.classes; ++j) {
			dets[index].prob[j] = 0;
			}
			int obj_index = entry_index(l, 0, nl.wl.h + i, l.coords);
			int box_index = entry_index(l, 0, nl.wl.h + i, 0);
			int mask_index = entry_index(l, 0, nl.wl.h + i, 4);
			float scale = l.background ? 1 : predictions[obj_index];
			dets[index].bbox = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h);// , l.w*l.h);
			dets[index].objectness = scale > thresh ? scale : 0;
			if (dets[index].mask) {
			for (j = 0; j < l.coords - 4; ++j) {
			dets[index].mask[j] = l.output[mask_index + jl.wl.h];
			}
			}

			int class_index = entry_index(l, 0, nl.wl.h + i, l.coords + !l.background);
			if (l.softmax_tree) {
			int class_index = entry_index(l, 0, nl.wl.h + i, l.coords + !l.background);
			if (l.softmax_tree) {

			hierarchy_predictions(predictions + class_index, l.classes, l.softmax_tree, 0);// , l.w*l.h);
			if (map) {
			for (j = 0; j < 200; ++j) {
			int class_index = entry_index(l, 0, nl.wl.h + i, l.coords + 1 + map[j]);
			float prob = scale*predictions[class_index];
			dets[index].prob[j] = (prob > thresh) ? prob : 0;
			}
			}
			else {
			int j = hierarchy_top_prediction(predictions + class_index, l.softmax_tree, tree_thresh, l.w*l.h);
			dets[index].prob[j] = (scale > thresh) ? scale : 0;
			}
			}
			else {
			if (dets[index].objectness) {
			for (j = 0; j < l.classes; ++j) {
			int class_index = entry_index(l, 0, nl.wl.h + i, l.coords + 1 + j);
			float prob = scale*predictions[class_index];
			dets[index].prob[j] = (prob > thresh) ? prob : 0;
			}
			}
			}
			}
			}
			correct_region_boxes(dets, l.wl.hl.n, w, h, netw, neth, relative);
			hierarchy_predictions(predictions + class_index, l.classes, l.softmax_tree, 0);// , l.w*l.h);
			if (map) {
			for (j = 0; j < 200; ++j) {
			int class_index = entry_index(l, 0, nl.wl.h + i, l.coords + 1 + map[j]);
			float prob = scale*predictions[class_index];
			dets[index].prob[j] = (prob > thresh) ? prob : 0;
			}
			}
			else {
			int j = hierarchy_top_prediction(predictions + class_index, l.softmax_tree, tree_thresh, l.w*l.h);
			dets[index].prob[j] = (scale > thresh) ? scale : 0;
			}
			}
			else {
			if (dets[index].objectness) {
			for (j = 0; j < l.classes; ++j) {
			int class_index = entry_index(l, 0, nl.wl.h + i, l.coords + 1 + j);
			float prob = scale*predictions[class_index];
			dets[index].prob[j] = (prob > thresh) ? prob : 0;
			}
			}
			}
			}
			}
			correct_region_boxes(dets, l.wl.hl.n, w, h, netw, neth, relative);
			}

			@@ -77,42 +77,42 @@

			void forward_reorg_layer(const layer l, network_state state)
			{
			if (l.reverse) {
			reorg_cpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, l.output);
			}
			else {
			reorg_cpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, l.output);
			}
			if (l.reverse) {
			reorg_cpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, l.output);
			}
			else {
			reorg_cpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, l.output);
			}
			}

			void backward_reorg_layer(const layer l, network_state state)
			{
			if (l.reverse) {
			reorg_cpu(l.delta, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, state.delta);
			}
			else {
			reorg_cpu(l.delta, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, state.delta);
			}
			if (l.reverse) {
			reorg_cpu(l.delta, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, state.delta);
			}
			else {
			reorg_cpu(l.delta, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, state.delta);
			}
			}

			#ifdef GPU
			void forward_reorg_layer_gpu(layer l, network_state state)
			{
			if (l.reverse) {
			reorg_ongpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, l.output_gpu);
			}
			else {
			reorg_ongpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, l.output_gpu);
			}
			if (l.reverse) {
			reorg_ongpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, l.output_gpu);
			}
			else {
			reorg_ongpu(state.input, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, l.output_gpu);
			}
			}

			void backward_reorg_layer_gpu(layer l, network_state state)
			{
			if (l.reverse) {
			reorg_ongpu(l.delta_gpu, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, state.delta);
			}
			else {
			reorg_ongpu(l.delta_gpu, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, state.delta);
			}
			if (l.reverse) {
			reorg_ongpu(l.delta_gpu, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 0, state.delta);
			}
			else {
			reorg_ongpu(l.delta_gpu, l.out_w, l.out_h, l.out_c, l.batch, l.stride, 1, state.delta);
			}
			}
			#endif

			@@ -77,42 +77,42 @@

			void forward_reorg_old_layer(const layer l, network_state state)
			{
			if (l.reverse) {
			reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output);
			}
			else {
			reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output);
			}
			if (l.reverse) {
			reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output);
			}
			else {
			reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output);
			}
			}

			void backward_reorg_old_layer(const layer l, network_state state)
			{
			if (l.reverse) {
			reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta);
			}
			else {
			reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta);
			}
			if (l.reverse) {
			reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta);
			}
			else {
			reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta);
			}
			}

			#ifdef GPU
			void forward_reorg_old_layer_gpu(layer l, network_state state)
			{
			if (l.reverse) {
			reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu);
			}
			else {
			reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu);
			}
			if (l.reverse) {
			reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu);
			}
			else {
			reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu);
			}
			}

			void backward_reorg_old_layer_gpu(layer l, network_state state)
			{
			if (l.reverse) {
			reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta);
			}
			else {
			reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta);
			}
			if (l.reverse) {
			reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta);
			}
			else {
			reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta);
			}
			}
			#endif

			@@ -38,20 +38,20 @@

			void resize_shortcut_layer(layer *l, int w, int h)
			{
			//assert(l->w == l->out_w);
			//assert(l->h == l->out_h);
			l->w = l->out_w = w;
			l->h = l->out_h = h;
			l->outputs = whl->out_c;
			l->inputs = l->outputs;
			l->delta = realloc(l->delta, l->outputsl->batch sizeof(float));
			l->output = realloc(l->output, l->outputsl->batch sizeof(float));
			//assert(l->w == l->out_w);
			//assert(l->h == l->out_h);
			l->w = l->out_w = w;
			l->h = l->out_h = h;
			l->outputs = whl->out_c;
			l->inputs = l->outputs;
			l->delta = realloc(l->delta, l->outputsl->batch sizeof(float));
			l->output = realloc(l->output, l->outputsl->batch sizeof(float));

			#ifdef GPU
			cuda_free(l->output_gpu);
			cuda_free(l->delta_gpu);
			l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch);
			l->delta_gpu = cuda_make_array(l->delta, l->outputs*l->batch);
			cuda_free(l->output_gpu);
			cuda_free(l->delta_gpu);
			l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch);
			l->delta_gpu = cuda_make_array(l->delta, l->outputs*l->batch);
			#endif

			}

			@@ -52,34 +52,34 @@

			int hierarchy_top_prediction(float predictions, tree hier, float thresh, int stride)
			{
			float p = 1;
			int group = 0;
			int i;
			while (1) {
			float max = 0;
			int max_i = 0;
			float p = 1;
			int group = 0;
			int i;
			while (1) {
			float max = 0;
			int max_i = 0;

			for (i = 0; i < hier->group_size[group]; ++i) {
			int index = i + hier->group_offset[group];
			float val = predictions[(i + hier->group_offset[group])*stride];
			if (val > max) {
			max_i = index;
			max = val;
			}
			}
			if (p*max > thresh) {
			p = p*max;
			group = hier->child[max_i];
			if (hier->child[max_i] < 0) return max_i;
			}
			else if (group == 0) {
			return max_i;
			}
			else {
			return hier->parent[hier->group_offset[group]];
			}
			}
			return 0;
			for (i = 0; i < hier->group_size[group]; ++i) {
			int index = i + hier->group_offset[group];
			float val = predictions[(i + hier->group_offset[group])*stride];
			if (val > max) {
			max_i = index;
			max = val;
			}
			}
			if (p*max > thresh) {
			p = p*max;
			group = hier->child[max_i];
			if (hier->child[max_i] < 0) return max_i;
			}
			else if (group == 0) {
			return max_i;
			}
			else {
			return hier->parent[hier->group_offset[group]];
			}
			}
			return 0;
			}

			tree read_tree(char filename)

			@@ -18,11 +18,11 @@

			double what_time_is_it_now()
			{
			struct timeval time;
			if (gettimeofday(&time, NULL)) {
			return 0;
			}
			return (double)time.tv_sec + (double)time.tv_usec * .000001;
			struct timeval time;
			if (gettimeofday(&time, NULL)) {
			return 0;
			}
			return (double)time.tv_sec + (double)time.tv_usec * .000001;
			}

			int read_map(char filename)
			@@ -57,7 +57,7 @@
			void *swp = calloc(1, size);
			for(i = 0; i < n-1; ++i){
			size_t j = i + rand()/(RAND_MAX / (n-i)+1);
			memcpy(swp, (char)arr+(jsize), size);
			memcpy(swp, (char)arr+(jsize), size);
			memcpy((char)arr+(jsize), (char)arr+(isize), size);
			memcpy((char)arr+(isize), swp, size);
			}
			@@ -137,7 +137,7 @@
			{
			c = next+1;
			}
			if(!next) while ((next = strchr(c, '\\'))) { c = next + 1; }
			if(!next) while ((next = strchr(c, '\\'))) { c = next + 1; }
			c = copy_string(c);
			next = strchr(c, '.');
			if (next) *next = 0;
			@@ -169,63 +169,63 @@

			void find_replace(char str, char orig, char rep, char output)
			{
			char *buffer = calloc(8192, sizeof(char));
			char *buffer = calloc(8192, sizeof(char));
			char *p;

			sprintf(buffer, "%s", str);
			if(!(p = strstr(buffer, orig))){ // Is 'orig' even in 'str'?
			sprintf(output, "%s", str);
			free(buffer);
			free(buffer);
			return;
			}

			*p = '\0';

			sprintf(output, "%s%s%s", buffer, rep, p+strlen(orig));
			free(buffer);
			free(buffer);
			}

			void find_replace_extension(char str, char orig, char rep, char output)
			{
			char *buffer = calloc(8192, sizeof(char));
			char *buffer = calloc(8192, sizeof(char));

			sprintf(buffer, "%s", str);
			char *p = strstr(buffer, orig);
			int offset = (p - buffer);
			int chars_from_end = strlen(buffer) - offset;
			if (!p \|\| chars_from_end != strlen(orig)) { // Is 'orig' even in 'str' AND is 'orig' found at the end of 'str'?
			sprintf(output, "%s", str);
			free(buffer);
			return;
			}
			sprintf(buffer, "%s", str);
			char *p = strstr(buffer, orig);
			int offset = (p - buffer);
			int chars_from_end = strlen(buffer) - offset;
			if (!p \|\| chars_from_end != strlen(orig)) { // Is 'orig' even in 'str' AND is 'orig' found at the end of 'str'?
			sprintf(output, "%s", str);
			free(buffer);
			return;
			}

			*p = '\0';
			*p = '\0';

			sprintf(output, "%s%s%s", buffer, rep, p + strlen(orig));
			free(buffer);
			sprintf(output, "%s%s%s", buffer, rep, p + strlen(orig));
			free(buffer);
			}

			void replace_image_to_label(char input_path, char output_path) {
			//find_replace(input_path, "/images/", "/labels/", output_path); // COCO
			find_replace(input_path, "/images/train2014/", "/labels/train2014/", output_path); // COCO
			find_replace(output_path, "/images/val2014/", "/labels/val2014/", output_path); // COCO
			find_replace(output_path, "/JPEGImages/", "/labels/", output_path); // PascalVOC
			//find_replace(output_path, "/VOC2007/JPEGImages/", "/VOC2007/labels/", output_path); // PascalVOC
			//find_replace(output_path, "/VOC2012/JPEGImages/", "/VOC2012/labels/", output_path); // PascalVOC
			//find_replace(input_path, "/images/", "/labels/", output_path); // COCO
			find_replace(input_path, "/images/train2014/", "/labels/train2014/", output_path); // COCO
			find_replace(output_path, "/images/val2014/", "/labels/val2014/", output_path); // COCO
			find_replace(output_path, "/JPEGImages/", "/labels/", output_path); // PascalVOC
			//find_replace(output_path, "/VOC2007/JPEGImages/", "/VOC2007/labels/", output_path); // PascalVOC
			//find_replace(output_path, "/VOC2012/JPEGImages/", "/VOC2012/labels/", output_path); // PascalVOC

			//find_replace(output_path, "/raw/", "/labels/", output_path);
			//find_replace(output_path, "/raw/", "/labels/", output_path);

			// replace only ext of files
			find_replace_extension(output_path, ".jpg", ".txt", output_path);
			find_replace_extension(output_path, ".JPG", ".txt", output_path); // error
			find_replace_extension(output_path, ".jpeg", ".txt", output_path);
			find_replace_extension(output_path, ".JPEG", ".txt", output_path);
			find_replace_extension(output_path, ".png", ".txt", output_path);
			find_replace_extension(output_path, ".PNG", ".txt", output_path);
			find_replace_extension(output_path, ".bmp", ".txt", output_path);
			find_replace_extension(output_path, ".BMP", ".txt", output_path);
			find_replace_extension(output_path, ".ppm", ".txt", output_path);
			find_replace_extension(output_path, ".PPM", ".txt", output_path);
			// replace only ext of files
			find_replace_extension(output_path, ".jpg", ".txt", output_path);
			find_replace_extension(output_path, ".JPG", ".txt", output_path); // error
			find_replace_extension(output_path, ".jpeg", ".txt", output_path);
			find_replace_extension(output_path, ".JPEG", ".txt", output_path);
			find_replace_extension(output_path, ".png", ".txt", output_path);
			find_replace_extension(output_path, ".PNG", ".txt", output_path);
			find_replace_extension(output_path, ".bmp", ".txt", output_path);
			find_replace_extension(output_path, ".BMP", ".txt", output_path);
			find_replace_extension(output_path, ".ppm", ".txt", output_path);
			find_replace_extension(output_path, ".PPM", ".txt", output_path);
			}

			float sec(clock_t clocks)
			@@ -299,15 +299,15 @@

			void strip_args(char *s)
			{
			size_t i;
			size_t len = strlen(s);
			size_t offset = 0;
			for (i = 0; i < len; ++i) {
			char c = s[i];
			if (c == '\t' \|\| c == '\n' \|\| c == '\r' \|\| c == 0x0d \|\| c == 0x0a) ++offset;
			else s[i - offset] = c;
			}
			s[len - offset] = '\0';
			size_t i;
			size_t len = strlen(s);
			size_t offset = 0;
			for (i = 0; i < len; ++i) {
			char c = s[i];
			if (c == '\t' \|\| c == '\n' \|\| c == '\r' \|\| c == 0x0d \|\| c == 0x0a) ++offset;
			else s[i - offset] = c;
			}
			s[len - offset] = '\0';
			}

			void strip_char(char *s, char bad)
			@@ -356,11 +356,11 @@
			fgets(&line[curr], readsize, fp);
			curr = strlen(line);
			}
			if(curr >= 2)
			if(line[curr-2] == 0x0d) line[curr-2] = 0x00;
			if(curr >= 2)
			if(line[curr-2] == 0x0d) line[curr-2] = 0x00;

			if(curr >= 1)
			if(line[curr-1] == 0x0a) line[curr-1] = 0x00;
			if(curr >= 1)
			if(line[curr-1] == 0x0a) line[curr-1] = 0x00;

			return line;
			}
			@@ -620,11 +620,11 @@

			int int_index(int *a, int val, int n)
			{
			int i;
			for (i = 0; i < n; ++i) {
			if (a[i] == val) return i;
			}
			return -1;
			int i;
			for (i = 0; i < n; ++i) {
			if (a[i] == val) return i;
			}
			return -1;
			}

			int rand_int(int min, int max)
			@@ -691,7 +691,7 @@
			max = swap;
			}
			return ((float)rand()/RAND_MAX * (max - min)) + min;
			//return (random_float() * (max - min)) + min;
			//return (random_float() * (max - min)) + min;
			}

			float rand_scale(float s)
			@@ -715,30 +715,30 @@

			unsigned int random_gen()
			{
			unsigned int rnd = 0;
			unsigned int rnd = 0;
			#ifdef WIN32
			rand_s(&rnd);
			rand_s(&rnd);
			#else
			rnd = rand();
			rnd = rand();
			#endif
			return rnd;
			return rnd;
			}

			float random_float()
			{
			#ifdef WIN32
			return ((float)random_gen() / (float)UINT_MAX);
			return ((float)random_gen() / (float)UINT_MAX);
			#else
			return ((float)random_gen() / (float)RAND_MAX);
			return ((float)random_gen() / (float)RAND_MAX);
			#endif
			}

			float rand_uniform_strong(float min, float max)
			{
			if (max < min) {
			float swap = min;
			min = max;
			max = swap;
			}
			return (random_float() * (max - min)) + min;
			if (max < min) {
			float swap = min;
			min = max;
			max = swap;
			}
			return (random_float() * (max - min)) + min;
			}

			@@ -38,8 +38,8 @@
			l.bias_updates = calloc(n*2, sizeof(float));
			l.outputs = hwn*(classes + 4 + 1);
			l.inputs = l.outputs;
			l.max_boxes = max_boxes;
			l.truths = l.max_boxes(4 + 1); // 90(4 + 1);
			l.max_boxes = max_boxes;
			l.truths = l.max_boxes(4 + 1); // 90(4 + 1);
			l.delta = calloc(batch*l.outputs, sizeof(float));
			l.output = calloc(batch*l.outputs, sizeof(float));
			for(i = 0; i < total*2; ++i){
			@@ -117,33 +117,33 @@
			if(avg_cat) avg_cat += output[index + strideclass_id];
			return;
			}
			// Focal loss
			if (focal_loss) {
			// Focal Loss
			float alpha = 0.5; // 0.25 or 0.5
			//float gamma = 2; // hardcoded in many places of the grad-formula
			// Focal loss
			if (focal_loss) {
			// Focal Loss
			float alpha = 0.5; // 0.25 or 0.5
			//float gamma = 2; // hardcoded in many places of the grad-formula

			int ti = index + stride*class_id;
			float pt = output[ti] + 0.000000000000001F;
			// http://fooplot.com/#W3sidHlwZSI6MCwiZXEiOiItKDEteCkqKDIqeCpsb2coeCkreC0xKSIsImNvbG9yIjoiIzAwMDAwMCJ9LHsidHlwZSI6MTAwMH1d
			float grad = -(1 - pt) * (2 * pt*logf(pt) + pt - 1); // http://blog.csdn.net/linmingan/article/details/77885832
			//float grad = (1 - pt) * (2 * pt*logf(pt) + pt - 1); // https://github.com/unsky/focal-loss
			int ti = index + stride*class_id;
			float pt = output[ti] + 0.000000000000001F;
			// http://fooplot.com/#W3sidHlwZSI6MCwiZXEiOiItKDEteCkqKDIqeCpsb2coeCkreC0xKSIsImNvbG9yIjoiIzAwMDAwMCJ9LHsidHlwZSI6MTAwMH1d
			float grad = -(1 - pt) * (2 * pt*logf(pt) + pt - 1); // http://blog.csdn.net/linmingan/article/details/77885832
			//float grad = (1 - pt) * (2 * pt*logf(pt) + pt - 1); // https://github.com/unsky/focal-loss

			for (n = 0; n < classes; ++n) {
			delta[index + striden] = (((n == class_id) ? 1 : 0) - output[index + striden]);
			for (n = 0; n < classes; ++n) {
			delta[index + striden] = (((n == class_id) ? 1 : 0) - output[index + striden]);

			delta[index + striden] = alpha*grad;
			delta[index + striden] = alpha*grad;

			if (n == class_id) avg_cat += output[index + striden];
			}
			}
			else {
			// default
			for (n = 0; n < classes; ++n) {
			delta[index + striden] = ((n == class_id) ? 1 : 0) - output[index + striden];
			if (n == class_id && avg_cat) avg_cat += output[index + striden];
			}
			}
			if (n == class_id) avg_cat += output[index + striden];
			}
			}
			else {
			// default
			for (n = 0; n < classes; ++n) {
			delta[index + striden] = ((n == class_id) ? 1 : 0) - output[index + striden];
			if (n == class_id && avg_cat) avg_cat += output[index + striden];
			}
			}
			}

			static int entry_index(layer l, int batch, int location, int entry)
			@@ -155,12 +155,12 @@

			static box float_to_box_stride(float *f, int stride)
			{
			box b = { 0 };
			b.x = f[0];
			b.y = f[1 * stride];
			b.w = f[2 * stride];
			b.h = f[3 * stride];
			return b;
			box b = { 0 };
			b.x = f[0];
			b.y = f[1 * stride];
			b.w = f[2 * stride];
			b.h = f[3 * stride];
			return b;
			}

			void forward_yolo_layer(const layer l, network_state state)
			@@ -200,12 +200,12 @@
			int best_t = 0;
			for(t = 0; t < l.max_boxes; ++t){
			box truth = float_to_box_stride(state.truth + t(4 + 1) + bl.truths, 1);
			int class_id = state.truth[t(4 + 1) + bl.truths + 4];
			if (class_id >= l.classes) {
			printf(" Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes - 1);
			getchar();
			continue; // if label contains class_id more than number of classes in the cfg-file
			}
			int class_id = state.truth[t(4 + 1) + bl.truths + 4];
			if (class_id >= l.classes) {
			printf(" Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes - 1);
			getchar();
			continue; // if label contains class_id more than number of classes in the cfg-file
			}
			if(!truth.x) break;
			float iou = box_iou(pred, truth);
			if (iou > best_iou) {
			@@ -234,8 +234,8 @@
			}
			for(t = 0; t < l.max_boxes; ++t){
			box truth = float_to_box_stride(state.truth + t(4 + 1) + bl.truths, 1);
			int class_id = state.truth[t(4 + 1) + bl.truths + 4];
			if (class_id >= l.classes) continue; // if label contains class_id more than number of classes in the cfg-file
			int class_id = state.truth[t(4 + 1) + bl.truths + 4];
			if (class_id >= l.classes) continue; // if label contains class_id more than number of classes in the cfg-file

			if(!truth.x) break;
			float best_iou = 0;
			@@ -291,20 +291,20 @@
			int i;
			int new_w=0;
			int new_h=0;
			if (letter) {
			if (((float)netw / w) < ((float)neth / h)) {
			new_w = netw;
			new_h = (h * netw) / w;
			}
			else {
			new_h = neth;
			new_w = (w * neth) / h;
			}
			}
			else {
			new_w = netw;
			new_h = neth;
			}
			if (letter) {
			if (((float)netw / w) < ((float)neth / h)) {
			new_w = netw;
			new_h = (h * netw) / w;
			}
			else {
			new_h = neth;
			new_w = (w * neth) / h;
			}
			}
			else {
			new_w = netw;
			new_h = neth;
			}
			for (i = 0; i < n; ++i){
			box b = dets[i].bbox;
			b.x = (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw);
			@@ -411,25 +411,25 @@
			}

			//cuda_pull_array(l.output_gpu, state.input, l.batch*l.inputs);
			float in_cpu = calloc(l.batchl.inputs, sizeof(float));
			cuda_pull_array(l.output_gpu, in_cpu, l.batch*l.inputs);
			float *truth_cpu = 0;
			if (state.truth) {
			int num_truth = l.batch*l.truths;
			truth_cpu = calloc(num_truth, sizeof(float));
			cuda_pull_array(state.truth, truth_cpu, num_truth);
			}
			network_state cpu_state = state;
			cpu_state.net = state.net;
			cpu_state.index = state.index;
			cpu_state.train = state.train;
			cpu_state.truth = truth_cpu;
			cpu_state.input = in_cpu;
			forward_yolo_layer(l, cpu_state);
			float in_cpu = calloc(l.batchl.inputs, sizeof(float));
			cuda_pull_array(l.output_gpu, in_cpu, l.batch*l.inputs);
			float *truth_cpu = 0;
			if (state.truth) {
			int num_truth = l.batch*l.truths;
			truth_cpu = calloc(num_truth, sizeof(float));
			cuda_pull_array(state.truth, truth_cpu, num_truth);
			}
			network_state cpu_state = state;
			cpu_state.net = state.net;
			cpu_state.index = state.index;
			cpu_state.train = state.train;
			cpu_state.truth = truth_cpu;
			cpu_state.input = in_cpu;
			forward_yolo_layer(l, cpu_state);
			//forward_yolo_layer(l, state);
			cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
			free(in_cpu);
			if (cpu_state.truth) free(cpu_state.truth);
			free(in_cpu);
			if (cpu_state.truth) free(cpu_state.truth);
			}

			void backward_yolo_layer_gpu(const layer l, network_state state)

			@@ -50,310 +50,310 @@
			return detection.size();
			#else
			return -1;
			#endif // OPENCV
			#endif // OPENCV
			}

			int dispose() {
			//if (detector != NULL) delete detector;
			//detector = NULL;
			//if (detector != NULL) delete detector;
			//detector = NULL;
			detector.reset();
			return 1;
			}

			#ifdef GPU
			void check_cuda(cudaError_t status) {
			if (status != cudaSuccess) {
			const char *s = cudaGetErrorString(status);
			printf("CUDA Error Prev: %s\n", s);
			}
			if (status != cudaSuccess) {
			const char *s = cudaGetErrorString(status);
			printf("CUDA Error Prev: %s\n", s);
			}
			}
			#endif

			struct detector_gpu_t {
			network net;
			image images[FRAMES];
			float *avg;
			float *predictions[FRAMES];
			int demo_index;
			unsigned int *track_id;
			network net;
			image images[FRAMES];
			float *avg;
			float *predictions[FRAMES];
			int demo_index;
			unsigned int *track_id;
			};

			YOLODLL_API Detector::Detector(std::string cfg_filename, std::string weight_filename, int gpu_id) : cur_gpu_id(gpu_id)
			{
			wait_stream = 0;
			int old_gpu_index;
			wait_stream = 0;
			int old_gpu_index;
			#ifdef GPU
			check_cuda( cudaGetDevice(&old_gpu_index) );
			check_cuda( cudaGetDevice(&old_gpu_index) );
			#endif

			detector_gpu_ptr = std::make_shared<detector_gpu_t>();
			detector_gpu_t &detector_gpu = static_cast<detector_gpu_t >(detector_gpu_ptr.get());
			detector_gpu_ptr = std::make_shared<detector_gpu_t>();
			detector_gpu_t &detector_gpu = static_cast<detector_gpu_t >(detector_gpu_ptr.get());

			#ifdef GPU
			//check_cuda( cudaSetDevice(cur_gpu_id) );
			cuda_set_device(cur_gpu_id);
			printf(" Used GPU %d \n", cur_gpu_id);
			//check_cuda( cudaSetDevice(cur_gpu_id) );
			cuda_set_device(cur_gpu_id);
			printf(" Used GPU %d \n", cur_gpu_id);
			#endif
			network &net = detector_gpu.net;
			net.gpu_index = cur_gpu_id;
			//gpu_index = i;

			char cfgfile = const_cast<char >(cfg_filename.data());
			char weightfile = const_cast<char >(weight_filename.data());
			network &net = detector_gpu.net;
			net.gpu_index = cur_gpu_id;
			//gpu_index = i;

			char cfgfile = const_cast<char >(cfg_filename.data());
			char weightfile = const_cast<char >(weight_filename.data());

			net = parse_network_cfg_custom(cfgfile, 1);
			if (weightfile) {
			load_weights(&net, weightfile);
			}
			set_batch_network(&net, 1);
			net.gpu_index = cur_gpu_id;
			fuse_conv_batchnorm(net);
			net = parse_network_cfg_custom(cfgfile, 1);
			if (weightfile) {
			load_weights(&net, weightfile);
			}
			set_batch_network(&net, 1);
			net.gpu_index = cur_gpu_id;
			fuse_conv_batchnorm(net);

			layer l = net.layers[net.n - 1];
			int j;
			layer l = net.layers[net.n - 1];
			int j;

			detector_gpu.avg = (float *)calloc(l.outputs, sizeof(float));
			for (j = 0; j < FRAMES; ++j) detector_gpu.predictions[j] = (float *)calloc(l.outputs, sizeof(float));
			for (j = 0; j < FRAMES; ++j) detector_gpu.images[j] = make_image(1, 1, 3);
			detector_gpu.avg = (float *)calloc(l.outputs, sizeof(float));
			for (j = 0; j < FRAMES; ++j) detector_gpu.predictions[j] = (float *)calloc(l.outputs, sizeof(float));
			for (j = 0; j < FRAMES; ++j) detector_gpu.images[j] = make_image(1, 1, 3);

			detector_gpu.track_id = (unsigned int *)calloc(l.classes, sizeof(unsigned int));
			for (j = 0; j < l.classes; ++j) detector_gpu.track_id[j] = 1;
			detector_gpu.track_id = (unsigned int *)calloc(l.classes, sizeof(unsigned int));
			for (j = 0; j < l.classes; ++j) detector_gpu.track_id[j] = 1;

			#ifdef GPU
			check_cuda( cudaSetDevice(old_gpu_index) );
			check_cuda( cudaSetDevice(old_gpu_index) );
			#endif
			}


			YOLODLL_API Detector::~Detector()
			{
			detector_gpu_t &detector_gpu = static_cast<detector_gpu_t >(detector_gpu_ptr.get());
			layer l = detector_gpu.net.layers[detector_gpu.net.n - 1];
			detector_gpu_t &detector_gpu = static_cast<detector_gpu_t >(detector_gpu_ptr.get());
			layer l = detector_gpu.net.layers[detector_gpu.net.n - 1];

			free(detector_gpu.track_id);
			free(detector_gpu.track_id);

			free(detector_gpu.avg);
			for (int j = 0; j < FRAMES; ++j) free(detector_gpu.predictions[j]);
			for (int j = 0; j < FRAMES; ++j) if(detector_gpu.images[j].data) free(detector_gpu.images[j].data);
			free(detector_gpu.avg);
			for (int j = 0; j < FRAMES; ++j) free(detector_gpu.predictions[j]);
			for (int j = 0; j < FRAMES; ++j) if(detector_gpu.images[j].data) free(detector_gpu.images[j].data);

			int old_gpu_index;
			int old_gpu_index;
			#ifdef GPU
			cudaGetDevice(&old_gpu_index);
			cuda_set_device(detector_gpu.net.gpu_index);
			cudaGetDevice(&old_gpu_index);
			cuda_set_device(detector_gpu.net.gpu_index);
			#endif

			free_network(detector_gpu.net);
			free_network(detector_gpu.net);

			#ifdef GPU
			cudaSetDevice(old_gpu_index);
			cudaSetDevice(old_gpu_index);
			#endif
			}

			YOLODLL_API int Detector::get_net_width() const {
			detector_gpu_t &detector_gpu = static_cast<detector_gpu_t >(detector_gpu_ptr.get());
			return detector_gpu.net.w;
			detector_gpu_t &detector_gpu = static_cast<detector_gpu_t >(detector_gpu_ptr.get());
			return detector_gpu.net.w;
			}
			YOLODLL_API int Detector::get_net_height() const {
			detector_gpu_t &detector_gpu = static_cast<detector_gpu_t >(detector_gpu_ptr.get());
			return detector_gpu.net.h;
			detector_gpu_t &detector_gpu = static_cast<detector_gpu_t >(detector_gpu_ptr.get());
			return detector_gpu.net.h;
			}
			YOLODLL_API int Detector::get_net_color_depth() const {
			detector_gpu_t &detector_gpu = static_cast<detector_gpu_t >(detector_gpu_ptr.get());
			return detector_gpu.net.c;
			detector_gpu_t &detector_gpu = static_cast<detector_gpu_t >(detector_gpu_ptr.get());
			return detector_gpu.net.c;
			}


			YOLODLL_API std::vector<bbox_t> Detector::detect(std::string image_filename, float thresh, bool use_mean)
			{
			std::shared_ptr<image_t> image_ptr(new image_t, [](image_t *img) { if (img->data) free(img->data); delete img; });
			*image_ptr = load_image(image_filename);
			return detect(*image_ptr, thresh, use_mean);
			std::shared_ptr<image_t> image_ptr(new image_t, [](image_t *img) { if (img->data) free(img->data); delete img; });
			*image_ptr = load_image(image_filename);
			return detect(*image_ptr, thresh, use_mean);
			}

			static image load_image_stb(char *filename, int channels)
			{
			int w, h, c;
			unsigned char *data = stbi_load(filename, &w, &h, &c, channels);
			if (!data)
			throw std::runtime_error("file not found");
			if (channels) c = channels;
			int i, j, k;
			image im = make_image(w, h, c);
			for (k = 0; k < c; ++k) {
			for (j = 0; j < h; ++j) {
			for (i = 0; i < w; ++i) {
			int dst_index = i + wj + wh*k;
			int src_index = k + ci + cw*j;
			im.data[dst_index] = (float)data[src_index] / 255.;
			}
			}
			}
			free(data);
			return im;
			int w, h, c;
			unsigned char *data = stbi_load(filename, &w, &h, &c, channels);
			if (!data)
			throw std::runtime_error("file not found");
			if (channels) c = channels;
			int i, j, k;
			image im = make_image(w, h, c);
			for (k = 0; k < c; ++k) {
			for (j = 0; j < h; ++j) {
			for (i = 0; i < w; ++i) {
			int dst_index = i + wj + wh*k;
			int src_index = k + ci + cw*j;
			im.data[dst_index] = (float)data[src_index] / 255.;
			}
			}
			}
			free(data);
			return im;
			}

			YOLODLL_API image_t Detector::load_image(std::string image_filename)
			{
			char input = const_cast<char >(image_filename.data());
			image im = load_image_stb(input, 3);
			char input = const_cast<char >(image_filename.data());
			image im = load_image_stb(input, 3);

			image_t img;
			img.c = im.c;
			img.data = im.data;
			img.h = im.h;
			img.w = im.w;
			image_t img;
			img.c = im.c;
			img.data = im.data;
			img.h = im.h;
			img.w = im.w;

			return img;
			return img;
			}


			YOLODLL_API void Detector::free_image(image_t m)
			{
			if (m.data) {
			free(m.data);
			}
			if (m.data) {
			free(m.data);
			}
			}

			YOLODLL_API std::vector<bbox_t> Detector::detect(image_t img, float thresh, bool use_mean)
			{
			detector_gpu_t &detector_gpu = static_cast<detector_gpu_t >(detector_gpu_ptr.get());
			network &net = detector_gpu.net;
			int old_gpu_index;
			detector_gpu_t &detector_gpu = static_cast<detector_gpu_t >(detector_gpu_ptr.get());
			network &net = detector_gpu.net;
			int old_gpu_index;
			#ifdef GPU
			cudaGetDevice(&old_gpu_index);
			if(cur_gpu_id != old_gpu_index)
			cudaSetDevice(net.gpu_index);
			cudaGetDevice(&old_gpu_index);
			if(cur_gpu_id != old_gpu_index)
			cudaSetDevice(net.gpu_index);

			net.wait_stream = wait_stream; // 1 - wait CUDA-stream, 0 - not to wait
			net.wait_stream = wait_stream; // 1 - wait CUDA-stream, 0 - not to wait
			#endif
			//std::cout << "net.gpu_index = " << net.gpu_index << std::endl;
			//std::cout << "net.gpu_index = " << net.gpu_index << std::endl;

			//float nms = .4;
			//float nms = .4;

			image im;
			im.c = img.c;
			im.data = img.data;
			im.h = img.h;
			im.w = img.w;
			image im;
			im.c = img.c;
			im.data = img.data;
			im.h = img.h;
			im.w = img.w;

			image sized;

			if (net.w == im.w && net.h == im.h) {
			sized = make_image(im.w, im.h, im.c);
			memcpy(sized.data, im.data, im.wim.him.c * sizeof(float));
			}
			else
			sized = resize_image(im, net.w, net.h);
			image sized;

			if (net.w == im.w && net.h == im.h) {
			sized = make_image(im.w, im.h, im.c);
			memcpy(sized.data, im.data, im.wim.him.c * sizeof(float));
			}
			else
			sized = resize_image(im, net.w, net.h);

			layer l = net.layers[net.n - 1];
			layer l = net.layers[net.n - 1];

			float *X = sized.data;
			float *X = sized.data;

			float *prediction = network_predict(net, X);
			float *prediction = network_predict(net, X);

			if (use_mean) {
			memcpy(detector_gpu.predictions[detector_gpu.demo_index], prediction, l.outputs * sizeof(float));
			mean_arrays(detector_gpu.predictions, FRAMES, l.outputs, detector_gpu.avg);
			l.output = detector_gpu.avg;
			detector_gpu.demo_index = (detector_gpu.demo_index + 1) % FRAMES;
			}
			//get_region_boxes(l, 1, 1, thresh, detector_gpu.probs, detector_gpu.boxes, 0, 0);
			//if (nms) do_nms_sort(detector_gpu.boxes, detector_gpu.probs, l.wl.hl.n, l.classes, nms);
			if (use_mean) {
			memcpy(detector_gpu.predictions[detector_gpu.demo_index], prediction, l.outputs * sizeof(float));
			mean_arrays(detector_gpu.predictions, FRAMES, l.outputs, detector_gpu.avg);
			l.output = detector_gpu.avg;
			detector_gpu.demo_index = (detector_gpu.demo_index + 1) % FRAMES;
			}
			//get_region_boxes(l, 1, 1, thresh, detector_gpu.probs, detector_gpu.boxes, 0, 0);
			//if (nms) do_nms_sort(detector_gpu.boxes, detector_gpu.probs, l.wl.hl.n, l.classes, nms);

			int nboxes = 0;
			int letterbox = 0;
			float hier_thresh = 0.5;
			detection *dets = get_network_boxes(&net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes, letterbox);
			if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
			int nboxes = 0;
			int letterbox = 0;
			float hier_thresh = 0.5;
			detection *dets = get_network_boxes(&net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes, letterbox);
			if (nms) do_nms_sort(dets, nboxes, l.classes, nms);

			std::vector<bbox_t> bbox_vec;
			std::vector<bbox_t> bbox_vec;

			for (size_t i = 0; i < nboxes; ++i) {
			box b = dets[i].bbox;
			int const obj_id = max_index(dets[i].prob, l.classes);
			float const prob = dets[i].prob[obj_id];

			if (prob > thresh)
			{
			bbox_t bbox;
			bbox.x = std::max((double)0, (b.x - b.w / 2.)*im.w);
			bbox.y = std::max((double)0, (b.y - b.h / 2.)*im.h);
			bbox.w = b.w*im.w;
			bbox.h = b.h*im.h;
			bbox.obj_id = obj_id;
			bbox.prob = prob;
			bbox.track_id = 0;
			for (size_t i = 0; i < nboxes; ++i) {
			box b = dets[i].bbox;
			int const obj_id = max_index(dets[i].prob, l.classes);
			float const prob = dets[i].prob[obj_id];

			if (prob > thresh)
			{
			bbox_t bbox;
			bbox.x = std::max((double)0, (b.x - b.w / 2.)*im.w);
			bbox.y = std::max((double)0, (b.y - b.h / 2.)*im.h);
			bbox.w = b.w*im.w;
			bbox.h = b.h*im.h;
			bbox.obj_id = obj_id;
			bbox.prob = prob;
			bbox.track_id = 0;

			bbox_vec.push_back(bbox);
			}
			}
			bbox_vec.push_back(bbox);
			}
			}

			free_detections(dets, nboxes);
			if(sized.data)
			free(sized.data);
			free_detections(dets, nboxes);
			if(sized.data)
			free(sized.data);

			#ifdef GPU
			if (cur_gpu_id != old_gpu_index)
			cudaSetDevice(old_gpu_index);
			if (cur_gpu_id != old_gpu_index)
			cudaSetDevice(old_gpu_index);
			#endif

			return bbox_vec;
			return bbox_vec;
			}

			YOLODLL_API std::vector<bbox_t> Detector::tracking_id(std::vector<bbox_t> cur_bbox_vec, bool const change_history,
			int const frames_story, int const max_dist)
			int const frames_story, int const max_dist)
			{
			detector_gpu_t &det_gpu = static_cast<detector_gpu_t >(detector_gpu_ptr.get());
			detector_gpu_t &det_gpu = static_cast<detector_gpu_t >(detector_gpu_ptr.get());

			bool prev_track_id_present = false;
			for (auto &i : prev_bbox_vec_deque)
			if (i.size() > 0) prev_track_id_present = true;
			bool prev_track_id_present = false;
			for (auto &i : prev_bbox_vec_deque)
			if (i.size() > 0) prev_track_id_present = true;

			if (!prev_track_id_present) {
			for (size_t i = 0; i < cur_bbox_vec.size(); ++i)
			cur_bbox_vec[i].track_id = det_gpu.track_id[cur_bbox_vec[i].obj_id]++;
			prev_bbox_vec_deque.push_front(cur_bbox_vec);
			if (prev_bbox_vec_deque.size() > frames_story) prev_bbox_vec_deque.pop_back();
			return cur_bbox_vec;
			}
			if (!prev_track_id_present) {
			for (size_t i = 0; i < cur_bbox_vec.size(); ++i)
			cur_bbox_vec[i].track_id = det_gpu.track_id[cur_bbox_vec[i].obj_id]++;
			prev_bbox_vec_deque.push_front(cur_bbox_vec);
			if (prev_bbox_vec_deque.size() > frames_story) prev_bbox_vec_deque.pop_back();
			return cur_bbox_vec;
			}

			std::vector<unsigned int> dist_vec(cur_bbox_vec.size(), std::numeric_limits<unsigned int>::max());
			std::vector<unsigned int> dist_vec(cur_bbox_vec.size(), std::numeric_limits<unsigned int>::max());

			for (auto &prev_bbox_vec : prev_bbox_vec_deque) {
			for (auto &i : prev_bbox_vec) {
			int cur_index = -1;
			for (size_t m = 0; m < cur_bbox_vec.size(); ++m) {
			bbox_t const& k = cur_bbox_vec[m];
			if (i.obj_id == k.obj_id) {
			float center_x_diff = (float)(i.x + i.w/2) - (float)(k.x + k.w/2);
			float center_y_diff = (float)(i.y + i.h/2) - (float)(k.y + k.h/2);
			unsigned int cur_dist = sqrt(center_x_diffcenter_x_diff + center_y_diffcenter_y_diff);
			if (cur_dist < max_dist && (k.track_id == 0 \|\| dist_vec[m] > cur_dist)) {
			dist_vec[m] = cur_dist;
			cur_index = m;
			}
			}
			}
			for (auto &prev_bbox_vec : prev_bbox_vec_deque) {
			for (auto &i : prev_bbox_vec) {
			int cur_index = -1;
			for (size_t m = 0; m < cur_bbox_vec.size(); ++m) {
			bbox_t const& k = cur_bbox_vec[m];
			if (i.obj_id == k.obj_id) {
			float center_x_diff = (float)(i.x + i.w/2) - (float)(k.x + k.w/2);
			float center_y_diff = (float)(i.y + i.h/2) - (float)(k.y + k.h/2);
			unsigned int cur_dist = sqrt(center_x_diffcenter_x_diff + center_y_diffcenter_y_diff);
			if (cur_dist < max_dist && (k.track_id == 0 \|\| dist_vec[m] > cur_dist)) {
			dist_vec[m] = cur_dist;
			cur_index = m;
			}
			}
			}

			bool track_id_absent = !std::any_of(cur_bbox_vec.begin(), cur_bbox_vec.end(),
			[&i](bbox_t const& b) { return b.track_id == i.track_id && b.obj_id == i.obj_id; });
			bool track_id_absent = !std::any_of(cur_bbox_vec.begin(), cur_bbox_vec.end(),
			[&i](bbox_t const& b) { return b.track_id == i.track_id && b.obj_id == i.obj_id; });

			if (cur_index >= 0 && track_id_absent){
			cur_bbox_vec[cur_index].track_id = i.track_id;
			cur_bbox_vec[cur_index].w = (cur_bbox_vec[cur_index].w + i.w) / 2;
			cur_bbox_vec[cur_index].h = (cur_bbox_vec[cur_index].h + i.h) / 2;
			}
			}
			}
			if (cur_index >= 0 && track_id_absent){
			cur_bbox_vec[cur_index].track_id = i.track_id;
			cur_bbox_vec[cur_index].w = (cur_bbox_vec[cur_index].w + i.w) / 2;
			cur_bbox_vec[cur_index].h = (cur_bbox_vec[cur_index].h + i.h) / 2;
			}
			}
			}

			for (size_t i = 0; i < cur_bbox_vec.size(); ++i)
			if (cur_bbox_vec[i].track_id == 0)
			cur_bbox_vec[i].track_id = det_gpu.track_id[cur_bbox_vec[i].obj_id]++;
			for (size_t i = 0; i < cur_bbox_vec.size(); ++i)
			if (cur_bbox_vec[i].track_id == 0)
			cur_bbox_vec[i].track_id = det_gpu.track_id[cur_bbox_vec[i].obj_id]++;

			if (change_history) {
			prev_bbox_vec_deque.push_front(cur_bbox_vec);
			if (prev_bbox_vec_deque.size() > frames_story) prev_bbox_vec_deque.pop_back();
			}
			if (change_history) {
			prev_bbox_vec_deque.push_front(cur_bbox_vec);
			if (prev_bbox_vec_deque.size() > frames_story) prev_bbox_vec_deque.pop_back();
			}

			return cur_bbox_vec;
			return cur_bbox_vec;
			}