From 1b5afb45838e603fa6780762eb8cc59246dc2d81 Mon Sep 17 00:00:00 2001
From: IlyaOvodov <b@ovdv.ru>
Date: Tue, 08 May 2018 11:09:35 +0000
Subject: [PATCH] Output improvements for detector results: When printing detector results, output was done in random order, obfuscating results for interpreting. Now: 1. Text output includes coordinates of rects in (left,right,top,bottom in pixels) along with label and score 2. Text output is sorted by rect lefts to simplify finding appropriate rects on image 3. If several class probs are > thresh for some detection, the most probable is written first and coordinates for others are not repeated 4. Rects are imprinted in image in order by their best class prob, so most probable rects are always on top and not overlayed by less probable ones 5. Most probable label for rect is always written first Also: 6. Message about low GPU memory include required amount

---
 src/convolutional_layer.c | 1020 +++++++++++++++++++++++++++++++++++++-----------------------
 1 files changed, 629 insertions(+), 391 deletions(-)

diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 7531415..b8065fd 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -1,450 +1,688 @@
 #include "convolutional_layer.h"
 #include "utils.h"
-#include "mini_blas.h"
+#include "batchnorm_layer.h"
+#include "im2col.h"
+#include "col2im.h"
+#include "blas.h"
+#include "gemm.h"
 #include <stdio.h>
 #include <time.h>
 
-int convolutional_out_height(convolutional_layer layer)
+#ifdef CUDNN
+#pragma comment(lib, "cudnn.lib")  
+#endif
+
+#ifdef AI2
+#include "xnor_layer.h"
+#endif
+
+#ifndef AI2
+#define AI2 0
+void forward_xnor_layer(layer l, network_state state);
+#endif
+
+void swap_binary(convolutional_layer *l)
 {
-    int h = layer.h;
-    if (!layer.pad) h -= layer.size;
-    else h -= 1;
-    return h/layer.stride + 1;
+    float *swap = l->weights;
+    l->weights = l->binary_weights;
+    l->binary_weights = swap;
+
+    #ifdef GPU
+    swap = l->weights_gpu;
+    l->weights_gpu = l->binary_weights_gpu;
+    l->binary_weights_gpu = swap;
+    #endif
 }
 
-int convolutional_out_width(convolutional_layer layer)
+void binarize_weights(float *weights, int n, int size, float *binary)
 {
-    int w = layer.w;
-    if (!layer.pad) w -= layer.size;
-    else w -= 1;
-    return w/layer.stride + 1;
+    int i, f;
+    for(f = 0; f < n; ++f){
+        float mean = 0;
+        for(i = 0; i < size; ++i){
+            mean += fabs(weights[f*size + i]);
+        }
+        mean = mean / size;
+        for(i = 0; i < size; ++i){
+            binary[f*size + i] = (weights[f*size + i] > 0) ? mean : -mean;
+        }
+    }
 }
 
-image get_convolutional_image(convolutional_layer layer)
-{
-    int h,w,c;
-    h = convolutional_out_height(layer);
-    w = convolutional_out_width(layer);
-    c = layer.n;
-    return float_to_image(h,w,c,layer.output);
-}
-
-image get_convolutional_delta(convolutional_layer layer)
-{
-    int h,w,c;
-    h = convolutional_out_height(layer);
-    w = convolutional_out_width(layer);
-    c = layer.n;
-    return float_to_image(h,w,c,layer.delta);
-}
-
-convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, float learning_rate, float momentum, float decay)
+void binarize_cpu(float *input, int n, float *binary)
 {
     int i;
-    size = 2*(size/2)+1; //HA! And you thought you'd use an even sized filter...
-    convolutional_layer *layer = calloc(1, sizeof(convolutional_layer));
-
-    layer->learning_rate = learning_rate;
-    layer->momentum = momentum;
-    layer->decay = decay;
-
-    layer->h = h;
-    layer->w = w;
-    layer->c = c;
-    layer->n = n;
-    layer->batch = batch;
-    layer->stride = stride;
-    layer->size = size;
-    layer->pad = pad;
-
-    layer->filters = calloc(c*n*size*size, sizeof(float));
-    layer->filter_updates = calloc(c*n*size*size, sizeof(float));
-    layer->filter_momentum = calloc(c*n*size*size, sizeof(float));
-
-    layer->biases = calloc(n, sizeof(float));
-    layer->bias_updates = calloc(n, sizeof(float));
-    layer->bias_momentum = calloc(n, sizeof(float));
-    float scale = 1./(size*size*c);
-    scale = .01;
-    for(i = 0; i < c*n*size*size; ++i) layer->filters[i] = scale*2*(rand_uniform()-.5);
     for(i = 0; i < n; ++i){
-        //layer->biases[i] = rand_normal()*scale + scale;
-        layer->biases[i] = .5;
+        binary[i] = (input[i] > 0) ? 1 : -1;
     }
-    int out_h = convolutional_out_height(*layer);
-    int out_w = convolutional_out_width(*layer);
-
-    layer->col_image = calloc(layer->batch*out_h*out_w*size*size*c, sizeof(float));
-    layer->output = calloc(layer->batch*out_h * out_w * n, sizeof(float));
-    layer->delta  = calloc(layer->batch*out_h * out_w * n, sizeof(float));
-    #ifdef GPU
-    layer->filters_cl = cl_make_array(layer->filters, c*n*size*size);
-    layer->filter_updates_cl = cl_make_array(layer->filter_updates, c*n*size*size);
-    layer->filter_momentum_cl = cl_make_array(layer->filter_momentum, c*n*size*size);
-
-    layer->biases_cl = cl_make_array(layer->biases, n);
-    layer->bias_updates_cl = cl_make_array(layer->bias_updates, n);
-    layer->bias_momentum_cl = cl_make_array(layer->bias_momentum, n);
-
-    layer->col_image_cl = cl_make_array(layer->col_image, layer->batch*out_h*out_w*size*size*c);
-    layer->delta_cl = cl_make_array(layer->delta, layer->batch*out_h*out_w*n);
-    layer->output_cl = cl_make_array(layer->output, layer->batch*out_h*out_w*n);
-    #endif
-    layer->activation = activation;
-
-    fprintf(stderr, "Convolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n);
-
-    return layer;
 }
 
-void resize_convolutional_layer(convolutional_layer *layer, int h, int w, int c)
+void binarize_input(float *input, int n, int size, float *binary)
 {
-    layer->h = h;
-    layer->w = w;
-    layer->c = c;
-    int out_h = convolutional_out_height(*layer);
-    int out_w = convolutional_out_width(*layer);
-
-    layer->col_image = realloc(layer->col_image,
-                                layer->batch*out_h*out_w*layer->size*layer->size*layer->c*sizeof(float));
-    layer->output = realloc(layer->output,
-                                layer->batch*out_h * out_w * layer->n*sizeof(float));
-    layer->delta  = realloc(layer->delta,
-                                layer->batch*out_h * out_w * layer->n*sizeof(float));
+    int i, s;
+    for(s = 0; s < size; ++s){
+        float mean = 0;
+        for(i = 0; i < n; ++i){
+            mean += fabs(input[i*size + s]);
+        }
+        mean = mean / n;
+        for(i = 0; i < n; ++i){
+            binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean;
+        }
+    }
 }
 
-void bias_output(const convolutional_layer layer)
+int convolutional_out_height(convolutional_layer l)
+{
+    return (l.h + 2*l.pad - l.size) / l.stride + 1;
+}
+
+int convolutional_out_width(convolutional_layer l)
+{
+    return (l.w + 2*l.pad - l.size) / l.stride + 1;
+}
+
+image get_convolutional_image(convolutional_layer l)
+{
+    int h,w,c;
+    h = convolutional_out_height(l);
+    w = convolutional_out_width(l);
+    c = l.n;
+    return float_to_image(w,h,c,l.output);
+}
+
+image get_convolutional_delta(convolutional_layer l)
+{
+    int h,w,c;
+    h = convolutional_out_height(l);
+    w = convolutional_out_width(l);
+    c = l.n;
+    return float_to_image(w,h,c,l.delta);
+}
+
+size_t get_workspace_size(layer l){
+#ifdef CUDNN
+    if(gpu_index >= 0){
+        size_t most = 0;
+        size_t s = 0;
+        cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
+                l.srcTensorDesc,
+                l.weightDesc,
+                l.convDesc,
+                l.dstTensorDesc,
+                l.fw_algo,
+                &s);
+        if (s > most) most = s;
+        cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle(),
+                l.srcTensorDesc,
+                l.ddstTensorDesc,
+                l.convDesc,
+                l.dweightDesc,
+                l.bf_algo,
+                &s);
+        if (s > most) most = s;
+        cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
+                l.weightDesc,
+                l.ddstTensorDesc,
+                l.convDesc,
+                l.dsrcTensorDesc,
+                l.bd_algo,
+                &s);
+        if (s > most) most = s;
+        return most;
+    }
+    #endif
+    return (size_t)l.out_h*l.out_w*l.size*l.size*l.c*sizeof(float);
+}
+
+#ifdef GPU
+#ifdef CUDNN
+void cudnn_convolutional_setup(layer *l, int cudnn_preference)
+{
+
+#ifdef CUDNN_HALF
+	// TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0): 
+	//   Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
+	// PSEUDO_HALF_CONFIG is required for Tensor Cores - our case!
+	const cudnnDataType_t data_type = CUDNN_DATA_HALF;
+#else
+	cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#endif
+
+#if(CUDNN_MAJOR >= 7)
+	// Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH
+	// For *_ALGO_WINOGRAD_NONFUSED can be used CUDNN_DATA_FLOAT
+	// otherwise Input, Filter and Output descriptors (xDesc, yDesc, wDesc, dxDesc, dyDesc and dwDesc as applicable) have dataType = CUDNN_DATA_HALF
+	// Three techniques for training using Mixed-precision: https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
+	// 1. Accumulation into FP32
+	// 2. Loss Scaling - required only for: activation gradients. We do not use.
+	// 3. FP32 Master Copy of Weights
+	// More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
+	cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH);
+#endif
+
+	// INT8_CONFIG, INT8_EXT_CONFIG, INT8x4_CONFIG and INT8x4_EXT_CONFIG are only supported 
+	//   on architectures with DP4A support (compute capability 6.1 and later).
+	//cudnnDataType_t data_type = CUDNN_DATA_INT8;
+
+	// backward delta
+    cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w);
+    cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
+    cudnnSetFilter4dDescriptor(l->dweightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
+
+	// forward
+    cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w);
+    cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
+    cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
+
+	// batch norm
+	cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1);
+	cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
+
+	cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
+#if(CUDNN_MAJOR >= 6)
+	cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);	// cudnn >= 6.0
+#else
+	cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);	// cudnn 5.1
+#endif
+	int forward_algo = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+	int backward_algo = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+	int backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+	if (cudnn_preference == cudnn_smallest) 
+	{
+		forward_algo = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+		backward_algo = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
+		backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+		printf(" CUDNN-slow ");
+	}
+
+	cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
+            l->srcTensorDesc,
+            l->weightDesc,
+            l->convDesc,
+            l->dstTensorDesc,
+			forward_algo,
+            0,
+            &l->fw_algo);
+    cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
+            l->weightDesc,
+            l->ddstTensorDesc,
+            l->convDesc,
+            l->dsrcTensorDesc,
+			backward_algo,
+            0,
+            &l->bd_algo);
+    cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
+            l->srcTensorDesc,
+            l->ddstTensorDesc,
+            l->convDesc,
+            l->dweightDesc,
+			backward_filter,
+            0,
+            &l->bf_algo);
+
+	if (data_type == CUDNN_DATA_HALF) 
+	{
+		// HALF-16 if(data_type == CUDNN_DATA_HALF)
+		l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+		l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+		l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+
+		// FLOAT-32 if(data_type == CUDNN_DATA_FLOAT)
+		//l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
+		//l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
+		//l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;
+
+		int fw = 0, bd = 0, bf = 0;
+		if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1;
+			//printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM \n");
+		if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2;
+			//printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED \n");
+
+		if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1;
+			//printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1  \n");
+		if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2;
+			//printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED \n");
+
+		if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1;
+			//printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1   \n");
+		if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2;
+			//printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED \n");
+
+		if (fw == 2 && bd == 2 && bf == 2) printf("TF ");
+		else if (fw == 1 && bd == 1 && bf == 1) printf("TH ");
+	}
+}
+#endif
+#endif
+
+convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam)
+{
+    int i;
+    convolutional_layer l = {0};
+    l.type = CONVOLUTIONAL;
+
+    l.h = h;
+    l.w = w;
+    l.c = c;
+    l.n = n;
+    l.binary = binary;
+    l.xnor = xnor;
+    l.batch = batch;
+    l.stride = stride;
+    l.size = size;
+    l.pad = padding;
+    l.batch_normalize = batch_normalize;
+
+    l.weights = calloc(c*n*size*size, sizeof(float));
+    l.weight_updates = calloc(c*n*size*size, sizeof(float));
+
+    l.biases = calloc(n, sizeof(float));
+    l.bias_updates = calloc(n, sizeof(float));
+
+    // float scale = 1./sqrt(size*size*c);
+    float scale = sqrt(2./(size*size*c));
+    for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1);
+    int out_h = convolutional_out_height(l);
+    int out_w = convolutional_out_width(l);
+    l.out_h = out_h;
+    l.out_w = out_w;
+    l.out_c = n;
+    l.outputs = l.out_h * l.out_w * l.out_c;
+    l.inputs = l.w * l.h * l.c;
+
+    l.output = calloc(l.batch*l.outputs, sizeof(float));
+    l.delta  = calloc(l.batch*l.outputs, sizeof(float));
+
+    l.forward = forward_convolutional_layer;
+    l.backward = backward_convolutional_layer;
+    l.update = update_convolutional_layer;
+    if(binary){
+        l.binary_weights = calloc(c*n*size*size, sizeof(float));
+        l.cweights = calloc(c*n*size*size, sizeof(char));
+        l.scales = calloc(n, sizeof(float));
+    }
+    if(xnor){
+        l.binary_weights = calloc(c*n*size*size, sizeof(float));
+        l.binary_input = calloc(l.inputs*l.batch, sizeof(float));
+    }
+
+    if(batch_normalize){
+        l.scales = calloc(n, sizeof(float));
+        l.scale_updates = calloc(n, sizeof(float));
+        for(i = 0; i < n; ++i){
+            l.scales[i] = 1;
+        }
+
+        l.mean = calloc(n, sizeof(float));
+        l.variance = calloc(n, sizeof(float));
+
+        l.mean_delta = calloc(n, sizeof(float));
+        l.variance_delta = calloc(n, sizeof(float));
+
+        l.rolling_mean = calloc(n, sizeof(float));
+        l.rolling_variance = calloc(n, sizeof(float));
+        l.x = calloc(l.batch*l.outputs, sizeof(float));
+        l.x_norm = calloc(l.batch*l.outputs, sizeof(float));
+    }
+    if(adam){
+        l.adam = 1;
+        l.m = calloc(c*n*size*size, sizeof(float));
+        l.v = calloc(c*n*size*size, sizeof(float));
+    }
+
+#ifdef GPU
+    l.forward_gpu = forward_convolutional_layer_gpu;
+    l.backward_gpu = backward_convolutional_layer_gpu;
+    l.update_gpu = update_convolutional_layer_gpu;
+
+    if(gpu_index >= 0){
+        if (adam) {
+            l.m_gpu = cuda_make_array(l.m, c*n*size*size);
+            l.v_gpu = cuda_make_array(l.v, c*n*size*size);
+        }
+
+        l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
+#ifdef CUDNN_HALF
+		l.weights_gpu16 = cuda_make_array(NULL, c*n*size*size / 2); //cuda_make_array(l.weights, c*n*size*size / 2);
+		l.weight_updates_gpu16 = cuda_make_array(NULL, c*n*size*size / 2); //cuda_make_array(l.weight_updates, c*n*size*size / 2);
+#endif
+        l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
+
+        l.biases_gpu = cuda_make_array(l.biases, n);
+        l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
+
+        l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
+        l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
+
+        if(binary){
+            l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
+        }
+        if(xnor){
+            l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
+            l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
+        }
+
+        if(batch_normalize){
+            l.mean_gpu = cuda_make_array(l.mean, n);
+            l.variance_gpu = cuda_make_array(l.variance, n);
+
+            l.rolling_mean_gpu = cuda_make_array(l.mean, n);
+            l.rolling_variance_gpu = cuda_make_array(l.variance, n);
+
+            l.mean_delta_gpu = cuda_make_array(l.mean, n);
+            l.variance_delta_gpu = cuda_make_array(l.variance, n);
+
+            l.scales_gpu = cuda_make_array(l.scales, n);
+            l.scale_updates_gpu = cuda_make_array(l.scale_updates, n);
+
+            l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
+            l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
+        }
+#ifdef CUDNN		
+		cudnnCreateTensorDescriptor(&l.normDstTensorDesc);
+		cudnnCreateTensorDescriptor(&l.normDstTensorDescF16);
+		cudnnCreateTensorDescriptor(&l.normTensorDesc);
+        cudnnCreateTensorDescriptor(&l.srcTensorDesc);
+        cudnnCreateTensorDescriptor(&l.dstTensorDesc);
+        cudnnCreateFilterDescriptor(&l.weightDesc);
+        cudnnCreateTensorDescriptor(&l.dsrcTensorDesc);
+        cudnnCreateTensorDescriptor(&l.ddstTensorDesc);
+        cudnnCreateFilterDescriptor(&l.dweightDesc);
+        cudnnCreateConvolutionDescriptor(&l.convDesc);
+        cudnn_convolutional_setup(&l, cudnn_fastest);
+#endif
+    }
+#endif
+    l.workspace_size = get_workspace_size(l);
+    l.activation = activation;
+
+    fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
+
+    return l;
+}
+
+void denormalize_convolutional_layer(convolutional_layer l)
+{
+    int i, j;
+    for(i = 0; i < l.n; ++i){
+        float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .00001);
+        for(j = 0; j < l.c*l.size*l.size; ++j){
+            l.weights[i*l.c*l.size*l.size + j] *= scale;
+        }
+        l.biases[i] -= l.rolling_mean[i] * scale;
+        l.scales[i] = 1;
+        l.rolling_mean[i] = 0;
+        l.rolling_variance[i] = 1;
+    }
+}
+
+void test_convolutional_layer()
+{
+    convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0);
+    l.batch_normalize = 1;
+    float data[] = {1,1,1,1,1,
+        1,1,1,1,1,
+        1,1,1,1,1,
+        1,1,1,1,1,
+        1,1,1,1,1,
+        2,2,2,2,2,
+        2,2,2,2,2,
+        2,2,2,2,2,
+        2,2,2,2,2,
+        2,2,2,2,2,
+        3,3,3,3,3,
+        3,3,3,3,3,
+        3,3,3,3,3,
+        3,3,3,3,3,
+        3,3,3,3,3};
+    network_state state = {0};
+    state.input = data;
+    forward_convolutional_layer(l, state);
+}
+
+void resize_convolutional_layer(convolutional_layer *l, int w, int h)
+{
+	int old_w = l->w;
+	int old_h = l->h;
+    l->w = w;
+    l->h = h;
+    int out_w = convolutional_out_width(*l);
+    int out_h = convolutional_out_height(*l);
+
+    l->out_w = out_w;
+    l->out_h = out_h;
+
+    l->outputs = l->out_h * l->out_w * l->out_c;
+    l->inputs = l->w * l->h * l->c;
+
+    l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
+    l->delta  = realloc(l->delta,  l->batch*l->outputs*sizeof(float));
+    if(l->batch_normalize){
+        l->x = realloc(l->x, l->batch*l->outputs*sizeof(float));
+        l->x_norm  = realloc(l->x_norm, l->batch*l->outputs*sizeof(float));
+    }
+
+#ifdef GPU
+	if (old_w < w || old_h < h) {
+		cuda_free(l->delta_gpu);
+		cuda_free(l->output_gpu);
+
+		l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs);
+		l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+
+		if (l->batch_normalize) {
+			cuda_free(l->x_gpu);
+			cuda_free(l->x_norm_gpu);
+
+			l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+			l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+		}
+	}
+#ifdef CUDNN
+    cudnn_convolutional_setup(l, cudnn_fastest);
+#endif
+#endif
+    l->workspace_size = get_workspace_size(*l);
+
+#ifdef CUDNN
+	// check for excessive memory consumption 
+	size_t free_byte;
+	size_t total_byte;
+	check_error(cudaMemGetInfo(&free_byte, &total_byte));
+	if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) {
+		printf(" used slow CUDNN algo without Workspace! Need memory: %d, available: %d\n", l->workspace_size, (free_byte < total_byte/2) ? free_byte : total_byte/2);
+		cudnn_convolutional_setup(l, cudnn_smallest);
+		l->workspace_size = get_workspace_size(*l);
+	}
+#endif
+}
+
+void add_bias(float *output, float *biases, int batch, int n, int size)
 {
     int i,j,b;
-    int out_h = convolutional_out_height(layer);
-    int out_w = convolutional_out_width(layer);
-    for(b = 0; b < layer.batch; ++b){
-        for(i = 0; i < layer.n; ++i){
-            for(j = 0; j < out_h*out_w; ++j){
-                layer.output[(b*layer.n + i)*out_h*out_w + j] = layer.biases[i];
+    for(b = 0; b < batch; ++b){
+        for(i = 0; i < n; ++i){
+            for(j = 0; j < size; ++j){
+                output[(b*n + i)*size + j] += biases[i];
             }
         }
     }
 }
 
-void forward_convolutional_layer(const convolutional_layer layer, float *in)
+void scale_bias(float *output, float *scales, int batch, int n, int size)
 {
-    int out_h = convolutional_out_height(layer);
-    int out_w = convolutional_out_width(layer);
-    int i;
-
-    bias_output(layer);
-
-    int m = layer.n;
-    int k = layer.size*layer.size*layer.c;
-    int n = out_h*out_w;
-
-    float *a = layer.filters;
-    float *b = layer.col_image;
-    float *c = layer.output;
-
-    im2col_cpu(in, layer.batch, layer.c, layer.h, layer.w, 
-        layer.size, layer.stride, layer.pad, b);
-
-    for(i = 0; i < layer.batch; ++i){
-        gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
-        b += k*n;
-        c += n*m;
+    int i,j,b;
+    for(b = 0; b < batch; ++b){
+        for(i = 0; i < n; ++i){
+            for(j = 0; j < size; ++j){
+                output[(b*n + i)*size + j] *= scales[i];
+            }
+        }
     }
-    activate_array(layer.output, m*n*layer.batch, layer.activation);
 }
 
-void learn_bias_convolutional_layer(convolutional_layer layer)
+void backward_bias(float *bias_updates, float *delta, int batch, int n, int size)
 {
     int i,b;
-    int size = convolutional_out_height(layer)
-        *convolutional_out_width(layer);
-    for(b = 0; b < layer.batch; ++b){
-        for(i = 0; i < layer.n; ++i){
-            layer.bias_updates[i] += sum_array(layer.delta+size*(i+b*layer.n), size);
+    for(b = 0; b < batch; ++b){
+        for(i = 0; i < n; ++i){
+            bias_updates[i] += sum_array(delta+size*(i+b*n), size);
         }
     }
 }
 
-void backward_convolutional_layer(convolutional_layer layer, float *delta)
+void forward_convolutional_layer(convolutional_layer l, network_state state)
+{
+    int out_h = convolutional_out_height(l);
+    int out_w = convolutional_out_width(l);
+    int i;
+
+    fill_cpu(l.outputs*l.batch, 0, l.output, 1);
+
+    if(l.xnor){
+        binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
+        swap_binary(&l);
+        binarize_cpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input);
+        state.input = l.binary_input;
+    }
+
+    int m = l.n;
+    int k = l.size*l.size*l.c;
+    int n = out_h*out_w;
+
+
+    float *a = l.weights;
+    float *b = state.workspace;
+    float *c = l.output;
+
+    for(i = 0; i < l.batch; ++i){
+        im2col_cpu(state.input, l.c, l.h, l.w, 
+                l.size, l.stride, l.pad, b);
+        gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
+        c += n*m;
+        state.input += l.c*l.h*l.w;
+    }
+
+    if(l.batch_normalize){
+        forward_batchnorm_layer(l, state);
+    }
+    add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
+
+    activate_array(l.output, m*n*l.batch, l.activation);
+    if(l.binary || l.xnor) swap_binary(&l);
+}
+
+void backward_convolutional_layer(convolutional_layer l, network_state state)
 {
     int i;
-    int m = layer.n;
-    int n = layer.size*layer.size*layer.c;
-    int k = convolutional_out_height(layer)*
-        convolutional_out_width(layer);
-    gradient_array(layer.output, m*k*layer.batch, layer.activation, layer.delta);
-    learn_bias_convolutional_layer(layer);
+    int m = l.n;
+    int n = l.size*l.size*l.c;
+    int k = convolutional_out_height(l)*
+        convolutional_out_width(l);
 
-    float *a = layer.delta;
-    float *b = layer.col_image;
-    float *c = layer.filter_updates;
+    gradient_array(l.output, m*k*l.batch, l.activation, l.delta);
+    backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);
 
-    for(i = 0; i < layer.batch; ++i){
+    if(l.batch_normalize){
+        backward_batchnorm_layer(l, state);
+    }
+
+    for(i = 0; i < l.batch; ++i){
+        float *a = l.delta + i*m*k;
+        float *b = state.workspace;
+        float *c = l.weight_updates;
+
+        float *im = state.input+i*l.c*l.h*l.w;
+
+        im2col_cpu(im, l.c, l.h, l.w, 
+                l.size, l.stride, l.pad, b);
         gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
-        a += m*k;
-        b += k*n;
-    }
 
-    if(delta){
-        m = layer.size*layer.size*layer.c;
-        k = layer.n;
-        n = convolutional_out_height(layer)*
-            convolutional_out_width(layer);
+        if(state.delta){
+            a = l.weights;
+            b = l.delta + i*m*k;
+            c = state.workspace;
 
-        a = layer.filters;
-        b = layer.delta;
-        c = layer.col_image;
+            gemm(1,0,n,k,m,1,a,n,b,k,0,c,k);
 
-        for(i = 0; i < layer.batch; ++i){
-            gemm(1,0,m,n,k,1,a,m,b,n,0,c,n);
-            b += k*n;
-            c += m*n;
-        }
-
-        memset(delta, 0, layer.batch*layer.h*layer.w*layer.c*sizeof(float));
-
-        col2im_cpu(layer.col_image, layer.batch, layer.c,  layer.h,  layer.w,  layer.size,  layer.stride, layer.pad, delta);
-    }
-}
-
-void update_convolutional_layer(convolutional_layer layer)
-{
-    int size = layer.size*layer.size*layer.c*layer.n;
-    axpy_cpu(layer.n, layer.learning_rate, layer.bias_updates, 1, layer.biases, 1);
-    scal_cpu(layer.n, layer.momentum, layer.bias_updates, 1);
-
-    scal_cpu(size, 1.-layer.learning_rate*layer.decay, layer.filters, 1);
-    axpy_cpu(size, layer.learning_rate, layer.filter_updates, 1, layer.filters, 1);
-    scal_cpu(size, layer.momentum, layer.filter_updates, 1);
-}
-
-
-image get_convolutional_filter(convolutional_layer layer, int i)
-{
-    int h = layer.size;
-    int w = layer.size;
-    int c = layer.c;
-    return float_to_image(h,w,c,layer.filters+i*h*w*c);
-}
-
-image *weighted_sum_filters(convolutional_layer layer, image *prev_filters)
-{
-    image *filters = calloc(layer.n, sizeof(image));
-    int i,j,k,c;
-    if(!prev_filters){
-        for(i = 0; i < layer.n; ++i){
-            filters[i] = copy_image(get_convolutional_filter(layer, i));
+            col2im_cpu(state.workspace, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
         }
     }
-    else{
-        image base = prev_filters[0];
-        for(i = 0; i < layer.n; ++i){
-            image filter = get_convolutional_filter(layer, i);
-            filters[i] = make_image(base.h, base.w, base.c);
-            for(j = 0; j < layer.size; ++j){
-                for(k = 0; k < layer.size; ++k){
-                    for(c = 0; c < layer.c; ++c){
-                        float weight = get_pixel(filter, j, k, c);
-                        image prev_filter = copy_image(prev_filters[c]);
-                        scale_image(prev_filter, weight);
-                        add_into_image(prev_filter, filters[i], 0,0);
-                        free_image(prev_filter);
-                    }
-                }
-            }
-        }
-    }
-    return filters;
 }
 
-image *visualize_convolutional_layer(convolutional_layer layer, char *window, image *prev_filters)
+void update_convolutional_layer(convolutional_layer l, int batch, float learning_rate, float momentum, float decay)
 {
-    image *single_filters = weighted_sum_filters(layer, 0);
-    show_images(single_filters, layer.n, window);
+    int size = l.size*l.size*l.c*l.n;
+    axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
+    scal_cpu(l.n, momentum, l.bias_updates, 1);
 
-    image delta = get_convolutional_image(layer);
+    if(l.scales){
+        axpy_cpu(l.n, learning_rate/batch, l.scale_updates, 1, l.scales, 1);
+        scal_cpu(l.n, momentum, l.scale_updates, 1);
+    }
+
+    axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1);
+    axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
+    scal_cpu(size, momentum, l.weight_updates, 1);
+}
+
+
+image get_convolutional_weight(convolutional_layer l, int i)
+{
+    int h = l.size;
+    int w = l.size;
+    int c = l.c;
+    return float_to_image(w,h,c,l.weights+i*h*w*c);
+}
+
+void rgbgr_weights(convolutional_layer l)
+{
+    int i;
+    for(i = 0; i < l.n; ++i){
+        image im = get_convolutional_weight(l, i);
+        if (im.c == 3) {
+            rgbgr_image(im);
+        }
+    }
+}
+
+void rescale_weights(convolutional_layer l, float scale, float trans)
+{
+    int i;
+    for(i = 0; i < l.n; ++i){
+        image im = get_convolutional_weight(l, i);
+        if (im.c == 3) {
+            scale_image(im, scale);
+            float sum = sum_array(im.data, im.w*im.h*im.c);
+            l.biases[i] += sum*trans;
+        }
+    }
+}
+
+image *get_weights(convolutional_layer l)
+{
+    image *weights = calloc(l.n, sizeof(image));
+    int i;
+    for(i = 0; i < l.n; ++i){
+        weights[i] = copy_image(get_convolutional_weight(l, i));
+        //normalize_image(weights[i]);
+    }
+    return weights;
+}
+
+image *visualize_convolutional_layer(convolutional_layer l, char *window, image *prev_weights)
+{
+    image *single_weights = get_weights(l);
+    show_images(single_weights, l.n, window);
+
+    image delta = get_convolutional_image(l);
     image dc = collapse_image_layers(delta, 1);
     char buff[256];
     sprintf(buff, "%s: Output", window);
     //show_image(dc, buff);
     //save_image(dc, buff);
     free_image(dc);
-    return single_filters;
+    return single_weights;
 }
 
-#ifdef GPU
-
-cl_kernel get_convolutional_learn_bias_kernel()
-{
-    static int init = 0;
-    static cl_kernel kernel;
-    if(!init){
-        kernel = get_kernel("src/convolutional_layer.cl", "learn_bias", 0);
-        init = 1;
-    }
-    return kernel;
-}
-
-void learn_bias_convolutional_layer_ongpu(convolutional_layer layer)
-{
-    int size = convolutional_out_height(layer) * convolutional_out_width(layer);
-
-    cl_setup();
-    cl_kernel kernel = get_convolutional_learn_bias_kernel();
-    cl_command_queue queue = cl.queue;
-
-    cl_uint i = 0;
-    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.batch), (void*) &layer.batch);
-    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.n), (void*) &layer.n);
-    cl.error = clSetKernelArg(kernel, i++, sizeof(size), (void*) &size);
-    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.delta_cl), (void*) &layer.delta_cl);
-    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.bias_updates_cl), (void*) &layer.bias_updates_cl);
-    check_error(cl);
-
-    const size_t global_size[] = {layer.n};
-
-    clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0);
-    check_error(cl);
-}
-
-cl_kernel get_convolutional_bias_kernel()
-{
-    static int init = 0;
-    static cl_kernel kernel;
-    if(!init){
-        kernel = get_kernel("src/convolutional_layer.cl", "bias", 0);
-        init = 1;
-    }
-    return kernel;
-}
-
-void bias_output_gpu(const convolutional_layer layer)
-{
-    int out_h = convolutional_out_height(layer);
-    int out_w = convolutional_out_width(layer);
-    int size = out_h*out_w;
-
-    cl_setup();
-    cl_kernel kernel = get_convolutional_bias_kernel();
-    cl_command_queue queue = cl.queue;
-
-    cl_uint i = 0;
-    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.n), (void*) &layer.n);
-    cl.error = clSetKernelArg(kernel, i++, sizeof(size), (void*) &size);
-    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.biases_cl), (void*) &layer.biases_cl);
-    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.output_cl), (void*) &layer.output_cl);
-    check_error(cl);
-
-    const size_t global_size[] = {layer.n*size, layer.batch};
-
-    clEnqueueNDRangeKernel(queue, kernel, 2, 0, global_size, 0, 0, 0, 0);
-    check_error(cl);
-}
-
-//#define TIMEIT
-
-void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in)
-{
-    int i;
-    int m = layer.n;
-    int k = layer.size*layer.size*layer.c;
-    int n = convolutional_out_height(layer)*
-        convolutional_out_width(layer);
-
-    bias_output_gpu(layer);
-
-    #ifdef TIMEIT
-    clock_t time = clock();
-    printf("Forward\n");
-    #endif
-
-    im2col_ongpu(in, layer.batch, layer.c,  layer.h,  layer.w,  layer.size,  layer.stride, layer.pad, layer.col_image_cl);
-
-    #ifdef TIMEIT
-    clFinish(cl.queue);
-    printf("Im2col %f\n", sec(clock()-time));
-    time = clock();
-    #endif
-
-    for(i = 0; i < layer.batch; ++i){
-        cl_mem a = layer.filters_cl;
-        cl_mem b = layer.col_image_cl;
-        cl_mem c = layer.output_cl;
-        gemm_ongpu_offset(0,0,m,n,k,1.,a,0,k,b,i*k*n,n,1.,c,i*m*n,n);
-    }
-    #ifdef TIMEIT
-    clFinish(cl.queue);
-    printf("Gemm %f\n", sec(clock()-time));
-    #endif
-    activate_array_ongpu(layer.output_cl, m*n*layer.batch, layer.activation);
-    #ifdef TIMEIT
-    cl_read_array(layer.output_cl, layer.output, m*n*layer.batch);
-    #endif
-}
-
-void backward_convolutional_layer_gpu(convolutional_layer layer, cl_mem delta_cl)
-{
-    int i;
-    int m = layer.n;
-    int n = layer.size*layer.size*layer.c;
-    int k = convolutional_out_height(layer)*
-        convolutional_out_width(layer);
-    gradient_array_ongpu(layer.output_cl, m*k*layer.batch, layer.activation, layer.delta_cl);
-    learn_bias_convolutional_layer_ongpu(layer);
-
-    for(i = 0; i < layer.batch; ++i){
-        cl_mem a = layer.delta_cl;
-        cl_mem b = layer.col_image_cl;
-        cl_mem c = layer.filter_updates_cl;
-
-        gemm_ongpu_offset(0,1,m,n,k,1,a,i*m*k,k,b,i*k*n,k,1,c,0,n);
-    }
-    //cl_read_array(layer.delta_cl, layer.delta, m*k*layer.batch);
-
-    if(delta_cl){
-        m = layer.size*layer.size*layer.c;
-        k = layer.n;
-        n = convolutional_out_height(layer)*
-            convolutional_out_width(layer);
-
-        for(i = 0; i < layer.batch; ++i){
-            cl_mem a = layer.filters_cl;
-            cl_mem b = layer.delta_cl;
-            cl_mem c = layer.col_image_cl;
-
-            gemm_ongpu_offset(1,0,m,n,k,1,a,0,m,b,i*k*n,n,0,c,i*m*n,n);
-        }
-
-        scal_ongpu(layer.batch*layer.h*layer.w*layer.c,0,delta_cl, 1);
-        col2im_ongpu(layer.col_image_cl, layer.batch, layer.c,  layer.h,  layer.w,  layer.size,  layer.stride, layer.pad, delta_cl);
-    }
-}
-
-void pull_convolutional_layer(convolutional_layer layer)
-{
-    cl_read_array(layer.filters_cl, layer.filters, layer.c*layer.n*layer.size*layer.size);
-    cl_read_array(layer.biases_cl, layer.biases, layer.n);
-}
-
-void push_convolutional_layer(convolutional_layer layer)
-{
-    cl_write_array(layer.filters_cl, layer.filters, layer.c*layer.n*layer.size*layer.size);
-    cl_write_array(layer.biases_cl, layer.biases, layer.n);
-}
-
-void update_convolutional_layer_gpu(convolutional_layer layer)
-{
-    int size = layer.size*layer.size*layer.c*layer.n;
-    axpy_ongpu(layer.n, layer.learning_rate, layer.bias_updates_cl, 1, layer.biases_cl, 1);
-    scal_ongpu(layer.n,layer.momentum, layer.bias_updates_cl, 1);
-
-    scal_ongpu(size, 1.-layer.learning_rate*layer.decay, layer.filters_cl, 1);
-    axpy_ongpu(size, layer.learning_rate, layer.filter_updates_cl, 1, layer.filters_cl, 1);
-    scal_ongpu(size, layer.momentum, layer.filter_updates_cl, 1);
-    pull_convolutional_layer(layer);
-}
-
-
-#endif
-

--
Gitblit v1.10.0