From 028696bf15efeca3acb3db8c42a96f7b9e0f55ff Mon Sep 17 00:00:00 2001
From: iovodov <b@ovdv.ru>
Date: Thu, 03 May 2018 13:33:46 +0000
Subject: [PATCH] Output improvements for detector results: When printing detector results, output was done in random order, obfuscating results for interpreting. Now: 1. Text output includes coordinates of rects in (left,right,top,bottom in pixels) along with label and score 2. Text output is sorted by rect lefts to simplify finding appropriate rects on image 3. If several class probs are > thresh for some detection, the most probable is written first and coordinates for others are not repeated 4. Rects are imprinted in image in order by their best class prob, so most probable rects are always on top and not overlayed by less probable ones 5. Most probable label for rect is always written first Also: 6. Message about low GPU memory include required amount

---
 src/convolutional_layer.c |  910 +++++++++++++++++++++++++++++++++++++++-----------------
 1 files changed, 631 insertions(+), 279 deletions(-)

diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index f7c9c10..b8065fd 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -1,336 +1,688 @@
 #include "convolutional_layer.h"
 #include "utils.h"
-#include "mini_blas.h"
+#include "batchnorm_layer.h"
+#include "im2col.h"
+#include "col2im.h"
+#include "blas.h"
+#include "gemm.h"
 #include <stdio.h>
+#include <time.h>
 
-int convolutional_out_height(convolutional_layer layer)
+#ifdef CUDNN
+#pragma comment(lib, "cudnn.lib")  
+#endif
+
+#ifdef AI2
+#include "xnor_layer.h"
+#endif
+
+#ifndef AI2
+#define AI2 0
+void forward_xnor_layer(layer l, network_state state);
+#endif
+
+void swap_binary(convolutional_layer *l)
 {
-    return (layer.h-layer.size)/layer.stride + 1;
+    float *swap = l->weights;
+    l->weights = l->binary_weights;
+    l->binary_weights = swap;
+
+    #ifdef GPU
+    swap = l->weights_gpu;
+    l->weights_gpu = l->binary_weights_gpu;
+    l->binary_weights_gpu = swap;
+    #endif
 }
 
-int convolutional_out_width(convolutional_layer layer)
+void binarize_weights(float *weights, int n, int size, float *binary)
 {
-    return (layer.w-layer.size)/layer.stride + 1;
+    int i, f;
+    for(f = 0; f < n; ++f){
+        float mean = 0;
+        for(i = 0; i < size; ++i){
+            mean += fabs(weights[f*size + i]);
+        }
+        mean = mean / size;
+        for(i = 0; i < size; ++i){
+            binary[f*size + i] = (weights[f*size + i] > 0) ? mean : -mean;
+        }
+    }
 }
 
-image get_convolutional_image(convolutional_layer layer)
-{
-    int h,w,c;
-    h = convolutional_out_height(layer);
-    w = convolutional_out_width(layer);
-    c = layer.n;
-    return float_to_image(h,w,c,layer.output);
-}
-
-image get_convolutional_delta(convolutional_layer layer)
-{
-    int h,w,c;
-    h = convolutional_out_height(layer);
-    w = convolutional_out_width(layer);
-    c = layer.n;
-    return float_to_image(h,w,c,layer.delta);
-}
-
-convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation)
+void binarize_cpu(float *input, int n, float *binary)
 {
     int i;
-    size = 2*(size/2)+1; //HA! And you thought you'd use an even sized filter...
-    convolutional_layer *layer = calloc(1, sizeof(convolutional_layer));
-    layer->h = h;
-    layer->w = w;
-    layer->c = c;
-    layer->n = n;
-    layer->batch = batch;
-    layer->stride = stride;
-    layer->size = size;
-
-    layer->filters = calloc(c*n*size*size, sizeof(float));
-    layer->filter_updates = calloc(c*n*size*size, sizeof(float));
-    layer->filter_momentum = calloc(c*n*size*size, sizeof(float));
-
-    layer->biases = calloc(n, sizeof(float));
-    layer->bias_updates = calloc(n, sizeof(float));
-    layer->bias_momentum = calloc(n, sizeof(float));
-    float scale = 1./(size*size*c);
-    for(i = 0; i < c*n*size*size; ++i) layer->filters[i] = scale*(rand_uniform());
     for(i = 0; i < n; ++i){
-        //layer->biases[i] = rand_normal()*scale + scale;
-        layer->biases[i] = 0;
-    }
-    int out_h = convolutional_out_height(*layer);
-    int out_w = convolutional_out_width(*layer);
-
-    layer->col_image = calloc(layer->batch*out_h*out_w*size*size*c, sizeof(float));
-    layer->output = calloc(layer->batch*out_h * out_w * n, sizeof(float));
-    layer->delta  = calloc(layer->batch*out_h * out_w * n, sizeof(float));
-    layer->activation = activation;
-
-    fprintf(stderr, "Convolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n);
-    srand(0);
-
-    return layer;
-}
-
-void resize_convolutional_layer(convolutional_layer *layer, int h, int w, int c)
-{
-    layer->h = h;
-    layer->w = w;
-    layer->c = c;
-    int out_h = convolutional_out_height(*layer);
-    int out_w = convolutional_out_width(*layer);
-
-    layer->col_image = realloc(layer->col_image,
-                                layer->batch*out_h*out_w*layer->size*layer->size*layer->c*sizeof(float));
-    layer->output = realloc(layer->output,
-                                layer->batch*out_h * out_w * layer->n*sizeof(float));
-    layer->delta  = realloc(layer->delta,
-                                layer->batch*out_h * out_w * layer->n*sizeof(float));
-}
-
-void forward_convolutional_layer(const convolutional_layer layer, float *in)
-{
-    int i;
-    int m = layer.n;
-    int k = layer.size*layer.size*layer.c;
-    int n = convolutional_out_height(layer)*
-            convolutional_out_width(layer)*
-            layer.batch;
-
-    memset(layer.output, 0, m*n*sizeof(float));
-
-    float *a = layer.filters;
-    float *b = layer.col_image;
-    float *c = layer.output;
-    for(i = 0; i < layer.batch; ++i){
-        im2col_cpu(in+i*(n/layer.batch),  layer.c,  layer.h,  layer.w,  layer.size,  layer.stride, b+i*(n/layer.batch));
-    }
-    gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
-
-    for(i = 0; i < m*n; ++i){
-        layer.output[i] = activate(layer.output[i], layer.activation);
-    }
-    //for(i = 0; i < m*n; ++i) if(i%(m*n/10+1)==0) printf("%f, ", layer.output[i]); printf("\n");
-
-}
-
-void gradient_delta_convolutional_layer(convolutional_layer layer)
-{
-    int i;
-    int size = convolutional_out_height(layer)*
-                convolutional_out_width(layer)*
-                layer.n*
-                layer.batch;
-    for(i = 0; i < size; ++i){
-        layer.delta[i] *= gradient(layer.output[i], layer.activation);
+        binary[i] = (input[i] > 0) ? 1 : -1;
     }
 }
 
-void learn_bias_convolutional_layer(convolutional_layer layer)
+void binarize_input(float *input, int n, int size, float *binary)
 {
-    int i,j,b;
-    int size = convolutional_out_height(layer)
-                *convolutional_out_width(layer);
-    for(b = 0; b < layer.batch; ++b){
-        for(i = 0; i < layer.n; ++i){
-            float sum = 0;
-            for(j = 0; j < size; ++j){
-                sum += layer.delta[j+size*(i+b*layer.n)];
-            }
-            layer.bias_updates[i] += sum/size;
+    int i, s;
+    for(s = 0; s < size; ++s){
+        float mean = 0;
+        for(i = 0; i < n; ++i){
+            mean += fabs(input[i*size + s]);
+        }
+        mean = mean / n;
+        for(i = 0; i < n; ++i){
+            binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean;
         }
     }
 }
 
-void learn_convolutional_layer(convolutional_layer layer)
+int convolutional_out_height(convolutional_layer l)
 {
-    gradient_delta_convolutional_layer(layer);
-    learn_bias_convolutional_layer(layer);
-    int m = layer.n;
-    int n = layer.size*layer.size*layer.c;
-    int k = convolutional_out_height(layer)*
-            convolutional_out_width(layer)*
-            layer.batch;
-
-    float *a = layer.delta;
-    float *b = layer.col_image;
-    float *c = layer.filter_updates;
-
-    gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
+    return (l.h + 2*l.pad - l.size) / l.stride + 1;
 }
 
-void backward_convolutional_layer(convolutional_layer layer, float *delta)
+int convolutional_out_width(convolutional_layer l)
+{
+    return (l.w + 2*l.pad - l.size) / l.stride + 1;
+}
+
+image get_convolutional_image(convolutional_layer l)
+{
+    int h,w,c;
+    h = convolutional_out_height(l);
+    w = convolutional_out_width(l);
+    c = l.n;
+    return float_to_image(w,h,c,l.output);
+}
+
+image get_convolutional_delta(convolutional_layer l)
+{
+    int h,w,c;
+    h = convolutional_out_height(l);
+    w = convolutional_out_width(l);
+    c = l.n;
+    return float_to_image(w,h,c,l.delta);
+}
+
+size_t get_workspace_size(layer l){
+#ifdef CUDNN
+    if(gpu_index >= 0){
+        size_t most = 0;
+        size_t s = 0;
+        cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
+                l.srcTensorDesc,
+                l.weightDesc,
+                l.convDesc,
+                l.dstTensorDesc,
+                l.fw_algo,
+                &s);
+        if (s > most) most = s;
+        cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle(),
+                l.srcTensorDesc,
+                l.ddstTensorDesc,
+                l.convDesc,
+                l.dweightDesc,
+                l.bf_algo,
+                &s);
+        if (s > most) most = s;
+        cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
+                l.weightDesc,
+                l.ddstTensorDesc,
+                l.convDesc,
+                l.dsrcTensorDesc,
+                l.bd_algo,
+                &s);
+        if (s > most) most = s;
+        return most;
+    }
+    #endif
+    return (size_t)l.out_h*l.out_w*l.size*l.size*l.c*sizeof(float);
+}
+
+#ifdef GPU
+#ifdef CUDNN
+void cudnn_convolutional_setup(layer *l, int cudnn_preference)
+{
+
+#ifdef CUDNN_HALF
+	// TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0): 
+	//   Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
+	// PSEUDO_HALF_CONFIG is required for Tensor Cores - our case!
+	const cudnnDataType_t data_type = CUDNN_DATA_HALF;
+#else
+	cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#endif
+
+#if(CUDNN_MAJOR >= 7)
+	// Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH
+	// For *_ALGO_WINOGRAD_NONFUSED can be used CUDNN_DATA_FLOAT
+	// otherwise Input, Filter and Output descriptors (xDesc, yDesc, wDesc, dxDesc, dyDesc and dwDesc as applicable) have dataType = CUDNN_DATA_HALF
+	// Three techniques for training using Mixed-precision: https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
+	// 1. Accumulation into FP32
+	// 2. Loss Scaling - required only for: activation gradients. We do not use.
+	// 3. FP32 Master Copy of Weights
+	// More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
+	cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH);
+#endif
+
+	// INT8_CONFIG, INT8_EXT_CONFIG, INT8x4_CONFIG and INT8x4_EXT_CONFIG are only supported 
+	//   on architectures with DP4A support (compute capability 6.1 and later).
+	//cudnnDataType_t data_type = CUDNN_DATA_INT8;
+
+	// backward delta
+    cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w);
+    cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
+    cudnnSetFilter4dDescriptor(l->dweightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
+
+	// forward
+    cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w);
+    cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
+    cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
+
+	// batch norm
+	cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1);
+	cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
+
+	cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
+#if(CUDNN_MAJOR >= 6)
+	cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);	// cudnn >= 6.0
+#else
+	cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);	// cudnn 5.1
+#endif
+	int forward_algo = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+	int backward_algo = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+	int backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+	if (cudnn_preference == cudnn_smallest) 
+	{
+		forward_algo = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+		backward_algo = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
+		backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+		printf(" CUDNN-slow ");
+	}
+
+	cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
+            l->srcTensorDesc,
+            l->weightDesc,
+            l->convDesc,
+            l->dstTensorDesc,
+			forward_algo,
+            0,
+            &l->fw_algo);
+    cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
+            l->weightDesc,
+            l->ddstTensorDesc,
+            l->convDesc,
+            l->dsrcTensorDesc,
+			backward_algo,
+            0,
+            &l->bd_algo);
+    cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
+            l->srcTensorDesc,
+            l->ddstTensorDesc,
+            l->convDesc,
+            l->dweightDesc,
+			backward_filter,
+            0,
+            &l->bf_algo);
+
+	if (data_type == CUDNN_DATA_HALF) 
+	{
+		// HALF-16 if(data_type == CUDNN_DATA_HALF)
+		l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+		l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+		l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+
+		// FLOAT-32 if(data_type == CUDNN_DATA_FLOAT)
+		//l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
+		//l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
+		//l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;
+
+		int fw = 0, bd = 0, bf = 0;
+		if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1;
+			//printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM \n");
+		if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2;
+			//printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED \n");
+
+		if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1;
+			//printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1  \n");
+		if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2;
+			//printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED \n");
+
+		if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1;
+			//printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1   \n");
+		if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2;
+			//printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED \n");
+
+		if (fw == 2 && bd == 2 && bf == 2) printf("TF ");
+		else if (fw == 1 && bd == 1 && bf == 1) printf("TH ");
+	}
+}
+#endif
+#endif
+
+convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam)
 {
     int i;
-    int m = layer.size*layer.size*layer.c;
-    int k = layer.n;
-    int n = convolutional_out_height(layer)*
-            convolutional_out_width(layer)*
-            layer.batch;
+    convolutional_layer l = {0};
+    l.type = CONVOLUTIONAL;
 
-    float *a = layer.filters;
-    float *b = layer.delta;
-    float *c = layer.col_image;
+    l.h = h;
+    l.w = w;
+    l.c = c;
+    l.n = n;
+    l.binary = binary;
+    l.xnor = xnor;
+    l.batch = batch;
+    l.stride = stride;
+    l.size = size;
+    l.pad = padding;
+    l.batch_normalize = batch_normalize;
 
+    l.weights = calloc(c*n*size*size, sizeof(float));
+    l.weight_updates = calloc(c*n*size*size, sizeof(float));
 
-    memset(c, 0, m*n*sizeof(float));
-    gemm(1,0,m,n,k,1,a,m,b,n,1,c,n);
+    l.biases = calloc(n, sizeof(float));
+    l.bias_updates = calloc(n, sizeof(float));
 
-    memset(delta, 0, layer.batch*layer.h*layer.w*layer.c*sizeof(float));
-    for(i = 0; i < layer.batch; ++i){
-        col2im_cpu(c+i*n/layer.batch,  layer.c,  layer.h,  layer.w,  layer.size,  layer.stride, delta+i*n/layer.batch);
+    // float scale = 1./sqrt(size*size*c);
+    float scale = sqrt(2./(size*size*c));
+    for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1);
+    int out_h = convolutional_out_height(l);
+    int out_w = convolutional_out_width(l);
+    l.out_h = out_h;
+    l.out_w = out_w;
+    l.out_c = n;
+    l.outputs = l.out_h * l.out_w * l.out_c;
+    l.inputs = l.w * l.h * l.c;
+
+    l.output = calloc(l.batch*l.outputs, sizeof(float));
+    l.delta  = calloc(l.batch*l.outputs, sizeof(float));
+
+    l.forward = forward_convolutional_layer;
+    l.backward = backward_convolutional_layer;
+    l.update = update_convolutional_layer;
+    if(binary){
+        l.binary_weights = calloc(c*n*size*size, sizeof(float));
+        l.cweights = calloc(c*n*size*size, sizeof(char));
+        l.scales = calloc(n, sizeof(float));
     }
-}
-
-void update_convolutional_layer(convolutional_layer layer, float step, float momentum, float decay)
-{
-    int i;
-    int size = layer.size*layer.size*layer.c*layer.n;
-    for(i = 0; i < layer.n; ++i){
-        layer.biases[i] += step*layer.bias_updates[i];
-        layer.bias_updates[i] *= momentum;
-    }
-    for(i = 0; i < size; ++i){
-        layer.filters[i] += step*(layer.filter_updates[i] - decay*layer.filters[i]);
-        layer.filter_updates[i] *= momentum;
-    }
-}
-/*
-
-void backward_convolutional_layer2(convolutional_layer layer, float *input, float *delta)
-{
-    image in_delta = float_to_image(layer.h, layer.w, layer.c, delta);
-    image out_delta = get_convolutional_delta(layer);
-    int i,j;
-    for(i = 0; i < layer.n; ++i){
-        rotate_image(layer.kernels[i]);
+    if(xnor){
+        l.binary_weights = calloc(c*n*size*size, sizeof(float));
+        l.binary_input = calloc(l.inputs*l.batch, sizeof(float));
     }
 
-    zero_image(in_delta);
-    upsample_image(out_delta, layer.stride, layer.upsampled);
-    for(j = 0; j < in_delta.c; ++j){
-        for(i = 0; i < layer.n; ++i){
-            two_d_convolve(layer.upsampled, i, layer.kernels[i], j, 1, in_delta, j, layer.edge);
+    if(batch_normalize){
+        l.scales = calloc(n, sizeof(float));
+        l.scale_updates = calloc(n, sizeof(float));
+        for(i = 0; i < n; ++i){
+            l.scales[i] = 1;
         }
+
+        l.mean = calloc(n, sizeof(float));
+        l.variance = calloc(n, sizeof(float));
+
+        l.mean_delta = calloc(n, sizeof(float));
+        l.variance_delta = calloc(n, sizeof(float));
+
+        l.rolling_mean = calloc(n, sizeof(float));
+        l.rolling_variance = calloc(n, sizeof(float));
+        l.x = calloc(l.batch*l.outputs, sizeof(float));
+        l.x_norm = calloc(l.batch*l.outputs, sizeof(float));
+    }
+    if(adam){
+        l.adam = 1;
+        l.m = calloc(c*n*size*size, sizeof(float));
+        l.v = calloc(c*n*size*size, sizeof(float));
     }
 
-    for(i = 0; i < layer.n; ++i){
-        rotate_image(layer.kernels[i]);
-    }
-}
+#ifdef GPU
+    l.forward_gpu = forward_convolutional_layer_gpu;
+    l.backward_gpu = backward_convolutional_layer_gpu;
+    l.update_gpu = update_convolutional_layer_gpu;
 
-
-void learn_convolutional_layer(convolutional_layer layer, float *input)
-{
-    int i;
-    image in_image = float_to_image(layer.h, layer.w, layer.c, input);
-    image out_delta = get_convolutional_delta(layer);
-    gradient_delta_convolutional_layer(layer);
-    for(i = 0; i < layer.n; ++i){
-        kernel_update(in_image, layer.kernel_updates[i], layer.stride, i, out_delta, layer.edge);
-        layer.bias_updates[i] += avg_image_layer(out_delta, i);
-    }
-}
-
-void update_convolutional_layer(convolutional_layer layer, float step, float momentum, float decay)
-{
-    int i,j;
-    for(i = 0; i < layer.n; ++i){
-        layer.bias_momentum[i] = step*(layer.bias_updates[i]) 
-                                + momentum*layer.bias_momentum[i];
-        layer.biases[i] += layer.bias_momentum[i];
-        layer.bias_updates[i] = 0;
-        int pixels = layer.kernels[i].h*layer.kernels[i].w*layer.kernels[i].c;
-        for(j = 0; j < pixels; ++j){
-            layer.kernel_momentum[i].data[j] = step*(layer.kernel_updates[i].data[j] - decay*layer.kernels[i].data[j]) 
-                                                + momentum*layer.kernel_momentum[i].data[j];
-            layer.kernels[i].data[j] += layer.kernel_momentum[i].data[j];
+    if(gpu_index >= 0){
+        if (adam) {
+            l.m_gpu = cuda_make_array(l.m, c*n*size*size);
+            l.v_gpu = cuda_make_array(l.v, c*n*size*size);
         }
-        zero_image(layer.kernel_updates[i]);
+
+        l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
+#ifdef CUDNN_HALF
+		l.weights_gpu16 = cuda_make_array(NULL, c*n*size*size / 2); //cuda_make_array(l.weights, c*n*size*size / 2);
+		l.weight_updates_gpu16 = cuda_make_array(NULL, c*n*size*size / 2); //cuda_make_array(l.weight_updates, c*n*size*size / 2);
+#endif
+        l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
+
+        l.biases_gpu = cuda_make_array(l.biases, n);
+        l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
+
+        l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
+        l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
+
+        if(binary){
+            l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
+        }
+        if(xnor){
+            l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
+            l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
+        }
+
+        if(batch_normalize){
+            l.mean_gpu = cuda_make_array(l.mean, n);
+            l.variance_gpu = cuda_make_array(l.variance, n);
+
+            l.rolling_mean_gpu = cuda_make_array(l.mean, n);
+            l.rolling_variance_gpu = cuda_make_array(l.variance, n);
+
+            l.mean_delta_gpu = cuda_make_array(l.mean, n);
+            l.variance_delta_gpu = cuda_make_array(l.variance, n);
+
+            l.scales_gpu = cuda_make_array(l.scales, n);
+            l.scale_updates_gpu = cuda_make_array(l.scale_updates, n);
+
+            l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
+            l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
+        }
+#ifdef CUDNN		
+		cudnnCreateTensorDescriptor(&l.normDstTensorDesc);
+		cudnnCreateTensorDescriptor(&l.normDstTensorDescF16);
+		cudnnCreateTensorDescriptor(&l.normTensorDesc);
+        cudnnCreateTensorDescriptor(&l.srcTensorDesc);
+        cudnnCreateTensorDescriptor(&l.dstTensorDesc);
+        cudnnCreateFilterDescriptor(&l.weightDesc);
+        cudnnCreateTensorDescriptor(&l.dsrcTensorDesc);
+        cudnnCreateTensorDescriptor(&l.ddstTensorDesc);
+        cudnnCreateFilterDescriptor(&l.dweightDesc);
+        cudnnCreateConvolutionDescriptor(&l.convDesc);
+        cudnn_convolutional_setup(&l, cudnn_fastest);
+#endif
+    }
+#endif
+    l.workspace_size = get_workspace_size(l);
+    l.activation = activation;
+
+    fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
+
+    return l;
+}
+
+void denormalize_convolutional_layer(convolutional_layer l)
+{
+    int i, j;
+    for(i = 0; i < l.n; ++i){
+        float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .00001);
+        for(j = 0; j < l.c*l.size*l.size; ++j){
+            l.weights[i*l.c*l.size*l.size + j] *= scale;
+        }
+        l.biases[i] -= l.rolling_mean[i] * scale;
+        l.scales[i] = 1;
+        l.rolling_mean[i] = 0;
+        l.rolling_variance[i] = 1;
     }
 }
-*/
 
 void test_convolutional_layer()
 {
-    convolutional_layer l = *make_convolutional_layer(1,4,4,1,1,3,1,LINEAR);
-    float input[] =    {1,2,3,4,
-                        5,6,7,8,
-                        9,10,11,12,
-                        13,14,15,16};
-    float filter[] =   {.5, 0, .3,
-                        0  , 1,  0,
-                        .2 , 0,  1};
-    float delta[] =    {1, 2,
-                        3,  4};
-    float in_delta[] = {.5,1,.3,.6,
-                        5,6,7,8,
-                        9,10,11,12,
-                        13,14,15,16};
-    l.filters = filter;
-    forward_convolutional_layer(l, input);
-    l.delta = delta;
-    learn_convolutional_layer(l);
-    image filter_updates = float_to_image(3,3,1,l.filter_updates);
-    print_image(filter_updates);
-    printf("Delta:\n");
-    backward_convolutional_layer(l, in_delta);
-    pm(4,4,in_delta);
+    convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0);
+    l.batch_normalize = 1;
+    float data[] = {1,1,1,1,1,
+        1,1,1,1,1,
+        1,1,1,1,1,
+        1,1,1,1,1,
+        1,1,1,1,1,
+        2,2,2,2,2,
+        2,2,2,2,2,
+        2,2,2,2,2,
+        2,2,2,2,2,
+        2,2,2,2,2,
+        3,3,3,3,3,
+        3,3,3,3,3,
+        3,3,3,3,3,
+        3,3,3,3,3,
+        3,3,3,3,3};
+    network_state state = {0};
+    state.input = data;
+    forward_convolutional_layer(l, state);
 }
 
-image get_convolutional_filter(convolutional_layer layer, int i)
+void resize_convolutional_layer(convolutional_layer *l, int w, int h)
 {
-    int h = layer.size;
-    int w = layer.size;
-    int c = layer.c;
-    return float_to_image(h,w,c,layer.filters+i*h*w*c);
-}
+	int old_w = l->w;
+	int old_h = l->h;
+    l->w = w;
+    l->h = h;
+    int out_w = convolutional_out_width(*l);
+    int out_h = convolutional_out_height(*l);
 
-void visualize_convolutional_layer(convolutional_layer layer, char *window)
-{
-    int color = 1;
-    int border = 1;
-    int h,w,c;
-    int size = layer.size;
-    h = size;
-    w = (size + border) * layer.n - border;
-    c = layer.c;
-    if(c != 3 || !color){
-        h = (h+border)*c - border;
-        c = 1;
+    l->out_w = out_w;
+    l->out_h = out_h;
+
+    l->outputs = l->out_h * l->out_w * l->out_c;
+    l->inputs = l->w * l->h * l->c;
+
+    l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
+    l->delta  = realloc(l->delta,  l->batch*l->outputs*sizeof(float));
+    if(l->batch_normalize){
+        l->x = realloc(l->x, l->batch*l->outputs*sizeof(float));
+        l->x_norm  = realloc(l->x_norm, l->batch*l->outputs*sizeof(float));
     }
 
-    image filters = make_image(h,w,c);
-    int i,j;
-    for(i = 0; i < layer.n; ++i){
-        int w_offset = i*(size+border);
-        image k = get_convolutional_filter(layer, i);
-        //printf("%f ** ", layer.biases[i]);
-        //print_image(k);
-        image copy = copy_image(k);
-        normalize_image(copy);
-        for(j = 0; j < k.c; ++j){
-            //set_pixel(copy,0,0,j,layer.biases[i]);
-        }
-        if(c == 3 && color){
-            embed_image(copy, filters, 0, w_offset);
-        }
-        else{
-            for(j = 0; j < k.c; ++j){
-                int h_offset = j*(size+border);
-                image layer = get_image_layer(k, j);
-                embed_image(layer, filters, h_offset, w_offset);
-                free_image(layer);
+#ifdef GPU
+	if (old_w < w || old_h < h) {
+		cuda_free(l->delta_gpu);
+		cuda_free(l->output_gpu);
+
+		l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs);
+		l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+
+		if (l->batch_normalize) {
+			cuda_free(l->x_gpu);
+			cuda_free(l->x_norm_gpu);
+
+			l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+			l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+		}
+	}
+#ifdef CUDNN
+    cudnn_convolutional_setup(l, cudnn_fastest);
+#endif
+#endif
+    l->workspace_size = get_workspace_size(*l);
+
+#ifdef CUDNN
+	// check for excessive memory consumption 
+	size_t free_byte;
+	size_t total_byte;
+	check_error(cudaMemGetInfo(&free_byte, &total_byte));
+	if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) {
+		printf(" used slow CUDNN algo without Workspace! Need memory: %d, available: %d\n", l->workspace_size, (free_byte < total_byte/2) ? free_byte : total_byte/2);
+		cudnn_convolutional_setup(l, cudnn_smallest);
+		l->workspace_size = get_workspace_size(*l);
+	}
+#endif
+}
+
+void add_bias(float *output, float *biases, int batch, int n, int size)
+{
+    int i,j,b;
+    for(b = 0; b < batch; ++b){
+        for(i = 0; i < n; ++i){
+            for(j = 0; j < size; ++j){
+                output[(b*n + i)*size + j] += biases[i];
             }
         }
-        free_image(copy);
     }
-    image delta = get_convolutional_delta(layer);
+}
+
+void scale_bias(float *output, float *scales, int batch, int n, int size)
+{
+    int i,j,b;
+    for(b = 0; b < batch; ++b){
+        for(i = 0; i < n; ++i){
+            for(j = 0; j < size; ++j){
+                output[(b*n + i)*size + j] *= scales[i];
+            }
+        }
+    }
+}
+
+void backward_bias(float *bias_updates, float *delta, int batch, int n, int size)
+{
+    int i,b;
+    for(b = 0; b < batch; ++b){
+        for(i = 0; i < n; ++i){
+            bias_updates[i] += sum_array(delta+size*(i+b*n), size);
+        }
+    }
+}
+
+void forward_convolutional_layer(convolutional_layer l, network_state state)
+{
+    int out_h = convolutional_out_height(l);
+    int out_w = convolutional_out_width(l);
+    int i;
+
+    fill_cpu(l.outputs*l.batch, 0, l.output, 1);
+
+    if(l.xnor){
+        binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
+        swap_binary(&l);
+        binarize_cpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input);
+        state.input = l.binary_input;
+    }
+
+    int m = l.n;
+    int k = l.size*l.size*l.c;
+    int n = out_h*out_w;
+
+
+    float *a = l.weights;
+    float *b = state.workspace;
+    float *c = l.output;
+
+    for(i = 0; i < l.batch; ++i){
+        im2col_cpu(state.input, l.c, l.h, l.w, 
+                l.size, l.stride, l.pad, b);
+        gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
+        c += n*m;
+        state.input += l.c*l.h*l.w;
+    }
+
+    if(l.batch_normalize){
+        forward_batchnorm_layer(l, state);
+    }
+    add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
+
+    activate_array(l.output, m*n*l.batch, l.activation);
+    if(l.binary || l.xnor) swap_binary(&l);
+}
+
+void backward_convolutional_layer(convolutional_layer l, network_state state)
+{
+    int i;
+    int m = l.n;
+    int n = l.size*l.size*l.c;
+    int k = convolutional_out_height(l)*
+        convolutional_out_width(l);
+
+    gradient_array(l.output, m*k*l.batch, l.activation, l.delta);
+    backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);
+
+    if(l.batch_normalize){
+        backward_batchnorm_layer(l, state);
+    }
+
+    for(i = 0; i < l.batch; ++i){
+        float *a = l.delta + i*m*k;
+        float *b = state.workspace;
+        float *c = l.weight_updates;
+
+        float *im = state.input+i*l.c*l.h*l.w;
+
+        im2col_cpu(im, l.c, l.h, l.w, 
+                l.size, l.stride, l.pad, b);
+        gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
+
+        if(state.delta){
+            a = l.weights;
+            b = l.delta + i*m*k;
+            c = state.workspace;
+
+            gemm(1,0,n,k,m,1,a,n,b,k,0,c,k);
+
+            col2im_cpu(state.workspace, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
+        }
+    }
+}
+
+void update_convolutional_layer(convolutional_layer l, int batch, float learning_rate, float momentum, float decay)
+{
+    int size = l.size*l.size*l.c*l.n;
+    axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
+    scal_cpu(l.n, momentum, l.bias_updates, 1);
+
+    if(l.scales){
+        axpy_cpu(l.n, learning_rate/batch, l.scale_updates, 1, l.scales, 1);
+        scal_cpu(l.n, momentum, l.scale_updates, 1);
+    }
+
+    axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1);
+    axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
+    scal_cpu(size, momentum, l.weight_updates, 1);
+}
+
+
+image get_convolutional_weight(convolutional_layer l, int i)
+{
+    int h = l.size;
+    int w = l.size;
+    int c = l.c;
+    return float_to_image(w,h,c,l.weights+i*h*w*c);
+}
+
+void rgbgr_weights(convolutional_layer l)
+{
+    int i;
+    for(i = 0; i < l.n; ++i){
+        image im = get_convolutional_weight(l, i);
+        if (im.c == 3) {
+            rgbgr_image(im);
+        }
+    }
+}
+
+void rescale_weights(convolutional_layer l, float scale, float trans)
+{
+    int i;
+    for(i = 0; i < l.n; ++i){
+        image im = get_convolutional_weight(l, i);
+        if (im.c == 3) {
+            scale_image(im, scale);
+            float sum = sum_array(im.data, im.w*im.h*im.c);
+            l.biases[i] += sum*trans;
+        }
+    }
+}
+
+image *get_weights(convolutional_layer l)
+{
+    image *weights = calloc(l.n, sizeof(image));
+    int i;
+    for(i = 0; i < l.n; ++i){
+        weights[i] = copy_image(get_convolutional_weight(l, i));
+        //normalize_image(weights[i]);
+    }
+    return weights;
+}
+
+image *visualize_convolutional_layer(convolutional_layer l, char *window, image *prev_weights)
+{
+    image *single_weights = get_weights(l);
+    show_images(single_weights, l.n, window);
+
+    image delta = get_convolutional_image(l);
     image dc = collapse_image_layers(delta, 1);
     char buff[256];
-    sprintf(buff, "%s: Delta", window);
-    show_image(dc, buff);
+    sprintf(buff, "%s: Output", window);
+    //show_image(dc, buff);
+    //save_image(dc, buff);
     free_image(dc);
-    show_image(filters, window);
-    free_image(filters);
+    return single_weights;
 }
 

--
Gitblit v1.10.0