From 64aa0180bb74e84a75958b3da0061a9f5615729d Mon Sep 17 00:00:00 2001
From: Alexey <AlexeyAB@users.noreply.github.com>
Date: Sat, 03 Feb 2018 12:42:16 +0000
Subject: [PATCH] Merge pull request #355 from PTS93/patch-1

---
 src/convolutional_layer.c |  126 ++++++++++++++++++++++--------------------
 1 files changed, 66 insertions(+), 60 deletions(-)

diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 01bb700..a3247d0 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -8,6 +8,10 @@
 #include <stdio.h>
 #include <time.h>
 
+#ifdef CUDNN
+#pragma comment(lib, "cudnn.lib")  
+#endif
+
 #ifdef AI2
 #include "xnor_layer.h"
 #endif
@@ -142,8 +146,12 @@
     cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
     cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
     cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); 
-    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);
-    cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
+#if(CUDNN_MAJOR >= 6)
+	cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);	// cudnn 6.0
+#else
+	cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);	// cudnn 5.1
+#endif
+	cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
             l->srcTensorDesc,
             l->weightDesc,
             l->convDesc,
@@ -171,7 +179,7 @@
 #endif
 #endif
 
-convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor)
+convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam)
 {
     int i;
     convolutional_layer l = {0};
@@ -206,9 +214,12 @@
     l.outputs = l.out_h * l.out_w * l.out_c;
     l.inputs = l.w * l.h * l.c;
 
-    l.output = calloc(l.batch*out_h * out_w * n, sizeof(float));
-    l.delta  = calloc(l.batch*out_h * out_w * n, sizeof(float));
+    l.output = calloc(l.batch*l.outputs, sizeof(float));
+    l.delta  = calloc(l.batch*l.outputs, sizeof(float));
 
+    l.forward = forward_convolutional_layer;
+    l.backward = backward_convolutional_layer;
+    l.update = update_convolutional_layer;
     if(binary){
         l.binary_weights = calloc(c*n*size*size, sizeof(float));
         l.cweights = calloc(c*n*size*size, sizeof(char));
@@ -229,12 +240,31 @@
         l.mean = calloc(n, sizeof(float));
         l.variance = calloc(n, sizeof(float));
 
+        l.mean_delta = calloc(n, sizeof(float));
+        l.variance_delta = calloc(n, sizeof(float));
+
         l.rolling_mean = calloc(n, sizeof(float));
         l.rolling_variance = calloc(n, sizeof(float));
+        l.x = calloc(l.batch*l.outputs, sizeof(float));
+        l.x_norm = calloc(l.batch*l.outputs, sizeof(float));
+    }
+    if(adam){
+        l.adam = 1;
+        l.m = calloc(c*n*size*size, sizeof(float));
+        l.v = calloc(c*n*size*size, sizeof(float));
     }
 
 #ifdef GPU
+    l.forward_gpu = forward_convolutional_layer_gpu;
+    l.backward_gpu = backward_convolutional_layer_gpu;
+    l.update_gpu = update_convolutional_layer_gpu;
+
     if(gpu_index >= 0){
+        if (adam) {
+            l.m_gpu = cuda_make_array(l.m, c*n*size*size);
+            l.v_gpu = cuda_make_array(l.v, c*n*size*size);
+        }
+
         l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
         l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
 
@@ -283,7 +313,7 @@
     l.workspace_size = get_workspace_size(l);
     l.activation = activation;
 
-    fprintf(stderr, "Convolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n);
+    fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
 
     return l;
 }
@@ -305,7 +335,7 @@
 
 void test_convolutional_layer()
 {
-    convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0);
+    convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0);
     l.batch_normalize = 1;
     float data[] = {1,1,1,1,1,
         1,1,1,1,1,
@@ -340,17 +370,27 @@
     l->outputs = l->out_h * l->out_w * l->out_c;
     l->inputs = l->w * l->h * l->c;
 
-    l->output = realloc(l->output,
-            l->batch*out_h * out_w * l->n*sizeof(float));
-    l->delta  = realloc(l->delta,
-            l->batch*out_h * out_w * l->n*sizeof(float));
+    l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
+    l->delta  = realloc(l->delta,  l->batch*l->outputs*sizeof(float));
+    if(l->batch_normalize){
+        l->x = realloc(l->x, l->batch*l->outputs*sizeof(float));
+        l->x_norm  = realloc(l->x_norm, l->batch*l->outputs*sizeof(float));
+    }
 
 #ifdef GPU
     cuda_free(l->delta_gpu);
     cuda_free(l->output_gpu);
 
-    l->delta_gpu =     cuda_make_array(l->delta, l->batch*out_h*out_w*l->n);
-    l->output_gpu =    cuda_make_array(l->output, l->batch*out_h*out_w*l->n);
+    l->delta_gpu =  cuda_make_array(l->delta,  l->batch*l->outputs);
+    l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+
+    if(l->batch_normalize){
+        cuda_free(l->x_gpu);
+        cuda_free(l->x_norm_gpu);
+
+        l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+        l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+    }
 #ifdef CUDNN
     cudnn_convolutional_setup(l);
 #endif
@@ -398,41 +438,8 @@
     int out_w = convolutional_out_width(l);
     int i;
 
-
     fill_cpu(l.outputs*l.batch, 0, l.output, 1);
 
-    /*
-       if(l.binary){
-       binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
-       binarize_weights2(l.weights, l.n, l.c*l.size*l.size, l.cweights, l.scales);
-       swap_binary(&l);
-       }
-     */
-
-    /*
-       if(l.binary){
-       int m = l.n;
-       int k = l.size*l.size*l.c;
-       int n = out_h*out_w;
-
-       char  *a = l.cweights;
-       float *b = state.workspace;
-       float *c = l.output;
-
-       for(i = 0; i < l.batch; ++i){
-       im2col_cpu(state.input, l.c, l.h, l.w, 
-       l.size, l.stride, l.pad, b);
-       gemm_bin(m,n,k,1,a,k,b,n,c,n);
-       c += n*m;
-       state.input += l.c*l.h*l.w;
-       }
-       scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w);
-       add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
-       activate_array(l.output, m*n*l.batch, l.activation);
-       return;
-       }
-     */
-
     if(l.xnor){
         binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
         swap_binary(&l);
@@ -444,22 +451,17 @@
     int k = l.size*l.size*l.c;
     int n = out_h*out_w;
 
-    if (l.xnor && l.c%32 == 0 && AI2) {
-        forward_xnor_layer(l, state);
-        printf("xnor\n");
-    } else {
 
-        float *a = l.weights;
-        float *b = state.workspace;
-        float *c = l.output;
+    float *a = l.weights;
+    float *b = state.workspace;
+    float *c = l.output;
 
-        for(i = 0; i < l.batch; ++i){
-            im2col_cpu(state.input, l.c, l.h, l.w, 
-                    l.size, l.stride, l.pad, b);
-            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
-            c += n*m;
-            state.input += l.c*l.h*l.w;
-        }
+    for(i = 0; i < l.batch; ++i){
+        im2col_cpu(state.input, l.c, l.h, l.w, 
+                l.size, l.stride, l.pad, b);
+        gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
+        c += n*m;
+        state.input += l.c*l.h*l.w;
     }
 
     if(l.batch_normalize){
@@ -482,6 +484,10 @@
     gradient_array(l.output, m*k*l.batch, l.activation, l.delta);
     backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);
 
+    if(l.batch_normalize){
+        backward_batchnorm_layer(l, state);
+    }
+
     for(i = 0; i < l.batch; ++i){
         float *a = l.delta + i*m*k;
         float *b = state.workspace;

--
Gitblit v1.10.0