From 1b5afb45838e603fa6780762eb8cc59246dc2d81 Mon Sep 17 00:00:00 2001
From: IlyaOvodov <b@ovdv.ru>
Date: Tue, 08 May 2018 11:09:35 +0000
Subject: [PATCH] Output improvements for detector results: When printing detector results, output was done in random order, obfuscating results for interpreting. Now: 1. Text output includes coordinates of rects in (left,right,top,bottom in pixels) along with label and score 2. Text output is sorted by rect lefts to simplify finding appropriate rects on image 3. If several class probs are > thresh for some detection, the most probable is written first and coordinates for others are not repeated 4. Rects are imprinted in image in order by their best class prob, so most probable rects are always on top and not overlayed by less probable ones 5. Most probable label for rect is always written first Also: 6. Message about low GPU memory include required amount

---
 src/network.c |  409 ++++++++++++++++++++++++++++++++++++++++++++++------------
 1 files changed, 325 insertions(+), 84 deletions(-)

diff --git a/src/network.c b/src/network.c
index d9585c4..ce6d28a 100644
--- a/src/network.c
+++ b/src/network.c
@@ -1,5 +1,6 @@
 #include <stdio.h>
 #include <time.h>
+#include <assert.h>
 #include "network.h"
 #include "image.h"
 #include "data.h"
@@ -8,17 +9,38 @@
 
 #include "crop_layer.h"
 #include "connected_layer.h"
+#include "gru_layer.h"
+#include "rnn_layer.h"
+#include "crnn_layer.h"
 #include "local_layer.h"
 #include "convolutional_layer.h"
-#include "deconvolutional_layer.h"
+#include "activation_layer.h"
 #include "detection_layer.h"
+#include "region_layer.h"
 #include "normalization_layer.h"
+#include "batchnorm_layer.h"
 #include "maxpool_layer.h"
+#include "reorg_layer.h"
 #include "avgpool_layer.h"
 #include "cost_layer.h"
 #include "softmax_layer.h"
 #include "dropout_layer.h"
 #include "route_layer.h"
+#include "shortcut_layer.h"
+#include "yolo_layer.h"
+#include "parser.h"
+
+network *load_network(char *cfg, char *weights, int clear)
+{
+	printf(" Try to load cfg: %s, weights: %s, clear = %d \n", cfg, weights, clear);
+	network *net = calloc(1, sizeof(network));
+	*net = parse_network_cfg(cfg);
+	if (weights && weights[0] != 0) {
+		load_weights(net, weights);
+	}
+	if (clear) (*net->seen) = 0;
+	return net;
+}
 
 int get_current_batch(network net)
 {
@@ -33,15 +55,37 @@
     net.momentum = 0;
     net.decay = 0;
     #ifdef GPU
-        if(gpu_index >= 0) update_network_gpu(net);
+        //if(net.gpu_index >= 0) update_network_gpu(net);
     #endif
 }
 
+void reset_network_state(network *net, int b)
+{
+	int i;
+	for (i = 0; i < net->n; ++i) {
+#ifdef GPU
+		layer l = net->layers[i];
+		if (l.state_gpu) {
+			fill_ongpu(l.outputs, 0, l.state_gpu + l.outputs*b, 1);
+		}
+		if (l.h_gpu) {
+			fill_ongpu(l.outputs, 0, l.h_gpu + l.outputs*b, 1);
+		}
+#endif
+	}
+}
+
+void reset_rnn(network *net)
+{
+	reset_network_state(net, 0);
+}
+
 float get_current_rate(network net)
 {
     int batch_num = get_current_batch(net);
     int i;
     float rate;
+	if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power);
     switch (net.policy) {
         case CONSTANT:
             return net.learning_rate;
@@ -52,13 +96,17 @@
             for(i = 0; i < net.num_steps; ++i){
                 if(net.steps[i] > batch_num) return rate;
                 rate *= net.scales[i];
-                if(net.steps[i] > batch_num - 1) reset_momentum(net);
+                //if(net.steps[i] > batch_num - 1 && net.scales[i] > 1) reset_momentum(net);
             }
             return rate;
         case EXP:
             return net.learning_rate * pow(net.gamma, batch_num);
         case POLY:
-            return net.learning_rate * pow(1 - (float)batch_num / net.max_batches, net.power);
+			return net.learning_rate * pow(1 - (float)batch_num / net.max_batches, net.power);
+            //if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power);
+            //return net.learning_rate * pow(1 - (float)batch_num / net.max_batches, net.power);
+        case RANDOM:
+            return net.learning_rate * pow(rand_uniform(0,1), net.power);
         case SIG:
             return net.learning_rate * (1./(1.+exp(net.gamma*(batch_num - net.step))));
         default:
@@ -72,20 +120,32 @@
     switch(a){
         case CONVOLUTIONAL:
             return "convolutional";
+        case ACTIVE:
+            return "activation";
         case LOCAL:
             return "local";
         case DECONVOLUTIONAL:
             return "deconvolutional";
         case CONNECTED:
             return "connected";
+        case RNN:
+            return "rnn";
+        case GRU:
+            return "gru";
+        case CRNN:
+            return "crnn";
         case MAXPOOL:
             return "maxpool";
+        case REORG:
+            return "reorg";
         case AVGPOOL:
             return "avgpool";
         case SOFTMAX:
             return "softmax";
         case DETECTION:
             return "detection";
+        case REGION:
+            return "region";
         case DROPOUT:
             return "dropout";
         case CROP:
@@ -94,8 +154,12 @@
             return "cost";
         case ROUTE:
             return "route";
+        case SHORTCUT:
+            return "shortcut";
         case NORMALIZATION:
             return "normalization";
+        case BATCHNORM:
+            return "batchnorm";
         default:
             break;
     }
@@ -111,45 +175,26 @@
     #ifdef GPU
     net.input_gpu = calloc(1, sizeof(float *));
     net.truth_gpu = calloc(1, sizeof(float *));
+
+	net.input16_gpu = calloc(1, sizeof(float *));
+	net.output16_gpu = calloc(1, sizeof(float *));
+	net.max_input16_size = calloc(1, sizeof(size_t));
+	net.max_output16_size = calloc(1, sizeof(size_t));
     #endif
     return net;
 }
 
 void forward_network(network net, network_state state)
 {
+    state.workspace = net.workspace;
     int i;
     for(i = 0; i < net.n; ++i){
+        state.index = i;
         layer l = net.layers[i];
         if(l.delta){
             scal_cpu(l.outputs * l.batch, 0, l.delta, 1);
         }
-        if(l.type == CONVOLUTIONAL){
-            forward_convolutional_layer(l, state);
-        } else if(l.type == DECONVOLUTIONAL){
-            forward_deconvolutional_layer(l, state);
-        } else if(l.type == LOCAL){
-            forward_local_layer(l, state);
-        } else if(l.type == NORMALIZATION){
-            forward_normalization_layer(l, state);
-        } else if(l.type == DETECTION){
-            forward_detection_layer(l, state);
-        } else if(l.type == CONNECTED){
-            forward_connected_layer(l, state);
-        } else if(l.type == CROP){
-            forward_crop_layer(l, state);
-        } else if(l.type == COST){
-            forward_cost_layer(l, state);
-        } else if(l.type == SOFTMAX){
-            forward_softmax_layer(l, state);
-        } else if(l.type == MAXPOOL){
-            forward_maxpool_layer(l, state);
-        } else if(l.type == AVGPOOL){
-            forward_avgpool_layer(l, state);
-        } else if(l.type == DROPOUT){
-            forward_dropout_layer(l, state);
-        } else if(l.type == ROUTE){
-            forward_route_layer(l, net);
-        }
+        l.forward(l, state);
         state.input = l.output;
     }
 }
@@ -161,20 +206,17 @@
     float rate = get_current_rate(net);
     for(i = 0; i < net.n; ++i){
         layer l = net.layers[i];
-        if(l.type == CONVOLUTIONAL){
-            update_convolutional_layer(l, update_batch, rate, net.momentum, net.decay);
-        } else if(l.type == DECONVOLUTIONAL){
-            update_deconvolutional_layer(l, rate, net.momentum, net.decay);
-        } else if(l.type == CONNECTED){
-            update_connected_layer(l, update_batch, rate, net.momentum, net.decay);
-        } else if(l.type == LOCAL){
-            update_local_layer(l, update_batch, rate, net.momentum, net.decay);
+        if(l.update){
+            l.update(l, update_batch, rate, net.momentum, net.decay);
         }
     }
 }
 
 float *get_network_output(network net)
 {
+#ifdef GPU
+    if (gpu_index >= 0) return get_network_output_gpu(net);
+#endif 
     int i;
     for(i = net.n-1; i > 0; --i) if(net.layers[i].type != COST) break;
     return net.layers[i].output;
@@ -186,11 +228,7 @@
     float sum = 0;
     int count = 0;
     for(i = 0; i < net.n; ++i){
-        if(net.layers[i].type == COST){
-            sum += net.layers[i].output[0];
-            ++count;
-        }
-        if(net.layers[i].type == DETECTION){
+        if(net.layers[i].cost){
             sum += net.layers[i].cost[0];
             ++count;
         }
@@ -210,7 +248,9 @@
     int i;
     float *original_input = state.input;
     float *original_delta = state.delta;
+    state.workspace = net.workspace;
     for(i = net.n-1; i >= 0; --i){
+        state.index = i;
         if(i == 0){
             state.input = original_input;
             state.delta = original_delta;
@@ -220,41 +260,20 @@
             state.delta = prev.delta;
         }
         layer l = net.layers[i];
-        if(l.type == CONVOLUTIONAL){
-            backward_convolutional_layer(l, state);
-        } else if(l.type == DECONVOLUTIONAL){
-            backward_deconvolutional_layer(l, state);
-        } else if(l.type == NORMALIZATION){
-            backward_normalization_layer(l, state);
-        } else if(l.type == MAXPOOL){
-            if(i != 0) backward_maxpool_layer(l, state);
-        } else if(l.type == AVGPOOL){
-            backward_avgpool_layer(l, state);
-        } else if(l.type == DROPOUT){
-            backward_dropout_layer(l, state);
-        } else if(l.type == DETECTION){
-            backward_detection_layer(l, state);
-        } else if(l.type == SOFTMAX){
-            if(i != 0) backward_softmax_layer(l, state);
-        } else if(l.type == CONNECTED){
-            backward_connected_layer(l, state);
-        } else if(l.type == LOCAL){
-            backward_local_layer(l, state);
-        } else if(l.type == COST){
-            backward_cost_layer(l, state);
-        } else if(l.type == ROUTE){
-            backward_route_layer(l, net);
-        }
+        if (l.stopbackward) break;
+        l.backward(l, state);
     }
 }
 
 float train_network_datum(network net, float *x, float *y)
 {
-    *net.seen += net.batch;
 #ifdef GPU
     if(gpu_index >= 0) return train_network_datum_gpu(net, x, y);
 #endif
     network_state state;
+    *net.seen += net.batch;
+    state.index = 0;
+    state.net = net;
     state.input = x;
     state.delta = 0;
     state.truth = y;
@@ -286,6 +305,7 @@
 
 float train_network(network net, data d)
 {
+    assert(d.X.rows % net.batch == 0);
     int batch = net.batch;
     int n = d.X.rows / batch;
     float *X = calloc(batch*d.X.cols, sizeof(float));
@@ -303,10 +323,13 @@
     return (float)sum/(n*batch);
 }
 
+
 float train_network_batch(network net, data d, int n)
 {
     int i,j;
     network_state state;
+    state.index = 0;
+    state.net = net;
     state.train = 1;
     state.delta = 0;
     float sum = 0;
@@ -331,39 +354,100 @@
     int i;
     for(i = 0; i < net->n; ++i){
         net->layers[i].batch = b;
+#ifdef CUDNN
+        if(net->layers[i].type == CONVOLUTIONAL){
+			cudnn_convolutional_setup(net->layers + i, cudnn_fastest);
+			/*
+			layer *l = net->layers + i;
+            cudnn_convolutional_setup(l, cudnn_fastest);
+			// check for excessive memory consumption 
+			size_t free_byte;
+			size_t total_byte;
+			check_error(cudaMemGetInfo(&free_byte, &total_byte));
+			if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) {
+				printf(" used slow CUDNN algo without Workspace! \n");
+				cudnn_convolutional_setup(l, cudnn_smallest);
+				l->workspace_size = get_workspace_size(*l);
+			}
+			*/
+        }
+#endif
     }
 }
 
 int resize_network(network *net, int w, int h)
 {
+#ifdef GPU
+    cuda_set_device(net->gpu_index);
+    if(gpu_index >= 0){
+        cuda_free(net->workspace);
+		if (net->input_gpu) {
+			cuda_free(*net->input_gpu);
+			*net->input_gpu = 0;
+			cuda_free(*net->truth_gpu);
+			*net->truth_gpu = 0;
+		}
+    }
+#endif
     int i;
     //if(w == net->w && h == net->h) return 0;
     net->w = w;
     net->h = h;
     int inputs = 0;
-    //fprintf(stderr, "Resizing to %d x %d...", w, h);
+    size_t workspace_size = 0;
+    //fprintf(stderr, "Resizing to %d x %d...\n", w, h);
     //fflush(stderr);
     for (i = 0; i < net->n; ++i){
         layer l = net->layers[i];
+		//printf(" %d: layer = %d,", i, l.type);
         if(l.type == CONVOLUTIONAL){
             resize_convolutional_layer(&l, w, h);
+        }else if(l.type == CROP){
+            resize_crop_layer(&l, w, h);
         }else if(l.type == MAXPOOL){
             resize_maxpool_layer(&l, w, h);
+        }else if(l.type == REGION){
+            resize_region_layer(&l, w, h);
+		}else if (l.type == YOLO) {
+			resize_yolo_layer(&l, w, h);
+        }else if(l.type == ROUTE){
+            resize_route_layer(&l, net);
+		}else if (l.type == SHORTCUT) {
+			resize_shortcut_layer(&l, w, h);
+		}else if (l.type == UPSAMPLE) {
+			resize_upsample_layer(&l, w, h);
+        }else if(l.type == REORG){
+            resize_reorg_layer(&l, w, h);
         }else if(l.type == AVGPOOL){
             resize_avgpool_layer(&l, w, h);
-            break;
         }else if(l.type == NORMALIZATION){
             resize_normalization_layer(&l, w, h);
         }else if(l.type == COST){
             resize_cost_layer(&l, inputs);
         }else{
+			fprintf(stderr, "Resizing type %d \n", (int)l.type);
             error("Cannot resize this type of layer");
         }
+        if(l.workspace_size > workspace_size) workspace_size = l.workspace_size;
         inputs = l.outputs;
         net->layers[i] = l;
         w = l.out_w;
         h = l.out_h;
+        if(l.type == AVGPOOL) break;
     }
+#ifdef GPU
+    if(gpu_index >= 0){
+		printf(" try to allocate workspace = %zu * sizeof(float), ", (workspace_size - 1) / sizeof(float) + 1);
+        net->workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1);
+		printf(" CUDA allocate done! \n");
+    }else {
+        free(net->workspace);
+        net->workspace = calloc(1, workspace_size);
+    }
+#else
+    free(net->workspace);
+    net->workspace = calloc(1, workspace_size);
+#endif
     //fprintf(stderr, " Done!\n");
     return 0;
 }
@@ -443,6 +527,8 @@
 #endif
 
     network_state state;
+    state.net = net;
+    state.index = 0;
     state.input = input;
     state.truth = 0;
     state.train = 0;
@@ -452,6 +538,112 @@
     return out;
 }
 
+int num_detections(network *net, float thresh)
+{
+	int i;
+	int s = 0;
+	for (i = 0; i < net->n; ++i) {
+		layer l = net->layers[i];
+		if (l.type == YOLO) {
+			s += yolo_num_detections(l, thresh);
+		}
+		if (l.type == DETECTION || l.type == REGION) {
+			s += l.w*l.h*l.n;
+		}
+	}
+	return s;
+}
+
+detection *make_network_boxes(network *net, float thresh, int *num)
+{
+	layer l = net->layers[net->n - 1];
+	int i;
+	int nboxes = num_detections(net, thresh);
+	if (num) *num = nboxes;
+	detection *dets = calloc(nboxes, sizeof(detection));
+	for (i = 0; i < nboxes; ++i) {
+		dets[i].prob = calloc(l.classes, sizeof(float));
+		if (l.coords > 4) {
+			dets[i].mask = calloc(l.coords - 4, sizeof(float));
+		}
+	}
+	return dets;
+}
+
+
+void custom_get_region_detections(layer l, int w, int h, int net_w, int net_h, float thresh, int *map, float hier, int relative, detection *dets, int letter)
+{
+	box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
+	float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
+	int i, j;
+	for (j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float *));
+	get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, map);
+	for (j = 0; j < l.w*l.h*l.n; ++j) {
+		dets[j].classes = l.classes;
+		dets[j].bbox = boxes[j];
+		dets[j].objectness = 1;
+		for (i = 0; i < l.classes; ++i) {
+			dets[j].prob[i] = probs[j][i];
+		}
+	}
+
+	free(boxes);
+	free_ptrs((void **)probs, l.w*l.h*l.n);
+
+	//correct_region_boxes(dets, l.w*l.h*l.n, w, h, net_w, net_h, relative);
+	correct_yolo_boxes(dets, l.w*l.h*l.n, w, h, net_w, net_h, relative, letter);
+}
+
+void fill_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, detection *dets, int letter)
+{
+	int j;
+	for (j = 0; j < net->n; ++j) {
+		layer l = net->layers[j];
+		if (l.type == YOLO) {
+			int count = get_yolo_detections(l, w, h, net->w, net->h, thresh, map, relative, dets, letter);
+			dets += count;
+		}
+		if (l.type == REGION) {
+			custom_get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets, letter);
+			//get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets);
+			dets += l.w*l.h*l.n;
+		}
+		if (l.type == DETECTION) {
+			get_detection_detections(l, w, h, thresh, dets);
+			dets += l.w*l.h*l.n;
+		}
+	}
+}
+
+detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num, int letter)
+{
+	detection *dets = make_network_boxes(net, thresh, num);
+	fill_network_boxes(net, w, h, thresh, hier, map, relative, dets, letter);
+	return dets;
+}
+
+void free_detections(detection *dets, int n)
+{
+	int i;
+	for (i = 0; i < n; ++i) {
+		free(dets[i].prob);
+		if (dets[i].mask) free(dets[i].mask);
+	}
+	free(dets);
+}
+
+float *network_predict_image(network *net, image im)
+{
+	image imr = letterbox_image(im, net->w, net->h);
+	set_batch_network(net, 1);
+	float *p = network_predict(*net, imr.data);
+	free_image(imr);
+	return p;
+}
+
+int network_width(network *net) { return net->w; }
+int network_height(network *net) { return net->h; }
+
 matrix network_predict_data_multi(network net, data test, int n)
 {
     int i,j,b,m;
@@ -560,7 +752,6 @@
     return acc;
 }
 
-
 float network_accuracy_multi(network net, data d, int n)
 {
     matrix guess = network_predict_data_multi(net, d, n);
@@ -571,15 +762,65 @@
 
 void free_network(network net)
 {
-    int i;
-    for(i = 0; i < net.n; ++i){
-        free_layer(net.layers[i]);
-    }
-    free(net.layers);
-    #ifdef GPU
-    if(*net.input_gpu) cuda_free(*net.input_gpu);
-    if(*net.truth_gpu) cuda_free(*net.truth_gpu);
-    if(net.input_gpu) free(net.input_gpu);
-    if(net.truth_gpu) free(net.truth_gpu);
-    #endif
+	int i;
+	for (i = 0; i < net.n; ++i) {
+		free_layer(net.layers[i]);
+	}
+	free(net.layers);
+#ifdef GPU
+	if (gpu_index >= 0) cuda_free(net.workspace);
+	else free(net.workspace);
+	if (*net.input_gpu) cuda_free(*net.input_gpu);
+	if (*net.truth_gpu) cuda_free(*net.truth_gpu);
+	if (net.input_gpu) free(net.input_gpu);
+	if (net.truth_gpu) free(net.truth_gpu);
+
+	if (*net.input16_gpu) cuda_free(*net.input16_gpu);
+	if (*net.output16_gpu) cuda_free(*net.output16_gpu);
+	if (net.input16_gpu) free(net.input16_gpu);
+	if (net.output16_gpu) free(net.output16_gpu);
+	if (net.max_input16_size) free(net.max_input16_size);
+	if (net.max_output16_size) free(net.max_output16_size);
+#else
+	free(net.workspace);
+#endif
+}
+
+
+void fuse_conv_batchnorm(network net)
+{
+	int j;
+	for (j = 0; j < net.n; ++j) {
+		layer *l = &net.layers[j];
+
+		if (l->type == CONVOLUTIONAL) {
+			//printf(" Merges Convolutional-%d and batch_norm \n", j);
+
+			if (l->batch_normalize) {
+				int f;
+				for (f = 0; f < l->n; ++f)
+				{
+					l->biases[f] = l->biases[f] - (double)l->scales[f] * l->rolling_mean[f] / (sqrt((double)l->rolling_variance[f]) + .000001f);
+
+					const size_t filter_size = l->size*l->size*l->c;
+					int i;
+					for (i = 0; i < filter_size; ++i) {
+						int w_index = f*filter_size + i;
+
+						l->weights[w_index] = (double)l->weights[w_index] * l->scales[f] / (sqrt((double)l->rolling_variance[f]) + .000001f);
+					}
+				}
+
+				l->batch_normalize = 0;
+#ifdef GPU
+				if (gpu_index >= 0) {
+					push_convolutional_layer(*l);
+				}
+#endif
+			}
+		}
+		else {
+			//printf(" Fusion skip layer type: %d \n", l->type);
+		}
+	}
 }

--
Gitblit v1.10.0