From 1b5afb45838e603fa6780762eb8cc59246dc2d81 Mon Sep 17 00:00:00 2001
From: IlyaOvodov <b@ovdv.ru>
Date: Tue, 08 May 2018 11:09:35 +0000
Subject: [PATCH] Output improvements for detector results: When printing detector results, output was done in random order, obfuscating results for interpreting. Now: 1. Text output includes coordinates of rects in (left,right,top,bottom in pixels) along with label and score 2. Text output is sorted by rect lefts to simplify finding appropriate rects on image 3. If several class probs are > thresh for some detection, the most probable is written first and coordinates for others are not repeated 4. Rects are imprinted in image in order by their best class prob, so most probable rects are always on top and not overlayed by less probable ones 5. Most probable label for rect is always written first Also: 6. Message about low GPU memory include required amount

---
 src/blas.c |  213 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 204 insertions(+), 9 deletions(-)

diff --git a/src/blas.c b/src/blas.c
index 556603c..4ff0b83 100644
--- a/src/blas.c
+++ b/src/blas.c
@@ -1,15 +1,92 @@
 #include "blas.h"
-#include "math.h"
 
-void shortcut_cpu(float *out, int w, int h, int c, int batch, int sample, float *add, int stride, int c2)
+#include <math.h>
+#include <assert.h>
+#include <float.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+void reorg_cpu(float *x, int out_w, int out_h, int out_c, int batch, int stride, int forward, float *out)
 {
+    int b,i,j,k;
+    int in_c = out_c/(stride*stride);
+
+	//printf("\n out_c = %d, out_w = %d, out_h = %d, stride = %d, forward = %d \n", out_c, out_w, out_h, stride, forward);
+	//printf("  in_c = %d,  in_w = %d,  in_h = %d \n", in_c, out_w*stride, out_h*stride);
+
+    for(b = 0; b < batch; ++b){
+        for(k = 0; k < out_c; ++k){
+            for(j = 0; j < out_h; ++j){
+                for(i = 0; i < out_w; ++i){
+                    int in_index  = i + out_w*(j + out_h*(k + out_c*b));
+                    int c2 = k % in_c;
+                    int offset = k / in_c;
+                    int w2 = i*stride + offset % stride;
+                    int h2 = j*stride + offset / stride;
+                    int out_index = w2 + out_w*stride*(h2 + out_h*stride*(c2 + in_c*b));
+                    if(forward) out[out_index] = x[in_index];	// used by default for forward (i.e. forward = 0)
+                    else out[in_index] = x[out_index];
+                }
+            }
+        }
+    }
+}
+
+void flatten(float *x, int size, int layers, int batch, int forward)
+{
+    float *swap = calloc(size*layers*batch, sizeof(float));
+    int i,c,b;
+    for(b = 0; b < batch; ++b){
+        for(c = 0; c < layers; ++c){
+            for(i = 0; i < size; ++i){
+                int i1 = b*layers*size + c*size + i;
+                int i2 = b*layers*size + i*layers + c;
+                if (forward) swap[i2] = x[i1];
+                else swap[i1] = x[i2];
+            }
+        }
+    }
+    memcpy(x, swap, size*layers*batch*sizeof(float));
+    free(swap);
+}
+
+void weighted_sum_cpu(float *a, float *b, float *s, int n, float *c)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        c[i] = s[i]*a[i] + (1-s[i])*(b ? b[i] : 0);
+    }
+}
+
+void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, float *ds, int n, float *dc)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        if(da) da[i] += dc[i] * s[i];
+        if(db) db[i] += dc[i] * (1-s[i]);
+        ds[i] += dc[i] * (a[i] - b[i]);
+    }
+}
+
+void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
+{
+    int stride = w1/w2;
+    int sample = w2/w1;
+    assert(stride == h1/h2);
+    assert(sample == h2/h1);
+    if(stride < 1) stride = 1;
+    if(sample < 1) sample = 1;
+    int minw = (w1 < w2) ? w1 : w2;
+    int minh = (h1 < h2) ? h1 : h2;
+    int minc = (c1 < c2) ? c1 : c2;
+
     int i,j,k,b;
     for(b = 0; b < batch; ++b){
-        for(k = 0; k < c && k < c2; ++k){
-            for(j = 0; j < h/sample; ++j){
-                for(i = 0; i < w/sample; ++i){
-                    int out_index = i*sample + w*(j*sample + h*(k + c*b));
-                    int add_index = b*w*stride/sample*h*stride/sample*c2 + i*stride + w*stride/sample*(j*stride + h*stride/sample*k);
+        for(k = 0; k < minc; ++k){
+            for(j = 0; j < minh; ++j){
+                for(i = 0; i < minw; ++i){
+                    int out_index = i*sample + w2*(j*sample + h2*(k + c2*b));
+                    int add_index = i*stride + w1*(j*stride + h1*(k + c1*b));
                     out[out_index] += add[add_index];
                 }
             }
@@ -35,7 +112,7 @@
 
 void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance)
 {
-    float scale = 1./(batch * spatial);
+    float scale = 1./(batch * spatial - 1);
     int i,j,k;
     for(i = 0; i < filters; ++i){
         variance[i] = 0;
@@ -56,7 +133,7 @@
         for(f = 0; f < filters; ++f){
             for(i = 0; i < spatial; ++i){
                 int index = b*filters*spatial + f*spatial + i;
-                x[index] = (x[index] - mean[f])/(sqrt(variance[f]));
+                x[index] = (x[index] - mean[f])/(sqrt(variance[f]) + .000001f);
             }
         }
     }
@@ -98,12 +175,85 @@
     for(i = 0; i < N; ++i) X[i*INCX] = ALPHA;
 }
 
+void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
+{
+    int i, j;
+    int index = 0;
+    for(j = 0; j < B; ++j) {
+        for(i = 0; i < NX; ++i){
+            if(X) X[j*NX + i] += OUT[index];
+            ++index;
+        }
+        for(i = 0; i < NY; ++i){
+            if(Y) Y[j*NY + i] += OUT[index];
+            ++index;
+        }
+    }
+}
+
+void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
+{
+    int i, j;
+    int index = 0;
+    for(j = 0; j < B; ++j) {
+        for(i = 0; i < NX; ++i){
+            OUT[index++] = X[j*NX + i];
+        }
+        for(i = 0; i < NY; ++i){
+            OUT[index++] = Y[j*NY + i];
+        }
+    }
+}
+
 void copy_cpu(int N, float *X, int INCX, float *Y, int INCY)
 {
     int i;
     for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX];
 }
 
+void mult_add_into_cpu(int N, float *X, float *Y, float *Z)
+{
+    int i;
+    for(i = 0; i < N; ++i) Z[i] += X[i]*Y[i];
+}
+
+void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        float diff = truth[i] - pred[i];
+        float abs_val = fabs(diff);
+        if(abs_val < 1) {
+            error[i] = diff * diff;
+            delta[i] = diff;
+        }
+        else {
+            error[i] = 2*abs_val - 1;
+            delta[i] = (diff < 0) ? 1 : -1;
+        }
+    }
+}
+
+void l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        float diff = truth[i] - pred[i];
+        error[i] = fabs(diff);
+        delta[i] = diff > 0 ? 1 : -1;
+    }
+}
+
+void l2_cpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        float diff = truth[i] - pred[i];
+        error[i] = diff * diff;
+        delta[i] = diff;
+    }
+}
+
 float dot_cpu(int N, float *X, int INCX, float *Y, int INCY)
 {
     int i;
@@ -112,3 +262,48 @@
     return dot;
 }
 
+void softmax(float *input, int n, float temp, float *output, int stride)
+{
+    int i;
+    float sum = 0;
+    float largest = -FLT_MAX;
+    for(i = 0; i < n; ++i){
+        if(input[i*stride] > largest) largest = input[i*stride];
+    }
+    for(i = 0; i < n; ++i){
+        float e = exp(input[i*stride]/temp - largest/temp);
+        sum += e;
+        output[i*stride] = e;
+    }
+    for(i = 0; i < n; ++i){
+        output[i*stride] /= sum;
+    }
+}
+
+
+void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
+{
+    int g, b;
+    for(b = 0; b < batch; ++b){
+        for(g = 0; g < groups; ++g){
+            softmax(input + b*batch_offset + g*group_offset, n, temp, output + b*batch_offset + g*group_offset, stride);
+        }
+    }
+}
+
+void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
+{
+	int i, j, k, b;
+	for (b = 0; b < batch; ++b) {
+		for (k = 0; k < c; ++k) {
+			for (j = 0; j < h*stride; ++j) {
+				for (i = 0; i < w*stride; ++i) {
+					int in_index = b*w*h*c + k*w*h + (j / stride)*w + i / stride;
+					int out_index = b*w*h*c*stride*stride + k*w*h*stride*stride + j*w*stride + i;
+					if (forward) out[out_index] = scale*in[in_index];
+					else in[in_index] += scale*out[out_index];
+				}
+			}
+		}
+	}
+}
\ No newline at end of file

--
Gitblit v1.10.0