From 989ab8c38a02fa7ea9c25108151736c62e81c972 Mon Sep 17 00:00:00 2001
From: Joseph Redmon <pjreddie@gmail.com>
Date: Fri, 24 Apr 2015 17:27:50 +0000
Subject: [PATCH] IOU loss function

---
 src/network.c         |    3 
 src/detection.c       |   60 +++++-----
 src/utils.h           |    4 
 src/imagenet.c        |    2 
 src/data.c            |   20 +-
 src/detection_layer.h |    2 
 src/darknet.c         |    1 
 src/detection_layer.c |  242 ++++++++++++++++++++++++++++++++++++---
 8 files changed, 272 insertions(+), 62 deletions(-)

diff --git a/src/darknet.c b/src/darknet.c
index 46a8c82..411efdf 100644
--- a/src/darknet.c
+++ b/src/darknet.c
@@ -93,6 +93,7 @@
 
 int main(int argc, char **argv)
 {
+    //test_box();
     //test_convolutional_layer();
     if(argc < 2){
         fprintf(stderr, "usage: %s <function>\n", argv[0]);
diff --git a/src/data.c b/src/data.c
index 2b74386..f1f5b80 100644
--- a/src/data.c
+++ b/src/data.c
@@ -65,22 +65,22 @@
     return X;
 }
 
-typedef struct box{
+typedef struct{
     int id;
     float x,y,w,h;
     float left, right, top, bottom;
-} box;
+} box_label;
 
-box *read_boxes(char *filename, int *n)
+box_label *read_boxes(char *filename, int *n)
 {
-    box *boxes = calloc(1, sizeof(box));
+    box_label *boxes = calloc(1, sizeof(box_label));
     FILE *file = fopen(filename, "r");
     if(!file) file_error(filename);
     float x, y, h, w;
     int id;
     int count = 0;
     while(fscanf(file, "%d %f %f %f %f", &id, &x, &y, &w, &h) == 5){
-        boxes = realloc(boxes, (count+1)*sizeof(box));
+        boxes = realloc(boxes, (count+1)*sizeof(box_label));
         boxes[count].id = id;
         boxes[count].x = x;
         boxes[count].y = y;
@@ -97,11 +97,11 @@
     return boxes;
 }
 
-void randomize_boxes(box *b, int n)
+void randomize_boxes(box_label *b, int n)
 {
     int i;
     for(i = 0; i < n; ++i){
-        box swap = b[i];
+        box_label swap = b[i];
         int index = rand_r(&data_seed)%n;
         b[i] = b[index];
         b[index] = swap;
@@ -114,7 +114,7 @@
     labelpath = find_replace(labelpath, ".jpg", ".txt");
     labelpath = find_replace(labelpath, ".JPEG", ".txt");
     int count = 0;
-    box *boxes = read_boxes(labelpath, &count);
+    box_label *boxes = read_boxes(labelpath, &count);
     randomize_boxes(boxes, count);
     float x,y,w,h;
     float left, top, right, bot;
@@ -174,10 +174,10 @@
         if(background) truth[index++] = 0;
         truth[index+id] = 1;
         index += classes;
-        truth[index++] = y;
         truth[index++] = x;
-        truth[index++] = h;
+        truth[index++] = y;
         truth[index++] = w;
+        truth[index++] = h;
     }
     free(boxes);
 }
diff --git a/src/detection.c b/src/detection.c
index c61c799..f61da67 100644
--- a/src/detection.c
+++ b/src/detection.c
@@ -81,9 +81,9 @@
     if (imgnet){
         plist = get_paths("/home/pjreddie/data/imagenet/det.train.list");
     }else{
-        //plist = get_paths("/home/pjreddie/data/voc/trainall.txt");
+        plist = get_paths("/home/pjreddie/data/voc/trainall.txt");
         //plist = get_paths("/home/pjreddie/data/coco/trainval.txt");
-        plist = get_paths("/home/pjreddie/data/voc/all2007-2012.txt");
+        //plist = get_paths("/home/pjreddie/data/voc/all2007-2012.txt");
     }
     paths = (char **)list_to_array(plist);
     pthread_t load_thread = load_data_detection_thread(imgs, paths, plist->size, classes, net.w, net.h, side, side, background, &buffer);
@@ -95,12 +95,12 @@
         train = buffer;
         load_thread = load_data_detection_thread(imgs, paths, plist->size, classes, net.w, net.h, side, side, background, &buffer);
 
-/*
- image im = float_to_image(net.w, net.h, 3, train.X.vals[114]);
- image copy = copy_image(im);
- draw_detection(copy, train.y.vals[114], 7);
- free_image(copy);
- */
+        /*
+           image im = float_to_image(net.w, net.h, 3, train.X.vals[114]);
+           image copy = copy_image(im);
+           draw_detection(copy, train.y.vals[114], 7);
+           free_image(copy);
+         */
 
         printf("Loaded: %lf seconds\n", sec(clock()-time));
         time=clock();
@@ -120,30 +120,30 @@
 
 void predict_detections(network net, data d, float threshold, int offset, int classes, int nuisance, int background, int num_boxes, int per_box)
 {
-        matrix pred = network_predict_data(net, d);
-        int j, k, class;
-        for(j = 0; j < pred.rows; ++j){
-            for(k = 0; k < pred.cols; k += per_box){
-                float scale = 1.;
-                int index = k/per_box;
-                int row = index / num_boxes;
-                int col = index % num_boxes;
-                if (nuisance) scale = 1.-pred.vals[j][k];
-                for (class = 0; class < classes; ++class){
-                    int ci = k+classes+background+nuisance;
-                    float y = (pred.vals[j][ci + 0] + row)/num_boxes;
-                    float x = (pred.vals[j][ci + 1] + col)/num_boxes;
-                    float h = pred.vals[j][ci + 2]; //* distance_from_edge(row, num_boxes);
-                    h = h*h;
-                    float w = pred.vals[j][ci + 3]; //* distance_from_edge(col, num_boxes);
-                    w = w*w;
-                    float prob = scale*pred.vals[j][k+class+background+nuisance];
-                    if(prob < threshold) continue;
-                    printf("%d %d %f %f %f %f %f\n", offset +  j, class, prob, y, x, h, w);
-                }
+    matrix pred = network_predict_data(net, d);
+    int j, k, class;
+    for(j = 0; j < pred.rows; ++j){
+        for(k = 0; k < pred.cols; k += per_box){
+            float scale = 1.;
+            int index = k/per_box;
+            int row = index / num_boxes;
+            int col = index % num_boxes;
+            if (nuisance) scale = 1.-pred.vals[j][k];
+            for (class = 0; class < classes; ++class){
+                int ci = k+classes+background+nuisance;
+                float y = (pred.vals[j][ci + 0] + row)/num_boxes;
+                float x = (pred.vals[j][ci + 1] + col)/num_boxes;
+                float h = pred.vals[j][ci + 2]; //* distance_from_edge(row, num_boxes);
+                h = h*h;
+                float w = pred.vals[j][ci + 3]; //* distance_from_edge(col, num_boxes);
+                w = w*w;
+                float prob = scale*pred.vals[j][k+class+background+nuisance];
+                if(prob < threshold) continue;
+                printf("%d %d %f %f %f %f %f\n", offset +  j, class, prob, y, x, h, w);
             }
         }
-        free_matrix(pred);
+    }
+    free_matrix(pred);
 }
 
 void validate_detection(char *cfgfile, char *weightfile)
diff --git a/src/detection_layer.c b/src/detection_layer.c
index 73b2862..7eaabb4 100644
--- a/src/detection_layer.c
+++ b/src/detection_layer.c
@@ -3,7 +3,9 @@
 #include "softmax_layer.h"
 #include "blas.h"
 #include "cuda.h"
+#include "utils.h"
 #include <stdio.h>
+#include <string.h>
 #include <stdlib.h>
 
 int get_detection_layer_locations(detection_layer layer)
@@ -26,6 +28,8 @@
     layer->coords = coords;
     layer->rescore = rescore;
     layer->nuisance = nuisance;
+    layer->cost = calloc(1, sizeof(float));
+    layer->does_cost=1;
     layer->background = background;
     int outputs = get_detection_layer_output_size(*layer);
     layer->output = calloc(batch*outputs, sizeof(float));
@@ -63,6 +67,169 @@
     }
 }
 
+typedef struct{
+    float dx, dy, dw, dh;
+} dbox;
+
+dbox derivative(box a, box b)
+{
+    dbox d;
+    d.dx = 0;
+    d.dw = 0;
+    float l1 = a.x - a.w/2;
+    float l2 = b.x - b.w/2;
+    if (l1 > l2){
+        d.dx -= 1;
+        d.dw += .5;
+    }
+    float r1 = a.x + a.w/2;
+    float r2 = b.x + b.w/2;
+    if(r1 < r2){
+        d.dx += 1;
+        d.dw += .5;
+    }
+    if (l1 > r2) {
+        d.dx = -1;
+        d.dw = 0;
+    }
+    if (r1 < l2){
+        d.dx = 1;
+        d.dw = 0;
+    }
+
+    d.dy = 0;
+    d.dh = 0;
+    float t1 = a.y - a.h/2;
+    float t2 = b.y - b.h/2;
+    if (t1 > t2){
+        d.dy -= 1;
+        d.dh += .5;
+    }
+    float b1 = a.y + a.h/2;
+    float b2 = b.y + b.h/2;
+    if(b1 < b2){
+        d.dy += 1;
+        d.dh += .5;
+    }
+    if (t1 > b2) {
+        d.dy = -1;
+        d.dh = 0;
+    }
+    if (b1 < t2){
+        d.dy = 1;
+        d.dh = 0;
+    }
+    return d;
+}
+
+float overlap(float x1, float w1, float x2, float w2)
+{
+    float l1 = x1 - w1/2;
+    float l2 = x2 - w2/2;
+    float left = l1 > l2 ? l1 : l2;
+    float r1 = x1 + w1/2;
+    float r2 = x2 + w2/2;
+    float right = r1 < r2 ? r1 : r2;
+    return right - left;
+}
+
+float box_intersection(box a, box b)
+{
+    float w = overlap(a.x, a.w, b.x, b.w);
+    float h = overlap(a.y, a.h, b.y, b.h);
+    if(w < 0 || h < 0) return 0;
+    float area = w*h;
+    return area;
+}
+
+float box_union(box a, box b)
+{
+    float i = box_intersection(a, b);
+    float u = a.w*a.h + b.w*b.h - i;
+    return u;
+}
+
+float box_iou(box a, box b)
+{
+    return box_intersection(a, b)/box_union(a, b);
+}
+
+dbox dintersect(box a, box b)
+{
+    float w = overlap(a.x, a.w, b.x, b.w);
+    float h = overlap(a.y, a.h, b.y, b.h);
+    dbox dover = derivative(a, b);
+    dbox di;
+
+    di.dw = dover.dw*h;
+    di.dx = dover.dx*h;
+    di.dh = dover.dh*w;
+    di.dy = dover.dy*w;
+    if(h < 0 || w < 0){
+        di.dx = dover.dx;
+        di.dy = dover.dy;
+    }
+    return di;
+}
+
+dbox dunion(box a, box b)
+{
+    dbox du = {0,0,0,0};;
+    float w = overlap(a.x, a.w, b.x, b.w);
+    float h = overlap(a.y, a.h, b.y, b.h);
+    if(w > 0 && h > 0){
+        dbox di = dintersect(a, b);
+        du.dw = h - di.dw;
+        du.dh = w - di.dw;
+        du.dx = -di.dx;
+        du.dy = -di.dy;
+    }
+    return du;
+}
+
+dbox diou(box a, box b)
+{
+    float u = box_union(a,b);
+    float i = box_intersection(a,b);
+    dbox di = dintersect(a,b);
+    dbox du = dunion(a,b);
+    dbox dd = {0,0,0,0};
+    if(i < 0) {
+        dd.dx = b.x - a.x;
+        dd.dy = b.y - a.y;
+        dd.dw = b.w - a.w;
+        dd.dh = b.h - a.h;
+        return dd;
+    }
+    dd.dx = 2*pow((1-(i/u)),1)*(di.dx*u - du.dx*i)/(u*u);
+    dd.dy = 2*pow((1-(i/u)),1)*(di.dy*u - du.dy*i)/(u*u);
+    dd.dw = 2*pow((1-(i/u)),1)*(di.dw*u - du.dw*i)/(u*u);
+    dd.dh = 2*pow((1-(i/u)),1)*(di.dh*u - du.dh*i)/(u*u);
+    return dd;
+}
+
+void test_box()
+{
+    box a = {1, 1, 1, 1};
+    box b = {0, 0, .5, .2};
+    int count = 0;
+    while(count++ < 300){
+        dbox d = diou(a, b);
+        printf("%f %f %f %f\n", a.x, a.y, a.w, a.h);
+        a.x += .1*d.dx;
+        a.w += .1*d.dw;
+        a.y += .1*d.dy;
+        a.h += .1*d.dh;
+        printf("inter: %f\n", box_intersection(a, b));
+        printf("union: %f\n", box_union(a, b));
+        printf("IOU: %f\n", box_iou(a, b));
+        if(d.dx==0 && d.dw==0 && d.dy==0 && d.dh==0) {
+            printf("break!!!\n");
+            break;
+        }
+    }
+}
+
 void forward_detection_layer(const detection_layer layer, network_state state)
 {
     int in_i = 0;
@@ -92,31 +259,63 @@
             layer.output[out_i++] = mask*state.input[in_i++];
         }
     }
-    /*
-    int count = 0;
-    for(i = 0; i < layer.batch*locations; ++i){
-        for(j = 0; j < layer.classes+layer.background; ++j){
-            printf("%f, ", layer.output[count++]);
-        }
-        printf("\n");
-        for(j = 0; j < layer.coords; ++j){
-            printf("%f, ", layer.output[count++]);
-        }
-        printf("\n");
-    }
-    */
-    /*
-    if(layer.background || 1){
+    if(layer.does_cost){
+        *(layer.cost) = 0;
+        int size = get_detection_layer_output_size(layer) * layer.batch;
+        memset(layer.delta, 0, size * sizeof(float));
         for(i = 0; i < layer.batch*locations; ++i){
-            int index = i*(layer.classes+layer.coords+layer.background);
-            for(j= 0; j < layer.classes; ++j){
-                if(state.truth[index+j+layer.background]){
-                    //dark_zone(layer, j, index, state);
-                }
+            int classes = layer.nuisance+layer.classes;
+            int offset = i*(classes+layer.coords);
+            for(j = offset; j < offset+classes; ++j){
+                *(layer.cost) += pow(state.truth[j] - layer.output[j], 2);
+                layer.delta[j] =  state.truth[j] - layer.output[j];
             }
+            box truth;
+            truth.x = state.truth[j+0];
+            truth.y = state.truth[j+1];
+            truth.w = state.truth[j+2];
+            truth.h = state.truth[j+3];
+            box out;
+            out.x = layer.output[j+0];
+            out.y = layer.output[j+1];
+            out.w = layer.output[j+2];
+            out.h = layer.output[j+3];
+            if(!(truth.w*truth.h)) continue;
+            float iou = box_iou(truth, out);
+            //printf("iou: %f\n", iou);
+            *(layer.cost) += pow((1-iou), 2);
+            dbox d = diou(out, truth);
+            layer.delta[j+0] = d.dx;
+            layer.delta[j+1] = d.dy;
+            layer.delta[j+2] = d.dw;
+            layer.delta[j+3] = d.dh;
         }
     }
-    */
+    /*
+       int count = 0;
+       for(i = 0; i < layer.batch*locations; ++i){
+       for(j = 0; j < layer.classes+layer.background; ++j){
+       printf("%f, ", layer.output[count++]);
+       }
+       printf("\n");
+       for(j = 0; j < layer.coords; ++j){
+       printf("%f, ", layer.output[count++]);
+       }
+       printf("\n");
+       }
+     */
+    /*
+       if(layer.background || 1){
+       for(i = 0; i < layer.batch*locations; ++i){
+       int index = i*(layer.classes+layer.coords+layer.background);
+       for(j= 0; j < layer.classes; ++j){
+       if(state.truth[index+j+layer.background]){
+//dark_zone(layer, j, index, state);
+}
+}
+}
+}
+     */
 }
 
 void backward_detection_layer(const detection_layer layer, network_state state)
@@ -164,6 +363,7 @@
     cpu_state.input = in_cpu;
     forward_detection_layer(layer, cpu_state);
     cuda_push_array(layer.output_gpu, layer.output, layer.batch*outputs);
+    cuda_push_array(layer.delta_gpu, layer.delta, layer.batch*outputs);
     free(cpu_state.input);
     if(cpu_state.truth) free(cpu_state.truth);
 }
diff --git a/src/detection_layer.h b/src/detection_layer.h
index a56cb25..0aa5f66 100644
--- a/src/detection_layer.h
+++ b/src/detection_layer.h
@@ -11,6 +11,8 @@
     int background;
     int rescore;
     int nuisance;
+    int does_cost;
+    float *cost;
     float *output;
     float *delta;
     #ifdef GPU
diff --git a/src/imagenet.c b/src/imagenet.c
index 906dbd4..3f88b36 100644
--- a/src/imagenet.c
+++ b/src/imagenet.c
@@ -47,7 +47,7 @@
         printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), net.seen);
         free_data(train);
         //if(i%100 == 0 && net.learning_rate > .00001) net.learning_rate *= .97;
-        if(i%100==0){
+        if(i%1000==0){
             char buff[256];
             sprintf(buff, "/home/pjreddie/imagenet_backup/%s_%d.weights",base, i);
             save_weights(net, buff);
diff --git a/src/network.c b/src/network.c
index 5571076..3247a31 100644
--- a/src/network.c
+++ b/src/network.c
@@ -186,6 +186,9 @@
     if(net.types[net.n-1] == COST){
         return ((cost_layer *)net.layers[net.n-1])->output[0];
     }
+    if(net.types[net.n-1] == DETECTION){
+        return ((detection_layer *)net.layers[net.n-1])->cost[0];
+    }
     return 0;
 }
 
diff --git a/src/utils.h b/src/utils.h
index 578abc3..0db16de 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -36,5 +36,9 @@
 float mag_array(float *a, int n);
 float **one_hot_encode(float *a, int n, int k);
 float sec(clock_t clocks);
+
+typedef struct{
+    float x, y, w, h;
+} box;
 #endif
 

--
Gitblit v1.10.0