From d00f0a1ccd2a9b1c332bbf7754f291dd61dee14f Mon Sep 17 00:00:00 2001
From: Joseph Redmon <pjreddie@gmail.com>
Date: Tue, 21 Jul 2015 23:09:33 +0000
Subject: [PATCH] Changes to make routing work better
---
src/network.c | 3 +
src/softmax_layer_kernels.cu | 2
src/col2im_kernels.cu | 61 ------------------------------
src/normalization_layer.c | 1
src/cost_layer.c | 4 +-
Makefile | 4 +-
src/network_kernels.cu | 4 ++
src/connected_layer.c | 4 +-
src/softmax_layer.c | 2
src/deconvolutional_layer.c | 2 -
src/maxpool_layer_kernels.cu | 2
src/detection_layer.c | 13 +++---
src/route_layer.c | 4 +-
src/convolutional_layer.c | 2 -
src/convolutional_kernels.cu | 2 -
src/avgpool_layer.c | 2
src/avgpool_layer_kernels.cu | 2
src/maxpool_layer.c | 1
18 files changed, 29 insertions(+), 86 deletions(-)
diff --git a/Makefile b/Makefile
index 32ff4c0..1b42277 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
-GPU=0
-OPENCV=0
+GPU=1
+OPENCV=1
DEBUG=0
ARCH= -arch=sm_52
diff --git a/src/avgpool_layer.c b/src/avgpool_layer.c
index ee56161..8eccde6 100644
--- a/src/avgpool_layer.c
+++ b/src/avgpool_layer.c
@@ -58,7 +58,7 @@
int out_index = k + b*l.c;
for(i = 0; i < l.h*l.w; ++i){
int in_index = i + l.h*l.w*(k + b*l.c);
- state.delta[in_index] = l.delta[out_index] / (l.h*l.w);
+ state.delta[in_index] += l.delta[out_index] / (l.h*l.w);
}
}
}
diff --git a/src/avgpool_layer_kernels.cu b/src/avgpool_layer_kernels.cu
index ca628c6..1bd2a2a 100644
--- a/src/avgpool_layer_kernels.cu
+++ b/src/avgpool_layer_kernels.cu
@@ -35,7 +35,7 @@
int out_index = (k + c*b);
for(i = 0; i < w*h; ++i){
int in_index = i + h*w*(k + b*c);
- in_delta[in_index] = out_delta[out_index] / (w*h);
+ in_delta[in_index] += out_delta[out_index] / (w*h);
}
}
diff --git a/src/col2im_kernels.cu b/src/col2im_kernels.cu
index 67c0b03..7262f92 100644
--- a/src/col2im_kernels.cu
+++ b/src/col2im_kernels.cu
@@ -33,7 +33,7 @@
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
}
}
- data_im[index] = val;
+ data_im[index] += val;
}
}
@@ -53,62 +53,3 @@
width_col, data_im);
}
-/*
- __global__ void col2im_kernel(float *data_col,
- int channels, int height, int width,
- int ksize, int stride, int pad, float *data_im)
- {
-
- int height_col = (height - ksize) / stride + 1;
- int width_col = (width - ksize) / stride + 1;
- if (pad){
- height_col = 1 + (height-1) / stride;
- width_col = 1 + (width-1) / stride;
- pad = ksize/2;
- }
-
- int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
- if(id >= channels*height*width) return;
-
- int index = id;
- int w = id%width + pad;
- id /= width;
- int h = id%height + pad;
- id /= height;
- int c = id%channels;
-
- int w_start = (w-ksize+stride)/stride;
- int w_end = w/stride + 1;
-
- int h_start = (h-ksize+stride)/stride;
- int h_end = h/stride + 1;
-
-// int rows = channels * ksize * ksize;
-// int cols = height_col*width_col;
-int col_offset = (c*ksize*ksize + h * ksize + w)*height_col*width_col;
-int h_coeff = (1-stride*ksize*height_col)*width_col;
-int w_coeff = 1-stride*height_col*width_col;
-float val = 0;
-int h_col, w_col;
-for(h_col = h_start; h_col < h_end; ++h_col){
-for(w_col = w_start; w_col < w_end; ++w_col){
-int col_index = col_offset +h_col*h_coeff + w_col*w_coeff;
-float part = (w_col < 0 || h_col < 0 || h_col >= height_col || w_col >= width_col) ? 0 : data_col[col_index];
-val += part;
-}
-}
-data_im[index] = val;
-}
-
-
-extern "C" void col2im_ongpu(float *data_col,
-int channels, int height, int width,
-int ksize, int stride, int pad, float *data_im)
-{
-
-size_t n = channels*height*width;
-
-col2im_kernel<<<cuda_gridsize(n), BLOCK>>>(data_col, channels, height, width, ksize, stride, pad, data_im);
-check_error(cudaPeekAtLastError());
-}
- */
diff --git a/src/connected_layer.c b/src/connected_layer.c
index 55d84ca..4323505 100644
--- a/src/connected_layer.c
+++ b/src/connected_layer.c
@@ -103,7 +103,7 @@
b = l.weights;
c = state.delta;
- if(c) gemm(0,1,m,n,k,1,a,k,b,k,0,c,n);
+ if(c) gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
}
#ifdef GPU
@@ -173,6 +173,6 @@
b = l.weights_gpu;
c = state.delta;
- if(c) gemm_ongpu(0,1,m,n,k,1,a,k,b,k,0,c,n);
+ if(c) gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
}
#endif
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index d260a95..a150c20 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -82,8 +82,6 @@
gradient_array_ongpu(layer.output_gpu, m*k*layer.batch, layer.activation, layer.delta_gpu);
backward_bias_gpu(layer.bias_updates_gpu, layer.delta_gpu, layer.batch, layer.n, k);
- if(state.delta) scal_ongpu(layer.batch*layer.h*layer.w*layer.c, 0, state.delta, 1);
-
for(i = 0; i < layer.batch; ++i){
float * a = layer.delta_gpu;
float * b = layer.col_image_gpu;
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index c266934..c3a3718 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -188,8 +188,6 @@
gradient_array(l.output, m*k*l.batch, l.activation, l.delta);
backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);
- if(state.delta) memset(state.delta, 0, l.batch*l.h*l.w*l.c*sizeof(float));
-
for(i = 0; i < l.batch; ++i){
float *a = l.delta + i*m*k;
float *b = l.col_image;
diff --git a/src/cost_layer.c b/src/cost_layer.c
index 76aa17e..d1ae6e5 100644
--- a/src/cost_layer.c
+++ b/src/cost_layer.c
@@ -61,7 +61,7 @@
void backward_cost_layer(const cost_layer l, network_state state)
{
- copy_cpu(l.batch*l.inputs, l.delta, 1, state.delta, 1);
+ axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
}
#ifdef GPU
@@ -92,7 +92,7 @@
void backward_cost_layer_gpu(const cost_layer l, network_state state)
{
- copy_ongpu(l.batch*l.inputs, l.delta_gpu, 1, state.delta, 1);
+ axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, state.delta, 1);
}
#endif
diff --git a/src/deconvolutional_layer.c b/src/deconvolutional_layer.c
index 524fc95..0f4e1e8 100644
--- a/src/deconvolutional_layer.c
+++ b/src/deconvolutional_layer.c
@@ -159,8 +159,6 @@
gradient_array(l.output, size*l.n*l.batch, l.activation, l.delta);
backward_bias(l.bias_updates, l.delta, l.batch, l.n, size);
- if(state.delta) memset(state.delta, 0, l.batch*l.h*l.w*l.c*sizeof(float));
-
for(i = 0; i < l.batch; ++i){
int m = l.c;
int n = l.size*l.size*l.n;
diff --git a/src/detection_layer.c b/src/detection_layer.c
index 9ef89d9..6a25819 100644
--- a/src/detection_layer.c
+++ b/src/detection_layer.c
@@ -141,20 +141,20 @@
float scale = 1;
float latent_delta = 0;
if(l.joint) scale = state.input[in_i++];
- else if (l.objectness) state.delta[in_i++] = -l.delta[out_i++];
- else if (l.background) state.delta[in_i++] = scale*l.delta[out_i++];
+ else if (l.objectness) state.delta[in_i++] += -l.delta[out_i++];
+ else if (l.background) state.delta[in_i++] += scale*l.delta[out_i++];
for(j = 0; j < l.classes; ++j){
latent_delta += state.input[in_i]*l.delta[out_i];
- state.delta[in_i++] = scale*l.delta[out_i++];
+ state.delta[in_i++] += scale*l.delta[out_i++];
}
if (l.objectness) {
}else if (l.background) gradient_array(l.output + out_i, l.coords, LOGISTIC, l.delta + out_i);
for(j = 0; j < l.coords; ++j){
- state.delta[in_i++] = l.delta[out_i++];
+ state.delta[in_i++] += l.delta[out_i++];
}
- if(l.joint) state.delta[in_i-l.coords-l.classes-l.joint] = latent_delta;
+ if(l.joint) state.delta[in_i-l.coords-l.classes-l.joint] += latent_delta;
}
}
@@ -198,7 +198,8 @@
cpu_state.truth = truth_cpu;
cpu_state.delta = delta_cpu;
- cuda_pull_array(state.input, in_cpu, l.batch*l.inputs);
+ cuda_pull_array(state.input, in_cpu, l.batch*l.inputs);
+ cuda_pull_array(state.delta, delta_cpu, l.batch*l.inputs);
cuda_pull_array(l.delta_gpu, l.delta, l.batch*outputs);
backward_detection_layer(l, cpu_state);
cuda_push_array(state.delta, delta_cpu, l.batch*l.inputs);
diff --git a/src/maxpool_layer.c b/src/maxpool_layer.c
index bc3aa68..ef06175 100644
--- a/src/maxpool_layer.c
+++ b/src/maxpool_layer.c
@@ -114,7 +114,6 @@
int h = (l.h-1)/l.stride + 1;
int w = (l.w-1)/l.stride + 1;
int c = l.c;
- memset(state.delta, 0, l.batch*l.h*l.w*l.c*sizeof(float));
for(i = 0; i < h*w*c*l.batch; ++i){
int index = l.indexes[i];
state.delta[index] += l.delta[i];
diff --git a/src/maxpool_layer_kernels.cu b/src/maxpool_layer_kernels.cu
index 6c633a9..8f69f90 100644
--- a/src/maxpool_layer_kernels.cu
+++ b/src/maxpool_layer_kernels.cu
@@ -77,7 +77,7 @@
d += (valid && indexes[out_index] == index) ? delta[out_index] : 0;
}
}
- prev_delta[index] = d;
+ prev_delta[index] += d;
}
extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state state)
diff --git a/src/network.c b/src/network.c
index e85dfe9..5b52da9 100644
--- a/src/network.c
+++ b/src/network.c
@@ -68,6 +68,9 @@
int i;
for(i = 0; i < net.n; ++i){
layer l = net.layers[i];
+ if(l.delta){
+ scal_cpu(l.outputs * l.batch, 0, l.delta, 1);
+ }
if(l.type == CONVOLUTIONAL){
forward_convolutional_layer(l, state);
} else if(l.type == DECONVOLUTIONAL){
diff --git a/src/network_kernels.cu b/src/network_kernels.cu
index 3a4f0bf..6562590 100644
--- a/src/network_kernels.cu
+++ b/src/network_kernels.cu
@@ -21,6 +21,7 @@
#include "softmax_layer.h"
#include "dropout_layer.h"
#include "route_layer.h"
+#include "blas.h"
}
float * get_network_output_gpu_layer(network net, int i);
@@ -32,6 +33,9 @@
int i;
for(i = 0; i < net.n; ++i){
layer l = net.layers[i];
+ if(l.delta){
+ scal_ongpu(l.outputs * l.batch, 0, l.delta_gpu, 1);
+ }
if(l.type == CONVOLUTIONAL){
forward_convolutional_layer_gpu(l, state);
} else if(l.type == DECONVOLUTIONAL){
diff --git a/src/normalization_layer.c b/src/normalization_layer.c
index d080559..587ece7 100644
--- a/src/normalization_layer.c
+++ b/src/normalization_layer.c
@@ -90,6 +90,7 @@
void backward_normalization_layer(const layer layer, network_state state)
{
// TODO This is approximate ;-)
+ // Also this should add in to delta instead of overwritting.
int w = layer.w;
int h = layer.h;
diff --git a/src/route_layer.c b/src/route_layer.c
index e3802b7..67b606c 100644
--- a/src/route_layer.c
+++ b/src/route_layer.c
@@ -54,7 +54,7 @@
float *delta = net.layers[index].delta;
int input_size = l.input_sizes[i];
for(j = 0; j < l.batch; ++j){
- copy_cpu(input_size, l.delta + offset + j*l.outputs, 1, delta + j*input_size, 1);
+ axpy_cpu(input_size, 1, l.delta + offset + j*l.outputs, 1, delta + j*input_size, 1);
}
offset += input_size;
}
@@ -85,7 +85,7 @@
float *delta = net.layers[index].delta_gpu;
int input_size = l.input_sizes[i];
for(j = 0; j < l.batch; ++j){
- copy_ongpu(input_size, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1);
+ axpy_ongpu(input_size, 1, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1);
}
offset += input_size;
}
diff --git a/src/softmax_layer.c b/src/softmax_layer.c
index ea22d05..0d19aca 100644
--- a/src/softmax_layer.c
+++ b/src/softmax_layer.c
@@ -58,7 +58,7 @@
{
int i;
for(i = 0; i < l.inputs*l.batch; ++i){
- state.delta[i] = l.delta[i];
+ state.delta[i] += l.delta[i];
}
}
diff --git a/src/softmax_layer_kernels.cu b/src/softmax_layer_kernels.cu
index 0529f75..8fbaf19 100644
--- a/src/softmax_layer_kernels.cu
+++ b/src/softmax_layer_kernels.cu
@@ -42,7 +42,7 @@
extern "C" void backward_softmax_layer_gpu(const softmax_layer layer, network_state state)
{
- copy_ongpu(layer.batch*layer.inputs, layer.delta_gpu, 1, state.delta, 1);
+ axpy_ongpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, state.delta, 1);
}
/* This is if you want softmax w/o log-loss classification. You probably don't.
--
Gitblit v1.10.0