~speedprog/mtg/mtg_card_detector.git

parent: e56d1eff | patch | commit | show whitespace

Joseph Redmon

2015-07-21 d00f0a1ccd2a9b1c332bbf7754f291dd61dee14f

Changes to make routing work better

18 files modified

	Makefile	4 ●●●●● patch \| view \| raw \| blame \| history
	src/avgpool_layer.c	2 ●●●●● patch \| view \| raw \| blame \| history
	src/avgpool_layer_kernels.cu	2 ●●●●● patch \| view \| raw \| blame \| history
	src/col2im_kernels.cu	61 ●●●●● patch \| view \| raw \| blame \| history
	src/connected_layer.c	4 ●●●●● patch \| view \| raw \| blame \| history
	src/convolutional_kernels.cu	2 ●●●●● patch \| view \| raw \| blame \| history
	src/convolutional_layer.c	2 ●●●●● patch \| view \| raw \| blame \| history
	src/cost_layer.c	4 ●●●●● patch \| view \| raw \| blame \| history
	src/deconvolutional_layer.c	2 ●●●●● patch \| view \| raw \| blame \| history
	src/detection_layer.c	11 ●●●●● patch \| view \| raw \| blame \| history
	src/maxpool_layer.c	1 ●●●●● patch \| view \| raw \| blame \| history
	src/maxpool_layer_kernels.cu	2 ●●●●● patch \| view \| raw \| blame \| history
	src/network.c	3 ●●●●● patch \| view \| raw \| blame \| history
	src/network_kernels.cu	4 ●●●●● patch \| view \| raw \| blame \| history
	src/normalization_layer.c	1 ●●●●● patch \| view \| raw \| blame \| history
	src/route_layer.c	4 ●●●●● patch \| view \| raw \| blame \| history
	src/softmax_layer.c	2 ●●●●● patch \| view \| raw \| blame \| history
	src/softmax_layer_kernels.cu	2 ●●●●● patch \| view \| raw \| blame \| history

 Makefile

@@ -1,5 +1,5 @@
GPU=0
OPENCV=0
GPU=1
OPENCV=1
DEBUG=0

ARCH= -arch=sm_52

 src/avgpool_layer.c

@@ -58,7 +58,7 @@
            int out_index = k + b*l.c;
            for(i = 0; i < l.h*l.w; ++i){
                int in_index = i + l.h*l.w*(k + b*l.c);
                state.delta[in_index] = l.delta[out_index] / (l.h*l.w);
                state.delta[in_index] += l.delta[out_index] / (l.h*l.w);
            }
        }
    }

 src/avgpool_layer_kernels.cu

@@ -35,7 +35,7 @@
    int out_index = (k + c*b);
    for(i = 0; i < w*h; ++i){
        int in_index = i + h*w*(k + b*c);
        in_delta[in_index] = out_delta[out_index] / (w*h);
        in_delta[in_index] += out_delta[out_index] / (w*h);
    }
}


 src/col2im_kernels.cu

@@ -33,7 +33,7 @@
                val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
            }
        }
        data_im[index] = val;
        data_im[index] += val;
    }
}

@@ -53,62 +53,3 @@
                width_col, data_im);
}

/*
   __global__ void col2im_kernel(float *data_col,
   int channels, int height, int width,
   int ksize, int stride, int pad, float *data_im)
   {

   int height_col = (height - ksize) / stride + 1;
   int width_col = (width - ksize) / stride + 1;
   if (pad){
   height_col = 1 + (height-1) / stride;
   width_col = 1 + (width-1) / stride;
   pad = ksize/2;
   }

   int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
   if(id >= channels*height*width) return;

   int index = id;
   int w = id%width + pad;
   id /= width;
   int h = id%height + pad;
   id /= height;
   int c = id%channels;

   int w_start = (w-ksize+stride)/stride;
   int w_end = w/stride + 1;

   int h_start = (h-ksize+stride)/stride;
   int h_end = h/stride + 1;

// int rows = channels * ksize * ksize;
// int cols = height_col*width_col;
int col_offset = (c*ksize*ksize + h * ksize + w)*height_col*width_col;
int h_coeff = (1-stride*ksize*height_col)*width_col;
int w_coeff = 1-stride*height_col*width_col;
float val = 0;
int h_col, w_col;
for(h_col = h_start; h_col < h_end; ++h_col){
for(w_col = w_start; w_col < w_end; ++w_col){
int col_index = col_offset +h_col*h_coeff + w_col*w_coeff;
float part = (w_col < 0 || h_col < 0 || h_col >= height_col || w_col >= width_col) ? 0 : data_col[col_index];
val += part;
}
}
data_im[index] = val;
}


extern "C" void col2im_ongpu(float *data_col,
int channels,  int height,  int width,
int ksize,  int stride,  int pad, float *data_im)
{

size_t n = channels*height*width;

col2im_kernel<<<cuda_gridsize(n), BLOCK>>>(data_col, channels, height, width, ksize, stride, pad, data_im);
check_error(cudaPeekAtLastError());
}
 */

 src/connected_layer.c

@@ -103,7 +103,7 @@
    b = l.weights;
    c = state.delta;

    if(c) gemm(0,1,m,n,k,1,a,k,b,k,0,c,n);
    if(c) gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
}

#ifdef GPU
@@ -173,6 +173,6 @@
    b = l.weights_gpu;
    c = state.delta;

    if(c) gemm_ongpu(0,1,m,n,k,1,a,k,b,k,0,c,n);
    if(c) gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
}
#endif

 src/convolutional_kernels.cu

@@ -82,8 +82,6 @@
    gradient_array_ongpu(layer.output_gpu, m*k*layer.batch, layer.activation, layer.delta_gpu);
    backward_bias_gpu(layer.bias_updates_gpu, layer.delta_gpu, layer.batch, layer.n, k);

    if(state.delta) scal_ongpu(layer.batch*layer.h*layer.w*layer.c, 0, state.delta, 1);

    for(i = 0; i < layer.batch; ++i){
        float * a = layer.delta_gpu;
        float * b = layer.col_image_gpu;

 src/convolutional_layer.c

@@ -188,8 +188,6 @@
    gradient_array(l.output, m*k*l.batch, l.activation, l.delta);
    backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);

    if(state.delta) memset(state.delta, 0, l.batch*l.h*l.w*l.c*sizeof(float));

    for(i = 0; i < l.batch; ++i){
        float *a = l.delta + i*m*k;
        float *b = l.col_image;

 src/cost_layer.c

@@ -61,7 +61,7 @@

void backward_cost_layer(const cost_layer l, network_state state)
{
    copy_cpu(l.batch*l.inputs, l.delta, 1, state.delta, 1);
    axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
}

#ifdef GPU
@@ -92,7 +92,7 @@

void backward_cost_layer_gpu(const cost_layer l, network_state state)
{
    copy_ongpu(l.batch*l.inputs, l.delta_gpu, 1, state.delta, 1);
    axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, state.delta, 1);
}
#endif


 src/deconvolutional_layer.c

@@ -159,8 +159,6 @@
    gradient_array(l.output, size*l.n*l.batch, l.activation, l.delta);
    backward_bias(l.bias_updates, l.delta, l.batch, l.n, size);

    if(state.delta) memset(state.delta, 0, l.batch*l.h*l.w*l.c*sizeof(float));

    for(i = 0; i < l.batch; ++i){
        int m = l.c;
        int n = l.size*l.size*l.n;

 src/detection_layer.c

@@ -141,20 +141,20 @@
        float scale = 1;
        float latent_delta = 0;
        if(l.joint) scale = state.input[in_i++];
        else if (l.objectness)   state.delta[in_i++] = -l.delta[out_i++];
        else if (l.background) state.delta[in_i++] = scale*l.delta[out_i++];
        else if (l.objectness)   state.delta[in_i++] += -l.delta[out_i++];
        else if (l.background) state.delta[in_i++] += scale*l.delta[out_i++];
        for(j = 0; j < l.classes; ++j){
            latent_delta += state.input[in_i]*l.delta[out_i];
            state.delta[in_i++] = scale*l.delta[out_i++];
            state.delta[in_i++] += scale*l.delta[out_i++];
        }

        if (l.objectness) {

        }else if (l.background) gradient_array(l.output + out_i, l.coords, LOGISTIC, l.delta + out_i);
        for(j = 0; j < l.coords; ++j){
            state.delta[in_i++] = l.delta[out_i++];
            state.delta[in_i++] += l.delta[out_i++];
        }
        if(l.joint) state.delta[in_i-l.coords-l.classes-l.joint] = latent_delta;
        if(l.joint) state.delta[in_i-l.coords-l.classes-l.joint] += latent_delta;
    }
}

@@ -199,6 +199,7 @@
    cpu_state.delta = delta_cpu;

    cuda_pull_array(state.input, in_cpu, l.batch*l.inputs);
    cuda_pull_array(state.delta, delta_cpu, l.batch*l.inputs);
    cuda_pull_array(l.delta_gpu, l.delta, l.batch*outputs);
    backward_detection_layer(l, cpu_state);
    cuda_push_array(state.delta, delta_cpu, l.batch*l.inputs);

 src/maxpool_layer.c

@@ -114,7 +114,6 @@
    int h = (l.h-1)/l.stride + 1;
    int w = (l.w-1)/l.stride + 1;
    int c = l.c;
    memset(state.delta, 0, l.batch*l.h*l.w*l.c*sizeof(float));
    for(i = 0; i < h*w*c*l.batch; ++i){
        int index = l.indexes[i];
        state.delta[index] += l.delta[i];

 src/maxpool_layer_kernels.cu

@@ -77,7 +77,7 @@
            d += (valid && indexes[out_index] == index) ? delta[out_index] : 0;
        }
    }
    prev_delta[index] = d;
    prev_delta[index] += d;
}

extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state state)

 src/network.c

@@ -68,6 +68,9 @@
    int i;
    for(i = 0; i < net.n; ++i){
        layer l = net.layers[i];
        if(l.delta){
            scal_cpu(l.outputs * l.batch, 0, l.delta, 1);
        }
        if(l.type == CONVOLUTIONAL){
            forward_convolutional_layer(l, state);
        } else if(l.type == DECONVOLUTIONAL){

 src/network_kernels.cu

@@ -21,6 +21,7 @@
#include "softmax_layer.h"
#include "dropout_layer.h"
#include "route_layer.h"
#include "blas.h"
}

float * get_network_output_gpu_layer(network net, int i);
@@ -32,6 +33,9 @@
    int i;
    for(i = 0; i < net.n; ++i){
        layer l = net.layers[i];
        if(l.delta){
            scal_ongpu(l.outputs * l.batch, 0, l.delta_gpu, 1);
        }
        if(l.type == CONVOLUTIONAL){
            forward_convolutional_layer_gpu(l, state);
        } else if(l.type == DECONVOLUTIONAL){

 src/normalization_layer.c

@@ -90,6 +90,7 @@
void backward_normalization_layer(const layer layer, network_state state)
{
    // TODO This is approximate ;-)
    // Also this should add in to delta instead of overwritting.

    int w = layer.w;
    int h = layer.h;

 src/route_layer.c

@@ -54,7 +54,7 @@
        float *delta = net.layers[index].delta;
        int input_size = l.input_sizes[i];
        for(j = 0; j < l.batch; ++j){
            copy_cpu(input_size, l.delta + offset + j*l.outputs, 1, delta + j*input_size, 1);
            axpy_cpu(input_size, 1, l.delta + offset + j*l.outputs, 1, delta + j*input_size, 1);
        }
        offset += input_size;
    }
@@ -85,7 +85,7 @@
        float *delta = net.layers[index].delta_gpu;
        int input_size = l.input_sizes[i];
        for(j = 0; j < l.batch; ++j){
            copy_ongpu(input_size, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1);
            axpy_ongpu(input_size, 1, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1);
        }
        offset += input_size;
    }

 src/softmax_layer.c

@@ -58,7 +58,7 @@
{
    int i;
    for(i = 0; i < l.inputs*l.batch; ++i){
        state.delta[i] = l.delta[i];
        state.delta[i] += l.delta[i];
    }
}


 src/softmax_layer_kernels.cu

@@ -42,7 +42,7 @@

extern "C" void backward_softmax_layer_gpu(const softmax_layer layer, network_state state)
{
    copy_ongpu(layer.batch*layer.inputs, layer.delta_gpu, 1, state.delta, 1);
    axpy_ongpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, state.delta, 1);
}

/* This is if you want softmax w/o log-loss classification. You probably don't.

			@@ -1,5 +1,5 @@
			GPU=0
			OPENCV=0
			GPU=1
			OPENCV=1
			DEBUG=0

			ARCH= -arch=sm_52

			@@ -58,7 +58,7 @@
			int out_index = k + b*l.c;
			for(i = 0; i < l.h*l.w; ++i){
			int in_index = i + l.hl.w(k + b*l.c);
			state.delta[in_index] = l.delta[out_index] / (l.h*l.w);
			state.delta[in_index] += l.delta[out_index] / (l.h*l.w);
			}
			}
			}

			@@ -35,7 +35,7 @@
			int out_index = (k + c*b);
			for(i = 0; i < w*h; ++i){
			int in_index = i + hw(k + b*c);
			in_delta[in_index] = out_delta[out_index] / (w*h);
			in_delta[in_index] += out_delta[out_index] / (w*h);
			}
			}

			@@ -33,7 +33,7 @@
			val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
			}
			}
			data_im[index] = val;
			data_im[index] += val;
			}
			}

			@@ -53,62 +53,3 @@
			width_col, data_im);
			}

			/*
			__global__ void col2im_kernel(float *data_col,
			int channels, int height, int width,
			int ksize, int stride, int pad, float *data_im)
			{

			int height_col = (height - ksize) / stride + 1;
			int width_col = (width - ksize) / stride + 1;
			if (pad){
			height_col = 1 + (height-1) / stride;
			width_col = 1 + (width-1) / stride;
			pad = ksize/2;
			}

			int id = (blockIdx.x + blockIdx.ygridDim.x) blockDim.x + threadIdx.x;
			if(id >= channelsheightwidth) return;

			int index = id;
			int w = id%width + pad;
			id /= width;
			int h = id%height + pad;
			id /= height;
			int c = id%channels;

			int w_start = (w-ksize+stride)/stride;
			int w_end = w/stride + 1;

			int h_start = (h-ksize+stride)/stride;
			int h_end = h/stride + 1;

			// int rows = channels * ksize * ksize;
			// int cols = height_col*width_col;
			int col_offset = (cksizeksize + h * ksize + w)height_colwidth_col;
			int h_coeff = (1-strideksizeheight_col)*width_col;
			int w_coeff = 1-strideheight_colwidth_col;
			float val = 0;
			int h_col, w_col;
			for(h_col = h_start; h_col < h_end; ++h_col){
			for(w_col = w_start; w_col < w_end; ++w_col){
			int col_index = col_offset +h_colh_coeff + w_colw_coeff;
			float part = (w_col < 0 \|\| h_col < 0 \|\| h_col >= height_col \|\| w_col >= width_col) ? 0 : data_col[col_index];
			val += part;
			}
			}
			data_im[index] = val;
			}


			extern "C" void col2im_ongpu(float *data_col,
			int channels, int height, int width,
			int ksize, int stride, int pad, float *data_im)
			{

			size_t n = channelsheightwidth;

			col2im_kernel<<<cuda_gridsize(n), BLOCK>>>(data_col, channels, height, width, ksize, stride, pad, data_im);
			check_error(cudaPeekAtLastError());
			}
			*/

			@@ -103,7 +103,7 @@
			b = l.weights;
			c = state.delta;

			if(c) gemm(0,1,m,n,k,1,a,k,b,k,0,c,n);
			if(c) gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
			}

			#ifdef GPU
			@@ -173,6 +173,6 @@
			b = l.weights_gpu;
			c = state.delta;

			if(c) gemm_ongpu(0,1,m,n,k,1,a,k,b,k,0,c,n);
			if(c) gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
			}
			#endif

			@@ -82,8 +82,6 @@
			gradient_array_ongpu(layer.output_gpu, mklayer.batch, layer.activation, layer.delta_gpu);
			backward_bias_gpu(layer.bias_updates_gpu, layer.delta_gpu, layer.batch, layer.n, k);

			if(state.delta) scal_ongpu(layer.batchlayer.hlayer.w*layer.c, 0, state.delta, 1);

			for(i = 0; i < layer.batch; ++i){
			float * a = layer.delta_gpu;
			float * b = layer.col_image_gpu;

			@@ -188,8 +188,6 @@
			gradient_array(l.output, mkl.batch, l.activation, l.delta);
			backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);

			if(state.delta) memset(state.delta, 0, l.batchl.hl.wl.csizeof(float));

			for(i = 0; i < l.batch; ++i){
			float a = l.delta + im*k;
			float *b = l.col_image;

			@@ -61,7 +61,7 @@

			void backward_cost_layer(const cost_layer l, network_state state)
			{
			copy_cpu(l.batch*l.inputs, l.delta, 1, state.delta, 1);
			axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
			}

			#ifdef GPU
			@@ -92,7 +92,7 @@

			void backward_cost_layer_gpu(const cost_layer l, network_state state)
			{
			copy_ongpu(l.batch*l.inputs, l.delta_gpu, 1, state.delta, 1);
			axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, state.delta, 1);
			}
			#endif

			@@ -159,8 +159,6 @@
			gradient_array(l.output, sizel.nl.batch, l.activation, l.delta);
			backward_bias(l.bias_updates, l.delta, l.batch, l.n, size);

			if(state.delta) memset(state.delta, 0, l.batchl.hl.wl.csizeof(float));

			for(i = 0; i < l.batch; ++i){
			int m = l.c;
			int n = l.sizel.sizel.n;

			@@ -141,20 +141,20 @@
			float scale = 1;
			float latent_delta = 0;
			if(l.joint) scale = state.input[in_i++];
			else if (l.objectness) state.delta[in_i++] = -l.delta[out_i++];
			else if (l.background) state.delta[in_i++] = scale*l.delta[out_i++];
			else if (l.objectness) state.delta[in_i++] += -l.delta[out_i++];
			else if (l.background) state.delta[in_i++] += scale*l.delta[out_i++];
			for(j = 0; j < l.classes; ++j){
			latent_delta += state.input[in_i]*l.delta[out_i];
			state.delta[in_i++] = scale*l.delta[out_i++];
			state.delta[in_i++] += scale*l.delta[out_i++];
			}

			if (l.objectness) {

			}else if (l.background) gradient_array(l.output + out_i, l.coords, LOGISTIC, l.delta + out_i);
			for(j = 0; j < l.coords; ++j){
			state.delta[in_i++] = l.delta[out_i++];
			state.delta[in_i++] += l.delta[out_i++];
			}
			if(l.joint) state.delta[in_i-l.coords-l.classes-l.joint] = latent_delta;
			if(l.joint) state.delta[in_i-l.coords-l.classes-l.joint] += latent_delta;
			}
			}

			@@ -199,6 +199,7 @@
			cpu_state.delta = delta_cpu;

			cuda_pull_array(state.input, in_cpu, l.batch*l.inputs);
			cuda_pull_array(state.delta, delta_cpu, l.batch*l.inputs);
			cuda_pull_array(l.delta_gpu, l.delta, l.batch*outputs);
			backward_detection_layer(l, cpu_state);
			cuda_push_array(state.delta, delta_cpu, l.batch*l.inputs);

			@@ -114,7 +114,6 @@
			int h = (l.h-1)/l.stride + 1;
			int w = (l.w-1)/l.stride + 1;
			int c = l.c;
			memset(state.delta, 0, l.batchl.hl.wl.csizeof(float));
			for(i = 0; i < hwc*l.batch; ++i){
			int index = l.indexes[i];
			state.delta[index] += l.delta[i];

			@@ -77,7 +77,7 @@
			d += (valid && indexes[out_index] == index) ? delta[out_index] : 0;
			}
			}
			prev_delta[index] = d;
			prev_delta[index] += d;
			}

			extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state state)

			@@ -68,6 +68,9 @@
			int i;
			for(i = 0; i < net.n; ++i){
			layer l = net.layers[i];
			if(l.delta){
			scal_cpu(l.outputs * l.batch, 0, l.delta, 1);
			}
			if(l.type == CONVOLUTIONAL){
			forward_convolutional_layer(l, state);
			} else if(l.type == DECONVOLUTIONAL){

			@@ -21,6 +21,7 @@
			#include "softmax_layer.h"
			#include "dropout_layer.h"
			#include "route_layer.h"
			#include "blas.h"
			}

			float * get_network_output_gpu_layer(network net, int i);
			@@ -32,6 +33,9 @@
			int i;
			for(i = 0; i < net.n; ++i){
			layer l = net.layers[i];
			if(l.delta){
			scal_ongpu(l.outputs * l.batch, 0, l.delta_gpu, 1);
			}
			if(l.type == CONVOLUTIONAL){
			forward_convolutional_layer_gpu(l, state);
			} else if(l.type == DECONVOLUTIONAL){

			@@ -90,6 +90,7 @@
			void backward_normalization_layer(const layer layer, network_state state)
			{
			// TODO This is approximate ;-)
			// Also this should add in to delta instead of overwritting.

			int w = layer.w;
			int h = layer.h;

			@@ -54,7 +54,7 @@
			float *delta = net.layers[index].delta;
			int input_size = l.input_sizes[i];
			for(j = 0; j < l.batch; ++j){
			copy_cpu(input_size, l.delta + offset + jl.outputs, 1, delta + jinput_size, 1);
			axpy_cpu(input_size, 1, l.delta + offset + jl.outputs, 1, delta + jinput_size, 1);
			}
			offset += input_size;
			}
			@@ -85,7 +85,7 @@
			float *delta = net.layers[index].delta_gpu;
			int input_size = l.input_sizes[i];
			for(j = 0; j < l.batch; ++j){
			copy_ongpu(input_size, l.delta_gpu + offset + jl.outputs, 1, delta + jinput_size, 1);
			axpy_ongpu(input_size, 1, l.delta_gpu + offset + jl.outputs, 1, delta + jinput_size, 1);
			}
			offset += input_size;
			}

			@@ -58,7 +58,7 @@
			{
			int i;
			for(i = 0; i < l.inputs*l.batch; ++i){
			state.delta[i] = l.delta[i];
			state.delta[i] += l.delta[i];
			}
			}

			@@ -42,7 +42,7 @@

			extern "C" void backward_softmax_layer_gpu(const softmax_layer layer, network_state state)
			{
			copy_ongpu(layer.batch*layer.inputs, layer.delta_gpu, 1, state.delta, 1);
			axpy_ongpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, state.delta, 1);
			}

			/* This is if you want softmax w/o log-loss classification. You probably don't.