~speedprog/mtg/mtg_card_detector.git

parent: 9802287b | patch | commit | show whitespace

Joseph Redmon

2016-01-18 1578ec70d751231218c869d345404ea68be9e5e8

idk

31 files modified

2 files added

	Makefile	2 ●●●●● patch \| view \| raw \| blame \| history
	cfg/msr_152.cfg	306 ●●●●● patch \| view \| raw \| blame \| history
	cfg/msr_34.cfg	5 ●●●●● patch \| view \| raw \| blame \| history
	cfg/msr_50.cfg	114 ●●●●● patch \| view \| raw \| blame \| history
	data/dog.jpg	patch \| view \| raw \| blame \| history
	src/activation_layer.c	58 ●●●●● patch \| view \| raw \| blame \| history
	src/activation_layer.h	19 ●●●●● patch \| view \| raw \| blame \| history
	src/avgpool_layer.c	3 ●●●●● patch \| view \| raw \| blame \| history
	src/blas.c	23 ●●●●● patch \| view \| raw \| blame \| history
	src/blas.h	4 ●●●●● patch \| view \| raw \| blame \| history
	src/blas_kernels.cu	40 ●●●●● patch \| view \| raw \| blame \| history
	src/classifier.c	144 ●●●●● patch \| view \| raw \| blame \| history
	src/connected_layer.c	2 ●●●●● patch \| view \| raw \| blame \| history
	src/convolutional_layer.c	2 ●●●●● patch \| view \| raw \| blame \| history
	src/crop_layer.c	38 ●●●●● patch \| view \| raw \| blame \| history
	src/crop_layer.h	1 ●●●●● patch \| view \| raw \| blame \| history
	src/crop_layer_kernels.cu	4 ●●●●● patch \| view \| raw \| blame \| history
	src/data.c	24 ●●●●● patch \| view \| raw \| blame \| history
	src/dropout_layer.c	2 ●●●●● patch \| view \| raw \| blame \| history
	src/image.c	34 ●●●●● patch \| view \| raw \| blame \| history
	src/image.h	1 ●●●●● patch \| view \| raw \| blame \| history
	src/imagenet.c	7 ●●●●● patch \| view \| raw \| blame \| history
	src/layer.h	5 ●●●●● patch \| view \| raw \| blame \| history
	src/local_layer.c	2 ●●●●● patch \| view \| raw \| blame \| history
	src/maxpool_layer.c	1 ●●●●● patch \| view \| raw \| blame \| history
	src/network.c	11 ●●●●● patch \| view \| raw \| blame \| history
	src/network_kernels.cu	5 ●●●●● patch \| view \| raw \| blame \| history
	src/nightmare.c	88 ●●●●● patch \| view \| raw \| blame \| history
	src/parser.c	29 ●●●●● patch \| view \| raw \| blame \| history
	src/shortcut_layer.c	31 ●●●●● patch \| view \| raw \| blame \| history
	src/utils.c	4 ●●●●● patch \| view \| raw \| blame \| history
	src/utils.h	2 ●●●●● patch \| view \| raw \| blame \| history
	src/yolo.c	2 ●●●●● patch \| view \| raw \| blame \| history

 Makefile

@@ -34,7 +34,7 @@
LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand
endif

OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o imagenet.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o
OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o imagenet.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o
ifeq ($(GPU), 1) 
OBJ+=convolutional_kernels.o deconvolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o softmax_layer_kernels.o network_kernels.o avgpool_layer_kernels.o yolo_kernels.o coco_kernels.o
endif

 cfg/msr_152.cfg

@@ -1,13 +1,16 @@
[net]
batch=256
subdivisions=16
batch=128
subdivisions=8
height=256
width=256
channels=3
momentum=0.9
decay=0.0005
decay=0.0001

learning_rate=0.02
learning_rate=0.1
policy=poly
power=4
max_batches=500000

[crop]
crop_height=224
@@ -57,10 +60,22 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[route]
layers=-4

[convolutional]
batch_normalize=1
size=1
stride=1
pad=1
activation=linear
filters=256

[shortcut]
from = -4
from = -3
activation=leaky

[convolutional]
batch_normalize=1
@@ -84,11 +99,13 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky

[convolutional]
batch_normalize=1
filters=64
@@ -111,11 +128,13 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


##### Conv 3_x #####

@@ -141,10 +160,24 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear


[route]
layers=-4

[convolutional]
batch_normalize=1
size=1
stride=2
pad=1
activation=linear
filters=512

[shortcut]
from = -4
from = -3
activation=leaky


[convolutional]
batch_normalize=1
@@ -168,11 +201,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=128
@@ -195,11 +231,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=128
@@ -222,11 +261,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=128
@@ -249,11 +291,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=128
@@ -276,11 +321,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=128
@@ -303,11 +351,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=128
@@ -330,11 +381,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky



##### Conv 4_x #####

@@ -360,10 +414,24 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear


[route]
layers=-4

[convolutional]
batch_normalize=1
size=1
stride=2
pad=1
activation=linear
filters=1024

[shortcut]
from = -4
from = -3
activation=leaky


[convolutional]
batch_normalize=1
@@ -387,11 +455,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -414,11 +485,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -441,11 +515,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -468,11 +545,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -495,11 +575,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -522,11 +605,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -549,11 +635,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -576,11 +665,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -603,11 +695,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -630,11 +725,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -657,11 +755,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -684,11 +785,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -711,11 +815,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -738,11 +845,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -765,11 +875,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -792,11 +905,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -819,11 +935,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -846,11 +965,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -873,11 +995,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -900,11 +1025,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -927,11 +1055,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -954,11 +1085,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -981,11 +1115,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -1008,11 +1145,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -1035,11 +1175,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -1062,11 +1205,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -1089,11 +1235,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -1116,11 +1265,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -1143,11 +1295,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -1170,11 +1325,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -1197,11 +1355,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -1224,11 +1385,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -1251,11 +1415,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -1278,11 +1445,14 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


[convolutional]
batch_normalize=1
filters=256
@@ -1305,11 +1475,13 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky


##### Conv 5_x #####

@@ -1335,10 +1507,24 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear


[route]
layers=-4

[convolutional]
batch_normalize=1
size=1
stride=2
pad=1
activation=linear
filters=2048

[shortcut]
from = -4
from = -3
activation=leaky


[convolutional]
batch_normalize=1
@@ -1362,11 +1548,13 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky

[convolutional]
batch_normalize=1
filters=512
@@ -1389,11 +1577,13 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4

activation=leaky

[avgpool]

[connected]

 cfg/msr_34.cfg

@@ -12,11 +12,6 @@
power=4
max_batches=500000

#policy=sigmoid
#gamma=.00008
#step=100000
#max_batches=200000

[crop]
crop_height=224
crop_width=224

 cfg/msr_50.cfg

@@ -1,13 +1,18 @@
[net]
batch=128
subdivisions=4
subdivisions=8
height=256
width=256
channels=3
momentum=0.9
decay=0.0005
decay=0.0001

learning_rate=0.01
learning_rate=0.05
policy=poly
power=4
max_batches=500000



[crop]
crop_height=224
@@ -57,10 +62,22 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[route]
layers=-4

[convolutional]
batch_normalize=1
size=1
stride=1
pad=1
activation=linear
filters=256

[shortcut]
from = -4
from = -3
activation=leaky

[convolutional]
batch_normalize=1
@@ -84,10 +101,11 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4
activation=leaky

[convolutional]
batch_normalize=1
@@ -111,10 +129,11 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4
activation=leaky


##### Conv 3_x #####
@@ -141,10 +160,22 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[route]
layers=-4

[convolutional]
batch_normalize=1
size=1
stride=2
pad=1
activation=linear
filters=512

[shortcut]
from = -4
from = -3
activation=leaky

[convolutional]
batch_normalize=1
@@ -168,10 +199,11 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4
activation=leaky

[convolutional]
batch_normalize=1
@@ -195,10 +227,11 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4
activation=leaky

[convolutional]
batch_normalize=1
@@ -222,10 +255,11 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4
activation=leaky


##### Conv 4_x #####
@@ -252,10 +286,23 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[route]
layers=-4

[convolutional]
batch_normalize=1
size=1
stride=2
pad=1
activation=linear
filters=1024

[shortcut]
from = -4
from = -3
activation=leaky


[convolutional]
batch_normalize=1
@@ -279,10 +326,11 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4
activation=leaky

[convolutional]
batch_normalize=1
@@ -306,10 +354,11 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4
activation=leaky

[convolutional]
batch_normalize=1
@@ -333,10 +382,11 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4
activation=leaky

[convolutional]
batch_normalize=1
@@ -360,10 +410,11 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4
activation=leaky

[convolutional]
batch_normalize=1
@@ -387,10 +438,11 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4
activation=leaky


##### Conv 5_x #####
@@ -417,10 +469,24 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear


[route]
layers=-4

[convolutional]
batch_normalize=1
size=1
stride=2
pad=1
activation=linear
filters=2048

[shortcut]
from = -4
from = -3
activation=leaky


[convolutional]
batch_normalize=1
@@ -444,10 +510,11 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4
activation=leaky

[convolutional]
batch_normalize=1
@@ -471,10 +538,11 @@
size=1
stride=1
pad=1
activation=leaky
activation=linear

[shortcut]
from = -4
activation=leaky

[avgpool]


 data/dog.jpg



 src/activation_layer.c

New file
@@ -0,0 +1,58 @@
#include "activation_layer.h"
#include "utils.h"
#include "cuda.h"
#include "blas.h"
#include "gemm.h"

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

layer make_activation_layer(int batch, int inputs, ACTIVATION activation)
{
    layer l = {0};
    l.type = ACTIVE;

    l.inputs = inputs;
    l.outputs = inputs;
    l.batch=batch;

    l.output = calloc(batch*inputs, sizeof(float*));
    l.delta = calloc(batch*inputs, sizeof(float*));

#ifdef GPU
    l.output_gpu = cuda_make_array(l.output, inputs*batch);
    l.delta_gpu = cuda_make_array(l.delta, inputs*batch);
#endif
    l.activation = activation;
    fprintf(stderr, "Activation Layer: %d inputs\n", inputs);
    return l;
}

void forward_activation_layer(layer l, network_state state)
{
    copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
    activate_array(l.output, l.outputs*l.batch, l.activation);
}

void backward_activation_layer(layer l, network_state state)
{
    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
    copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1);
}

#ifdef GPU

void forward_activation_layer_gpu(layer l, network_state state)
{
    copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
}

void backward_activation_layer_gpu(layer l, network_state state)
{
    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
    copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
}
#endif

 src/activation_layer.h

New file
@@ -0,0 +1,19 @@
#ifndef ACTIVATION_LAYER_H
#define ACTIVATION_LAYER_H

#include "activations.h"
#include "layer.h"
#include "network.h"

layer make_activation_layer(int batch, int inputs, ACTIVATION activation);

void forward_activation_layer(layer l, network_state state);
void backward_activation_layer(layer l, network_state state);

#ifdef GPU
void forward_activation_layer_gpu(layer l, network_state state);
void backward_activation_layer_gpu(layer l, network_state state);
#endif

#endif


 src/avgpool_layer.c

@@ -28,8 +28,9 @@

void resize_avgpool_layer(avgpool_layer *l, int w, int h)
{
    l->h = h;
    l->w = w;
    l->h = h;
    l->inputs = h*w*l->c;
}

void forward_avgpool_layer(const avgpool_layer l, network_state state)

 src/blas.c

@@ -1,15 +1,26 @@
#include "blas.h"
#include "math.h"
#include <assert.h>

void shortcut_cpu(float *out, int w, int h, int c, int batch, int sample, float *add, int stride, int c2)
void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
{
    int stride = w1/w2;
    int sample = w2/w1;
    assert(stride == h1/h2);
    assert(sample == h2/h1);
    if(stride < 1) stride = 1;
    if(sample < 1) sample = 1;
    int minw = (w1 < w2) ? w1 : w2;
    int minh = (h1 < h2) ? h1 : h2;
    int minc = (c1 < c2) ? c1 : c2;

    int i,j,k,b;
    for(b = 0; b < batch; ++b){
        for(k = 0; k < c && k < c2; ++k){
            for(j = 0; j < h/sample; ++j){
                for(i = 0; i < w/sample; ++i){
                    int out_index = i*sample + w*(j*sample + h*(k + c*b));
                    int add_index = b*w*stride/sample*h*stride/sample*c2 + i*stride + w*stride/sample*(j*stride + h*stride/sample*k);
        for(k = 0; k < minc; ++k){
            for(j = 0; j < minh; ++j){
                for(i = 0; i < minw; ++i){
                    int out_index = i*sample + w2*(j*sample + h2*(k + c2*b));
                    int add_index = i*stride + w1*(j*stride + h1*(k + c1*b));
                    out[out_index] += add[add_index];
                }
            }

 src/blas.h

@@ -16,7 +16,7 @@
void fill_cpu(int N, float ALPHA, float * X, int INCX);
float dot_cpu(int N, float *X, int INCX, float *Y, int INCY);
void test_gpu_blas();
void shortcut_cpu(float *out, int w, int h, int c, int batch, int sample, float *add, int stride, int c2);
void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out);

void mean_cpu(float *x, int batch, int filters, int spatial, float *mean);
void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
@@ -45,6 +45,6 @@

void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean);
void shortcut_gpu(float *out, int w, int h, int c, int batch, int sample, float *add, int stride, int c2);
void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out);
#endif
#endif

 src/blas_kernels.cu

@@ -1,6 +1,7 @@
#include "cuda_runtime.h"
#include "curand.h"
#include "cublas_v2.h"
#include <assert.h>

extern "C" {
#include "blas.h"
@@ -374,26 +375,37 @@
    check_error(cudaPeekAtLastError());
}

__global__ void shortcut_kernel(int size, float *out, int w, int h, int c, int batch, int sample, float *add, int stride, int c2, int min_c)
__global__ void shortcut_kernel(int size, int minw, int minh, int minc, int stride, int sample, int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
{
    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
    if (id >= size) return;
    int i = id % (w/sample);
    id /= (w/sample);
    int j = id % (h/sample);
    id /= (h/sample);
    int k = id % min_c;
    id /= min_c;
    int b = id;
    int out_index = i*sample + w*(j*sample + h*(k + c*b));
    int add_index = b*w*stride/sample*h*stride/sample*c2 + i*stride + w*stride/sample*(j*stride + h*stride/sample*k);
    int i = id % minw;
    id /= minw;
    int j = id % minh;
    id /= minh;
    int k = id % minc;
    id /= minc;
    int b = id % batch;

    int out_index = i*sample + w2*(j*sample + h2*(k + c2*b));
    int add_index = i*stride + w1*(j*stride + h1*(k + c1*b));
    out[out_index] += add[add_index];
}

extern "C" void shortcut_gpu(float *out, int w, int h, int c, int batch, int sample, float *add, int stride, int c2)
extern "C" void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
{
    int min_c = (c < c2) ? c : c2;
    int size = batch * w/sample * h/sample * min_c;
    shortcut_kernel<<<cuda_gridsize(size), BLOCK>>>(size, out, w, h, c, batch, sample, add, stride, c2, min_c);
    int minw = (w1 < w2) ? w1 : w2;
    int minh = (h1 < h2) ? h1 : h2;
    int minc = (c1 < c2) ? c1 : c2;

    int stride = w1/w2;
    int sample = w2/w1;
    assert(stride == h1/h2);
    assert(sample == h2/h1);
    if(stride < 1) stride = 1;
    if(sample < 1) sample = 1;

    int size = batch * minw * minh * minc;
    shortcut_kernel<<<cuda_gridsize(size), BLOCK>>>(size, minw, minh, minc, stride, sample, batch, w1, h1, c1, add, w2, h2, c2, out);
    check_error(cudaPeekAtLastError());
}

 src/classifier.c

@@ -2,6 +2,7 @@
#include "utils.h"
#include "parser.h"
#include "option_list.h"
#include "blas.h"

#ifdef OPENCV
#include "opencv2/highgui/highgui_c.h"
@@ -183,6 +184,145 @@
    }
}

void validate_classifier_10(char *datacfg, char *filename, char *weightfile)
{
    int i, j;
    network net = parse_network_cfg(filename);
    set_batch_network(&net, 1);
    if(weightfile){
        load_weights(&net, weightfile);
    }
    srand(time(0));

    list *options = read_data_cfg(datacfg);

    char *label_list = option_find_str(options, "labels", "data/labels.list");
    char *valid_list = option_find_str(options, "valid", "data/train.list");
    int classes = option_find_int(options, "classes", 2);
    int topk = option_find_int(options, "top", 1);

    char **labels = get_labels(label_list);
    list *plist = get_paths(valid_list);

    char **paths = (char **)list_to_array(plist);
    int m = plist->size;
    free_list(plist);

    float avg_acc = 0;
    float avg_topk = 0;
    int *indexes = calloc(topk, sizeof(int));

    for(i = 0; i < m; ++i){
        int class = -1;
        char *path = paths[i];
        for(j = 0; j < classes; ++j){
            if(strstr(path, labels[j])){
                class = j;
                break;
            }
        }
        image im = load_image_color(paths[i], 256, 256);
        image images[10];
        images[0] = crop_image(im, -16, -16, 256, 256);
        images[1] = crop_image(im, 16, -16, 256, 256);
        images[2] = crop_image(im, 0, 0, 256, 256);
        images[3] = crop_image(im, -16, 16, 256, 256);
        images[4] = crop_image(im, 16, 16, 256, 256);
        flip_image(im);
        images[5] = crop_image(im, -16, -16, 256, 256);
        images[6] = crop_image(im, 16, -16, 256, 256);
        images[7] = crop_image(im, 0, 0, 256, 256);
        images[8] = crop_image(im, -16, 16, 256, 256);
        images[9] = crop_image(im, 16, 16, 256, 256);
        float *pred = calloc(classes, sizeof(float));
        for(j = 0; j < 10; ++j){
            float *p = network_predict(net, images[j].data);
            axpy_cpu(classes, 1, p, 1, pred, 1);
            free_image(images[j]);
        }
        free_image(im);
        top_k(pred, classes, topk, indexes);
        free(pred);
        if(indexes[0] == class) avg_acc += 1;
        for(j = 0; j < topk; ++j){
            if(indexes[j] == class) avg_topk += 1;
        }

        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
    }
}

void validate_classifier_multi(char *datacfg, char *filename, char *weightfile)
{
    int i, j;
    network net = parse_network_cfg(filename);
    set_batch_network(&net, 1);
    if(weightfile){
        load_weights(&net, weightfile);
    }
    srand(time(0));

    list *options = read_data_cfg(datacfg);

    char *label_list = option_find_str(options, "labels", "data/labels.list");
    char *valid_list = option_find_str(options, "valid", "data/train.list");
    int classes = option_find_int(options, "classes", 2);
    int topk = option_find_int(options, "top", 1);

    char **labels = get_labels(label_list);
    list *plist = get_paths(valid_list);
    int scales[] = {224, 256, 384, 480, 640};
    int nscales = sizeof(scales)/sizeof(scales[0]);

    char **paths = (char **)list_to_array(plist);
    int m = plist->size;
    free_list(plist);

    float avg_acc = 0;
    float avg_topk = 0;
    int *indexes = calloc(topk, sizeof(int));

    for(i = 0; i < m; ++i){
        int class = -1;
        char *path = paths[i];
        for(j = 0; j < classes; ++j){
            if(strstr(path, labels[j])){
                class = j;
                break;
            }
        }
        float *pred = calloc(classes, sizeof(float));
        image im = load_image_color(paths[i], 0, 0);
        for(j = 0; j < nscales; ++j){
            int w, h;
            if(im.w < im.h){
                w = scales[j];
                h = (im.h*w)/im.w;
            } else {
                h = scales[j];
                w = (im.w * h) / im.h;
            }
            resize_network(&net, w, h);
            image r = resize_image(im, w, h);
            float *p = network_predict(net, r.data);
            axpy_cpu(classes, 1, p, 1, pred, 1);
            flip_image(r);
            p = network_predict(net, r.data);
            axpy_cpu(classes, 1, p, 1, pred, 1);
            free_image(r);
        }
        free_image(im);
        top_k(pred, classes, topk, indexes);
        free(pred);
        if(indexes[0] == class) avg_acc += 1;
        for(j = 0; j < topk; ++j){
            if(indexes[j] == class) avg_topk += 1;
        }

        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
    }
}

void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filename)
{
    network net = parse_network_cfg(cfgfile);
@@ -296,7 +436,7 @@

        free_matrix(pred);

        fprintf(stderr, "%lf seconds, %d images\n", sec(clock()-time), val.X.rows);
        fprintf(stderr, "%lf seconds, %d images, %d total\n", sec(clock()-time), val.X.rows, curr);
        free_data(val);
    }
}
@@ -319,6 +459,8 @@
    else if(0==strcmp(argv[2], "train")) train_classifier(data, cfg, weights);
    else if(0==strcmp(argv[2], "test")) test_classifier(data, cfg, weights, layer);
    else if(0==strcmp(argv[2], "valid")) validate_classifier(data, cfg, weights);
    else if(0==strcmp(argv[2], "valid10")) validate_classifier_10(data, cfg, weights);
    else if(0==strcmp(argv[2], "validmulti")) validate_classifier_multi(data, cfg, weights);
}



 src/connected_layer.c

@@ -32,7 +32,7 @@
    //float scale = 1./sqrt(inputs);
    float scale = sqrt(2./inputs);
    for(i = 0; i < outputs*inputs; ++i){
        l.weights[i] = 2*scale*rand_uniform() - scale;
        l.weights[i] = scale*rand_uniform(-1, 1);
    }

    for(i = 0; i < outputs; ++i){

 src/convolutional_layer.c

@@ -65,7 +65,7 @@

    // float scale = 1./sqrt(size*size*c);
    float scale = sqrt(2./(size*size*c));
    for(i = 0; i < c*n*size*size; ++i) l.filters[i] = 2*scale*rand_uniform() - scale;
    for(i = 0; i < c*n*size*size; ++i) l.filters[i] = scale*rand_uniform(-1, 1);
    int out_h = convolutional_out_height(l);
    int out_w = convolutional_out_width(l);
    l.out_h = out_h;

 src/crop_layer.c

@@ -19,33 +19,51 @@
    l.h = h;
    l.w = w;
    l.c = c;
    l.scale = (float)crop_height / h;
    l.flip = flip;
    l.angle = angle;
    l.saturation = saturation;
    l.exposure = exposure;
    l.crop_width = crop_width;
    l.crop_height = crop_height;
    l.out_w = crop_width;
    l.out_h = crop_height;
    l.out_c = c;
    l.inputs = l.w * l.h * l.c;
    l.outputs = l.out_w * l.out_h * l.out_c;
    l.output = calloc(crop_width*crop_height * c*batch, sizeof(float));
    l.output = calloc(l.outputs*batch, sizeof(float));
    #ifdef GPU
    l.output_gpu = cuda_make_array(l.output, crop_width*crop_height*c*batch);
    l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
    l.rand_gpu   = cuda_make_array(0, l.batch*8);
    #endif
    return l;
}

void resize_crop_layer(layer *l, int w, int h)
{
    l->w = w;
    l->h = h;

    l->out_w =  l->scale*w;
    l->out_h =  l->scale*h;

    l->inputs = l->w * l->h * l->c;
    l->outputs = l->out_h * l->out_w * l->out_c;

    l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
    #ifdef GPU
    cuda_free(l->output_gpu);
    l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch);
    #endif
}


void forward_crop_layer(const crop_layer l, network_state state)
{
    int i,j,c,b,row,col;
    int index;
    int count = 0;
    int flip = (l.flip && rand()%2);
    int dh = rand()%(l.h - l.crop_height + 1);
    int dw = rand()%(l.w - l.crop_width + 1);
    int dh = rand()%(l.h - l.out_h + 1);
    int dw = rand()%(l.w - l.out_w + 1);
    float scale = 2;
    float trans = -1;
    if(l.noadjust){
@@ -54,13 +72,13 @@
    }
    if(!state.train){
        flip = 0;
        dh = (l.h - l.crop_height)/2;
        dw = (l.w - l.crop_width)/2;
        dh = (l.h - l.out_h)/2;
        dw = (l.w - l.out_w)/2;
    }
    for(b = 0; b < l.batch; ++b){
        for(c = 0; c < l.c; ++c){
            for(i = 0; i < l.crop_height; ++i){
                for(j = 0; j < l.crop_width; ++j){
            for(i = 0; i < l.out_h; ++i){
                for(j = 0; j < l.out_w; ++j){
                    if(flip){
                        col = l.w - dw - j - 1;    
                    }else{

 src/crop_layer.h

@@ -11,6 +11,7 @@
image get_crop_image(crop_layer l);
crop_layer make_crop_layer(int batch, int h, int w, int c, int crop_height, int crop_width, int flip, float angle, float saturation, float exposure);
void forward_crop_layer(const crop_layer l, network_state state);
void resize_crop_layer(layer *l, int w, int h);

#ifdef GPU
void forward_crop_layer_gpu(crop_layer l, network_state state);

 src/crop_layer_kernels.cu

@@ -198,9 +198,9 @@
    levels_image_kernel<<<cuda_gridsize(size), BLOCK>>>(state.input, layer.rand_gpu, layer.batch, layer.w, layer.h, state.train, layer.saturation, layer.exposure, translate, scale, layer.shift);
    check_error(cudaPeekAtLastError());

    size = layer.batch*layer.c*layer.crop_width*layer.crop_height;
    size = layer.batch*layer.c*layer.out_w*layer.out_h;

    forward_crop_layer_kernel<<<cuda_gridsize(size), BLOCK>>>(state.input, layer.rand_gpu, size, layer.c, layer.h, layer.w, layer.crop_height, layer.crop_width, state.train, layer.flip, radians, layer.output_gpu);
    forward_crop_layer_kernel<<<cuda_gridsize(size), BLOCK>>>(state.input, layer.rand_gpu, size, layer.c, layer.h, layer.w, layer.out_h, layer.out_w, state.train, layer.flip, radians, layer.output_gpu);
    check_error(cudaPeekAtLastError());

/*

 src/data.c

@@ -427,10 +427,10 @@
        int dw = (ow*jitter);
        int dh = (oh*jitter);

        int pleft  = (rand_uniform() * 2*dw - dw);
        int pright = (rand_uniform() * 2*dw - dw);
        int ptop   = (rand_uniform() * 2*dh - dh);
        int pbot   = (rand_uniform() * 2*dh - dh);
        int pleft  = rand_uniform(-dw, dw);
        int pright = rand_uniform(-dw, dw);
        int ptop   = rand_uniform(-dh, dh);
        int pbot   = rand_uniform(-dh, dh);

        int swidth =  ow - pleft - pright;
        int sheight = oh - ptop - pbot;
@@ -543,10 +543,10 @@
    int dw = w*jitter;
    int dh = h*jitter;

    int pleft  = (rand_uniform() * 2*dw - dw);
    int pright = (rand_uniform() * 2*dw - dw);
    int ptop   = (rand_uniform() * 2*dh - dh);
    int pbot   = (rand_uniform() * 2*dh - dh);
    int pleft  = rand_uniform(-dw, dw);
    int pright = rand_uniform(-dw, dw);
    int ptop   = rand_uniform(-dh, dh);
    int pbot   = rand_uniform(-dh, dh);

    int swidth =  w - pleft - pright;
    int sheight = h - ptop - pbot;
@@ -594,10 +594,10 @@
        int dw = ow/10;
        int dh = oh/10;

        int pleft  = (rand_uniform() * 2*dw - dw);
        int pright = (rand_uniform() * 2*dw - dw);
        int ptop   = (rand_uniform() * 2*dh - dh);
        int pbot   = (rand_uniform() * 2*dh - dh);
        int pleft  = rand_uniform(-dw, dw);
        int pright = rand_uniform(-dw, dw);
        int ptop   = rand_uniform(-dh, dh);
        int pbot   = rand_uniform(-dh, dh);

        int swidth =  ow - pleft - pright;
        int sheight = oh - ptop - pbot;

 src/dropout_layer.c

@@ -37,7 +37,7 @@
    int i;
    if (!state.train) return;
    for(i = 0; i < l.batch * l.inputs; ++i){
        float r = rand_uniform();
        float r = rand_uniform(0, 1);
        l.rand[i] = r;
        if(r < l.probability) state.input[i] = 0;
        else state.input[i] *= l.scale;

 src/image.c

@@ -390,6 +390,17 @@
    return out;
}

image make_random_image(int w, int h, int c)
{
    image out = make_empty_image(w,h,c);
    out.data = calloc(h*w*c, sizeof(float));
    int i;
    for(i = 0; i < w*h*c; ++i){
        out.data[i] = (rand_normal() * .25) + .5;
    }
    return out;
}

image float_to_image(int w, int h, int c, float *data)
{
    image out = make_empty_image(w,h,c);
@@ -692,6 +703,8 @@
    return resized;
}

#include "cuda.h"

void test_resize(char *filename)
{
    image im = load_image(filename, 0,0, 3);
@@ -709,14 +722,27 @@
    image exp5 = copy_image(im);
    exposure_image(exp5, .5);

    image r = resize_image(im, im.w/2, im.h/2);
    #ifdef GPU
    image r = resize_image(im, im.w, im.h);
    image black = make_image(im.w*2 + 3, im.h*2 + 3, 9);
    image black2 = make_image(im.w, im.h, 3);

    image black = make_image(im.w, im.h, im.c);
    shortcut_cpu(black.data, im.w, im.h, im.c, 1, 2, r.data, 1, r.c);
    float *r_gpu = cuda_make_array(r.data, r.w*r.h*r.c);
    float *black_gpu = cuda_make_array(black.data, black.w*black.h*black.c);
    float *black2_gpu = cuda_make_array(black2.data, black2.w*black2.h*black2.c);
    shortcut_gpu(3, r.w, r.h, 1, r_gpu, black.w, black.h, 3, black_gpu);
    //flip_image(r);
    //shortcut_gpu(3, r.w, r.h, 1, r.data, black.w, black.h, 3, black.data);

    shortcut_gpu(3, black.w, black.h, 3, black_gpu, black2.w, black2.h, 1, black2_gpu);
    cuda_pull_array(black_gpu, black.data, black.w*black.h*black.c);
    cuda_pull_array(black2_gpu, black2.data, black2.w*black2.h*black2.c);
    show_image_layers(black, "Black");
    show_image(black2, "Recreate");
    #endif

    show_image(im, "Original");
    show_image(gray, "Gray");
    show_image(black, "Black");
    show_image(sat2, "Saturation-2");
    show_image(sat5, "Saturation-.5");
    show_image(exp2, "Exposure-2");

 src/image.h

@@ -58,6 +58,7 @@
void print_image(image m);

image make_image(int w, int h, int c);
image make_random_image(int w, int h, int c);
image make_empty_image(int w, int h, int c);
image float_to_image(int w, int h, int c, float *data);
image copy_image(image p);

 src/imagenet.c

@@ -21,7 +21,7 @@
    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
    int imgs = 1024;
    char **labels = get_labels("data/inet.labels.list");
    list *plist = get_paths("/data/imagenet/cls.train.list");
    list *plist = get_paths("data/inet.train.list");
    char **paths = (char **)list_to_array(plist);
    printf("%d\n", plist->size);
    int N = plist->size;
@@ -62,6 +62,11 @@
            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
            save_weights(net, buff);
        }
        if(*net.seen%1000 == 0){
            char buff[256];
            sprintf(buff, "%s/%s.backup",backup_directory,base);
            save_weights(net, buff);
        }
    }
    char buff[256];
    sprintf(buff, "%s/%s.weights", backup_directory, base);

 src/layer.h

@@ -20,7 +20,8 @@
    NORMALIZATION,
    AVGPOOL,
    LOCAL,
    SHORTCUT
    SHORTCUT,
    ACTIVE
} LAYER_TYPE;

typedef enum{
@@ -46,8 +47,6 @@
    int side;
    int stride;
    int pad;
    int crop_width;
    int crop_height;
    int sqrt;
    int flip;
    int index;

 src/local_layer.c

@@ -55,7 +55,7 @@

    // float scale = 1./sqrt(size*size*c);
    float scale = sqrt(2./(size*size*c));
    for(i = 0; i < c*n*size*size; ++i) l.filters[i] = 2*scale*rand_uniform() - scale;
    for(i = 0; i < c*n*size*size; ++i) l.filters[i] = scale*rand_uniform(-1,1);

    l.col_image = calloc(out_h*out_w*size*size*c, sizeof(float));
    l.output = calloc(l.batch*out_h * out_w * n, sizeof(float));

 src/maxpool_layer.c

@@ -51,6 +51,7 @@
    int stride = l->stride;
    l->h = h;
    l->w = w;
    l->inputs = h*w*l->c;

    l->out_w = (w-1)/stride + 1;
    l->out_h = (h-1)/stride + 1;

 src/network.c

@@ -10,6 +10,7 @@
#include "connected_layer.h"
#include "local_layer.h"
#include "convolutional_layer.h"
#include "activation_layer.h"
#include "deconvolutional_layer.h"
#include "detection_layer.h"
#include "normalization_layer.h"
@@ -73,6 +74,8 @@
    switch(a){
        case CONVOLUTIONAL:
            return "convolutional";
        case ACTIVE:
            return "activation";
        case LOCAL:
            return "local";
        case DECONVOLUTIONAL:
@@ -131,6 +134,8 @@
            forward_convolutional_layer(l, state);
        } else if(l.type == DECONVOLUTIONAL){
            forward_deconvolutional_layer(l, state);
        } else if(l.type == ACTIVE){
            forward_activation_layer(l, state);
        } else if(l.type == LOCAL){
            forward_local_layer(l, state);
        } else if(l.type == NORMALIZATION){
@@ -231,6 +236,8 @@
            backward_convolutional_layer(l, state);
        } else if(l.type == DECONVOLUTIONAL){
            backward_deconvolutional_layer(l, state);
        } else if(l.type == ACTIVE){
            backward_activation_layer(l, state);
        } else if(l.type == NORMALIZATION){
            backward_normalization_layer(l, state);
        } else if(l.type == MAXPOOL){
@@ -360,11 +367,12 @@
        layer l = net->layers[i];
        if(l.type == CONVOLUTIONAL){
            resize_convolutional_layer(&l, w, h);
        }else if(l.type == CROP){
            resize_crop_layer(&l, w, h);
        }else if(l.type == MAXPOOL){
            resize_maxpool_layer(&l, w, h);
        }else if(l.type == AVGPOOL){
            resize_avgpool_layer(&l, w, h);
            break;
        }else if(l.type == NORMALIZATION){
            resize_normalization_layer(&l, w, h);
        }else if(l.type == COST){
@@ -376,6 +384,7 @@
        net->layers[i] = l;
        w = l.out_w;
        h = l.out_h;
        if(l.type == AVGPOOL) break;
    }
    //fprintf(stderr, " Done!\n");
    return 0;

 src/network_kernels.cu

@@ -18,6 +18,7 @@
#include "connected_layer.h"
#include "detection_layer.h"
#include "convolutional_layer.h"
#include "activation_layer.h"
#include "deconvolutional_layer.h"
#include "maxpool_layer.h"
#include "avgpool_layer.h"
@@ -48,6 +49,8 @@
            forward_convolutional_layer_gpu(l, state);
        } else if(l.type == DECONVOLUTIONAL){
            forward_deconvolutional_layer_gpu(l, state);
        } else if(l.type == ACTIVE){
            forward_activation_layer_gpu(l, state);
        } else if(l.type == LOCAL){
            forward_local_layer_gpu(l, state);
        } else if(l.type == DETECTION){
@@ -97,6 +100,8 @@
            backward_convolutional_layer_gpu(l, state);
        } else if(l.type == DECONVOLUTIONAL){
            backward_deconvolutional_layer_gpu(l, state);
        } else if(l.type == ACTIVE){
            backward_activation_layer_gpu(l, state);
        } else if(l.type == LOCAL){
            backward_local_layer_gpu(l, state);
        } else if(l.type == MAXPOOL){

 src/nightmare.c

@@ -108,6 +108,69 @@

}

void smooth(image recon, image update, float lambda, int num)
{
    int i, j, k;
    int ii, jj;
    for(k = 0; k < recon.c; ++k){
        for(j = 0; j < recon.h; ++j){
            for(i = 0; i < recon.w; ++i){
                int out_index = i + recon.w*(j + recon.h*k);
                for(jj = j-num; jj <= j + num && jj < recon.h; ++jj){
                    if (jj < 0) continue;
                    for(ii = i-num; ii <= i + num && ii < recon.w; ++ii){
                        if (ii < 0) continue;
                        int in_index = ii + recon.w*(jj + recon.h*k);
                        update.data[out_index] += lambda * (recon.data[in_index] - recon.data[out_index]);
                    }
                }
            }
        }
    }
}

void reconstruct_picture(network net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size)
{
    scale_image(recon, 2);
    translate_image(recon, -1);

    image delta = make_image(recon.w, recon.h, recon.c);

    network_state state = {0};
#ifdef GPU
    state.input = cuda_make_array(recon.data, recon.w*recon.h*recon.c);
    state.delta = cuda_make_array(delta.data, delta.w*delta.h*delta.c);
    state.truth = cuda_make_array(features, get_network_output_size(net));

    forward_network_gpu(net, state);
    backward_network_gpu(net, state);

    cuda_pull_array(state.delta, delta.data, delta.w*delta.h*delta.c);

    cuda_free(state.input);
    cuda_free(state.delta);
    cuda_free(state.truth);
#else
    state.input = recon.data;
    state.delta = delta.data;
    state.truth = features;

    forward_network(net, state);
    backward_network(net, state);
#endif

    axpy_cpu(recon.w*recon.h*recon.c, 1, delta.data, 1, update.data, 1);
    smooth(recon, update, lambda, smooth_size);

    axpy_cpu(recon.w*recon.h*recon.c, rate, update.data, 1, recon.data, 1);
    scal_cpu(recon.w*recon.h*recon.c, momentum, update.data, 1);

    translate_image(recon, 1);
    scale_image(recon, .5);
    constrain_image(recon);
    free_image(delta);
}


void run_nightmare(int argc, char **argv)
{
@@ -131,7 +194,11 @@
    float rate = find_float_arg(argc, argv, "-rate", .04);
    float thresh = find_float_arg(argc, argv, "-thresh", 1.);
    float rotate = find_float_arg(argc, argv, "-rotate", 0);
    float momentum = find_float_arg(argc, argv, "-momentum", .9);
    float lambda = find_float_arg(argc, argv, "-lambda", .01);
    char *prefix = find_char_arg(argc, argv, "-prefix", 0);
    int reconstruct = find_arg(argc, argv, "-reconstruct");
    int smooth_size = find_int_arg(argc, argv, "-smooth", 1);

    network net = parse_network_cfg(cfg);
    load_weights(&net, weights);
@@ -151,6 +218,19 @@
        im = resized;
    }

    float *features;
    image update;
    if (reconstruct){
        resize_network(&net, im.w, im.h);
        int size = get_network_output_size(net);
        features = calloc(size, sizeof(float));
        float *out = network_predict(net, im.data);
        copy_cpu(size, out, 1, features, 1);
        free_image(im);
        im = make_random_image(im.w, im.h, im.c);
        update = make_image(im.w, im.h, im.c);
    }

    int e;
    int n;
    for(e = 0; e < rounds; ++e){
@@ -159,10 +239,18 @@
        for(n = 0; n < iters; ++n){  
            fprintf(stderr, "%d, ", n);
            fflush(stderr);
            if(reconstruct){
                reconstruct_picture(net, features, im, update, rate, momentum, lambda, smooth_size);
                show_image(im, "reconstruction");
                #ifdef OPENCV
                cvWaitKey(10);
                #endif
            }else{
            int layer = max_layer + rand()%range - range/2;
            int octave = rand()%octaves;
            optimize_picture(&net, im, layer, 1/pow(1.33333333, octave), rate, thresh, norm);
        }
        }
        fprintf(stderr, "done\n");
        if(0){
            image g = grayscale_image(im);

 src/parser.c

@@ -7,6 +7,7 @@
#include "crop_layer.h"
#include "cost_layer.h"
#include "convolutional_layer.h"
#include "activation_layer.h"
#include "normalization_layer.h"
#include "deconvolutional_layer.h"
#include "connected_layer.h"
@@ -29,6 +30,7 @@

int is_network(section *s);
int is_convolutional(section *s);
int is_activation(section *s);
int is_local(section *s);
int is_deconvolutional(section *s);
int is_connected(section *s);
@@ -301,10 +303,31 @@
    layer from = net.layers[index];

    layer s = make_shortcut_layer(batch, index, params.w, params.h, params.c, from.out_w, from.out_h, from.out_c);

    char *activation_s = option_find_str(options, "activation", "linear");
    ACTIVATION activation = get_activation(activation_s);
    s.activation = activation;
    return s;
}


layer parse_activation(list *options, size_params params)
{
    char *activation_s = option_find_str(options, "activation", "linear");
    ACTIVATION activation = get_activation(activation_s);

    layer l = make_activation_layer(params.batch, params.inputs, activation);

    l.out_h = params.h;
    l.out_w = params.w;
    l.out_c = params.c;
    l.h = params.h;
    l.w = params.w;
    l.c = params.c;

    return l;
}

route_layer parse_route(list *options, size_params params, network net)
{
    char *l = option_find(options, "layers");   
@@ -447,6 +470,8 @@
            l = parse_convolutional(options, params);
        }else if(is_local(s)){
            l = parse_local(options, params);
        }else if(is_activation(s)){
            l = parse_activation(options, params);
        }else if(is_deconvolutional(s)){
            l = parse_deconvolutional(options, params);
        }else if(is_connected(s)){
@@ -530,6 +555,10 @@
    return (strcmp(s->type, "[conv]")==0
            || strcmp(s->type, "[convolutional]")==0);
}
int is_activation(section *s)
{
    return (strcmp(s->type, "[activation]")==0);
}
int is_network(section *s)
{
    return (strcmp(s->type, "[net]")==0

 src/shortcut_layer.c

@@ -10,22 +10,15 @@
    layer l = {0};
    l.type = SHORTCUT;
    l.batch = batch;
    l.w = w;
    l.h = h;
    l.c = c;
    l.w = w2;
    l.h = h2;
    l.c = c2;
    l.out_w = w;
    l.out_h = h;
    l.out_c = c;
    l.outputs = w*h*c;
    l.inputs = w*h*c;
    int stride = w2 / w;
    l.inputs = l.outputs;

    assert(stride * w == w2);
    assert(stride * h == h2);
    assert(c >= c2);

    l.stride = stride;
    l.n = c2;
    l.index = index;

    l.delta =  calloc(l.outputs*batch, sizeof(float));
@@ -40,25 +33,29 @@
void forward_shortcut_layer(const layer l, network_state state)
{
    copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
    shortcut_cpu(l.output, l.w, l.h, l.c, l.batch, 1, state.net.layers[l.index].output, l.stride, l.n);
    shortcut_cpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.output);
    activate_array(l.output, l.outputs*l.batch, l.activation);
}

void backward_shortcut_layer(const layer l, network_state state)
{
    copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1);
    shortcut_cpu(state.net.layers[l.index].delta, l.w*l.stride, l.h*l.stride, l.n, l.batch, l.stride, l.delta, 1, l.c);
    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
    axpy_cpu(l.outputs*l.batch, 1, l.delta, 1, state.delta, 1);
    shortcut_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, state.net.layers[l.index].delta);
}

#ifdef GPU
void forward_shortcut_layer_gpu(const layer l, network_state state)
{
    copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
    shortcut_gpu(l.output_gpu, l.w, l.h, l.c, l.batch, 1, state.net.layers[l.index].output_gpu, l.stride, l.n);
    shortcut_gpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu);
    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
}

void backward_shortcut_layer_gpu(const layer l, network_state state)
{
    copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
    shortcut_gpu(state.net.layers[l.index].delta_gpu, l.w*l.stride, l.h*l.stride, l.n, l.batch, l.stride, l.delta_gpu, 1, l.c);
    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
    axpy_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1, state.delta, 1);
    shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, state.net.layers[l.index].delta_gpu);
}
#endif

 src/utils.c

@@ -485,9 +485,9 @@
   }
 */

float rand_uniform()
float rand_uniform(float min, float max)
{
    return (float)rand()/RAND_MAX;
    return ((float)rand()/RAND_MAX * (max - min)) + min;
}

float **one_hot_encode(float *a, int n, int k)

 src/utils.h

@@ -34,7 +34,7 @@
float constrain(float min, float max, float a);
float mse_array(float *a, int n);
float rand_normal();
float rand_uniform();
float rand_uniform(float min, float max);
float sum_array(float *a, int n);
float mean_array(float *a, int n);
void mean_arrays(float **a, int n, int els, float *avg);

 src/yolo.c

@@ -14,7 +14,7 @@

void train_yolo(char *cfgfile, char *weightfile)
{
    char *train_images = "data/voc.0712.trainval";
    char *train_images = "/data/voc/train.txt";
    char *backup_directory = "/home/pjreddie/backup/";
    srand(time(0));
    data_seed = time(0);

			@@ -34,7 +34,7 @@
			LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand
			endif

			OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o imagenet.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o
			OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o imagenet.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o
			ifeq ($(GPU), 1)
			OBJ+=convolutional_kernels.o deconvolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o softmax_layer_kernels.o network_kernels.o avgpool_layer_kernels.o yolo_kernels.o coco_kernels.o
			endif

			@@ -1,13 +1,16 @@
			[net]
			batch=256
			subdivisions=16
			batch=128
			subdivisions=8
			height=256
			width=256
			channels=3
			momentum=0.9
			decay=0.0005
			decay=0.0001

			learning_rate=0.02
			learning_rate=0.1
			policy=poly
			power=4
			max_batches=500000

			[crop]
			crop_height=224
			@@ -57,10 +60,22 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[route]
			layers=-4

			[convolutional]
			batch_normalize=1
			size=1
			stride=1
			pad=1
			activation=linear
			filters=256

			[shortcut]
			from = -4
			from = -3
			activation=leaky

			[convolutional]
			batch_normalize=1
			@@ -84,11 +99,13 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky

			[convolutional]
			batch_normalize=1
			filters=64
			@@ -111,11 +128,13 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			##### Conv 3_x #####

			@@ -141,10 +160,24 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear


			[route]
			layers=-4

			[convolutional]
			batch_normalize=1
			size=1
			stride=2
			pad=1
			activation=linear
			filters=512

			[shortcut]
			from = -4
			from = -3
			activation=leaky


			[convolutional]
			batch_normalize=1
			@@ -168,11 +201,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=128
			@@ -195,11 +231,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=128
			@@ -222,11 +261,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=128
			@@ -249,11 +291,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=128
			@@ -276,11 +321,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=128
			@@ -303,11 +351,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=128
			@@ -330,11 +381,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky



			##### Conv 4_x #####

			@@ -360,10 +414,24 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear


			[route]
			layers=-4

			[convolutional]
			batch_normalize=1
			size=1
			stride=2
			pad=1
			activation=linear
			filters=1024

			[shortcut]
			from = -4
			from = -3
			activation=leaky


			[convolutional]
			batch_normalize=1
			@@ -387,11 +455,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -414,11 +485,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -441,11 +515,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -468,11 +545,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -495,11 +575,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -522,11 +605,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -549,11 +635,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -576,11 +665,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -603,11 +695,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -630,11 +725,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -657,11 +755,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -684,11 +785,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -711,11 +815,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -738,11 +845,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -765,11 +875,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -792,11 +905,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -819,11 +935,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -846,11 +965,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -873,11 +995,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -900,11 +1025,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -927,11 +1055,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -954,11 +1085,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -981,11 +1115,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -1008,11 +1145,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -1035,11 +1175,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -1062,11 +1205,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -1089,11 +1235,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -1116,11 +1265,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -1143,11 +1295,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -1170,11 +1325,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -1197,11 +1355,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -1224,11 +1385,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -1251,11 +1415,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -1278,11 +1445,14 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			[convolutional]
			batch_normalize=1
			filters=256
			@@ -1305,11 +1475,13 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky


			##### Conv 5_x #####

			@@ -1335,10 +1507,24 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear


			[route]
			layers=-4

			[convolutional]
			batch_normalize=1
			size=1
			stride=2
			pad=1
			activation=linear
			filters=2048

			[shortcut]
			from = -4
			from = -3
			activation=leaky


			[convolutional]
			batch_normalize=1
			@@ -1362,11 +1548,13 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky

			[convolutional]
			batch_normalize=1
			filters=512
			@@ -1389,11 +1577,13 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4

			activation=leaky

			[avgpool]

			[connected]

			@@ -12,11 +12,6 @@
			power=4
			max_batches=500000

			#policy=sigmoid
			#gamma=.00008
			#step=100000
			#max_batches=200000

			[crop]
			crop_height=224
			crop_width=224

			@@ -1,13 +1,18 @@
			[net]
			batch=128
			subdivisions=4
			subdivisions=8
			height=256
			width=256
			channels=3
			momentum=0.9
			decay=0.0005
			decay=0.0001

			learning_rate=0.01
			learning_rate=0.05
			policy=poly
			power=4
			max_batches=500000



			[crop]
			crop_height=224
			@@ -57,10 +62,22 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[route]
			layers=-4

			[convolutional]
			batch_normalize=1
			size=1
			stride=1
			pad=1
			activation=linear
			filters=256

			[shortcut]
			from = -4
			from = -3
			activation=leaky

			[convolutional]
			batch_normalize=1
			@@ -84,10 +101,11 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4
			activation=leaky

			[convolutional]
			batch_normalize=1
			@@ -111,10 +129,11 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4
			activation=leaky


			##### Conv 3_x #####
			@@ -141,10 +160,22 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[route]
			layers=-4

			[convolutional]
			batch_normalize=1
			size=1
			stride=2
			pad=1
			activation=linear
			filters=512

			[shortcut]
			from = -4
			from = -3
			activation=leaky

			[convolutional]
			batch_normalize=1
			@@ -168,10 +199,11 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4
			activation=leaky

			[convolutional]
			batch_normalize=1
			@@ -195,10 +227,11 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4
			activation=leaky

			[convolutional]
			batch_normalize=1
			@@ -222,10 +255,11 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4
			activation=leaky


			##### Conv 4_x #####
			@@ -252,10 +286,23 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[route]
			layers=-4

			[convolutional]
			batch_normalize=1
			size=1
			stride=2
			pad=1
			activation=linear
			filters=1024

			[shortcut]
			from = -4
			from = -3
			activation=leaky


			[convolutional]
			batch_normalize=1
			@@ -279,10 +326,11 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4
			activation=leaky

			[convolutional]
			batch_normalize=1
			@@ -306,10 +354,11 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4
			activation=leaky

			[convolutional]
			batch_normalize=1
			@@ -333,10 +382,11 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4
			activation=leaky

			[convolutional]
			batch_normalize=1
			@@ -360,10 +410,11 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4
			activation=leaky

			[convolutional]
			batch_normalize=1
			@@ -387,10 +438,11 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4
			activation=leaky


			##### Conv 5_x #####
			@@ -417,10 +469,24 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear


			[route]
			layers=-4

			[convolutional]
			batch_normalize=1
			size=1
			stride=2
			pad=1
			activation=linear
			filters=2048

			[shortcut]
			from = -4
			from = -3
			activation=leaky


			[convolutional]
			batch_normalize=1
			@@ -444,10 +510,11 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4
			activation=leaky

			[convolutional]
			batch_normalize=1
			@@ -471,10 +538,11 @@
			size=1
			stride=1
			pad=1
			activation=leaky
			activation=linear

			[shortcut]
			from = -4
			activation=leaky

			[avgpool]

New file
			@@ -0,0 +1,58 @@
			#include "activation_layer.h"
			#include "utils.h"
			#include "cuda.h"
			#include "blas.h"
			#include "gemm.h"

			#include <math.h>
			#include <stdio.h>
			#include <stdlib.h>
			#include <string.h>

			layer make_activation_layer(int batch, int inputs, ACTIVATION activation)
			{
			layer l = {0};
			l.type = ACTIVE;

			l.inputs = inputs;
			l.outputs = inputs;
			l.batch=batch;

			l.output = calloc(batchinputs, sizeof(float));
			l.delta = calloc(batchinputs, sizeof(float));

			#ifdef GPU
			l.output_gpu = cuda_make_array(l.output, inputs*batch);
			l.delta_gpu = cuda_make_array(l.delta, inputs*batch);
			#endif
			l.activation = activation;
			fprintf(stderr, "Activation Layer: %d inputs\n", inputs);
			return l;
			}

			void forward_activation_layer(layer l, network_state state)
			{
			copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
			activate_array(l.output, l.outputs*l.batch, l.activation);
			}

			void backward_activation_layer(layer l, network_state state)
			{
			gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
			copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1);
			}

			#ifdef GPU

			void forward_activation_layer_gpu(layer l, network_state state)
			{
			copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
			activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
			}

			void backward_activation_layer_gpu(layer l, network_state state)
			{
			gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
			copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
			}
			#endif

New file
			@@ -0,0 +1,19 @@
			#ifndef ACTIVATION_LAYER_H
			#define ACTIVATION_LAYER_H

			#include "activations.h"
			#include "layer.h"
			#include "network.h"

			layer make_activation_layer(int batch, int inputs, ACTIVATION activation);

			void forward_activation_layer(layer l, network_state state);
			void backward_activation_layer(layer l, network_state state);

			#ifdef GPU
			void forward_activation_layer_gpu(layer l, network_state state);
			void backward_activation_layer_gpu(layer l, network_state state);
			#endif

			#endif

			@@ -28,8 +28,9 @@

			void resize_avgpool_layer(avgpool_layer *l, int w, int h)
			{
			l->h = h;
			l->w = w;
			l->h = h;
			l->inputs = hwl->c;
			}

			void forward_avgpool_layer(const avgpool_layer l, network_state state)

			@@ -1,15 +1,26 @@
			#include "blas.h"
			#include "math.h"
			#include <assert.h>

			void shortcut_cpu(float out, int w, int h, int c, int batch, int sample, float add, int stride, int c2)
			void shortcut_cpu(int batch, int w1, int h1, int c1, float add, int w2, int h2, int c2, float out)
			{
			int stride = w1/w2;
			int sample = w2/w1;
			assert(stride == h1/h2);
			assert(sample == h2/h1);
			if(stride < 1) stride = 1;
			if(sample < 1) sample = 1;
			int minw = (w1 < w2) ? w1 : w2;
			int minh = (h1 < h2) ? h1 : h2;
			int minc = (c1 < c2) ? c1 : c2;

			int i,j,k,b;
			for(b = 0; b < batch; ++b){
			for(k = 0; k < c && k < c2; ++k){
			for(j = 0; j < h/sample; ++j){
			for(i = 0; i < w/sample; ++i){
			int out_index = isample + w(jsample + h(k + c*b));
			int add_index = bwstride/samplehstride/samplec2 + istride + wstride/sample(jstride + hstride/sample*k);
			for(k = 0; k < minc; ++k){
			for(j = 0; j < minh; ++j){
			for(i = 0; i < minw; ++i){
			int out_index = isample + w2(jsample + h2(k + c2*b));
			int add_index = istride + w1(jstride + h1(k + c1*b));
			out[out_index] += add[add_index];
			}
			}

			@@ -16,7 +16,7 @@
			void fill_cpu(int N, float ALPHA, float * X, int INCX);
			float dot_cpu(int N, float X, int INCX, float Y, int INCY);
			void test_gpu_blas();
			void shortcut_cpu(float out, int w, int h, int c, int batch, int sample, float add, int stride, int c2);
			void shortcut_cpu(int batch, int w1, int h1, int c1, float add, int w2, int h2, int c2, float out);

			void mean_cpu(float x, int batch, int filters, int spatial, float mean);
			void variance_cpu(float x, float mean, int batch, int filters, int spatial, float *variance);
			@@ -45,6 +45,6 @@

			void fast_variance_gpu(float x, float mean, int batch, int filters, int spatial, float *variance);
			void fast_mean_gpu(float x, int batch, int filters, int spatial, float mean);
			void shortcut_gpu(float out, int w, int h, int c, int batch, int sample, float add, int stride, int c2);
			void shortcut_gpu(int batch, int w1, int h1, int c1, float add, int w2, int h2, int c2, float out);
			#endif
			#endif

			@@ -1,6 +1,7 @@
			#include "cuda_runtime.h"
			#include "curand.h"
			#include "cublas_v2.h"
			#include <assert.h>

			extern "C" {
			#include "blas.h"
			@@ -374,26 +375,37 @@
			check_error(cudaPeekAtLastError());
			}

			__global__ void shortcut_kernel(int size, float out, int w, int h, int c, int batch, int sample, float add, int stride, int c2, int min_c)
			__global__ void shortcut_kernel(int size, int minw, int minh, int minc, int stride, int sample, int batch, int w1, int h1, int c1, float add, int w2, int h2, int c2, float out)
			{
			int id = (blockIdx.x + blockIdx.ygridDim.x) blockDim.x + threadIdx.x;
			if (id >= size) return;
			int i = id % (w/sample);
			id /= (w/sample);
			int j = id % (h/sample);
			id /= (h/sample);
			int k = id % min_c;
			id /= min_c;
			int b = id;
			int out_index = isample + w(jsample + h(k + c*b));
			int add_index = bwstride/samplehstride/samplec2 + istride + wstride/sample(jstride + hstride/sample*k);
			int i = id % minw;
			id /= minw;
			int j = id % minh;
			id /= minh;
			int k = id % minc;
			id /= minc;
			int b = id % batch;

			int out_index = isample + w2(jsample + h2(k + c2*b));
			int add_index = istride + w1(jstride + h1(k + c1*b));
			out[out_index] += add[add_index];
			}

			extern "C" void shortcut_gpu(float out, int w, int h, int c, int batch, int sample, float add, int stride, int c2)
			extern "C" void shortcut_gpu(int batch, int w1, int h1, int c1, float add, int w2, int h2, int c2, float out)
			{
			int min_c = (c < c2) ? c : c2;
			int size = batch * w/sample * h/sample * min_c;
			shortcut_kernel<<<cuda_gridsize(size), BLOCK>>>(size, out, w, h, c, batch, sample, add, stride, c2, min_c);
			int minw = (w1 < w2) ? w1 : w2;
			int minh = (h1 < h2) ? h1 : h2;
			int minc = (c1 < c2) ? c1 : c2;

			int stride = w1/w2;
			int sample = w2/w1;
			assert(stride == h1/h2);
			assert(sample == h2/h1);
			if(stride < 1) stride = 1;
			if(sample < 1) sample = 1;

			int size = batch * minw * minh * minc;
			shortcut_kernel<<<cuda_gridsize(size), BLOCK>>>(size, minw, minh, minc, stride, sample, batch, w1, h1, c1, add, w2, h2, c2, out);
			check_error(cudaPeekAtLastError());
			}

			@@ -2,6 +2,7 @@
			#include "utils.h"
			#include "parser.h"
			#include "option_list.h"
			#include "blas.h"

			#ifdef OPENCV
			#include "opencv2/highgui/highgui_c.h"
			@@ -183,6 +184,145 @@
			}
			}

			void validate_classifier_10(char datacfg, char filename, char *weightfile)
			{
			int i, j;
			network net = parse_network_cfg(filename);
			set_batch_network(&net, 1);
			if(weightfile){
			load_weights(&net, weightfile);
			}
			srand(time(0));

			list *options = read_data_cfg(datacfg);

			char *label_list = option_find_str(options, "labels", "data/labels.list");
			char *valid_list = option_find_str(options, "valid", "data/train.list");
			int classes = option_find_int(options, "classes", 2);
			int topk = option_find_int(options, "top", 1);

			char **labels = get_labels(label_list);
			list *plist = get_paths(valid_list);

			char paths = (char )list_to_array(plist);
			int m = plist->size;
			free_list(plist);

			float avg_acc = 0;
			float avg_topk = 0;
			int *indexes = calloc(topk, sizeof(int));

			for(i = 0; i < m; ++i){
			int class = -1;
			char *path = paths[i];
			for(j = 0; j < classes; ++j){
			if(strstr(path, labels[j])){
			class = j;
			break;
			}
			}
			image im = load_image_color(paths[i], 256, 256);
			image images[10];
			images[0] = crop_image(im, -16, -16, 256, 256);
			images[1] = crop_image(im, 16, -16, 256, 256);
			images[2] = crop_image(im, 0, 0, 256, 256);
			images[3] = crop_image(im, -16, 16, 256, 256);
			images[4] = crop_image(im, 16, 16, 256, 256);
			flip_image(im);
			images[5] = crop_image(im, -16, -16, 256, 256);
			images[6] = crop_image(im, 16, -16, 256, 256);
			images[7] = crop_image(im, 0, 0, 256, 256);
			images[8] = crop_image(im, -16, 16, 256, 256);
			images[9] = crop_image(im, 16, 16, 256, 256);
			float *pred = calloc(classes, sizeof(float));
			for(j = 0; j < 10; ++j){
			float *p = network_predict(net, images[j].data);
			axpy_cpu(classes, 1, p, 1, pred, 1);
			free_image(images[j]);
			}
			free_image(im);
			top_k(pred, classes, topk, indexes);
			free(pred);
			if(indexes[0] == class) avg_acc += 1;
			for(j = 0; j < topk; ++j){
			if(indexes[j] == class) avg_topk += 1;
			}

			printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
			}
			}

			void validate_classifier_multi(char datacfg, char filename, char *weightfile)
			{
			int i, j;
			network net = parse_network_cfg(filename);
			set_batch_network(&net, 1);
			if(weightfile){
			load_weights(&net, weightfile);
			}
			srand(time(0));

			list *options = read_data_cfg(datacfg);

			char *label_list = option_find_str(options, "labels", "data/labels.list");
			char *valid_list = option_find_str(options, "valid", "data/train.list");
			int classes = option_find_int(options, "classes", 2);
			int topk = option_find_int(options, "top", 1);

			char **labels = get_labels(label_list);
			list *plist = get_paths(valid_list);
			int scales[] = {224, 256, 384, 480, 640};
			int nscales = sizeof(scales)/sizeof(scales[0]);

			char paths = (char )list_to_array(plist);
			int m = plist->size;
			free_list(plist);

			float avg_acc = 0;
			float avg_topk = 0;
			int *indexes = calloc(topk, sizeof(int));

			for(i = 0; i < m; ++i){
			int class = -1;
			char *path = paths[i];
			for(j = 0; j < classes; ++j){
			if(strstr(path, labels[j])){
			class = j;
			break;
			}
			}
			float *pred = calloc(classes, sizeof(float));
			image im = load_image_color(paths[i], 0, 0);
			for(j = 0; j < nscales; ++j){
			int w, h;
			if(im.w < im.h){
			w = scales[j];
			h = (im.h*w)/im.w;
			} else {
			h = scales[j];
			w = (im.w * h) / im.h;
			}
			resize_network(&net, w, h);
			image r = resize_image(im, w, h);
			float *p = network_predict(net, r.data);
			axpy_cpu(classes, 1, p, 1, pred, 1);
			flip_image(r);
			p = network_predict(net, r.data);
			axpy_cpu(classes, 1, p, 1, pred, 1);
			free_image(r);
			}
			free_image(im);
			top_k(pred, classes, topk, indexes);
			free(pred);
			if(indexes[0] == class) avg_acc += 1;
			for(j = 0; j < topk; ++j){
			if(indexes[j] == class) avg_topk += 1;
			}

			printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
			}
			}

			void predict_classifier(char datacfg, char cfgfile, char weightfile, char filename)
			{
			network net = parse_network_cfg(cfgfile);
			@@ -296,7 +436,7 @@

			free_matrix(pred);

			fprintf(stderr, "%lf seconds, %d images\n", sec(clock()-time), val.X.rows);
			fprintf(stderr, "%lf seconds, %d images, %d total\n", sec(clock()-time), val.X.rows, curr);
			free_data(val);
			}
			}
			@@ -319,6 +459,8 @@
			else if(0==strcmp(argv[2], "train")) train_classifier(data, cfg, weights);
			else if(0==strcmp(argv[2], "test")) test_classifier(data, cfg, weights, layer);
			else if(0==strcmp(argv[2], "valid")) validate_classifier(data, cfg, weights);
			else if(0==strcmp(argv[2], "valid10")) validate_classifier_10(data, cfg, weights);
			else if(0==strcmp(argv[2], "validmulti")) validate_classifier_multi(data, cfg, weights);
			}

			@@ -32,7 +32,7 @@
			//float scale = 1./sqrt(inputs);
			float scale = sqrt(2./inputs);
			for(i = 0; i < outputs*inputs; ++i){
			l.weights[i] = 2scalerand_uniform() - scale;
			l.weights[i] = scale*rand_uniform(-1, 1);
			}

			for(i = 0; i < outputs; ++i){

			@@ -65,7 +65,7 @@

			// float scale = 1./sqrt(sizesizec);
			float scale = sqrt(2./(sizesizec));
			for(i = 0; i < cnsizesize; ++i) l.filters[i] = 2scale*rand_uniform() - scale;
			for(i = 0; i < cnsizesize; ++i) l.filters[i] = scalerand_uniform(-1, 1);
			int out_h = convolutional_out_height(l);
			int out_w = convolutional_out_width(l);
			l.out_h = out_h;

			@@ -19,33 +19,51 @@
			l.h = h;
			l.w = w;
			l.c = c;
			l.scale = (float)crop_height / h;
			l.flip = flip;
			l.angle = angle;
			l.saturation = saturation;
			l.exposure = exposure;
			l.crop_width = crop_width;
			l.crop_height = crop_height;
			l.out_w = crop_width;
			l.out_h = crop_height;
			l.out_c = c;
			l.inputs = l.w * l.h * l.c;
			l.outputs = l.out_w * l.out_h * l.out_c;
			l.output = calloc(crop_widthcrop_height c*batch, sizeof(float));
			l.output = calloc(l.outputs*batch, sizeof(float));
			#ifdef GPU
			l.output_gpu = cuda_make_array(l.output, crop_widthcrop_heightc*batch);
			l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
			l.rand_gpu = cuda_make_array(0, l.batch*8);
			#endif
			return l;
			}

			void resize_crop_layer(layer *l, int w, int h)
			{
			l->w = w;
			l->h = h;

			l->out_w = l->scale*w;
			l->out_h = l->scale*h;

			l->inputs = l->w * l->h * l->c;
			l->outputs = l->out_h * l->out_w * l->out_c;

			l->output = realloc(l->output, l->batchl->outputssizeof(float));
			#ifdef GPU
			cuda_free(l->output_gpu);
			l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch);
			#endif
			}


			void forward_crop_layer(const crop_layer l, network_state state)
			{
			int i,j,c,b,row,col;
			int index;
			int count = 0;
			int flip = (l.flip && rand()%2);
			int dh = rand()%(l.h - l.crop_height + 1);
			int dw = rand()%(l.w - l.crop_width + 1);
			int dh = rand()%(l.h - l.out_h + 1);
			int dw = rand()%(l.w - l.out_w + 1);
			float scale = 2;
			float trans = -1;
			if(l.noadjust){
			@@ -54,13 +72,13 @@
			}
			if(!state.train){
			flip = 0;
			dh = (l.h - l.crop_height)/2;
			dw = (l.w - l.crop_width)/2;
			dh = (l.h - l.out_h)/2;
			dw = (l.w - l.out_w)/2;
			}
			for(b = 0; b < l.batch; ++b){
			for(c = 0; c < l.c; ++c){
			for(i = 0; i < l.crop_height; ++i){
			for(j = 0; j < l.crop_width; ++j){
			for(i = 0; i < l.out_h; ++i){
			for(j = 0; j < l.out_w; ++j){
			if(flip){
			col = l.w - dw - j - 1;
			}else{

			@@ -11,6 +11,7 @@
			image get_crop_image(crop_layer l);
			crop_layer make_crop_layer(int batch, int h, int w, int c, int crop_height, int crop_width, int flip, float angle, float saturation, float exposure);
			void forward_crop_layer(const crop_layer l, network_state state);
			void resize_crop_layer(layer *l, int w, int h);

			#ifdef GPU
			void forward_crop_layer_gpu(crop_layer l, network_state state);

			@@ -198,9 +198,9 @@
			levels_image_kernel<<<cuda_gridsize(size), BLOCK>>>(state.input, layer.rand_gpu, layer.batch, layer.w, layer.h, state.train, layer.saturation, layer.exposure, translate, scale, layer.shift);
			check_error(cudaPeekAtLastError());

			size = layer.batchlayer.clayer.crop_width*layer.crop_height;
			size = layer.batchlayer.clayer.out_w*layer.out_h;

			forward_crop_layer_kernel<<<cuda_gridsize(size), BLOCK>>>(state.input, layer.rand_gpu, size, layer.c, layer.h, layer.w, layer.crop_height, layer.crop_width, state.train, layer.flip, radians, layer.output_gpu);
			forward_crop_layer_kernel<<<cuda_gridsize(size), BLOCK>>>(state.input, layer.rand_gpu, size, layer.c, layer.h, layer.w, layer.out_h, layer.out_w, state.train, layer.flip, radians, layer.output_gpu);
			check_error(cudaPeekAtLastError());

			/*

			@@ -427,10 +427,10 @@
			int dw = (ow*jitter);
			int dh = (oh*jitter);

			int pleft = (rand_uniform() * 2*dw - dw);
			int pright = (rand_uniform() * 2*dw - dw);
			int ptop = (rand_uniform() * 2*dh - dh);
			int pbot = (rand_uniform() * 2*dh - dh);
			int pleft = rand_uniform(-dw, dw);
			int pright = rand_uniform(-dw, dw);
			int ptop = rand_uniform(-dh, dh);
			int pbot = rand_uniform(-dh, dh);

			int swidth = ow - pleft - pright;
			int sheight = oh - ptop - pbot;
			@@ -543,10 +543,10 @@
			int dw = w*jitter;
			int dh = h*jitter;

			int pleft = (rand_uniform() * 2*dw - dw);
			int pright = (rand_uniform() * 2*dw - dw);
			int ptop = (rand_uniform() * 2*dh - dh);
			int pbot = (rand_uniform() * 2*dh - dh);
			int pleft = rand_uniform(-dw, dw);
			int pright = rand_uniform(-dw, dw);
			int ptop = rand_uniform(-dh, dh);
			int pbot = rand_uniform(-dh, dh);

			int swidth = w - pleft - pright;
			int sheight = h - ptop - pbot;
			@@ -594,10 +594,10 @@
			int dw = ow/10;
			int dh = oh/10;

			int pleft = (rand_uniform() * 2*dw - dw);
			int pright = (rand_uniform() * 2*dw - dw);
			int ptop = (rand_uniform() * 2*dh - dh);
			int pbot = (rand_uniform() * 2*dh - dh);
			int pleft = rand_uniform(-dw, dw);
			int pright = rand_uniform(-dw, dw);
			int ptop = rand_uniform(-dh, dh);
			int pbot = rand_uniform(-dh, dh);

			int swidth = ow - pleft - pright;
			int sheight = oh - ptop - pbot;

			@@ -37,7 +37,7 @@
			int i;
			if (!state.train) return;
			for(i = 0; i < l.batch * l.inputs; ++i){
			float r = rand_uniform();
			float r = rand_uniform(0, 1);
			l.rand[i] = r;
			if(r < l.probability) state.input[i] = 0;
			else state.input[i] *= l.scale;

			@@ -390,6 +390,17 @@
			return out;
			}

			image make_random_image(int w, int h, int c)
			{
			image out = make_empty_image(w,h,c);
			out.data = calloc(hwc, sizeof(float));
			int i;
			for(i = 0; i < whc; ++i){
			out.data[i] = (rand_normal() * .25) + .5;
			}
			return out;
			}

			image float_to_image(int w, int h, int c, float *data)
			{
			image out = make_empty_image(w,h,c);
			@@ -692,6 +703,8 @@
			return resized;
			}

			#include "cuda.h"

			void test_resize(char *filename)
			{
			image im = load_image(filename, 0,0, 3);
			@@ -709,14 +722,27 @@
			image exp5 = copy_image(im);
			exposure_image(exp5, .5);

			image r = resize_image(im, im.w/2, im.h/2);
			#ifdef GPU
			image r = resize_image(im, im.w, im.h);
			image black = make_image(im.w2 + 3, im.h2 + 3, 9);
			image black2 = make_image(im.w, im.h, 3);

			image black = make_image(im.w, im.h, im.c);
			shortcut_cpu(black.data, im.w, im.h, im.c, 1, 2, r.data, 1, r.c);
			float r_gpu = cuda_make_array(r.data, r.wr.h*r.c);
			float black_gpu = cuda_make_array(black.data, black.wblack.h*black.c);
			float black2_gpu = cuda_make_array(black2.data, black2.wblack2.h*black2.c);
			shortcut_gpu(3, r.w, r.h, 1, r_gpu, black.w, black.h, 3, black_gpu);
			//flip_image(r);
			//shortcut_gpu(3, r.w, r.h, 1, r.data, black.w, black.h, 3, black.data);

			shortcut_gpu(3, black.w, black.h, 3, black_gpu, black2.w, black2.h, 1, black2_gpu);
			cuda_pull_array(black_gpu, black.data, black.wblack.hblack.c);
			cuda_pull_array(black2_gpu, black2.data, black2.wblack2.hblack2.c);
			show_image_layers(black, "Black");
			show_image(black2, "Recreate");
			#endif

			show_image(im, "Original");
			show_image(gray, "Gray");
			show_image(black, "Black");
			show_image(sat2, "Saturation-2");
			show_image(sat5, "Saturation-.5");
			show_image(exp2, "Exposure-2");

			@@ -58,6 +58,7 @@
			void print_image(image m);

			image make_image(int w, int h, int c);
			image make_random_image(int w, int h, int c);
			image make_empty_image(int w, int h, int c);
			image float_to_image(int w, int h, int c, float *data);
			image copy_image(image p);

			@@ -21,7 +21,7 @@
			printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
			int imgs = 1024;
			char **labels = get_labels("data/inet.labels.list");
			list *plist = get_paths("/data/imagenet/cls.train.list");
			list *plist = get_paths("data/inet.train.list");
			char paths = (char )list_to_array(plist);
			printf("%d\n", plist->size);
			int N = plist->size;
			@@ -62,6 +62,11 @@
			sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
			save_weights(net, buff);
			}
			if(*net.seen%1000 == 0){
			char buff[256];
			sprintf(buff, "%s/%s.backup",backup_directory,base);
			save_weights(net, buff);
			}
			}
			char buff[256];
			sprintf(buff, "%s/%s.weights", backup_directory, base);

			@@ -20,7 +20,8 @@
			NORMALIZATION,
			AVGPOOL,
			LOCAL,
			SHORTCUT
			SHORTCUT,
			ACTIVE
			} LAYER_TYPE;

			typedef enum{
			@@ -46,8 +47,6 @@
			int side;
			int stride;
			int pad;
			int crop_width;
			int crop_height;
			int sqrt;
			int flip;
			int index;

			@@ -55,7 +55,7 @@

			// float scale = 1./sqrt(sizesizec);
			float scale = sqrt(2./(sizesizec));
			for(i = 0; i < cnsizesize; ++i) l.filters[i] = 2scale*rand_uniform() - scale;
			for(i = 0; i < cnsizesize; ++i) l.filters[i] = scalerand_uniform(-1,1);

			l.col_image = calloc(out_hout_wsizesizec, sizeof(float));
			l.output = calloc(l.batchout_h out_w * n, sizeof(float));

			@@ -51,6 +51,7 @@
			int stride = l->stride;
			l->h = h;
			l->w = w;
			l->inputs = hwl->c;

			l->out_w = (w-1)/stride + 1;
			l->out_h = (h-1)/stride + 1;

			@@ -10,6 +10,7 @@
			#include "connected_layer.h"
			#include "local_layer.h"
			#include "convolutional_layer.h"
			#include "activation_layer.h"
			#include "deconvolutional_layer.h"
			#include "detection_layer.h"
			#include "normalization_layer.h"
			@@ -73,6 +74,8 @@
			switch(a){
			case CONVOLUTIONAL:
			return "convolutional";
			case ACTIVE:
			return "activation";
			case LOCAL:
			return "local";
			case DECONVOLUTIONAL:
			@@ -131,6 +134,8 @@
			forward_convolutional_layer(l, state);
			} else if(l.type == DECONVOLUTIONAL){
			forward_deconvolutional_layer(l, state);
			} else if(l.type == ACTIVE){
			forward_activation_layer(l, state);
			} else if(l.type == LOCAL){
			forward_local_layer(l, state);
			} else if(l.type == NORMALIZATION){
			@@ -231,6 +236,8 @@
			backward_convolutional_layer(l, state);
			} else if(l.type == DECONVOLUTIONAL){
			backward_deconvolutional_layer(l, state);
			} else if(l.type == ACTIVE){
			backward_activation_layer(l, state);
			} else if(l.type == NORMALIZATION){
			backward_normalization_layer(l, state);
			} else if(l.type == MAXPOOL){
			@@ -360,11 +367,12 @@
			layer l = net->layers[i];
			if(l.type == CONVOLUTIONAL){
			resize_convolutional_layer(&l, w, h);
			}else if(l.type == CROP){
			resize_crop_layer(&l, w, h);
			}else if(l.type == MAXPOOL){
			resize_maxpool_layer(&l, w, h);
			}else if(l.type == AVGPOOL){
			resize_avgpool_layer(&l, w, h);
			break;
			}else if(l.type == NORMALIZATION){
			resize_normalization_layer(&l, w, h);
			}else if(l.type == COST){
			@@ -376,6 +384,7 @@
			net->layers[i] = l;
			w = l.out_w;
			h = l.out_h;
			if(l.type == AVGPOOL) break;
			}
			//fprintf(stderr, " Done!\n");
			return 0;

			@@ -18,6 +18,7 @@
			#include "connected_layer.h"
			#include "detection_layer.h"
			#include "convolutional_layer.h"
			#include "activation_layer.h"
			#include "deconvolutional_layer.h"
			#include "maxpool_layer.h"
			#include "avgpool_layer.h"
			@@ -48,6 +49,8 @@
			forward_convolutional_layer_gpu(l, state);
			} else if(l.type == DECONVOLUTIONAL){
			forward_deconvolutional_layer_gpu(l, state);
			} else if(l.type == ACTIVE){
			forward_activation_layer_gpu(l, state);
			} else if(l.type == LOCAL){
			forward_local_layer_gpu(l, state);
			} else if(l.type == DETECTION){
			@@ -97,6 +100,8 @@
			backward_convolutional_layer_gpu(l, state);
			} else if(l.type == DECONVOLUTIONAL){
			backward_deconvolutional_layer_gpu(l, state);
			} else if(l.type == ACTIVE){
			backward_activation_layer_gpu(l, state);
			} else if(l.type == LOCAL){
			backward_local_layer_gpu(l, state);
			} else if(l.type == MAXPOOL){

			@@ -108,6 +108,69 @@

			}

			void smooth(image recon, image update, float lambda, int num)
			{
			int i, j, k;
			int ii, jj;
			for(k = 0; k < recon.c; ++k){
			for(j = 0; j < recon.h; ++j){
			for(i = 0; i < recon.w; ++i){
			int out_index = i + recon.w(j + recon.hk);
			for(jj = j-num; jj <= j + num && jj < recon.h; ++jj){
			if (jj < 0) continue;
			for(ii = i-num; ii <= i + num && ii < recon.w; ++ii){
			if (ii < 0) continue;
			int in_index = ii + recon.w(jj + recon.hk);
			update.data[out_index] += lambda * (recon.data[in_index] - recon.data[out_index]);
			}
			}
			}
			}
			}
			}

			void reconstruct_picture(network net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size)
			{
			scale_image(recon, 2);
			translate_image(recon, -1);

			image delta = make_image(recon.w, recon.h, recon.c);

			network_state state = {0};
			#ifdef GPU
			state.input = cuda_make_array(recon.data, recon.wrecon.hrecon.c);
			state.delta = cuda_make_array(delta.data, delta.wdelta.hdelta.c);
			state.truth = cuda_make_array(features, get_network_output_size(net));

			forward_network_gpu(net, state);
			backward_network_gpu(net, state);

			cuda_pull_array(state.delta, delta.data, delta.wdelta.hdelta.c);

			cuda_free(state.input);
			cuda_free(state.delta);
			cuda_free(state.truth);
			#else
			state.input = recon.data;
			state.delta = delta.data;
			state.truth = features;

			forward_network(net, state);
			backward_network(net, state);
			#endif

			axpy_cpu(recon.wrecon.hrecon.c, 1, delta.data, 1, update.data, 1);
			smooth(recon, update, lambda, smooth_size);

			axpy_cpu(recon.wrecon.hrecon.c, rate, update.data, 1, recon.data, 1);
			scal_cpu(recon.wrecon.hrecon.c, momentum, update.data, 1);

			translate_image(recon, 1);
			scale_image(recon, .5);
			constrain_image(recon);
			free_image(delta);
			}


			void run_nightmare(int argc, char **argv)
			{
			@@ -131,7 +194,11 @@
			float rate = find_float_arg(argc, argv, "-rate", .04);
			float thresh = find_float_arg(argc, argv, "-thresh", 1.);
			float rotate = find_float_arg(argc, argv, "-rotate", 0);
			float momentum = find_float_arg(argc, argv, "-momentum", .9);
			float lambda = find_float_arg(argc, argv, "-lambda", .01);
			char *prefix = find_char_arg(argc, argv, "-prefix", 0);
			int reconstruct = find_arg(argc, argv, "-reconstruct");
			int smooth_size = find_int_arg(argc, argv, "-smooth", 1);

			network net = parse_network_cfg(cfg);
			load_weights(&net, weights);
			@@ -151,6 +218,19 @@
			im = resized;
			}

			float *features;
			image update;
			if (reconstruct){
			resize_network(&net, im.w, im.h);
			int size = get_network_output_size(net);
			features = calloc(size, sizeof(float));
			float *out = network_predict(net, im.data);
			copy_cpu(size, out, 1, features, 1);
			free_image(im);
			im = make_random_image(im.w, im.h, im.c);
			update = make_image(im.w, im.h, im.c);
			}

			int e;
			int n;
			for(e = 0; e < rounds; ++e){
			@@ -159,10 +239,18 @@
			for(n = 0; n < iters; ++n){
			fprintf(stderr, "%d, ", n);
			fflush(stderr);
			if(reconstruct){
			reconstruct_picture(net, features, im, update, rate, momentum, lambda, smooth_size);
			show_image(im, "reconstruction");
			#ifdef OPENCV
			cvWaitKey(10);
			#endif
			}else{
			int layer = max_layer + rand()%range - range/2;
			int octave = rand()%octaves;
			optimize_picture(&net, im, layer, 1/pow(1.33333333, octave), rate, thresh, norm);
			}
			}
			fprintf(stderr, "done\n");
			if(0){
			image g = grayscale_image(im);

			@@ -7,6 +7,7 @@
			#include "crop_layer.h"
			#include "cost_layer.h"
			#include "convolutional_layer.h"
			#include "activation_layer.h"
			#include "normalization_layer.h"
			#include "deconvolutional_layer.h"
			#include "connected_layer.h"
			@@ -29,6 +30,7 @@

			int is_network(section *s);
			int is_convolutional(section *s);
			int is_activation(section *s);
			int is_local(section *s);
			int is_deconvolutional(section *s);
			int is_connected(section *s);
			@@ -301,10 +303,31 @@
			layer from = net.layers[index];

			layer s = make_shortcut_layer(batch, index, params.w, params.h, params.c, from.out_w, from.out_h, from.out_c);

			char *activation_s = option_find_str(options, "activation", "linear");
			ACTIVATION activation = get_activation(activation_s);
			s.activation = activation;
			return s;
			}


			layer parse_activation(list *options, size_params params)
			{
			char *activation_s = option_find_str(options, "activation", "linear");
			ACTIVATION activation = get_activation(activation_s);

			layer l = make_activation_layer(params.batch, params.inputs, activation);

			l.out_h = params.h;
			l.out_w = params.w;
			l.out_c = params.c;
			l.h = params.h;
			l.w = params.w;
			l.c = params.c;

			return l;
			}

			route_layer parse_route(list *options, size_params params, network net)
			{
			char *l = option_find(options, "layers");
			@@ -447,6 +470,8 @@
			l = parse_convolutional(options, params);
			}else if(is_local(s)){
			l = parse_local(options, params);
			}else if(is_activation(s)){
			l = parse_activation(options, params);
			}else if(is_deconvolutional(s)){
			l = parse_deconvolutional(options, params);
			}else if(is_connected(s)){
			@@ -530,6 +555,10 @@
			return (strcmp(s->type, "[conv]")==0
			\|\| strcmp(s->type, "[convolutional]")==0);
			}
			int is_activation(section *s)
			{
			return (strcmp(s->type, "[activation]")==0);
			}
			int is_network(section *s)
			{
			return (strcmp(s->type, "[net]")==0

			@@ -10,22 +10,15 @@
			layer l = {0};
			l.type = SHORTCUT;
			l.batch = batch;
			l.w = w;
			l.h = h;
			l.c = c;
			l.w = w2;
			l.h = h2;
			l.c = c2;
			l.out_w = w;
			l.out_h = h;
			l.out_c = c;
			l.outputs = whc;
			l.inputs = whc;
			int stride = w2 / w;
			l.inputs = l.outputs;

			assert(stride * w == w2);
			assert(stride * h == h2);
			assert(c >= c2);

			l.stride = stride;
			l.n = c2;
			l.index = index;

			l.delta = calloc(l.outputs*batch, sizeof(float));
			@@ -40,25 +33,29 @@
			void forward_shortcut_layer(const layer l, network_state state)
			{
			copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
			shortcut_cpu(l.output, l.w, l.h, l.c, l.batch, 1, state.net.layers[l.index].output, l.stride, l.n);
			shortcut_cpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.output);
			activate_array(l.output, l.outputs*l.batch, l.activation);
			}

			void backward_shortcut_layer(const layer l, network_state state)
			{
			copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1);
			shortcut_cpu(state.net.layers[l.index].delta, l.wl.stride, l.hl.stride, l.n, l.batch, l.stride, l.delta, 1, l.c);
			gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
			axpy_cpu(l.outputs*l.batch, 1, l.delta, 1, state.delta, 1);
			shortcut_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, state.net.layers[l.index].delta);
			}

			#ifdef GPU
			void forward_shortcut_layer_gpu(const layer l, network_state state)
			{
			copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
			shortcut_gpu(l.output_gpu, l.w, l.h, l.c, l.batch, 1, state.net.layers[l.index].output_gpu, l.stride, l.n);
			shortcut_gpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu);
			activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
			}

			void backward_shortcut_layer_gpu(const layer l, network_state state)
			{
			copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
			shortcut_gpu(state.net.layers[l.index].delta_gpu, l.wl.stride, l.hl.stride, l.n, l.batch, l.stride, l.delta_gpu, 1, l.c);
			gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
			axpy_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1, state.delta, 1);
			shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, state.net.layers[l.index].delta_gpu);
			}
			#endif

			@@ -485,9 +485,9 @@
			}
			*/

			float rand_uniform()
			float rand_uniform(float min, float max)
			{
			return (float)rand()/RAND_MAX;
			return ((float)rand()/RAND_MAX * (max - min)) + min;
			}

			float *one_hot_encode(float a, int n, int k)