~speedprog/mtg/mtg_card_detector.git

parent: 8ec889f1 | patch | commit | show whitespace

Joseph Redmon

2016-09-12 5c067dc44785a761a0243d8cd634e3ac17d548ad

good chance I didn't break anything

19 files modified

	cfg/alexnet.cfg	58 ●●●●● patch \| view \| raw \| blame \| history
	src/classifier.c	148 ●●●●● patch \| view \| raw \| blame \| history
	src/convolutional_kernels.cu	48 ●●●●● patch \| view \| raw \| blame \| history
	src/convolutional_layer.c	110 ●●●●● patch \| view \| raw \| blame \| history
	src/convolutional_layer.h	12 ●●●●● patch \| view \| raw \| blame \| history
	src/cuda.c	55 ●●●●● patch \| view \| raw \| blame \| history
	src/cuda.h	1 ●●●●● patch \| view \| raw \| blame \| history
	src/darknet.c	11 ●●●●● patch \| view \| raw \| blame \| history
	src/deconvolutional_kernels.cu	20 ●●●●● patch \| view \| raw \| blame \| history
	src/deconvolutional_layer.c	22 ●●●●● patch \| view \| raw \| blame \| history
	src/detector.c	8 ●●●●● patch \| view \| raw \| blame \| history
	src/layer.c	6 ●●●●● patch \| view \| raw \| blame \| history
	src/layer.h	14 ●●●●● patch \| view \| raw \| blame \| history
	src/local_layer.c	38 ●●●●● patch \| view \| raw \| blame \| history
	src/network.c	2 ●●●●● patch \| view \| raw \| blame \| history
	src/network.h	1 ●●●●● patch \| view \| raw \| blame \| history
	src/network_kernels.cu	112 ●●●●● patch \| view \| raw \| blame \| history
	src/parser.c	35 ●●●●● patch \| view \| raw \| blame \| history
	src/region_layer.c	93 ●●●●● patch \| view \| raw \| blame \| history

 cfg/alexnet.cfg

@@ -1,85 +1,91 @@
[net]
batch=128
subdivisions=1
height=256
width=256
height=227
width=227
channels=3
learning_rate=0.01
momentum=0.9
decay=0.0005
max_crop=256

[crop]
crop_height=224
crop_width=224
flip=1
angle=0
saturation=1
exposure=1
learning_rate=0.01
policy=poly
power=4
max_batches=800000

angle=7
hue = .1
saturation=.75
exposure=.75
aspect=.75

[convolutional]
filters=64
filters=96
size=11
stride=4
pad=0
activation=ramp
activation=relu

[maxpool]
size=3
stride=2
padding=0

[convolutional]
filters=192
filters=256
size=5
stride=1
pad=1
activation=ramp
activation=relu

[maxpool]
size=3
stride=2
padding=0

[convolutional]
filters=384
size=3
stride=1
pad=1
activation=ramp
activation=relu

[convolutional]
filters=384
size=3
stride=1
pad=1
activation=relu

[convolutional]
filters=256
size=3
stride=1
pad=1
activation=ramp

[convolutional]
filters=256
size=3
stride=1
pad=1
activation=ramp
activation=relu

[maxpool]
size=3
stride=2
padding=0

[connected]
output=4096
activation=ramp
activation=relu

[dropout]
probability=.5

[connected]
output=4096
activation=ramp
activation=relu

[dropout]
probability=.5

[connected]
output=1000
activation=ramp
activation=linear

[softmax]
groups=1

 src/classifier.c

@@ -5,6 +5,7 @@
#include "blas.h"
#include "assert.h"
#include "classifier.h"
#include "cuda.h"
#include <sys/time.h>

#ifdef OPENCV
@@ -51,6 +52,134 @@
    return v;
}

void train_classifier_multi(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear)
{
#ifdef GPU
    int nthreads = 8;
    int i;

    data_seed = time(0);
    srand(time(0));
    float avg_loss = -1;
    char *base = basecfg(cfgfile);
    printf("%s\n", base);
    printf("%d\n", ngpus);
    network *nets = calloc(ngpus, sizeof(network));
    for(i = 0; i < ngpus; ++i){
        cuda_set_device(gpus[i]);
        nets[i] = parse_network_cfg(cfgfile);
        if(weightfile){
            load_weights(&(nets[i]), weightfile);
        }
        if(clear) *nets[i].seen = 0;
    }
    network net = nets[0];

    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
    int imgs = net.batch*ngpus/nthreads;
    assert(net.batch*ngpus % nthreads == 0);

    list *options = read_data_cfg(datacfg);

    char *backup_directory = option_find_str(options, "backup", "/backup/");
    char *label_list = option_find_str(options, "labels", "data/labels.list");
    char *train_list = option_find_str(options, "train", "data/train.list");
    int classes = option_find_int(options, "classes", 2);

    char **labels = get_labels(label_list);
    list *plist = get_paths(train_list);
    char **paths = (char **)list_to_array(plist);
    printf("%d\n", plist->size);
    int N = plist->size;
    clock_t time;

    pthread_t *load_threads = calloc(nthreads, sizeof(pthread_t));
    data *trains  = calloc(nthreads, sizeof(data));
    data *buffers = calloc(nthreads, sizeof(data));

    load_args args = {0};
    args.w = net.w;
    args.h = net.h;

    args.min = net.min_crop;
    args.max = net.max_crop;
    args.angle = net.angle;
    args.aspect = net.aspect;
    args.exposure = net.exposure;
    args.saturation = net.saturation;
    args.hue = net.hue;
    args.size = net.w;

    args.paths = paths;
    args.classes = classes;
    args.n = imgs;
    args.m = N;
    args.labels = labels;
    args.type = CLASSIFICATION_DATA;

    for(i = 0; i < nthreads; ++i){
        args.d = buffers + i;
        load_threads[i] = load_data_in_thread(args);
    }

    int epoch = (*net.seen)/N;
    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
        time=clock();
        for(i = 0; i < nthreads; ++i){
            pthread_join(load_threads[i], 0);
            trains[i] = buffers[i];
        }
        data train = concat_datas(trains, nthreads);

        for(i = 0; i < nthreads; ++i){
            args.d = buffers + i;
            load_threads[i] = load_data_in_thread(args);
        }

        printf("Loaded: %lf seconds\n", sec(clock()-time));
        time=clock();

        float loss = train_networks(nets, ngpus, train);
        if(avg_loss == -1) avg_loss = loss;
        avg_loss = avg_loss*.9 + loss*.1;
        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
        free_data(train);
        for(i = 0; i < nthreads; ++i){
            free_data(trains[i]);
        }
        if(*net.seen/N > epoch){
            epoch = *net.seen/N;
            char buff[256];
            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
            save_weights(net, buff);
        }
        if(get_current_batch(net)%100 == 0){
            char buff[256];
            sprintf(buff, "%s/%s.backup",backup_directory,base);
            save_weights(net, buff);
        }
    }
    char buff[256];
    sprintf(buff, "%s/%s.weights", backup_directory, base);
    save_weights(net, buff);

    for(i = 0; i < nthreads; ++i){
        pthread_join(load_threads[i], 0);
        free_data(buffers[i]);
    }
    free(buffers);
    free(trains);
    free(load_threads);

    free_network(net);
    free_ptrs((void**)labels, classes);
    free_ptrs((void**)paths, plist->size);
    free_list(plist);
    free(base);
#endif
}


void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int clear)
{
    int nthreads = 8;
@@ -942,6 +1071,24 @@
        return;
    }

    char *gpu_list = find_char_arg(argc, argv, "-gpus", 0);
    int *gpus = 0;
    int ngpus = 0;
    if(gpu_list){
        printf("%s\n", gpu_list);
        int len = strlen(gpu_list);
        ngpus = 1;
        int i;
        for(i = 0; i < len; ++i){
            if (gpu_list[i] == ',') ++ngpus;
        }
        gpus = calloc(ngpus, sizeof(int));
        for(i = 0; i < ngpus; ++i){
            gpus[i] = atoi(gpu_list);
            gpu_list = strchr(gpu_list, ',')+1;
        }
    }

    int cam_index = find_int_arg(argc, argv, "-c", 0);
    int clear = find_arg(argc, argv, "-clear");
    char *data = argv[3];
@@ -953,6 +1100,7 @@
    if(0==strcmp(argv[2], "predict")) predict_classifier(data, cfg, weights, filename);
    else if(0==strcmp(argv[2], "try")) try_classifier(data, cfg, weights, filename, atoi(layer_s));
    else if(0==strcmp(argv[2], "train")) train_classifier(data, cfg, weights, clear);
    else if(0==strcmp(argv[2], "trainm")) train_classifier_multi(data, cfg, weights, gpus, ngpus, clear);
    else if(0==strcmp(argv[2], "demo")) demo_classifier(data, cfg, weights, cam_index, filename);
    else if(0==strcmp(argv[2], "threat")) threat_classifier(data, cfg, weights, cam_index, filename);
    else if(0==strcmp(argv[2], "test")) test_classifier(data, cfg, weights, layer);

 src/convolutional_kernels.cu

@@ -48,25 +48,25 @@
}


__global__ void binarize_filters_kernel(float *filters, int n, int size, float *binary)
__global__ void binarize_weights_kernel(float *weights, int n, int size, float *binary)
{
    int f = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
    if (f >= n) return;
    int i = 0;
    float mean = 0;
    for(i = 0; i < size; ++i){
        mean += abs(filters[f*size + i]);
        mean += abs(weights[f*size + i]);
    }
    mean = mean / size;
    for(i = 0; i < size; ++i){
        binary[f*size + i] = (filters[f*size + i] > 0) ? mean : -mean;
        //binary[f*size + i] = filters[f*size + i];
        binary[f*size + i] = (weights[f*size + i] > 0) ? mean : -mean;
        //binary[f*size + i] = weights[f*size + i];
    }
}

void binarize_filters_gpu(float *filters, int n, int size, float *binary)
void binarize_weights_gpu(float *weights, int n, int size, float *binary)
{
    binarize_filters_kernel<<<cuda_gridsize(n), BLOCK>>>(filters, n, size, binary);
    binarize_weights_kernel<<<cuda_gridsize(n), BLOCK>>>(weights, n, size, binary);
    check_error(cudaPeekAtLastError());
}

@@ -74,12 +74,12 @@
{
    fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
    if(l.binary){
        binarize_filters_gpu(l.filters_gpu, l.n, l.c*l.size*l.size, l.binary_filters_gpu);
        binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
        swap_binary(&l);
    }

    if(l.xnor){
        binarize_filters_gpu(l.filters_gpu, l.n, l.c*l.size*l.size, l.binary_filters_gpu);
        binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
        swap_binary(&l);
        binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu);
        state.input = l.binary_input_gpu;
@@ -91,8 +91,8 @@
                &one,
                l.srcTensorDesc,
                state.input,
                l.filterDesc,
                l.filters_gpu,
                l.weightDesc,
                l.weights_gpu,
                l.convDesc,
                l.fw_algo,
                state.workspace,
@@ -108,7 +108,7 @@
    int n = l.out_w*l.out_h;
    for(i = 0; i < l.batch; ++i){
        im2col_ongpu(state.input + i*l.c*l.h*l.w, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.workspace);
        float * a = l.filters_gpu;
        float * a = l.weights_gpu;
        float * b = state.workspace;
        float * c = l.output_gpu;
        gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n);
@@ -150,15 +150,15 @@
            state.workspace,
            l.workspace_size,
            &one,
            l.dfilterDesc,
            l.filter_updates_gpu);
            l.dweightDesc,
            l.weight_updates_gpu);

    if(state.delta){
        if(l.binary || l.xnor) swap_binary(&l);
        cudnnConvolutionBackwardData(cudnn_handle(),
                &one,
                l.filterDesc,
                l.filters_gpu,
                l.weightDesc,
                l.weights_gpu,
                l.ddstTensorDesc,
                l.delta_gpu,
                l.convDesc,
@@ -181,14 +181,14 @@
    for(i = 0; i < l.batch; ++i){
        float * a = l.delta_gpu;
        float * b = state.workspace;
        float * c = l.filter_updates_gpu;
        float * c = l.weight_updates_gpu;

        im2col_ongpu(state.input + i*l.c*l.h*l.w, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.workspace);
        gemm_ongpu(0,1,m,n,k,1,a + i*m*k,k,b,k,1,c,n);

        if(state.delta){
            if(l.binary || l.xnor) swap_binary(&l);
            float * a = l.filters_gpu;
            float * a = l.weights_gpu;
            float * b = l.delta_gpu;
            float * c = state.workspace;

@@ -206,9 +206,9 @@

void pull_convolutional_layer(convolutional_layer layer)
{
    cuda_pull_array(layer.filters_gpu, layer.filters, layer.c*layer.n*layer.size*layer.size);
    cuda_pull_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
    cuda_pull_array(layer.biases_gpu, layer.biases, layer.n);
    cuda_pull_array(layer.filter_updates_gpu, layer.filter_updates, layer.c*layer.n*layer.size*layer.size);
    cuda_pull_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
    cuda_pull_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
    if (layer.batch_normalize){
        cuda_pull_array(layer.scales_gpu, layer.scales, layer.n);
@@ -219,9 +219,9 @@

void push_convolutional_layer(convolutional_layer layer)
{
    cuda_push_array(layer.filters_gpu, layer.filters, layer.c*layer.n*layer.size*layer.size);
    cuda_push_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
    cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
    cuda_push_array(layer.filter_updates_gpu, layer.filter_updates, layer.c*layer.n*layer.size*layer.size);
    cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
    cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
    if (layer.batch_normalize){
        cuda_push_array(layer.scales_gpu, layer.scales, layer.n);
@@ -240,9 +240,9 @@
    axpy_ongpu(layer.n, learning_rate/batch, layer.scale_updates_gpu, 1, layer.scales_gpu, 1);
    scal_ongpu(layer.n, momentum, layer.scale_updates_gpu, 1);

    axpy_ongpu(size, -decay*batch, layer.filters_gpu, 1, layer.filter_updates_gpu, 1);
    axpy_ongpu(size, learning_rate/batch, layer.filter_updates_gpu, 1, layer.filters_gpu, 1);
    scal_ongpu(size, momentum, layer.filter_updates_gpu, 1);
    axpy_ongpu(size, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1);
    axpy_ongpu(size, learning_rate/batch, layer.weight_updates_gpu, 1, layer.weights_gpu, 1);
    scal_ongpu(size, momentum, layer.weight_updates_gpu, 1);
}



 src/convolutional_layer.c

@@ -19,28 +19,28 @@

void swap_binary(convolutional_layer *l)
{
    float *swap = l->filters;
    l->filters = l->binary_filters;
    l->binary_filters = swap;
    float *swap = l->weights;
    l->weights = l->binary_weights;
    l->binary_weights = swap;

    #ifdef GPU
    swap = l->filters_gpu;
    l->filters_gpu = l->binary_filters_gpu;
    l->binary_filters_gpu = swap;
    swap = l->weights_gpu;
    l->weights_gpu = l->binary_weights_gpu;
    l->binary_weights_gpu = swap;
    #endif
}

void binarize_filters(float *filters, int n, int size, float *binary)
void binarize_weights(float *weights, int n, int size, float *binary)
{
    int i, f;
    for(f = 0; f < n; ++f){
        float mean = 0;
        for(i = 0; i < size; ++i){
            mean += fabs(filters[f*size + i]);
            mean += fabs(weights[f*size + i]);
        }
        mean = mean / size;
        for(i = 0; i < size; ++i){
            binary[f*size + i] = (filters[f*size + i] > 0) ? mean : -mean;
            binary[f*size + i] = (weights[f*size + i] > 0) ? mean : -mean;
        }
    }
}
@@ -103,7 +103,7 @@
        size_t s = 0;
        cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
                l.srcTensorDesc,
                l.filterDesc,
                l.weightDesc,
                l.convDesc,
                l.dstTensorDesc,
                l.fw_algo,
@@ -113,12 +113,12 @@
                l.srcTensorDesc,
                l.ddstTensorDesc,
                l.convDesc,
                l.dfilterDesc,
                l.dweightDesc,
                l.bf_algo,
                &s);
        if (s > most) most = s;
        cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
                l.filterDesc,
                l.weightDesc,
                l.ddstTensorDesc,
                l.convDesc,
                l.dsrcTensorDesc,
@@ -137,22 +137,22 @@
{
    cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
    cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
    cudnnSetFilter4dDescriptor(l->dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); 
    cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); 

    cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
    cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
    cudnnSetFilter4dDescriptor(l->filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); 
    cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); 
    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);
    cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
            l->srcTensorDesc,
            l->filterDesc,
            l->weightDesc,
            l->convDesc,
            l->dstTensorDesc,
            CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
            0,
            &l->fw_algo);
    cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
            l->filterDesc,
            l->weightDesc,
            l->ddstTensorDesc,
            l->convDesc,
            l->dsrcTensorDesc,
@@ -163,7 +163,7 @@
            l->srcTensorDesc,
            l->ddstTensorDesc,
            l->convDesc,
            l->dfilterDesc,
            l->dweightDesc,
            CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
            0,
            &l->bf_algo);
@@ -189,15 +189,15 @@
    l.pad = padding;
    l.batch_normalize = batch_normalize;

    l.filters = calloc(c*n*size*size, sizeof(float));
    l.filter_updates = calloc(c*n*size*size, sizeof(float));
    l.weights = calloc(c*n*size*size, sizeof(float));
    l.weight_updates = calloc(c*n*size*size, sizeof(float));

    l.biases = calloc(n, sizeof(float));
    l.bias_updates = calloc(n, sizeof(float));

    // float scale = 1./sqrt(size*size*c);
    float scale = sqrt(2./(size*size*c));
    for(i = 0; i < c*n*size*size; ++i) l.filters[i] = scale*rand_uniform(-1, 1);
    for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1);
    int out_h = convolutional_out_height(l);
    int out_w = convolutional_out_width(l);
    l.out_h = out_h;
@@ -210,12 +210,12 @@
    l.delta  = calloc(l.batch*out_h * out_w * n, sizeof(float));

    if(binary){
        l.binary_filters = calloc(c*n*size*size, sizeof(float));
        l.cfilters = calloc(c*n*size*size, sizeof(char));
        l.binary_weights = calloc(c*n*size*size, sizeof(float));
        l.cweights = calloc(c*n*size*size, sizeof(char));
        l.scales = calloc(n, sizeof(float));
    }
    if(xnor){
        l.binary_filters = calloc(c*n*size*size, sizeof(float));
        l.binary_weights = calloc(c*n*size*size, sizeof(float));
        l.binary_input = calloc(l.inputs*l.batch, sizeof(float));
    }

@@ -235,8 +235,8 @@

#ifdef GPU
    if(gpu_index >= 0){
        l.filters_gpu = cuda_make_array(l.filters, c*n*size*size);
        l.filter_updates_gpu = cuda_make_array(l.filter_updates, c*n*size*size);
        l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
        l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);

        l.biases_gpu = cuda_make_array(l.biases, n);
        l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
@@ -248,10 +248,10 @@
        l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);

        if(binary){
            l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size);
            l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
        }
        if(xnor){
            l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size);
            l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
            l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
        }

@@ -271,10 +271,10 @@
#ifdef CUDNN
        cudnnCreateTensorDescriptor(&l.srcTensorDesc);
        cudnnCreateTensorDescriptor(&l.dstTensorDesc);
        cudnnCreateFilterDescriptor(&l.filterDesc);
        cudnnCreateFilterDescriptor(&l.weightDesc);
        cudnnCreateTensorDescriptor(&l.dsrcTensorDesc);
        cudnnCreateTensorDescriptor(&l.ddstTensorDesc);
        cudnnCreateFilterDescriptor(&l.dfilterDesc);
        cudnnCreateFilterDescriptor(&l.dweightDesc);
        cudnnCreateConvolutionDescriptor(&l.convDesc);
        cudnn_convolutional_setup(&l);
#endif
@@ -294,7 +294,7 @@
    for(i = 0; i < l.n; ++i){
        float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .00001);
        for(j = 0; j < l.c*l.size*l.size; ++j){
            l.filters[i*l.c*l.size*l.size + j] *= scale;
            l.weights[i*l.c*l.size*l.size + j] *= scale;
        }
        l.biases[i] -= l.rolling_mean[i] * scale;
        l.scales[i] = 1;
@@ -403,8 +403,8 @@

    /*
       if(l.binary){
       binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.binary_filters);
       binarize_filters2(l.filters, l.n, l.c*l.size*l.size, l.cfilters, l.scales);
       binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
       binarize_weights2(l.weights, l.n, l.c*l.size*l.size, l.cweights, l.scales);
       swap_binary(&l);
       }
     */
@@ -415,7 +415,7 @@
       int k = l.size*l.size*l.c;
       int n = out_h*out_w;

       char  *a = l.cfilters;
       char  *a = l.cweights;
       float *b = state.workspace;
       float *c = l.output;

@@ -434,7 +434,7 @@
     */

    if(l.xnor){
        binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.binary_filters);
        binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
        swap_binary(&l);
        binarize_cpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input);
        state.input = l.binary_input;
@@ -449,7 +449,7 @@
        printf("xnor\n");
    } else {

        float *a = l.filters;
        float *a = l.weights;
        float *b = state.workspace;
        float *c = l.output;

@@ -485,7 +485,7 @@
    for(i = 0; i < l.batch; ++i){
        float *a = l.delta + i*m*k;
        float *b = state.workspace;
        float *c = l.filter_updates;
        float *c = l.weight_updates;

        float *im = state.input+i*l.c*l.h*l.w;

@@ -494,7 +494,7 @@
        gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);

        if(state.delta){
            a = l.filters;
            a = l.weights;
            b = l.delta + i*m*k;
            c = state.workspace;

@@ -511,36 +511,36 @@
    axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
    scal_cpu(l.n, momentum, l.bias_updates, 1);

    axpy_cpu(size, -decay*batch, l.filters, 1, l.filter_updates, 1);
    axpy_cpu(size, learning_rate/batch, l.filter_updates, 1, l.filters, 1);
    scal_cpu(size, momentum, l.filter_updates, 1);
    axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1);
    axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
    scal_cpu(size, momentum, l.weight_updates, 1);
}


image get_convolutional_filter(convolutional_layer l, int i)
image get_convolutional_weight(convolutional_layer l, int i)
{
    int h = l.size;
    int w = l.size;
    int c = l.c;
    return float_to_image(w,h,c,l.filters+i*h*w*c);
    return float_to_image(w,h,c,l.weights+i*h*w*c);
}

void rgbgr_filters(convolutional_layer l)
void rgbgr_weights(convolutional_layer l)
{
    int i;
    for(i = 0; i < l.n; ++i){
        image im = get_convolutional_filter(l, i);
        image im = get_convolutional_weight(l, i);
        if (im.c == 3) {
            rgbgr_image(im);
        }
    }
}

void rescale_filters(convolutional_layer l, float scale, float trans)
void rescale_weights(convolutional_layer l, float scale, float trans)
{
    int i;
    for(i = 0; i < l.n; ++i){
        image im = get_convolutional_filter(l, i);
        image im = get_convolutional_weight(l, i);
        if (im.c == 3) {
            scale_image(im, scale);
            float sum = sum_array(im.data, im.w*im.h*im.c);
@@ -549,21 +549,21 @@
    }
}

image *get_filters(convolutional_layer l)
image *get_weights(convolutional_layer l)
{
    image *filters = calloc(l.n, sizeof(image));
    image *weights = calloc(l.n, sizeof(image));
    int i;
    for(i = 0; i < l.n; ++i){
        filters[i] = copy_image(get_convolutional_filter(l, i));
        //normalize_image(filters[i]);
        weights[i] = copy_image(get_convolutional_weight(l, i));
        //normalize_image(weights[i]);
    }
    return filters;
    return weights;
}

image *visualize_convolutional_layer(convolutional_layer l, char *window, image *prev_filters)
image *visualize_convolutional_layer(convolutional_layer l, char *window, image *prev_weights)
{
    image *single_filters = get_filters(l);
    show_images(single_filters, l.n, window);
    image *single_weights = get_weights(l);
    show_images(single_weights, l.n, window);

    image delta = get_convolutional_image(l);
    image dc = collapse_image_layers(delta, 1);
@@ -572,6 +572,6 @@
    //show_image(dc, buff);
    //save_image(dc, buff);
    free_image(dc);
    return single_filters;
    return single_weights;
}


 src/convolutional_layer.h

@@ -29,10 +29,10 @@
void resize_convolutional_layer(convolutional_layer *layer, int w, int h);
void forward_convolutional_layer(const convolutional_layer layer, network_state state);
void update_convolutional_layer(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay);
image *visualize_convolutional_layer(convolutional_layer layer, char *window, image *prev_filters);
void binarize_filters(float *filters, int n, int size, float *binary);
image *visualize_convolutional_layer(convolutional_layer layer, char *window, image *prev_weights);
void binarize_weights(float *weights, int n, int size, float *binary);
void swap_binary(convolutional_layer *l);
void binarize_filters2(float *filters, int n, int size, char *binary, float *scales);
void binarize_weights2(float *weights, int n, int size, char *binary, float *scales);

void backward_convolutional_layer(convolutional_layer layer, network_state state);

@@ -41,12 +41,12 @@

image get_convolutional_image(convolutional_layer layer);
image get_convolutional_delta(convolutional_layer layer);
image get_convolutional_filter(convolutional_layer layer, int i);
image get_convolutional_weight(convolutional_layer layer, int i);

int convolutional_out_height(convolutional_layer layer);
int convolutional_out_width(convolutional_layer layer);
void rescale_filters(convolutional_layer l, float scale, float trans);
void rgbgr_filters(convolutional_layer l);
void rescale_weights(convolutional_layer l, float scale, float trans);
void rgbgr_weights(convolutional_layer l);

#endif


 src/cuda.c

@@ -9,6 +9,20 @@
#include <stdlib.h>
#include <time.h>

void cuda_set_device(int n)
{
    gpu_index = n;
    cudaError_t status = cudaSetDevice(n);
    check_error(status);
}

int cuda_get_device()
{
    int n = 0;
    cudaError_t status = cudaGetDevice(&n);
    check_error(status);
    return n;
}

void check_error(cudaError_t status)
{
@@ -49,25 +63,27 @@
#ifdef CUDNN
cudnnHandle_t cudnn_handle()
{
    static int init = 0;
    static cudnnHandle_t handle;
    if(!init) {
        cudnnCreate(&handle);
        init = 1;
    static int init[16] = {0};
    static cudnnHandle_t handle[16];
    int i = cuda_get_device();
    if(!init[i]) {
        cudnnCreate(&handle[i]);
        init[i] = 1;
    }
    return handle;
    return handle[i];
}
#endif

cublasHandle_t blas_handle()
{
    static int init = 0;
    static cublasHandle_t handle;
    if(!init) {
        cublasCreate(&handle);
        init = 1;
    static int init[16] = {0};
    static cublasHandle_t handle[16];
    int i = cuda_get_device();
    if(!init[i]) {
        cublasCreate(&handle[i]);
        init[i] = 1;
    }
    return handle;
    return handle[i];
}

float *cuda_make_array(float *x, size_t n)
@@ -86,14 +102,15 @@

void cuda_random(float *x_gpu, size_t n)
{
    static curandGenerator_t gen;
    static int init = 0;
    if(!init){
        curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
        curandSetPseudoRandomGeneratorSeed(gen, time(0));
        init = 1;
    static curandGenerator_t gen[16];
    static int init[16] = {0};
    int i = cuda_get_device();
    if(!init[i]){
        curandCreateGenerator(&gen[i], CURAND_RNG_PSEUDO_DEFAULT);
        curandSetPseudoRandomGeneratorSeed(gen[i], time(0));
        init[i] = 1;
    }
    curandGenerateUniform(gen, x_gpu, n);
    curandGenerateUniform(gen[i], x_gpu, n);
    check_error(cudaPeekAtLastError());
}


 src/cuda.h

@@ -21,6 +21,7 @@
int *cuda_make_int_array(size_t n);
void cuda_push_array(float *x_gpu, float *x, size_t n);
void cuda_pull_array(float *x_gpu, float *x, size_t n);
void cuda_set_device(int n);
void cuda_free(float *x_gpu);
void cuda_random(float *x_gpu, size_t n);
float cuda_compare(float *x_gpu, float *x, size_t n, char *s);

 src/darknet.c

@@ -66,7 +66,7 @@
            if(l.type == CONVOLUTIONAL){
                int num = l.n*l.c*l.size*l.size;
                axpy_cpu(l.n, 1, l.biases, 1, out.biases, 1);
                axpy_cpu(num, 1, l.filters, 1, out.filters, 1);
                axpy_cpu(num, 1, l.weights, 1, out.weights, 1);
            }
            if(l.type == CONNECTED){
                axpy_cpu(l.outputs, 1, l.biases, 1, out.biases, 1);
@@ -80,7 +80,7 @@
        if(l.type == CONVOLUTIONAL){
            int num = l.n*l.c*l.size*l.size;
            scal_cpu(l.n, 1./n, l.biases, 1);
            scal_cpu(num, 1./n, l.filters, 1);
            scal_cpu(num, 1./n, l.weights, 1);
        }
        if(l.type == CONNECTED){
            scal_cpu(l.outputs, 1./n, l.biases, 1);
@@ -159,7 +159,7 @@
    for(i = 0; i < net.n; ++i){
        layer l = net.layers[i];
        if(l.type == CONVOLUTIONAL){
            rescale_filters(l, 2, -.5);
            rescale_weights(l, 2, -.5);
            break;
        }
    }
@@ -177,7 +177,7 @@
    for(i = 0; i < net.n; ++i){
        layer l = net.layers[i];
        if(l.type == CONVOLUTIONAL){
            rgbgr_filters(l);
            rgbgr_weights(l);
            break;
        }
    }
@@ -354,8 +354,7 @@
    gpu_index = -1;
#else
    if(gpu_index >= 0){
        cudaError_t status = cudaSetDevice(gpu_index);
        check_error(status);
        cuda_set_device(gpu_index);
    }
#endif


 src/deconvolutional_kernels.cu

@@ -27,7 +27,7 @@
    fill_ongpu(layer.outputs*layer.batch, 0, layer.output_gpu, 1);

    for(i = 0; i < layer.batch; ++i){
        float *a = layer.filters_gpu;
        float *a = layer.weights_gpu;
        float *b = state.input + i*layer.c*layer.h*layer.w;
        float *c = layer.col_image_gpu;

@@ -59,7 +59,7 @@

        float *a = state.input + i*m*n;
        float *b = layer.col_image_gpu;
        float *c = layer.filter_updates_gpu;
        float *c = layer.weight_updates_gpu;

        im2col_ongpu(layer.delta_gpu + i*layer.n*size, layer.n, out_h, out_w, 
                layer.size, layer.stride, 0, b);
@@ -70,7 +70,7 @@
            int n = layer.h*layer.w;
            int k = layer.size*layer.size*layer.n;

            float *a = layer.filters_gpu;
            float *a = layer.weights_gpu;
            float *b = layer.col_image_gpu;
            float *c = state.delta + i*n*m;

@@ -81,17 +81,17 @@

extern "C" void pull_deconvolutional_layer(deconvolutional_layer layer)
{
    cuda_pull_array(layer.filters_gpu, layer.filters, layer.c*layer.n*layer.size*layer.size);
    cuda_pull_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
    cuda_pull_array(layer.biases_gpu, layer.biases, layer.n);
    cuda_pull_array(layer.filter_updates_gpu, layer.filter_updates, layer.c*layer.n*layer.size*layer.size);
    cuda_pull_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
    cuda_pull_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
}

extern "C" void push_deconvolutional_layer(deconvolutional_layer layer)
{
    cuda_push_array(layer.filters_gpu, layer.filters, layer.c*layer.n*layer.size*layer.size);
    cuda_push_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
    cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
    cuda_push_array(layer.filter_updates_gpu, layer.filter_updates, layer.c*layer.n*layer.size*layer.size);
    cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
    cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
}

@@ -102,8 +102,8 @@
    axpy_ongpu(layer.n, learning_rate, layer.bias_updates_gpu, 1, layer.biases_gpu, 1);
    scal_ongpu(layer.n, momentum, layer.bias_updates_gpu, 1);

    axpy_ongpu(size, -decay, layer.filters_gpu, 1, layer.filter_updates_gpu, 1);
    axpy_ongpu(size, learning_rate, layer.filter_updates_gpu, 1, layer.filters_gpu, 1);
    scal_ongpu(size, momentum, layer.filter_updates_gpu, 1);
    axpy_ongpu(size, -decay, layer.weights_gpu, 1, layer.weight_updates_gpu, 1);
    axpy_ongpu(size, learning_rate, layer.weight_updates_gpu, 1, layer.weights_gpu, 1);
    scal_ongpu(size, momentum, layer.weight_updates_gpu, 1);
}


 src/deconvolutional_layer.c

@@ -57,13 +57,13 @@
    l.stride = stride;
    l.size = size;

    l.filters = calloc(c*n*size*size, sizeof(float));
    l.filter_updates = calloc(c*n*size*size, sizeof(float));
    l.weights = calloc(c*n*size*size, sizeof(float));
    l.weight_updates = calloc(c*n*size*size, sizeof(float));

    l.biases = calloc(n, sizeof(float));
    l.bias_updates = calloc(n, sizeof(float));
    float scale = 1./sqrt(size*size*c);
    for(i = 0; i < c*n*size*size; ++i) l.filters[i] = scale*rand_normal();
    for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_normal();
    for(i = 0; i < n; ++i){
        l.biases[i] = scale;
    }
@@ -81,8 +81,8 @@
    l.delta  = calloc(l.batch*out_h * out_w * n, sizeof(float));

    #ifdef GPU
    l.filters_gpu = cuda_make_array(l.filters, c*n*size*size);
    l.filter_updates_gpu = cuda_make_array(l.filter_updates, c*n*size*size);
    l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
    l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);

    l.biases_gpu = cuda_make_array(l.biases, n);
    l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
@@ -137,7 +137,7 @@
    fill_cpu(l.outputs*l.batch, 0, l.output, 1);

    for(i = 0; i < l.batch; ++i){
        float *a = l.filters;
        float *a = l.weights;
        float *b = state.input + i*l.c*l.h*l.w;
        float *c = l.col_image;

@@ -167,7 +167,7 @@

        float *a = state.input + i*m*n;
        float *b = l.col_image;
        float *c = l.filter_updates;
        float *c = l.weight_updates;

        im2col_cpu(l.delta + i*l.n*size, l.n, out_h, out_w, 
                l.size, l.stride, 0, b);
@@ -178,7 +178,7 @@
            int n = l.h*l.w;
            int k = l.size*l.size*l.n;

            float *a = l.filters;
            float *a = l.weights;
            float *b = l.col_image;
            float *c = state.delta + i*n*m;

@@ -193,9 +193,9 @@
    axpy_cpu(l.n, learning_rate, l.bias_updates, 1, l.biases, 1);
    scal_cpu(l.n, momentum, l.bias_updates, 1);

    axpy_cpu(size, -decay, l.filters, 1, l.filter_updates, 1);
    axpy_cpu(size, learning_rate, l.filter_updates, 1, l.filters, 1);
    scal_cpu(size, momentum, l.filter_updates, 1);
    axpy_cpu(size, -decay, l.weights, 1, l.weight_updates, 1);
    axpy_cpu(size, learning_rate, l.weight_updates, 1, l.weights, 1);
    scal_cpu(size, momentum, l.weight_updates, 1);
}



 src/detector.c

@@ -117,12 +117,18 @@
            int box_index = index * (classes + 5);
            boxes[index].x = (predictions[box_index + 0] + col + .5) / side * w;
            boxes[index].y = (predictions[box_index + 1] + row + .5) / side * h;
            if(1){
            if(0){
                boxes[index].x = (logistic_activate(predictions[box_index + 0]) + col) / side * w;
                boxes[index].y = (logistic_activate(predictions[box_index + 1]) + row) / side * h;
            }
            boxes[index].w = pow(logistic_activate(predictions[box_index + 2]), (square?2:1)) * w;
            boxes[index].h = pow(logistic_activate(predictions[box_index + 3]), (square?2:1)) * h;
            if(1){
                boxes[index].x = ((col + .5)/side + predictions[box_index + 0] * .5) * w;
                boxes[index].y = ((row + .5)/side + predictions[box_index + 1] * .5) * h;
                boxes[index].w = (exp(predictions[box_index + 2]) * .5) * w;
                boxes[index].h = (exp(predictions[box_index + 3]) * .5) * h;
            }
            for(j = 0; j < classes; ++j){
                int class_index = index * (classes + 5) + 5;
                float prob = scale*predictions[class_index+j];

 src/layer.c

@@ -14,8 +14,6 @@
    if(l.indexes)        free(l.indexes);
    if(l.rand)           free(l.rand);
    if(l.cost)           free(l.cost);
    if(l.filters)        free(l.filters);
    if(l.filter_updates) free(l.filter_updates);
    if(l.biases)         free(l.biases);
    if(l.bias_updates)   free(l.bias_updates);
    if(l.weights)        free(l.weights);
@@ -30,8 +28,8 @@

#ifdef GPU
    if(l.indexes_gpu)          cuda_free((float *)l.indexes_gpu);
    if(l.filters_gpu)          cuda_free(l.filters_gpu);
    if(l.filter_updates_gpu)   cuda_free(l.filter_updates_gpu);
    if(l.weights_gpu)          cuda_free(l.weights_gpu);
    if(l.weight_updates_gpu)   cuda_free(l.weight_updates_gpu);
    if(l.col_image_gpu)        cuda_free(l.col_image_gpu);
    if(l.weights_gpu)          cuda_free(l.weights_gpu);
    if(l.biases_gpu)           cuda_free(l.biases_gpu);

 src/layer.h

@@ -105,9 +105,7 @@
    int *indexes;
    float *rand;
    float *cost;
    float *filters;
    char  *cfilters;
    float *filter_updates;
    char  *cweights;
    float *state;
    float *prev_state;
    float *forgot_state;
@@ -117,7 +115,7 @@
    float *concat;
    float *concat_delta;

    float *binary_filters;
    float *binary_weights;

    float *biases;
    float *bias_updates;
@@ -194,11 +192,9 @@
    float * save_delta_gpu;
    float * concat_gpu;
    float * concat_delta_gpu;
    float * filters_gpu;
    float * filter_updates_gpu;

    float *binary_input_gpu;
    float *binary_filters_gpu;
    float *binary_weights_gpu;

    float * mean_gpu;
    float * variance_gpu;
@@ -230,8 +226,8 @@
    #ifdef CUDNN
    cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc;
    cudnnTensorDescriptor_t dsrcTensorDesc, ddstTensorDesc;
    cudnnFilterDescriptor_t filterDesc;
    cudnnFilterDescriptor_t dfilterDesc;
    cudnnFilterDescriptor_t weightDesc;
    cudnnFilterDescriptor_t dweightDesc;
    cudnnConvolutionDescriptor_t convDesc;
    cudnnConvolutionFwdAlgo_t fw_algo;
    cudnnConvolutionBwdDataAlgo_t bd_algo;

 src/local_layer.c

@@ -47,23 +47,23 @@
    l.outputs = l.out_h * l.out_w * l.out_c;
    l.inputs = l.w * l.h * l.c;

    l.filters = calloc(c*n*size*size*locations, sizeof(float));
    l.filter_updates = calloc(c*n*size*size*locations, sizeof(float));
    l.weights = calloc(c*n*size*size*locations, sizeof(float));
    l.weight_updates = calloc(c*n*size*size*locations, sizeof(float));

    l.biases = calloc(l.outputs, sizeof(float));
    l.bias_updates = calloc(l.outputs, sizeof(float));

    // float scale = 1./sqrt(size*size*c);
    float scale = sqrt(2./(size*size*c));
    for(i = 0; i < c*n*size*size; ++i) l.filters[i] = scale*rand_uniform(-1,1);
    for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1,1);

    l.col_image = calloc(out_h*out_w*size*size*c, sizeof(float));
    l.output = calloc(l.batch*out_h * out_w * n, sizeof(float));
    l.delta  = calloc(l.batch*out_h * out_w * n, sizeof(float));

#ifdef GPU
    l.filters_gpu = cuda_make_array(l.filters, c*n*size*size*locations);
    l.filter_updates_gpu = cuda_make_array(l.filter_updates, c*n*size*size*locations);
    l.weights_gpu = cuda_make_array(l.weights, c*n*size*size*locations);
    l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size*locations);

    l.biases_gpu = cuda_make_array(l.biases, l.outputs);
    l.bias_updates_gpu = cuda_make_array(l.bias_updates, l.outputs);
@@ -97,7 +97,7 @@
                l.size, l.stride, l.pad, l.col_image);
        float *output = l.output + i*l.outputs;
        for(j = 0; j < locations; ++j){
            float *a = l.filters + j*l.size*l.size*l.c*l.n;
            float *a = l.weights + j*l.size*l.size*l.c*l.n;
            float *b = l.col_image + j;
            float *c = output + j;

@@ -130,7 +130,7 @@
        for(j = 0; j < locations; ++j){ 
            float *a = l.delta + i*l.outputs + j;
            float *b = l.col_image + j;
            float *c = l.filter_updates + j*l.size*l.size*l.c*l.n;
            float *c = l.weight_updates + j*l.size*l.size*l.c*l.n;
            int m = l.n;
            int n = l.size*l.size*l.c;
            int k = 1;
@@ -140,7 +140,7 @@

        if(state.delta){
            for(j = 0; j < locations; ++j){ 
                float *a = l.filters + j*l.size*l.size*l.c*l.n;
                float *a = l.weights + j*l.size*l.size*l.c*l.n;
                float *b = l.delta + i*l.outputs + j;
                float *c = l.col_image + j;

@@ -163,9 +163,9 @@
    axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
    scal_cpu(l.outputs, momentum, l.bias_updates, 1);

    axpy_cpu(size, -decay*batch, l.filters, 1, l.filter_updates, 1);
    axpy_cpu(size, learning_rate/batch, l.filter_updates, 1, l.filters, 1);
    scal_cpu(size, momentum, l.filter_updates, 1);
    axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1);
    axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
    scal_cpu(size, momentum, l.weight_updates, 1);
}

#ifdef GPU
@@ -187,7 +187,7 @@
                l.size, l.stride, l.pad, l.col_image_gpu);
        float *output = l.output_gpu + i*l.outputs;
        for(j = 0; j < locations; ++j){
            float *a = l.filters_gpu + j*l.size*l.size*l.c*l.n;
            float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n;
            float *b = l.col_image_gpu + j;
            float *c = output + j;

@@ -219,7 +219,7 @@
        for(j = 0; j < locations; ++j){ 
            float *a = l.delta_gpu + i*l.outputs + j;
            float *b = l.col_image_gpu + j;
            float *c = l.filter_updates_gpu + j*l.size*l.size*l.c*l.n;
            float *c = l.weight_updates_gpu + j*l.size*l.size*l.c*l.n;
            int m = l.n;
            int n = l.size*l.size*l.c;
            int k = 1;
@@ -229,7 +229,7 @@

        if(state.delta){
            for(j = 0; j < locations; ++j){ 
                float *a = l.filters_gpu + j*l.size*l.size*l.c*l.n;
                float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n;
                float *b = l.delta_gpu + i*l.outputs + j;
                float *c = l.col_image_gpu + j;

@@ -252,16 +252,16 @@
    axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
    scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);

    axpy_ongpu(size, -decay*batch, l.filters_gpu, 1, l.filter_updates_gpu, 1);
    axpy_ongpu(size, learning_rate/batch, l.filter_updates_gpu, 1, l.filters_gpu, 1);
    scal_ongpu(size, momentum, l.filter_updates_gpu, 1);
    axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
    axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
    scal_ongpu(size, momentum, l.weight_updates_gpu, 1);
}

void pull_local_layer(local_layer l)
{
    int locations = l.out_w*l.out_h;
    int size = l.size*l.size*l.c*l.n*locations;
    cuda_pull_array(l.filters_gpu, l.filters, size);
    cuda_pull_array(l.weights_gpu, l.weights, size);
    cuda_pull_array(l.biases_gpu, l.biases, l.outputs);
}

@@ -269,7 +269,7 @@
{
    int locations = l.out_w*l.out_h;
    int size = l.size*l.size*l.c*l.n*locations;
    cuda_push_array(l.filters_gpu, l.filters, size);
    cuda_push_array(l.weights_gpu, l.weights, size);
    cuda_push_array(l.biases_gpu, l.biases, l.outputs);
}
#endif

 src/network.c

@@ -318,11 +318,11 @@

float train_network_datum(network net, float *x, float *y)
{
    *net.seen += net.batch;
#ifdef GPU
    if(gpu_index >= 0) return train_network_datum_gpu(net, x, y);
#endif
    network_state state;
    *net.seen += net.batch;
    state.index = 0;
    state.net = net;
    state.input = x;

 src/network.h

@@ -65,6 +65,7 @@
} network_state;

#ifdef GPU
float train_networks(network *nets, int n, data d);
float train_network_datum_gpu(network net, float *x, float *y);
float *network_predict_gpu(network net, float *input);
float * get_network_output_gpu_layer(network net, int i);

 src/network_kernels.cu

@@ -209,6 +209,7 @@

float train_network_datum_gpu(network net, float *x, float *y)
{
    *net.seen += net.batch;
    forward_backward_network_gpu(net, x, y);
    float error = get_network_cost(net);
    if (((*net.seen) / net.batch) % net.subdivisions == 0) update_network_gpu(net);
@@ -226,25 +227,115 @@
{
    train_args args = *(train_args*)ptr;

    cudaError_t status = cudaSetDevice(args.net.gpu_index);
    check_error(status);
    cuda_set_device(args.net.gpu_index);
    forward_backward_network_gpu(args.net, args.X, args.y);
    free(ptr);
    return 0;
}

pthread_t train_network_in_thread(train_args args)
pthread_t train_network_in_thread(network net, float *X, float *y)
{
    pthread_t thread;
    train_args *ptr = (train_args *)calloc(1, sizeof(train_args));
    *ptr = args;
    ptr->net = net;
    ptr->X = X;
    ptr->y = y;
    if(pthread_create(&thread, 0, train_thread, ptr)) error("Thread creation failed");
    return thread;
}

void pull_updates(layer l)
{
#ifdef GPU
    if(l.type == CONVOLUTIONAL){
        cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
        cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.n*l.size*l.size*l.c);
        if(l.scale_updates) cuda_pull_array(l.scale_updates_gpu, l.scale_updates, l.n);
    } else if(l.type == CONNECTED){
        cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
        cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.outputs*l.inputs);
    }
#endif
}

void push_updates(layer l)
{
#ifdef GPU
    if(l.type == CONVOLUTIONAL){
        cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
        cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.n*l.size*l.size*l.c);
        if(l.scale_updates) cuda_push_array(l.scale_updates_gpu, l.scale_updates, l.n);
    } else if(l.type == CONNECTED){
        cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
        cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.outputs*l.inputs);
    }
#endif
}

void merge_updates(layer l, layer base)
{
    if (l.type == CONVOLUTIONAL) {
        axpy_cpu(l.n, 1, l.bias_updates, 1, base.bias_updates, 1);
        axpy_cpu(l.n*l.size*l.size*l.c, 1, l.weight_updates, 1, base.weight_updates, 1);
        if (l.scale_updates) {
            axpy_cpu(l.n, 1, l.scale_updates, 1, base.scale_updates, 1);
        }
    } else if(l.type == CONNECTED) {
        axpy_cpu(l.outputs, 1, l.bias_updates, 1, base.bias_updates, 1);
        axpy_cpu(l.outputs*l.inputs, 1, l.weight_updates, 1, base.weight_updates, 1);
    }
}

void distribute_updates(layer l, layer base)
{
    if (l.type == CONVOLUTIONAL) {
        copy_cpu(l.n, base.bias_updates, 1, l.bias_updates, 1);
        copy_cpu(l.n*l.size*l.size*l.c, base.weight_updates, 1, l.weight_updates, 1);
        if (l.scale_updates) {
            copy_cpu(l.n, base.scale_updates, 1, l.scale_updates, 1);
        }
    } else if(l.type == CONNECTED) {
        copy_cpu(l.outputs, base.bias_updates, 1, l.bias_updates, 1);
        copy_cpu(l.outputs*l.inputs, base.weight_updates, 1, l.weight_updates, 1);
    }
}

void sync_updates(network *nets, int n)
{
    int i,j;
    int layers = nets[0].n;
    network net = nets[0];
    for (j = 0; j < layers; ++j) {
        layer base = net.layers[j];
        cuda_set_device(net.gpu_index);
        pull_updates(base);
        for (i = 1; i < n; ++i) {
            cuda_set_device(nets[i].gpu_index);
            layer l = nets[i].layers[j];
            pull_updates(l);
            merge_updates(l, base);
        }
        for (i = 1; i < n; ++i) {
            cuda_set_device(nets[i].gpu_index);
            layer l = nets[i].layers[j];
            distribute_updates(l, base);
            push_updates(l);
        }
        cuda_set_device(net.gpu_index);
        push_updates(base);
    }
    for (i = 0; i < n; ++i) {
        cuda_set_device(nets[i].gpu_index);
        if(i > 0) nets[i].momentum = 0;
        update_network_gpu(nets[i]);
    }
}

float train_networks(network *nets, int n, data d)
{
    int batch = nets[0].batch;
    assert(batch * n == d.X.rows);
    assert(nets[0].subdivisions % n == 0);
    float **X = (float **) calloc(n, sizeof(float *));
    float **y = (float **) calloc(n, sizeof(float *));
    pthread_t *threads = (pthread_t *) calloc(n, sizeof(pthread_t));
@@ -255,11 +346,20 @@
        X[i] = (float *) calloc(batch*d.X.cols, sizeof(float));
        y[i] = (float *) calloc(batch*d.y.cols, sizeof(float));
        get_next_batch(d, batch, i*batch, X[i], y[i]);
        float err = train_network_datum(nets[i], X[i], y[i]);
        sum += err;
        threads[i] = train_network_in_thread(nets[i], X[i], y[i]);
    }
    for(i = 0; i < n; ++i){
        pthread_join(threads[i], 0);
        *nets[i].seen += n*nets[i].batch;
        printf("%f\n", get_network_cost(nets[i]) / batch);
        sum += get_network_cost(nets[i]);
        free(X[i]);
        free(y[i]);
    }
    if (((*nets[0].seen) / nets[0].batch) % nets[0].subdivisions == 0) sync_updates(nets, n);
    free(X);
    free(y);
    free(threads);
    return (float)sum/(n*batch);
}


 src/parser.c

@@ -551,6 +551,7 @@
    node *n = sections->front;
    if(!n) error("Config file has no sections");
    network net = make_network(sections->size - 1);
    net.gpu_index = gpu_index;
    size_params params;

    section *s = (section *)n->val;
@@ -856,13 +857,13 @@

            for (j = 0; j < l.n; ++j){
                int index = j*l.c*l.size*l.size;
                fwrite(l.filters+index, sizeof(float), l.c*l.size*l.size, fp);
                fwrite(l.weights+index, sizeof(float), l.c*l.size*l.size, fp);
                for (k = 0; k < l.c*l.size*l.size; ++k) fwrite(&zero, sizeof(float), 1, fp);
            }
            for (j = 0; j < l.n; ++j){
                int index = j*l.c*l.size*l.size;
                for (k = 0; k < l.c*l.size*l.size; ++k) fwrite(&zero, sizeof(float), 1, fp);
                fwrite(l.filters+index, sizeof(float), l.c*l.size*l.size, fp);
                fwrite(l.weights+index, sizeof(float), l.c*l.size*l.size, fp);
            }
        }
    }
@@ -876,7 +877,7 @@
        pull_convolutional_layer(l);
    }
#endif
    binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.binary_filters);
    binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
    int size = l.c*l.size*l.size;
    int i, j, k;
    fwrite(l.biases, sizeof(float), l.n, fp);
@@ -886,7 +887,7 @@
        fwrite(l.rolling_variance, sizeof(float), l.n, fp);
    }
    for(i = 0; i < l.n; ++i){
        float mean = l.binary_filters[i*size];
        float mean = l.binary_weights[i*size];
        if(mean < 0) mean = -mean;
        fwrite(&mean, sizeof(float), 1, fp);
        for(j = 0; j < size/8; ++j){
@@ -894,7 +895,7 @@
            unsigned char c = 0;
            for(k = 0; k < 8; ++k){
                if (j*8 + k >= size) break;
                if (l.binary_filters[index + k] > 0) c = (c | 1<<k);
                if (l.binary_weights[index + k] > 0) c = (c | 1<<k);
            }
            fwrite(&c, sizeof(char), 1, fp);
        }
@@ -919,7 +920,7 @@
        fwrite(l.rolling_mean, sizeof(float), l.n, fp);
        fwrite(l.rolling_variance, sizeof(float), l.n, fp);
    }
    fwrite(l.filters, sizeof(float), num, fp);
    fwrite(l.weights, sizeof(float), num, fp);
}

void save_batchnorm_weights(layer l, FILE *fp)
@@ -952,6 +953,9 @@

void save_weights_upto(network net, char *filename, int cutoff)
{
#ifdef GPU
    cuda_set_device(net.gpu_index);
#endif
    fprintf(stderr, "Saving weights to %s\n", filename);
    FILE *fp = fopen(filename, "w");
    if(!fp) file_error(filename);
@@ -997,7 +1001,7 @@
            int locations = l.out_w*l.out_h;
            int size = l.size*l.size*l.c*l.n*locations;
            fwrite(l.biases, sizeof(float), l.outputs, fp);
            fwrite(l.filters, sizeof(float), size, fp);
            fwrite(l.weights, sizeof(float), size, fp);
        }
    }
    fclose(fp);
@@ -1075,7 +1079,7 @@
            fread(&c, sizeof(char), 1, fp);
            for(k = 0; k < 8; ++k){
                if (j*8 + k >= size) break;
                l.filters[index + k] = (c & 1<<k) ? mean : -mean;
                l.weights[index + k] = (c & 1<<k) ? mean : -mean;
            }
        }
    }
@@ -1099,12 +1103,12 @@
        fread(l.rolling_mean, sizeof(float), l.n, fp);
        fread(l.rolling_variance, sizeof(float), l.n, fp);
    }
    fread(l.filters, sizeof(float), num, fp);
    //if(l.c == 3) scal_cpu(num, 1./256, l.filters, 1);
    fread(l.weights, sizeof(float), num, fp);
    //if(l.c == 3) scal_cpu(num, 1./256, l.weights, 1);
    if (l.flipped) {
        transpose_matrix(l.filters, l.c*l.size*l.size, l.n);
        transpose_matrix(l.weights, l.c*l.size*l.size, l.n);
    }
    //if (l.binary) binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.filters);
    //if (l.binary) binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.weights);
#ifdef GPU
    if(gpu_index >= 0){
        push_convolutional_layer(l);
@@ -1115,6 +1119,9 @@

void load_weights_upto(network *net, char *filename, int cutoff)
{
#ifdef GPU
    cuda_set_device(net->gpu_index);
#endif
    fprintf(stderr, "Loading weights from %s...", filename);
    fflush(stdout);
    FILE *fp = fopen(filename, "rb");
@@ -1139,7 +1146,7 @@
        if(l.type == DECONVOLUTIONAL){
            int num = l.n*l.c*l.size*l.size;
            fread(l.biases, sizeof(float), l.n, fp);
            fread(l.filters, sizeof(float), num, fp);
            fread(l.weights, sizeof(float), num, fp);
#ifdef GPU
            if(gpu_index >= 0){
                push_deconvolutional_layer(l);
@@ -1174,7 +1181,7 @@
            int locations = l.out_w*l.out_h;
            int size = l.size*l.size*l.c*l.n*locations;
            fread(l.biases, sizeof(float), l.outputs, fp);
            fread(l.filters, sizeof(float), size, fp);
            fread(l.weights, sizeof(float), size, fp);
#ifdef GPU
            if(gpu_index >= 0){
                push_local_layer(l);

 src/region_layer.c

@@ -22,11 +22,18 @@
    l.classes = classes;
    l.coords = coords;
    l.cost = calloc(1, sizeof(float));
    l.biases = calloc(n*2, sizeof(float));
    l.bias_updates = calloc(n*2, sizeof(float));
    l.outputs = h*w*n*(classes + coords + 1);
    l.inputs = l.outputs;
    l.truths = 30*(5);
    l.delta = calloc(batch*l.outputs, sizeof(float));
    l.output = calloc(batch*l.outputs, sizeof(float));
    int i;
    for(i = 0; i < n*2; ++i){
        l.biases[i] = .5;
    }

#ifdef GPU
    l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
    l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
@@ -38,62 +45,30 @@
    return l;
}

box get_region_box2(float *x, int index, int i, int j, int w, int h)
box get_region_box(float *x, float *biases, int n, int index, int i, int j, int w, int h)
{
    float aspect = exp(x[index+0]);
    float scale  = logistic_activate(x[index+1]);
    float move_x = x[index+2];
    float move_y = x[index+3];

    box b;
    b.w = sqrt(scale * aspect);
    b.h = b.w * 1./aspect;
    b.x = move_x * b.w + (i + .5)/w;
    b.y = move_y * b.h + (j + .5)/h;
    b.x = (i + .5)/w + x[index + 0] * biases[2*n];
    b.y = (j + .5)/h + x[index + 1] * biases[2*n + 1];
    b.w = exp(x[index + 2]) * biases[2*n];
    b.h = exp(x[index + 3]) * biases[2*n+1];
    return b;
}

float delta_region_box2(box truth, float *output, int index, int i, int j, int w, int h, float *delta)
float delta_region_box(box truth, float *x, float *biases, int n, int index, int i, int j, int w, int h, float *delta, float scale)
{
    box pred = get_region_box2(output, index, i, j, w, h);
    float iou = box_iou(pred, truth);
    float true_aspect = truth.w/truth.h;
    float true_scale = truth.w*truth.h;

    float true_dx = (truth.x - (i+.5)/w) / truth.w;
    float true_dy = (truth.y - (j+.5)/h) / truth.h;
    delta[index + 0] = (true_aspect - exp(output[index + 0])) * exp(output[index + 0]);
    delta[index + 1] = (true_scale - logistic_activate(output[index + 1])) * logistic_gradient(logistic_activate(output[index + 1]));
    delta[index + 2] = true_dx - output[index + 2];
    delta[index + 3] = true_dy - output[index + 3];
    return iou;
}

box get_region_box(float *x, int index, int i, int j, int w, int h, int adjust, int logistic)
{
    box b;
    b.x = (x[index + 0] + i + .5)/w;
    b.y = (x[index + 1] + j + .5)/h;
    b.w = x[index + 2];
    b.h = x[index + 3];
    if(logistic){
        b.w = logistic_activate(x[index + 2]);
        b.h = logistic_activate(x[index + 3]);
    }
    if(adjust && b.w < .01) b.w = .01;
    if(adjust && b.h < .01) b.h = .01;
    return b;
}

float delta_region_box(box truth, float *output, int index, int i, int j, int w, int h, float *delta, int logistic, float scale)
{
    box pred = get_region_box(output, index, i, j, w, h, 0, logistic);
    box pred = get_region_box(x, biases, n, index, i, j, w, h);
    float iou = box_iou(pred, truth);

    delta[index + 0] = scale * (truth.x - pred.x);
    delta[index + 1] = scale * (truth.y - pred.y);
    delta[index + 2] = scale * ((truth.w - pred.w)*(logistic ? logistic_gradient(pred.w) : 1));
    delta[index + 3] = scale * ((truth.h - pred.h)*(logistic ? logistic_gradient(pred.h) : 1));
    float tx = (truth.x - (i + .5)/w) / biases[2*n];
    float ty = (truth.y - (j + .5)/h) / biases[2*n + 1];
    float tw = log(truth.w / biases[2*n]);
    float th = log(truth.h / biases[2*n + 1]);

    delta[index + 0] = scale * (tx - x[index + 0]);
    delta[index + 1] = scale * (ty - x[index + 1]);
    delta[index + 2] = scale * (tw - x[index + 2]);
    delta[index + 3] = scale * (th - x[index + 3]);
    return iou;
}

@@ -107,7 +82,7 @@
    return (x != x);
}

#define LOG 1
#define LOG 0

void forward_region_layer(const region_layer l, network_state state)
{
@@ -127,6 +102,7 @@
    if(!state.train) return;
    memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
    float avg_iou = 0;
    float recall = 0;
    float avg_cat = 0;
    float avg_obj = 0;
    float avg_anyobj = 0;
@@ -137,7 +113,7 @@
            for (i = 0; i < l.w; ++i) {
                for (n = 0; n < l.n; ++n) {
                    int index = size*(j*l.w*l.n + i*l.n + n) + b*l.outputs;
                    box pred = get_region_box(l.output, index, i, j, l.w, l.h, 1, LOG);
                    box pred = get_region_box(l.output, l.biases, n, index, i, j, l.w, l.h);
                    float best_iou = 0;
                    for(t = 0; t < 30; ++t){
                        box truth = float_to_box(state.truth + t*5 + b*l.truths);
@@ -155,7 +131,11 @@
                        truth.y = (j + .5)/l.h;
                        truth.w = .5;
                        truth.h = .5;
                        delta_region_box(truth, l.output, index, i, j, l.w, l.h, l.delta, LOG, 1);
                        delta_region_box(truth, l.output, l.biases, n, index, i, j, l.w, l.h, l.delta, .01);
                        //l.delta[index + 0] = .1 * (0 - l.output[index + 0]);
                        //l.delta[index + 1] = .1 * (0 - l.output[index + 1]);
                        //l.delta[index + 2] = .1 * (0 - l.output[index + 2]);
                        //l.delta[index + 3] = .1 * (0 - l.output[index + 3]);
                    }
                }
            }
@@ -176,8 +156,8 @@
            printf("index %d %d\n",i, j);
            for(n = 0; n < l.n; ++n){
                int index = size*(j*l.w*l.n + i*l.n + n) + b*l.outputs;
                box pred = get_region_box(l.output, index, i, j, l.w, l.h, 1, LOG);
                printf("pred: (%f, %f) %f x %f\n", pred.x, pred.y, pred.w, pred.h);
                box pred = get_region_box(l.output, l.biases, n, index, i, j, l.w, l.h);
                printf("pred: (%f, %f) %f x %f\n", pred.x*l.w - i - .5, pred.y * l.h - j - .5, pred.w, pred.h);
                pred.x = 0;
                pred.y = 0;
                float iou = box_iou(pred, truth_shift);
@@ -187,9 +167,10 @@
                    best_n = n;
                }
            }
            printf("%d %f (%f, %f) %f x %f\n", best_n, best_iou, truth.x, truth.y, truth.w, truth.h);
            printf("%d %f (%f, %f) %f x %f\n", best_n, best_iou, truth.x * l.w - i - .5, truth.y*l.h - j - .5, truth.w, truth.h);

            float iou = delta_region_box(truth, l.output, best_index, i, j, l.w, l.h, l.delta, LOG, l.coord_scale);
            float iou = delta_region_box(truth, l.output, l.biases, best_n, best_index, i, j, l.w, l.h, l.delta, l.coord_scale);
            if(iou > .5) recall += 1;
            avg_iou += iou;

            //l.delta[best_index + 4] = iou - l.output[best_index + 4];
@@ -239,7 +220,7 @@
    printf("\n");
    reorg(l.delta, l.w*l.h, size*l.n, l.batch, 0);
    *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
    printf("Region Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, count: %d\n", avg_iou/count, avg_cat/count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), count);
    printf("Region Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, Avg Recall: %f,  count: %d\n", avg_iou/count, avg_cat/count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, count);
}

void backward_region_layer(const region_layer l, network_state state)

			@@ -1,85 +1,91 @@
			[net]
			batch=128
			subdivisions=1
			height=256
			width=256
			height=227
			width=227
			channels=3
			learning_rate=0.01
			momentum=0.9
			decay=0.0005
			max_crop=256

			[crop]
			crop_height=224
			crop_width=224
			flip=1
			angle=0
			saturation=1
			exposure=1
			learning_rate=0.01
			policy=poly
			power=4
			max_batches=800000

			angle=7
			hue = .1
			saturation=.75
			exposure=.75
			aspect=.75

			[convolutional]
			filters=64
			filters=96
			size=11
			stride=4
			pad=0
			activation=ramp
			activation=relu

			[maxpool]
			size=3
			stride=2
			padding=0

			[convolutional]
			filters=192
			filters=256
			size=5
			stride=1
			pad=1
			activation=ramp
			activation=relu

			[maxpool]
			size=3
			stride=2
			padding=0

			[convolutional]
			filters=384
			size=3
			stride=1
			pad=1
			activation=ramp
			activation=relu

			[convolutional]
			filters=384
			size=3
			stride=1
			pad=1
			activation=relu

			[convolutional]
			filters=256
			size=3
			stride=1
			pad=1
			activation=ramp

			[convolutional]
			filters=256
			size=3
			stride=1
			pad=1
			activation=ramp
			activation=relu

			[maxpool]
			size=3
			stride=2
			padding=0

			[connected]
			output=4096
			activation=ramp
			activation=relu

			[dropout]
			probability=.5

			[connected]
			output=4096
			activation=ramp
			activation=relu

			[dropout]
			probability=.5

			[connected]
			output=1000
			activation=ramp
			activation=linear

			[softmax]
			groups=1

			@@ -5,6 +5,7 @@
			#include "blas.h"
			#include "assert.h"
			#include "classifier.h"
			#include "cuda.h"
			#include <sys/time.h>

			#ifdef OPENCV
			@@ -51,6 +52,134 @@
			return v;
			}

			void train_classifier_multi(char datacfg, char cfgfile, char weightfile, int gpus, int ngpus, int clear)
			{
			#ifdef GPU
			int nthreads = 8;
			int i;

			data_seed = time(0);
			srand(time(0));
			float avg_loss = -1;
			char *base = basecfg(cfgfile);
			printf("%s\n", base);
			printf("%d\n", ngpus);
			network *nets = calloc(ngpus, sizeof(network));
			for(i = 0; i < ngpus; ++i){
			cuda_set_device(gpus[i]);
			nets[i] = parse_network_cfg(cfgfile);
			if(weightfile){
			load_weights(&(nets[i]), weightfile);
			}
			if(clear) *nets[i].seen = 0;
			}
			network net = nets[0];

			printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
			int imgs = net.batch*ngpus/nthreads;
			assert(net.batch*ngpus % nthreads == 0);

			list *options = read_data_cfg(datacfg);

			char *backup_directory = option_find_str(options, "backup", "/backup/");
			char *label_list = option_find_str(options, "labels", "data/labels.list");
			char *train_list = option_find_str(options, "train", "data/train.list");
			int classes = option_find_int(options, "classes", 2);

			char **labels = get_labels(label_list);
			list *plist = get_paths(train_list);
			char paths = (char )list_to_array(plist);
			printf("%d\n", plist->size);
			int N = plist->size;
			clock_t time;

			pthread_t *load_threads = calloc(nthreads, sizeof(pthread_t));
			data *trains = calloc(nthreads, sizeof(data));
			data *buffers = calloc(nthreads, sizeof(data));

			load_args args = {0};
			args.w = net.w;
			args.h = net.h;

			args.min = net.min_crop;
			args.max = net.max_crop;
			args.angle = net.angle;
			args.aspect = net.aspect;
			args.exposure = net.exposure;
			args.saturation = net.saturation;
			args.hue = net.hue;
			args.size = net.w;

			args.paths = paths;
			args.classes = classes;
			args.n = imgs;
			args.m = N;
			args.labels = labels;
			args.type = CLASSIFICATION_DATA;

			for(i = 0; i < nthreads; ++i){
			args.d = buffers + i;
			load_threads[i] = load_data_in_thread(args);
			}

			int epoch = (*net.seen)/N;
			while(get_current_batch(net) < net.max_batches \|\| net.max_batches == 0){
			time=clock();
			for(i = 0; i < nthreads; ++i){
			pthread_join(load_threads[i], 0);
			trains[i] = buffers[i];
			}
			data train = concat_datas(trains, nthreads);

			for(i = 0; i < nthreads; ++i){
			args.d = buffers + i;
			load_threads[i] = load_data_in_thread(args);
			}

			printf("Loaded: %lf seconds\n", sec(clock()-time));
			time=clock();

			float loss = train_networks(nets, ngpus, train);
			if(avg_loss == -1) avg_loss = loss;
			avg_loss = avg_loss.9 + loss.1;
			printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), net.seen);
			free_data(train);
			for(i = 0; i < nthreads; ++i){
			free_data(trains[i]);
			}
			if(*net.seen/N > epoch){
			epoch = *net.seen/N;
			char buff[256];
			sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
			save_weights(net, buff);
			}
			if(get_current_batch(net)%100 == 0){
			char buff[256];
			sprintf(buff, "%s/%s.backup",backup_directory,base);
			save_weights(net, buff);
			}
			}
			char buff[256];
			sprintf(buff, "%s/%s.weights", backup_directory, base);
			save_weights(net, buff);

			for(i = 0; i < nthreads; ++i){
			pthread_join(load_threads[i], 0);
			free_data(buffers[i]);
			}
			free(buffers);
			free(trains);
			free(load_threads);

			free_network(net);
			free_ptrs((void**)labels, classes);
			free_ptrs((void**)paths, plist->size);
			free_list(plist);
			free(base);
			#endif
			}


			void train_classifier(char datacfg, char cfgfile, char *weightfile, int clear)
			{
			int nthreads = 8;
			@@ -942,6 +1071,24 @@
			return;
			}

			char *gpu_list = find_char_arg(argc, argv, "-gpus", 0);
			int *gpus = 0;
			int ngpus = 0;
			if(gpu_list){
			printf("%s\n", gpu_list);
			int len = strlen(gpu_list);
			ngpus = 1;
			int i;
			for(i = 0; i < len; ++i){
			if (gpu_list[i] == ',') ++ngpus;
			}
			gpus = calloc(ngpus, sizeof(int));
			for(i = 0; i < ngpus; ++i){
			gpus[i] = atoi(gpu_list);
			gpu_list = strchr(gpu_list, ',')+1;
			}
			}

			int cam_index = find_int_arg(argc, argv, "-c", 0);
			int clear = find_arg(argc, argv, "-clear");
			char *data = argv[3];
			@@ -953,6 +1100,7 @@
			if(0==strcmp(argv[2], "predict")) predict_classifier(data, cfg, weights, filename);
			else if(0==strcmp(argv[2], "try")) try_classifier(data, cfg, weights, filename, atoi(layer_s));
			else if(0==strcmp(argv[2], "train")) train_classifier(data, cfg, weights, clear);
			else if(0==strcmp(argv[2], "trainm")) train_classifier_multi(data, cfg, weights, gpus, ngpus, clear);
			else if(0==strcmp(argv[2], "demo")) demo_classifier(data, cfg, weights, cam_index, filename);
			else if(0==strcmp(argv[2], "threat")) threat_classifier(data, cfg, weights, cam_index, filename);
			else if(0==strcmp(argv[2], "test")) test_classifier(data, cfg, weights, layer);

			@@ -48,25 +48,25 @@
			}


			__global__ void binarize_filters_kernel(float filters, int n, int size, float binary)
			__global__ void binarize_weights_kernel(float weights, int n, int size, float binary)
			{
			int f = (blockIdx.x + blockIdx.ygridDim.x) blockDim.x + threadIdx.x;
			if (f >= n) return;
			int i = 0;
			float mean = 0;
			for(i = 0; i < size; ++i){
			mean += abs(filters[f*size + i]);
			mean += abs(weights[f*size + i]);
			}
			mean = mean / size;
			for(i = 0; i < size; ++i){
			binary[fsize + i] = (filters[fsize + i] > 0) ? mean : -mean;
			//binary[fsize + i] = filters[fsize + i];
			binary[fsize + i] = (weights[fsize + i] > 0) ? mean : -mean;
			//binary[fsize + i] = weights[fsize + i];
			}
			}

			void binarize_filters_gpu(float filters, int n, int size, float binary)
			void binarize_weights_gpu(float weights, int n, int size, float binary)
			{
			binarize_filters_kernel<<<cuda_gridsize(n), BLOCK>>>(filters, n, size, binary);
			binarize_weights_kernel<<<cuda_gridsize(n), BLOCK>>>(weights, n, size, binary);
			check_error(cudaPeekAtLastError());
			}

			@@ -74,12 +74,12 @@
			{
			fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
			if(l.binary){
			binarize_filters_gpu(l.filters_gpu, l.n, l.cl.sizel.size, l.binary_filters_gpu);
			binarize_weights_gpu(l.weights_gpu, l.n, l.cl.sizel.size, l.binary_weights_gpu);
			swap_binary(&l);
			}

			if(l.xnor){
			binarize_filters_gpu(l.filters_gpu, l.n, l.cl.sizel.size, l.binary_filters_gpu);
			binarize_weights_gpu(l.weights_gpu, l.n, l.cl.sizel.size, l.binary_weights_gpu);
			swap_binary(&l);
			binarize_gpu(state.input, l.cl.hl.w*l.batch, l.binary_input_gpu);
			state.input = l.binary_input_gpu;
			@@ -91,8 +91,8 @@
			&one,
			l.srcTensorDesc,
			state.input,
			l.filterDesc,
			l.filters_gpu,
			l.weightDesc,
			l.weights_gpu,
			l.convDesc,
			l.fw_algo,
			state.workspace,
			@@ -108,7 +108,7 @@
			int n = l.out_w*l.out_h;
			for(i = 0; i < l.batch; ++i){
			im2col_ongpu(state.input + il.cl.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
			float * a = l.filters_gpu;
			float * a = l.weights_gpu;
			float * b = state.workspace;
			float * c = l.output_gpu;
			gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+imn,n);
			@@ -150,15 +150,15 @@
			state.workspace,
			l.workspace_size,
			&one,
			l.dfilterDesc,
			l.filter_updates_gpu);
			l.dweightDesc,
			l.weight_updates_gpu);

			if(state.delta){
			if(l.binary \|\| l.xnor) swap_binary(&l);
			cudnnConvolutionBackwardData(cudnn_handle(),
			&one,
			l.filterDesc,
			l.filters_gpu,
			l.weightDesc,
			l.weights_gpu,
			l.ddstTensorDesc,
			l.delta_gpu,
			l.convDesc,
			@@ -181,14 +181,14 @@
			for(i = 0; i < l.batch; ++i){
			float * a = l.delta_gpu;
			float * b = state.workspace;
			float * c = l.filter_updates_gpu;
			float * c = l.weight_updates_gpu;

			im2col_ongpu(state.input + il.cl.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
			gemm_ongpu(0,1,m,n,k,1,a + imk,k,b,k,1,c,n);

			if(state.delta){
			if(l.binary \|\| l.xnor) swap_binary(&l);
			float * a = l.filters_gpu;
			float * a = l.weights_gpu;
			float * b = l.delta_gpu;
			float * c = state.workspace;

			@@ -206,9 +206,9 @@

			void pull_convolutional_layer(convolutional_layer layer)
			{
			cuda_pull_array(layer.filters_gpu, layer.filters, layer.clayer.nlayer.size*layer.size);
			cuda_pull_array(layer.weights_gpu, layer.weights, layer.clayer.nlayer.size*layer.size);
			cuda_pull_array(layer.biases_gpu, layer.biases, layer.n);
			cuda_pull_array(layer.filter_updates_gpu, layer.filter_updates, layer.clayer.nlayer.size*layer.size);
			cuda_pull_array(layer.weight_updates_gpu, layer.weight_updates, layer.clayer.nlayer.size*layer.size);
			cuda_pull_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
			if (layer.batch_normalize){
			cuda_pull_array(layer.scales_gpu, layer.scales, layer.n);
			@@ -219,9 +219,9 @@

			void push_convolutional_layer(convolutional_layer layer)
			{
			cuda_push_array(layer.filters_gpu, layer.filters, layer.clayer.nlayer.size*layer.size);
			cuda_push_array(layer.weights_gpu, layer.weights, layer.clayer.nlayer.size*layer.size);
			cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
			cuda_push_array(layer.filter_updates_gpu, layer.filter_updates, layer.clayer.nlayer.size*layer.size);
			cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.clayer.nlayer.size*layer.size);
			cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
			if (layer.batch_normalize){
			cuda_push_array(layer.scales_gpu, layer.scales, layer.n);
			@@ -240,9 +240,9 @@
			axpy_ongpu(layer.n, learning_rate/batch, layer.scale_updates_gpu, 1, layer.scales_gpu, 1);
			scal_ongpu(layer.n, momentum, layer.scale_updates_gpu, 1);

			axpy_ongpu(size, -decay*batch, layer.filters_gpu, 1, layer.filter_updates_gpu, 1);
			axpy_ongpu(size, learning_rate/batch, layer.filter_updates_gpu, 1, layer.filters_gpu, 1);
			scal_ongpu(size, momentum, layer.filter_updates_gpu, 1);
			axpy_ongpu(size, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1);
			axpy_ongpu(size, learning_rate/batch, layer.weight_updates_gpu, 1, layer.weights_gpu, 1);
			scal_ongpu(size, momentum, layer.weight_updates_gpu, 1);
			}

			@@ -19,28 +19,28 @@

			void swap_binary(convolutional_layer *l)
			{
			float *swap = l->filters;
			l->filters = l->binary_filters;
			l->binary_filters = swap;
			float *swap = l->weights;
			l->weights = l->binary_weights;
			l->binary_weights = swap;

			#ifdef GPU
			swap = l->filters_gpu;
			l->filters_gpu = l->binary_filters_gpu;
			l->binary_filters_gpu = swap;
			swap = l->weights_gpu;
			l->weights_gpu = l->binary_weights_gpu;
			l->binary_weights_gpu = swap;
			#endif
			}

			void binarize_filters(float filters, int n, int size, float binary)
			void binarize_weights(float weights, int n, int size, float binary)
			{
			int i, f;
			for(f = 0; f < n; ++f){
			float mean = 0;
			for(i = 0; i < size; ++i){
			mean += fabs(filters[f*size + i]);
			mean += fabs(weights[f*size + i]);
			}
			mean = mean / size;
			for(i = 0; i < size; ++i){
			binary[fsize + i] = (filters[fsize + i] > 0) ? mean : -mean;
			binary[fsize + i] = (weights[fsize + i] > 0) ? mean : -mean;
			}
			}
			}
			@@ -103,7 +103,7 @@
			size_t s = 0;
			cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
			l.srcTensorDesc,
			l.filterDesc,
			l.weightDesc,
			l.convDesc,
			l.dstTensorDesc,
			l.fw_algo,
			@@ -113,12 +113,12 @@
			l.srcTensorDesc,
			l.ddstTensorDesc,
			l.convDesc,
			l.dfilterDesc,
			l.dweightDesc,
			l.bf_algo,
			&s);
			if (s > most) most = s;
			cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
			l.filterDesc,
			l.weightDesc,
			l.ddstTensorDesc,
			l.convDesc,
			l.dsrcTensorDesc,
			@@ -137,22 +137,22 @@
			{
			cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w);
			cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
			cudnnSetFilter4dDescriptor(l->dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
			cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);

			cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w);
			cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
			cudnnSetFilter4dDescriptor(l->filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
			cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
			cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);
			cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
			l->srcTensorDesc,
			l->filterDesc,
			l->weightDesc,
			l->convDesc,
			l->dstTensorDesc,
			CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
			0,
			&l->fw_algo);
			cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
			l->filterDesc,
			l->weightDesc,
			l->ddstTensorDesc,
			l->convDesc,
			l->dsrcTensorDesc,
			@@ -163,7 +163,7 @@
			l->srcTensorDesc,
			l->ddstTensorDesc,
			l->convDesc,
			l->dfilterDesc,
			l->dweightDesc,
			CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
			0,
			&l->bf_algo);
			@@ -189,15 +189,15 @@
			l.pad = padding;
			l.batch_normalize = batch_normalize;

			l.filters = calloc(cnsize*size, sizeof(float));
			l.filter_updates = calloc(cnsize*size, sizeof(float));
			l.weights = calloc(cnsize*size, sizeof(float));
			l.weight_updates = calloc(cnsize*size, sizeof(float));

			l.biases = calloc(n, sizeof(float));
			l.bias_updates = calloc(n, sizeof(float));

			// float scale = 1./sqrt(sizesizec);
			float scale = sqrt(2./(sizesizec));
			for(i = 0; i < cnsizesize; ++i) l.filters[i] = scalerand_uniform(-1, 1);
			for(i = 0; i < cnsizesize; ++i) l.weights[i] = scalerand_uniform(-1, 1);
			int out_h = convolutional_out_height(l);
			int out_w = convolutional_out_width(l);
			l.out_h = out_h;
			@@ -210,12 +210,12 @@
			l.delta = calloc(l.batchout_h out_w * n, sizeof(float));

			if(binary){
			l.binary_filters = calloc(cnsize*size, sizeof(float));
			l.cfilters = calloc(cnsize*size, sizeof(char));
			l.binary_weights = calloc(cnsize*size, sizeof(float));
			l.cweights = calloc(cnsize*size, sizeof(char));
			l.scales = calloc(n, sizeof(float));
			}
			if(xnor){
			l.binary_filters = calloc(cnsize*size, sizeof(float));
			l.binary_weights = calloc(cnsize*size, sizeof(float));
			l.binary_input = calloc(l.inputs*l.batch, sizeof(float));
			}

			@@ -235,8 +235,8 @@

			#ifdef GPU
			if(gpu_index >= 0){
			l.filters_gpu = cuda_make_array(l.filters, cnsize*size);
			l.filter_updates_gpu = cuda_make_array(l.filter_updates, cnsize*size);
			l.weights_gpu = cuda_make_array(l.weights, cnsize*size);
			l.weight_updates_gpu = cuda_make_array(l.weight_updates, cnsize*size);

			l.biases_gpu = cuda_make_array(l.biases, n);
			l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
			@@ -248,10 +248,10 @@
			l.output_gpu = cuda_make_array(l.output, l.batchout_hout_w*n);

			if(binary){
			l.binary_filters_gpu = cuda_make_array(l.filters, cnsize*size);
			l.binary_weights_gpu = cuda_make_array(l.weights, cnsize*size);
			}
			if(xnor){
			l.binary_filters_gpu = cuda_make_array(l.filters, cnsize*size);
			l.binary_weights_gpu = cuda_make_array(l.weights, cnsize*size);
			l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
			}

			@@ -271,10 +271,10 @@
			#ifdef CUDNN
			cudnnCreateTensorDescriptor(&l.srcTensorDesc);
			cudnnCreateTensorDescriptor(&l.dstTensorDesc);
			cudnnCreateFilterDescriptor(&l.filterDesc);
			cudnnCreateFilterDescriptor(&l.weightDesc);
			cudnnCreateTensorDescriptor(&l.dsrcTensorDesc);
			cudnnCreateTensorDescriptor(&l.ddstTensorDesc);
			cudnnCreateFilterDescriptor(&l.dfilterDesc);
			cudnnCreateFilterDescriptor(&l.dweightDesc);
			cudnnCreateConvolutionDescriptor(&l.convDesc);
			cudnn_convolutional_setup(&l);
			#endif
			@@ -294,7 +294,7 @@
			for(i = 0; i < l.n; ++i){
			float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .00001);
			for(j = 0; j < l.cl.sizel.size; ++j){
			l.filters[il.cl.sizel.size + j] = scale;
			l.weights[il.cl.sizel.size + j] = scale;
			}
			l.biases[i] -= l.rolling_mean[i] * scale;
			l.scales[i] = 1;
			@@ -403,8 +403,8 @@

			/*
			if(l.binary){
			binarize_filters(l.filters, l.n, l.cl.sizel.size, l.binary_filters);
			binarize_filters2(l.filters, l.n, l.cl.sizel.size, l.cfilters, l.scales);
			binarize_weights(l.weights, l.n, l.cl.sizel.size, l.binary_weights);
			binarize_weights2(l.weights, l.n, l.cl.sizel.size, l.cweights, l.scales);
			swap_binary(&l);
			}
			*/
			@@ -415,7 +415,7 @@
			int k = l.sizel.sizel.c;
			int n = out_h*out_w;

			char *a = l.cfilters;
			char *a = l.cweights;
			float *b = state.workspace;
			float *c = l.output;

			@@ -434,7 +434,7 @@
			*/

			if(l.xnor){
			binarize_filters(l.filters, l.n, l.cl.sizel.size, l.binary_filters);
			binarize_weights(l.weights, l.n, l.cl.sizel.size, l.binary_weights);
			swap_binary(&l);
			binarize_cpu(state.input, l.cl.hl.w*l.batch, l.binary_input);
			state.input = l.binary_input;
			@@ -449,7 +449,7 @@
			printf("xnor\n");
			} else {

			float *a = l.filters;
			float *a = l.weights;
			float *b = state.workspace;
			float *c = l.output;

			@@ -485,7 +485,7 @@
			for(i = 0; i < l.batch; ++i){
			float a = l.delta + im*k;
			float *b = state.workspace;
			float *c = l.filter_updates;
			float *c = l.weight_updates;

			float im = state.input+il.cl.hl.w;

			@@ -494,7 +494,7 @@
			gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);

			if(state.delta){
			a = l.filters;
			a = l.weights;
			b = l.delta + imk;
			c = state.workspace;

			@@ -511,36 +511,36 @@
			axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
			scal_cpu(l.n, momentum, l.bias_updates, 1);

			axpy_cpu(size, -decay*batch, l.filters, 1, l.filter_updates, 1);
			axpy_cpu(size, learning_rate/batch, l.filter_updates, 1, l.filters, 1);
			scal_cpu(size, momentum, l.filter_updates, 1);
			axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1);
			axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
			scal_cpu(size, momentum, l.weight_updates, 1);
			}


			image get_convolutional_filter(convolutional_layer l, int i)
			image get_convolutional_weight(convolutional_layer l, int i)
			{
			int h = l.size;
			int w = l.size;
			int c = l.c;
			return float_to_image(w,h,c,l.filters+ihw*c);
			return float_to_image(w,h,c,l.weights+ihw*c);
			}

			void rgbgr_filters(convolutional_layer l)
			void rgbgr_weights(convolutional_layer l)
			{
			int i;
			for(i = 0; i < l.n; ++i){
			image im = get_convolutional_filter(l, i);
			image im = get_convolutional_weight(l, i);
			if (im.c == 3) {
			rgbgr_image(im);
			}
			}
			}

			void rescale_filters(convolutional_layer l, float scale, float trans)
			void rescale_weights(convolutional_layer l, float scale, float trans)
			{
			int i;
			for(i = 0; i < l.n; ++i){
			image im = get_convolutional_filter(l, i);
			image im = get_convolutional_weight(l, i);
			if (im.c == 3) {
			scale_image(im, scale);
			float sum = sum_array(im.data, im.wim.him.c);
			@@ -549,21 +549,21 @@
			}
			}

			image *get_filters(convolutional_layer l)
			image *get_weights(convolutional_layer l)
			{
			image *filters = calloc(l.n, sizeof(image));
			image *weights = calloc(l.n, sizeof(image));
			int i;
			for(i = 0; i < l.n; ++i){
			filters[i] = copy_image(get_convolutional_filter(l, i));
			//normalize_image(filters[i]);
			weights[i] = copy_image(get_convolutional_weight(l, i));
			//normalize_image(weights[i]);
			}
			return filters;
			return weights;
			}

			image visualize_convolutional_layer(convolutional_layer l, char window, image *prev_filters)
			image visualize_convolutional_layer(convolutional_layer l, char window, image *prev_weights)
			{
			image *single_filters = get_filters(l);
			show_images(single_filters, l.n, window);
			image *single_weights = get_weights(l);
			show_images(single_weights, l.n, window);

			image delta = get_convolutional_image(l);
			image dc = collapse_image_layers(delta, 1);
			@@ -572,6 +572,6 @@
			//show_image(dc, buff);
			//save_image(dc, buff);
			free_image(dc);
			return single_filters;
			return single_weights;
			}

			@@ -29,10 +29,10 @@
			void resize_convolutional_layer(convolutional_layer *layer, int w, int h);
			void forward_convolutional_layer(const convolutional_layer layer, network_state state);
			void update_convolutional_layer(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay);
			image visualize_convolutional_layer(convolutional_layer layer, char window, image *prev_filters);
			void binarize_filters(float filters, int n, int size, float binary);
			image visualize_convolutional_layer(convolutional_layer layer, char window, image *prev_weights);
			void binarize_weights(float weights, int n, int size, float binary);
			void swap_binary(convolutional_layer *l);
			void binarize_filters2(float filters, int n, int size, char binary, float *scales);
			void binarize_weights2(float weights, int n, int size, char binary, float *scales);

			void backward_convolutional_layer(convolutional_layer layer, network_state state);

			@@ -41,12 +41,12 @@

			image get_convolutional_image(convolutional_layer layer);
			image get_convolutional_delta(convolutional_layer layer);
			image get_convolutional_filter(convolutional_layer layer, int i);
			image get_convolutional_weight(convolutional_layer layer, int i);

			int convolutional_out_height(convolutional_layer layer);
			int convolutional_out_width(convolutional_layer layer);
			void rescale_filters(convolutional_layer l, float scale, float trans);
			void rgbgr_filters(convolutional_layer l);
			void rescale_weights(convolutional_layer l, float scale, float trans);
			void rgbgr_weights(convolutional_layer l);

			#endif

			@@ -9,6 +9,20 @@
			#include <stdlib.h>
			#include <time.h>

			void cuda_set_device(int n)
			{
			gpu_index = n;
			cudaError_t status = cudaSetDevice(n);
			check_error(status);
			}

			int cuda_get_device()
			{
			int n = 0;
			cudaError_t status = cudaGetDevice(&n);
			check_error(status);
			return n;
			}

			void check_error(cudaError_t status)
			{
			@@ -49,25 +63,27 @@
			#ifdef CUDNN
			cudnnHandle_t cudnn_handle()
			{
			static int init = 0;
			static cudnnHandle_t handle;
			if(!init) {
			cudnnCreate(&handle);
			init = 1;
			static int init[16] = {0};
			static cudnnHandle_t handle[16];
			int i = cuda_get_device();
			if(!init[i]) {
			cudnnCreate(&handle[i]);
			init[i] = 1;
			}
			return handle;
			return handle[i];
			}
			#endif

			cublasHandle_t blas_handle()
			{
			static int init = 0;
			static cublasHandle_t handle;
			if(!init) {
			cublasCreate(&handle);
			init = 1;
			static int init[16] = {0};
			static cublasHandle_t handle[16];
			int i = cuda_get_device();
			if(!init[i]) {
			cublasCreate(&handle[i]);
			init[i] = 1;
			}
			return handle;
			return handle[i];
			}

			float cuda_make_array(float x, size_t n)
			@@ -86,14 +102,15 @@

			void cuda_random(float *x_gpu, size_t n)
			{
			static curandGenerator_t gen;
			static int init = 0;
			if(!init){
			curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
			curandSetPseudoRandomGeneratorSeed(gen, time(0));
			init = 1;
			static curandGenerator_t gen[16];
			static int init[16] = {0};
			int i = cuda_get_device();
			if(!init[i]){
			curandCreateGenerator(&gen[i], CURAND_RNG_PSEUDO_DEFAULT);
			curandSetPseudoRandomGeneratorSeed(gen[i], time(0));
			init[i] = 1;
			}
			curandGenerateUniform(gen, x_gpu, n);
			curandGenerateUniform(gen[i], x_gpu, n);
			check_error(cudaPeekAtLastError());
			}

			@@ -21,6 +21,7 @@
			int *cuda_make_int_array(size_t n);
			void cuda_push_array(float x_gpu, float x, size_t n);
			void cuda_pull_array(float x_gpu, float x, size_t n);
			void cuda_set_device(int n);
			void cuda_free(float *x_gpu);
			void cuda_random(float *x_gpu, size_t n);
			float cuda_compare(float x_gpu, float x, size_t n, char *s);

			@@ -66,7 +66,7 @@
			if(l.type == CONVOLUTIONAL){
			int num = l.nl.cl.size*l.size;
			axpy_cpu(l.n, 1, l.biases, 1, out.biases, 1);
			axpy_cpu(num, 1, l.filters, 1, out.filters, 1);
			axpy_cpu(num, 1, l.weights, 1, out.weights, 1);
			}
			if(l.type == CONNECTED){
			axpy_cpu(l.outputs, 1, l.biases, 1, out.biases, 1);
			@@ -80,7 +80,7 @@
			if(l.type == CONVOLUTIONAL){
			int num = l.nl.cl.size*l.size;
			scal_cpu(l.n, 1./n, l.biases, 1);
			scal_cpu(num, 1./n, l.filters, 1);
			scal_cpu(num, 1./n, l.weights, 1);
			}
			if(l.type == CONNECTED){
			scal_cpu(l.outputs, 1./n, l.biases, 1);
			@@ -159,7 +159,7 @@
			for(i = 0; i < net.n; ++i){
			layer l = net.layers[i];
			if(l.type == CONVOLUTIONAL){
			rescale_filters(l, 2, -.5);
			rescale_weights(l, 2, -.5);
			break;
			}
			}
			@@ -177,7 +177,7 @@
			for(i = 0; i < net.n; ++i){
			layer l = net.layers[i];
			if(l.type == CONVOLUTIONAL){
			rgbgr_filters(l);
			rgbgr_weights(l);
			break;
			}
			}
			@@ -354,8 +354,7 @@
			gpu_index = -1;
			#else
			if(gpu_index >= 0){
			cudaError_t status = cudaSetDevice(gpu_index);
			check_error(status);
			cuda_set_device(gpu_index);
			}
			#endif

			@@ -27,7 +27,7 @@
			fill_ongpu(layer.outputs*layer.batch, 0, layer.output_gpu, 1);

			for(i = 0; i < layer.batch; ++i){
			float *a = layer.filters_gpu;
			float *a = layer.weights_gpu;
			float b = state.input + ilayer.clayer.hlayer.w;
			float *c = layer.col_image_gpu;

			@@ -59,7 +59,7 @@

			float a = state.input + im*n;
			float *b = layer.col_image_gpu;
			float *c = layer.filter_updates_gpu;
			float *c = layer.weight_updates_gpu;

			im2col_ongpu(layer.delta_gpu + ilayer.nsize, layer.n, out_h, out_w,
			layer.size, layer.stride, 0, b);
			@@ -70,7 +70,7 @@
			int n = layer.h*layer.w;
			int k = layer.sizelayer.sizelayer.n;

			float *a = layer.filters_gpu;
			float *a = layer.weights_gpu;
			float *b = layer.col_image_gpu;
			float c = state.delta + in*m;

			@@ -81,17 +81,17 @@

			extern "C" void pull_deconvolutional_layer(deconvolutional_layer layer)
			{
			cuda_pull_array(layer.filters_gpu, layer.filters, layer.clayer.nlayer.size*layer.size);
			cuda_pull_array(layer.weights_gpu, layer.weights, layer.clayer.nlayer.size*layer.size);
			cuda_pull_array(layer.biases_gpu, layer.biases, layer.n);
			cuda_pull_array(layer.filter_updates_gpu, layer.filter_updates, layer.clayer.nlayer.size*layer.size);
			cuda_pull_array(layer.weight_updates_gpu, layer.weight_updates, layer.clayer.nlayer.size*layer.size);
			cuda_pull_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
			}

			extern "C" void push_deconvolutional_layer(deconvolutional_layer layer)
			{
			cuda_push_array(layer.filters_gpu, layer.filters, layer.clayer.nlayer.size*layer.size);
			cuda_push_array(layer.weights_gpu, layer.weights, layer.clayer.nlayer.size*layer.size);
			cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
			cuda_push_array(layer.filter_updates_gpu, layer.filter_updates, layer.clayer.nlayer.size*layer.size);
			cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.clayer.nlayer.size*layer.size);
			cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
			}

			@@ -102,8 +102,8 @@
			axpy_ongpu(layer.n, learning_rate, layer.bias_updates_gpu, 1, layer.biases_gpu, 1);
			scal_ongpu(layer.n, momentum, layer.bias_updates_gpu, 1);

			axpy_ongpu(size, -decay, layer.filters_gpu, 1, layer.filter_updates_gpu, 1);
			axpy_ongpu(size, learning_rate, layer.filter_updates_gpu, 1, layer.filters_gpu, 1);
			scal_ongpu(size, momentum, layer.filter_updates_gpu, 1);
			axpy_ongpu(size, -decay, layer.weights_gpu, 1, layer.weight_updates_gpu, 1);
			axpy_ongpu(size, learning_rate, layer.weight_updates_gpu, 1, layer.weights_gpu, 1);
			scal_ongpu(size, momentum, layer.weight_updates_gpu, 1);
			}

			@@ -57,13 +57,13 @@
			l.stride = stride;
			l.size = size;

			l.filters = calloc(cnsize*size, sizeof(float));
			l.filter_updates = calloc(cnsize*size, sizeof(float));
			l.weights = calloc(cnsize*size, sizeof(float));
			l.weight_updates = calloc(cnsize*size, sizeof(float));

			l.biases = calloc(n, sizeof(float));
			l.bias_updates = calloc(n, sizeof(float));
			float scale = 1./sqrt(sizesizec);
			for(i = 0; i < cnsizesize; ++i) l.filters[i] = scalerand_normal();
			for(i = 0; i < cnsizesize; ++i) l.weights[i] = scalerand_normal();
			for(i = 0; i < n; ++i){
			l.biases[i] = scale;
			}
			@@ -81,8 +81,8 @@
			l.delta = calloc(l.batchout_h out_w * n, sizeof(float));

			#ifdef GPU
			l.filters_gpu = cuda_make_array(l.filters, cnsize*size);
			l.filter_updates_gpu = cuda_make_array(l.filter_updates, cnsize*size);
			l.weights_gpu = cuda_make_array(l.weights, cnsize*size);
			l.weight_updates_gpu = cuda_make_array(l.weight_updates, cnsize*size);

			l.biases_gpu = cuda_make_array(l.biases, n);
			l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
			@@ -137,7 +137,7 @@
			fill_cpu(l.outputs*l.batch, 0, l.output, 1);

			for(i = 0; i < l.batch; ++i){
			float *a = l.filters;
			float *a = l.weights;
			float b = state.input + il.cl.hl.w;
			float *c = l.col_image;

			@@ -167,7 +167,7 @@

			float a = state.input + im*n;
			float *b = l.col_image;
			float *c = l.filter_updates;
			float *c = l.weight_updates;

			im2col_cpu(l.delta + il.nsize, l.n, out_h, out_w,
			l.size, l.stride, 0, b);
			@@ -178,7 +178,7 @@
			int n = l.h*l.w;
			int k = l.sizel.sizel.n;

			float *a = l.filters;
			float *a = l.weights;
			float *b = l.col_image;
			float c = state.delta + in*m;

			@@ -193,9 +193,9 @@
			axpy_cpu(l.n, learning_rate, l.bias_updates, 1, l.biases, 1);
			scal_cpu(l.n, momentum, l.bias_updates, 1);

			axpy_cpu(size, -decay, l.filters, 1, l.filter_updates, 1);
			axpy_cpu(size, learning_rate, l.filter_updates, 1, l.filters, 1);
			scal_cpu(size, momentum, l.filter_updates, 1);
			axpy_cpu(size, -decay, l.weights, 1, l.weight_updates, 1);
			axpy_cpu(size, learning_rate, l.weight_updates, 1, l.weights, 1);
			scal_cpu(size, momentum, l.weight_updates, 1);
			}

			@@ -117,12 +117,18 @@
			int box_index = index * (classes + 5);
			boxes[index].x = (predictions[box_index + 0] + col + .5) / side * w;
			boxes[index].y = (predictions[box_index + 1] + row + .5) / side * h;
			if(1){
			if(0){
			boxes[index].x = (logistic_activate(predictions[box_index + 0]) + col) / side * w;
			boxes[index].y = (logistic_activate(predictions[box_index + 1]) + row) / side * h;
			}
			boxes[index].w = pow(logistic_activate(predictions[box_index + 2]), (square?2:1)) * w;
			boxes[index].h = pow(logistic_activate(predictions[box_index + 3]), (square?2:1)) * h;
			if(1){
			boxes[index].x = ((col + .5)/side + predictions[box_index + 0] * .5) * w;
			boxes[index].y = ((row + .5)/side + predictions[box_index + 1] * .5) * h;
			boxes[index].w = (exp(predictions[box_index + 2]) * .5) * w;
			boxes[index].h = (exp(predictions[box_index + 3]) * .5) * h;
			}
			for(j = 0; j < classes; ++j){
			int class_index = index * (classes + 5) + 5;
			float prob = scale*predictions[class_index+j];

			@@ -14,8 +14,6 @@
			if(l.indexes) free(l.indexes);
			if(l.rand) free(l.rand);
			if(l.cost) free(l.cost);
			if(l.filters) free(l.filters);
			if(l.filter_updates) free(l.filter_updates);
			if(l.biases) free(l.biases);
			if(l.bias_updates) free(l.bias_updates);
			if(l.weights) free(l.weights);
			@@ -30,8 +28,8 @@

			#ifdef GPU
			if(l.indexes_gpu) cuda_free((float *)l.indexes_gpu);
			if(l.filters_gpu) cuda_free(l.filters_gpu);
			if(l.filter_updates_gpu) cuda_free(l.filter_updates_gpu);
			if(l.weights_gpu) cuda_free(l.weights_gpu);
			if(l.weight_updates_gpu) cuda_free(l.weight_updates_gpu);
			if(l.col_image_gpu) cuda_free(l.col_image_gpu);
			if(l.weights_gpu) cuda_free(l.weights_gpu);
			if(l.biases_gpu) cuda_free(l.biases_gpu);

			@@ -105,9 +105,7 @@
			int *indexes;
			float *rand;
			float *cost;
			float *filters;
			char *cfilters;
			float *filter_updates;
			char *cweights;
			float *state;
			float *prev_state;
			float *forgot_state;
			@@ -117,7 +115,7 @@
			float *concat;
			float *concat_delta;

			float *binary_filters;
			float *binary_weights;

			float *biases;
			float *bias_updates;
			@@ -194,11 +192,9 @@
			float * save_delta_gpu;
			float * concat_gpu;
			float * concat_delta_gpu;
			float * filters_gpu;
			float * filter_updates_gpu;

			float *binary_input_gpu;
			float *binary_filters_gpu;
			float *binary_weights_gpu;

			float * mean_gpu;
			float * variance_gpu;
			@@ -230,8 +226,8 @@
			#ifdef CUDNN
			cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc;
			cudnnTensorDescriptor_t dsrcTensorDesc, ddstTensorDesc;
			cudnnFilterDescriptor_t filterDesc;
			cudnnFilterDescriptor_t dfilterDesc;
			cudnnFilterDescriptor_t weightDesc;
			cudnnFilterDescriptor_t dweightDesc;
			cudnnConvolutionDescriptor_t convDesc;
			cudnnConvolutionFwdAlgo_t fw_algo;
			cudnnConvolutionBwdDataAlgo_t bd_algo;

			@@ -47,23 +47,23 @@
			l.outputs = l.out_h * l.out_w * l.out_c;
			l.inputs = l.w * l.h * l.c;

			l.filters = calloc(cnsizesizelocations, sizeof(float));
			l.filter_updates = calloc(cnsizesizelocations, sizeof(float));
			l.weights = calloc(cnsizesizelocations, sizeof(float));
			l.weight_updates = calloc(cnsizesizelocations, sizeof(float));

			l.biases = calloc(l.outputs, sizeof(float));
			l.bias_updates = calloc(l.outputs, sizeof(float));

			// float scale = 1./sqrt(sizesizec);
			float scale = sqrt(2./(sizesizec));
			for(i = 0; i < cnsizesize; ++i) l.filters[i] = scalerand_uniform(-1,1);
			for(i = 0; i < cnsizesize; ++i) l.weights[i] = scalerand_uniform(-1,1);

			l.col_image = calloc(out_hout_wsizesizec, sizeof(float));
			l.output = calloc(l.batchout_h out_w * n, sizeof(float));
			l.delta = calloc(l.batchout_h out_w * n, sizeof(float));

			#ifdef GPU
			l.filters_gpu = cuda_make_array(l.filters, cnsizesizelocations);
			l.filter_updates_gpu = cuda_make_array(l.filter_updates, cnsizesizelocations);
			l.weights_gpu = cuda_make_array(l.weights, cnsizesizelocations);
			l.weight_updates_gpu = cuda_make_array(l.weight_updates, cnsizesizelocations);

			l.biases_gpu = cuda_make_array(l.biases, l.outputs);
			l.bias_updates_gpu = cuda_make_array(l.bias_updates, l.outputs);
			@@ -97,7 +97,7 @@
			l.size, l.stride, l.pad, l.col_image);
			float output = l.output + il.outputs;
			for(j = 0; j < locations; ++j){
			float a = l.filters + jl.sizel.sizel.c*l.n;
			float a = l.weights + jl.sizel.sizel.c*l.n;
			float *b = l.col_image + j;
			float *c = output + j;

			@@ -130,7 +130,7 @@
			for(j = 0; j < locations; ++j){
			float a = l.delta + il.outputs + j;
			float *b = l.col_image + j;
			float c = l.filter_updates + jl.sizel.sizel.c*l.n;
			float c = l.weight_updates + jl.sizel.sizel.c*l.n;
			int m = l.n;
			int n = l.sizel.sizel.c;
			int k = 1;
			@@ -140,7 +140,7 @@

			if(state.delta){
			for(j = 0; j < locations; ++j){
			float a = l.filters + jl.sizel.sizel.c*l.n;
			float a = l.weights + jl.sizel.sizel.c*l.n;
			float b = l.delta + il.outputs + j;
			float *c = l.col_image + j;

			@@ -163,9 +163,9 @@
			axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
			scal_cpu(l.outputs, momentum, l.bias_updates, 1);

			axpy_cpu(size, -decay*batch, l.filters, 1, l.filter_updates, 1);
			axpy_cpu(size, learning_rate/batch, l.filter_updates, 1, l.filters, 1);
			scal_cpu(size, momentum, l.filter_updates, 1);
			axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1);
			axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
			scal_cpu(size, momentum, l.weight_updates, 1);
			}

			#ifdef GPU
			@@ -187,7 +187,7 @@
			l.size, l.stride, l.pad, l.col_image_gpu);
			float output = l.output_gpu + il.outputs;
			for(j = 0; j < locations; ++j){
			float a = l.filters_gpu + jl.sizel.sizel.c*l.n;
			float a = l.weights_gpu + jl.sizel.sizel.c*l.n;
			float *b = l.col_image_gpu + j;
			float *c = output + j;

			@@ -219,7 +219,7 @@
			for(j = 0; j < locations; ++j){
			float a = l.delta_gpu + il.outputs + j;
			float *b = l.col_image_gpu + j;
			float c = l.filter_updates_gpu + jl.sizel.sizel.c*l.n;
			float c = l.weight_updates_gpu + jl.sizel.sizel.c*l.n;
			int m = l.n;
			int n = l.sizel.sizel.c;
			int k = 1;
			@@ -229,7 +229,7 @@

			if(state.delta){
			for(j = 0; j < locations; ++j){
			float a = l.filters_gpu + jl.sizel.sizel.c*l.n;
			float a = l.weights_gpu + jl.sizel.sizel.c*l.n;
			float b = l.delta_gpu + il.outputs + j;
			float *c = l.col_image_gpu + j;

			@@ -252,16 +252,16 @@
			axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
			scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);

			axpy_ongpu(size, -decay*batch, l.filters_gpu, 1, l.filter_updates_gpu, 1);
			axpy_ongpu(size, learning_rate/batch, l.filter_updates_gpu, 1, l.filters_gpu, 1);
			scal_ongpu(size, momentum, l.filter_updates_gpu, 1);
			axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
			axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
			scal_ongpu(size, momentum, l.weight_updates_gpu, 1);
			}

			void pull_local_layer(local_layer l)
			{
			int locations = l.out_w*l.out_h;
			int size = l.sizel.sizel.cl.nlocations;
			cuda_pull_array(l.filters_gpu, l.filters, size);
			cuda_pull_array(l.weights_gpu, l.weights, size);
			cuda_pull_array(l.biases_gpu, l.biases, l.outputs);
			}

			@@ -269,7 +269,7 @@
			{
			int locations = l.out_w*l.out_h;
			int size = l.sizel.sizel.cl.nlocations;
			cuda_push_array(l.filters_gpu, l.filters, size);
			cuda_push_array(l.weights_gpu, l.weights, size);
			cuda_push_array(l.biases_gpu, l.biases, l.outputs);
			}
			#endif

			@@ -318,11 +318,11 @@

			float train_network_datum(network net, float x, float y)
			{
			*net.seen += net.batch;
			#ifdef GPU
			if(gpu_index >= 0) return train_network_datum_gpu(net, x, y);
			#endif
			network_state state;
			*net.seen += net.batch;
			state.index = 0;
			state.net = net;
			state.input = x;

			@@ -65,6 +65,7 @@
			} network_state;

			#ifdef GPU
			float train_networks(network *nets, int n, data d);
			float train_network_datum_gpu(network net, float x, float y);
			float network_predict_gpu(network net, float input);
			float * get_network_output_gpu_layer(network net, int i);

			@@ -209,6 +209,7 @@

			float train_network_datum_gpu(network net, float x, float y)
			{
			*net.seen += net.batch;
			forward_backward_network_gpu(net, x, y);
			float error = get_network_cost(net);
			if (((*net.seen) / net.batch) % net.subdivisions == 0) update_network_gpu(net);
			@@ -226,25 +227,115 @@
			{
			train_args args = (train_args)ptr;

			cudaError_t status = cudaSetDevice(args.net.gpu_index);
			check_error(status);
			cuda_set_device(args.net.gpu_index);
			forward_backward_network_gpu(args.net, args.X, args.y);
			free(ptr);
			return 0;
			}

			pthread_t train_network_in_thread(train_args args)
			pthread_t train_network_in_thread(network net, float X, float y)
			{
			pthread_t thread;
			train_args ptr = (train_args )calloc(1, sizeof(train_args));
			*ptr = args;
			ptr->net = net;
			ptr->X = X;
			ptr->y = y;
			if(pthread_create(&thread, 0, train_thread, ptr)) error("Thread creation failed");
			return thread;
			}

			void pull_updates(layer l)
			{
			#ifdef GPU
			if(l.type == CONVOLUTIONAL){
			cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
			cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.nl.sizel.size*l.c);
			if(l.scale_updates) cuda_pull_array(l.scale_updates_gpu, l.scale_updates, l.n);
			} else if(l.type == CONNECTED){
			cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
			cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.outputs*l.inputs);
			}
			#endif
			}

			void push_updates(layer l)
			{
			#ifdef GPU
			if(l.type == CONVOLUTIONAL){
			cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
			cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nl.sizel.size*l.c);
			if(l.scale_updates) cuda_push_array(l.scale_updates_gpu, l.scale_updates, l.n);
			} else if(l.type == CONNECTED){
			cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
			cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.outputs*l.inputs);
			}
			#endif
			}

			void merge_updates(layer l, layer base)
			{
			if (l.type == CONVOLUTIONAL) {
			axpy_cpu(l.n, 1, l.bias_updates, 1, base.bias_updates, 1);
			axpy_cpu(l.nl.sizel.size*l.c, 1, l.weight_updates, 1, base.weight_updates, 1);
			if (l.scale_updates) {
			axpy_cpu(l.n, 1, l.scale_updates, 1, base.scale_updates, 1);
			}
			} else if(l.type == CONNECTED) {
			axpy_cpu(l.outputs, 1, l.bias_updates, 1, base.bias_updates, 1);
			axpy_cpu(l.outputs*l.inputs, 1, l.weight_updates, 1, base.weight_updates, 1);
			}
			}

			void distribute_updates(layer l, layer base)
			{
			if (l.type == CONVOLUTIONAL) {
			copy_cpu(l.n, base.bias_updates, 1, l.bias_updates, 1);
			copy_cpu(l.nl.sizel.size*l.c, base.weight_updates, 1, l.weight_updates, 1);
			if (l.scale_updates) {
			copy_cpu(l.n, base.scale_updates, 1, l.scale_updates, 1);
			}
			} else if(l.type == CONNECTED) {
			copy_cpu(l.outputs, base.bias_updates, 1, l.bias_updates, 1);
			copy_cpu(l.outputs*l.inputs, base.weight_updates, 1, l.weight_updates, 1);
			}
			}

			void sync_updates(network *nets, int n)
			{
			int i,j;
			int layers = nets[0].n;
			network net = nets[0];
			for (j = 0; j < layers; ++j) {
			layer base = net.layers[j];
			cuda_set_device(net.gpu_index);
			pull_updates(base);
			for (i = 1; i < n; ++i) {
			cuda_set_device(nets[i].gpu_index);
			layer l = nets[i].layers[j];
			pull_updates(l);
			merge_updates(l, base);
			}
			for (i = 1; i < n; ++i) {
			cuda_set_device(nets[i].gpu_index);
			layer l = nets[i].layers[j];
			distribute_updates(l, base);
			push_updates(l);
			}
			cuda_set_device(net.gpu_index);
			push_updates(base);
			}
			for (i = 0; i < n; ++i) {
			cuda_set_device(nets[i].gpu_index);
			if(i > 0) nets[i].momentum = 0;
			update_network_gpu(nets[i]);
			}
			}

			float train_networks(network *nets, int n, data d)
			{
			int batch = nets[0].batch;
			assert(batch * n == d.X.rows);
			assert(nets[0].subdivisions % n == 0);
			float X = (float ) calloc(n, sizeof(float *));
			float y = (float ) calloc(n, sizeof(float *));
			pthread_t threads = (pthread_t ) calloc(n, sizeof(pthread_t));
			@@ -255,11 +346,20 @@
			X[i] = (float ) calloc(batchd.X.cols, sizeof(float));
			y[i] = (float ) calloc(batchd.y.cols, sizeof(float));
			get_next_batch(d, batch, i*batch, X[i], y[i]);
			float err = train_network_datum(nets[i], X[i], y[i]);
			sum += err;
			threads[i] = train_network_in_thread(nets[i], X[i], y[i]);
			}
			for(i = 0; i < n; ++i){
			pthread_join(threads[i], 0);
			nets[i].seen += nnets[i].batch;
			printf("%f\n", get_network_cost(nets[i]) / batch);
			sum += get_network_cost(nets[i]);
			free(X[i]);
			free(y[i]);
			}
			if (((*nets[0].seen) / nets[0].batch) % nets[0].subdivisions == 0) sync_updates(nets, n);
			free(X);
			free(y);
			free(threads);
			return (float)sum/(n*batch);
			}

			@@ -551,6 +551,7 @@
			node *n = sections->front;
			if(!n) error("Config file has no sections");
			network net = make_network(sections->size - 1);
			net.gpu_index = gpu_index;
			size_params params;

			section s = (section )n->val;
			@@ -856,13 +857,13 @@

			for (j = 0; j < l.n; ++j){
			int index = jl.cl.size*l.size;
			fwrite(l.filters+index, sizeof(float), l.cl.sizel.size, fp);
			fwrite(l.weights+index, sizeof(float), l.cl.sizel.size, fp);
			for (k = 0; k < l.cl.sizel.size; ++k) fwrite(&zero, sizeof(float), 1, fp);
			}
			for (j = 0; j < l.n; ++j){
			int index = jl.cl.size*l.size;
			for (k = 0; k < l.cl.sizel.size; ++k) fwrite(&zero, sizeof(float), 1, fp);
			fwrite(l.filters+index, sizeof(float), l.cl.sizel.size, fp);
			fwrite(l.weights+index, sizeof(float), l.cl.sizel.size, fp);
			}
			}
			}
			@@ -876,7 +877,7 @@
			pull_convolutional_layer(l);
			}
			#endif
			binarize_filters(l.filters, l.n, l.cl.sizel.size, l.binary_filters);
			binarize_weights(l.weights, l.n, l.cl.sizel.size, l.binary_weights);
			int size = l.cl.sizel.size;
			int i, j, k;
			fwrite(l.biases, sizeof(float), l.n, fp);
			@@ -886,7 +887,7 @@
			fwrite(l.rolling_variance, sizeof(float), l.n, fp);
			}
			for(i = 0; i < l.n; ++i){
			float mean = l.binary_filters[i*size];
			float mean = l.binary_weights[i*size];
			if(mean < 0) mean = -mean;
			fwrite(&mean, sizeof(float), 1, fp);
			for(j = 0; j < size/8; ++j){
			@@ -894,7 +895,7 @@
			unsigned char c = 0;
			for(k = 0; k < 8; ++k){
			if (j*8 + k >= size) break;
			if (l.binary_filters[index + k] > 0) c = (c \| 1<<k);
			if (l.binary_weights[index + k] > 0) c = (c \| 1<<k);
			}
			fwrite(&c, sizeof(char), 1, fp);
			}
			@@ -919,7 +920,7 @@
			fwrite(l.rolling_mean, sizeof(float), l.n, fp);
			fwrite(l.rolling_variance, sizeof(float), l.n, fp);
			}
			fwrite(l.filters, sizeof(float), num, fp);
			fwrite(l.weights, sizeof(float), num, fp);
			}

			void save_batchnorm_weights(layer l, FILE *fp)
			@@ -952,6 +953,9 @@

			void save_weights_upto(network net, char *filename, int cutoff)
			{
			#ifdef GPU
			cuda_set_device(net.gpu_index);
			#endif
			fprintf(stderr, "Saving weights to %s\n", filename);
			FILE *fp = fopen(filename, "w");
			if(!fp) file_error(filename);
			@@ -997,7 +1001,7 @@
			int locations = l.out_w*l.out_h;
			int size = l.sizel.sizel.cl.nlocations;
			fwrite(l.biases, sizeof(float), l.outputs, fp);
			fwrite(l.filters, sizeof(float), size, fp);
			fwrite(l.weights, sizeof(float), size, fp);
			}
			}
			fclose(fp);
			@@ -1075,7 +1079,7 @@
			fread(&c, sizeof(char), 1, fp);
			for(k = 0; k < 8; ++k){
			if (j*8 + k >= size) break;
			l.filters[index + k] = (c & 1<<k) ? mean : -mean;
			l.weights[index + k] = (c & 1<<k) ? mean : -mean;
			}
			}
			}
			@@ -1099,12 +1103,12 @@
			fread(l.rolling_mean, sizeof(float), l.n, fp);
			fread(l.rolling_variance, sizeof(float), l.n, fp);
			}
			fread(l.filters, sizeof(float), num, fp);
			//if(l.c == 3) scal_cpu(num, 1./256, l.filters, 1);
			fread(l.weights, sizeof(float), num, fp);
			//if(l.c == 3) scal_cpu(num, 1./256, l.weights, 1);
			if (l.flipped) {
			transpose_matrix(l.filters, l.cl.sizel.size, l.n);
			transpose_matrix(l.weights, l.cl.sizel.size, l.n);
			}
			//if (l.binary) binarize_filters(l.filters, l.n, l.cl.sizel.size, l.filters);
			//if (l.binary) binarize_weights(l.weights, l.n, l.cl.sizel.size, l.weights);
			#ifdef GPU
			if(gpu_index >= 0){
			push_convolutional_layer(l);
			@@ -1115,6 +1119,9 @@

			void load_weights_upto(network net, char filename, int cutoff)
			{
			#ifdef GPU
			cuda_set_device(net->gpu_index);
			#endif
			fprintf(stderr, "Loading weights from %s...", filename);
			fflush(stdout);
			FILE *fp = fopen(filename, "rb");
			@@ -1139,7 +1146,7 @@
			if(l.type == DECONVOLUTIONAL){
			int num = l.nl.cl.size*l.size;
			fread(l.biases, sizeof(float), l.n, fp);
			fread(l.filters, sizeof(float), num, fp);
			fread(l.weights, sizeof(float), num, fp);
			#ifdef GPU
			if(gpu_index >= 0){
			push_deconvolutional_layer(l);
			@@ -1174,7 +1181,7 @@
			int locations = l.out_w*l.out_h;
			int size = l.sizel.sizel.cl.nlocations;
			fread(l.biases, sizeof(float), l.outputs, fp);
			fread(l.filters, sizeof(float), size, fp);
			fread(l.weights, sizeof(float), size, fp);
			#ifdef GPU
			if(gpu_index >= 0){
			push_local_layer(l);

			@@ -22,11 +22,18 @@
			l.classes = classes;
			l.coords = coords;
			l.cost = calloc(1, sizeof(float));
			l.biases = calloc(n*2, sizeof(float));
			l.bias_updates = calloc(n*2, sizeof(float));
			l.outputs = hwn*(classes + coords + 1);
			l.inputs = l.outputs;
			l.truths = 30*(5);
			l.delta = calloc(batch*l.outputs, sizeof(float));
			l.output = calloc(batch*l.outputs, sizeof(float));
			int i;
			for(i = 0; i < n*2; ++i){
			l.biases[i] = .5;
			}

			#ifdef GPU
			l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
			l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
			@@ -38,62 +45,30 @@
			return l;
			}

			box get_region_box2(float *x, int index, int i, int j, int w, int h)
			box get_region_box(float x, float biases, int n, int index, int i, int j, int w, int h)
			{
			float aspect = exp(x[index+0]);
			float scale = logistic_activate(x[index+1]);
			float move_x = x[index+2];
			float move_y = x[index+3];

			box b;
			b.w = sqrt(scale * aspect);
			b.h = b.w * 1./aspect;
			b.x = move_x * b.w + (i + .5)/w;
			b.y = move_y * b.h + (j + .5)/h;
			b.x = (i + .5)/w + x[index + 0] * biases[2*n];
			b.y = (j + .5)/h + x[index + 1] * biases[2*n + 1];
			b.w = exp(x[index + 2]) * biases[2*n];
			b.h = exp(x[index + 3]) * biases[2*n+1];
			return b;
			}

			float delta_region_box2(box truth, float output, int index, int i, int j, int w, int h, float delta)
			float delta_region_box(box truth, float x, float biases, int n, int index, int i, int j, int w, int h, float *delta, float scale)
			{
			box pred = get_region_box2(output, index, i, j, w, h);
			float iou = box_iou(pred, truth);
			float true_aspect = truth.w/truth.h;
			float true_scale = truth.w*truth.h;

			float true_dx = (truth.x - (i+.5)/w) / truth.w;
			float true_dy = (truth.y - (j+.5)/h) / truth.h;
			delta[index + 0] = (true_aspect - exp(output[index + 0])) * exp(output[index + 0]);
			delta[index + 1] = (true_scale - logistic_activate(output[index + 1])) * logistic_gradient(logistic_activate(output[index + 1]));
			delta[index + 2] = true_dx - output[index + 2];
			delta[index + 3] = true_dy - output[index + 3];
			return iou;
			}

			box get_region_box(float *x, int index, int i, int j, int w, int h, int adjust, int logistic)
			{
			box b;
			b.x = (x[index + 0] + i + .5)/w;
			b.y = (x[index + 1] + j + .5)/h;
			b.w = x[index + 2];
			b.h = x[index + 3];
			if(logistic){
			b.w = logistic_activate(x[index + 2]);
			b.h = logistic_activate(x[index + 3]);
			}
			if(adjust && b.w < .01) b.w = .01;
			if(adjust && b.h < .01) b.h = .01;
			return b;
			}

			float delta_region_box(box truth, float output, int index, int i, int j, int w, int h, float delta, int logistic, float scale)
			{
			box pred = get_region_box(output, index, i, j, w, h, 0, logistic);
			box pred = get_region_box(x, biases, n, index, i, j, w, h);
			float iou = box_iou(pred, truth);

			delta[index + 0] = scale * (truth.x - pred.x);
			delta[index + 1] = scale * (truth.y - pred.y);
			delta[index + 2] = scale * ((truth.w - pred.w)*(logistic ? logistic_gradient(pred.w) : 1));
			delta[index + 3] = scale * ((truth.h - pred.h)*(logistic ? logistic_gradient(pred.h) : 1));
			float tx = (truth.x - (i + .5)/w) / biases[2*n];
			float ty = (truth.y - (j + .5)/h) / biases[2*n + 1];
			float tw = log(truth.w / biases[2*n]);
			float th = log(truth.h / biases[2*n + 1]);

			delta[index + 0] = scale * (tx - x[index + 0]);
			delta[index + 1] = scale * (ty - x[index + 1]);
			delta[index + 2] = scale * (tw - x[index + 2]);
			delta[index + 3] = scale * (th - x[index + 3]);
			return iou;
			}

			@@ -107,7 +82,7 @@
			return (x != x);
			}

			#define LOG 1
			#define LOG 0

			void forward_region_layer(const region_layer l, network_state state)
			{
			@@ -127,6 +102,7 @@
			if(!state.train) return;
			memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
			float avg_iou = 0;
			float recall = 0;
			float avg_cat = 0;
			float avg_obj = 0;
			float avg_anyobj = 0;
			@@ -137,7 +113,7 @@
			for (i = 0; i < l.w; ++i) {
			for (n = 0; n < l.n; ++n) {
			int index = size(jl.wl.n + il.n + n) + b*l.outputs;
			box pred = get_region_box(l.output, index, i, j, l.w, l.h, 1, LOG);
			box pred = get_region_box(l.output, l.biases, n, index, i, j, l.w, l.h);
			float best_iou = 0;
			for(t = 0; t < 30; ++t){
			box truth = float_to_box(state.truth + t5 + bl.truths);
			@@ -155,7 +131,11 @@
			truth.y = (j + .5)/l.h;
			truth.w = .5;
			truth.h = .5;
			delta_region_box(truth, l.output, index, i, j, l.w, l.h, l.delta, LOG, 1);
			delta_region_box(truth, l.output, l.biases, n, index, i, j, l.w, l.h, l.delta, .01);
			//l.delta[index + 0] = .1 * (0 - l.output[index + 0]);
			//l.delta[index + 1] = .1 * (0 - l.output[index + 1]);
			//l.delta[index + 2] = .1 * (0 - l.output[index + 2]);
			//l.delta[index + 3] = .1 * (0 - l.output[index + 3]);
			}
			}
			}
			@@ -176,8 +156,8 @@
			printf("index %d %d\n",i, j);
			for(n = 0; n < l.n; ++n){
			int index = size(jl.wl.n + il.n + n) + b*l.outputs;
			box pred = get_region_box(l.output, index, i, j, l.w, l.h, 1, LOG);
			printf("pred: (%f, %f) %f x %f\n", pred.x, pred.y, pred.w, pred.h);
			box pred = get_region_box(l.output, l.biases, n, index, i, j, l.w, l.h);
			printf("pred: (%f, %f) %f x %f\n", pred.xl.w - i - .5, pred.y l.h - j - .5, pred.w, pred.h);
			pred.x = 0;
			pred.y = 0;
			float iou = box_iou(pred, truth_shift);
			@@ -187,9 +167,10 @@
			best_n = n;
			}
			}
			printf("%d %f (%f, %f) %f x %f\n", best_n, best_iou, truth.x, truth.y, truth.w, truth.h);
			printf("%d %f (%f, %f) %f x %f\n", best_n, best_iou, truth.x * l.w - i - .5, truth.y*l.h - j - .5, truth.w, truth.h);

			float iou = delta_region_box(truth, l.output, best_index, i, j, l.w, l.h, l.delta, LOG, l.coord_scale);
			float iou = delta_region_box(truth, l.output, l.biases, best_n, best_index, i, j, l.w, l.h, l.delta, l.coord_scale);
			if(iou > .5) recall += 1;
			avg_iou += iou;

			//l.delta[best_index + 4] = iou - l.output[best_index + 4];
			@@ -239,7 +220,7 @@
			printf("\n");
			reorg(l.delta, l.wl.h, sizel.n, l.batch, 0);
			(l.cost) = pow(mag_array(l.delta, l.outputs l.batch), 2);
			printf("Region Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, count: %d\n", avg_iou/count, avg_cat/count, avg_obj/count, avg_anyobj/(l.wl.hl.n*l.batch), count);
			printf("Region Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, Avg Recall: %f, count: %d\n", avg_iou/count, avg_cat/count, avg_obj/count, avg_anyobj/(l.wl.hl.n*l.batch), recall/count, count);
			}

			void backward_region_layer(const region_layer l, network_state state)