~speedprog/mtg/mtg_card_detector.git

parent: d790f21c | patch | commit | ignore whitespace

Joseph Redmon

2016-06-06 8a767f106677b78a389e1ceffc066501015ec51a

stuff for carlo

4 files modified

8 files added

	ai2.mk	79 ●●●●● patch \| view \| raw \| blame \| history
	cfg/xyolo.test.cfg	148 ●●●●● patch \| view \| raw \| blame \| history
	src/binary_convolution.c	598 ●●●●● patch \| view \| raw \| blame \| history
	src/binary_convolution.h	218 ●●●●● patch \| view \| raw \| blame \| history
	src/common.c	81 ●●●●● patch \| view \| raw \| blame \| history
	src/common.h	50 ●●●●● patch \| view \| raw \| blame \| history
	src/convolutional_layer.c	127 ●●●●● patch \| view \| raw \| blame \| history
	src/layer.h	2 ●●●●● patch \| view \| raw \| blame \| history
	src/parser.c	3 ●●●●● patch \| view \| raw \| blame \| history
	src/xnor_layer.c	86 ●●●●● patch \| view \| raw \| blame \| history
	src/xnor_layer.h	11 ●●●●● patch \| view \| raw \| blame \| history
	src/yolo.c	2 ●●●●● patch \| view \| raw \| blame \| history

 ai2.mk

New file
@@ -0,0 +1,79 @@
GPU=0
CUDNN=0
OPENCV=0
DEBUG=0
AI2=1

ARCH= --gpu-architecture=compute_52 --gpu-code=compute_52 

VPATH=./src/
EXEC=darknet
OBJDIR=./obj/

CC=gcc -std=gnu11
NVCC=nvcc
OPTS=-Ofast
LDFLAGS= -lm -pthread 
COMMON= 
CFLAGS=-Wall -Wfatal-errors 

ifeq ($(DEBUG), 1) 
OPTS=-O0 -g
endif

CFLAGS+=$(OPTS)

ifeq ($(OPENCV), 1) 
COMMON+= -DOPENCV
CFLAGS+= -DOPENCV
LDFLAGS+= `pkg-config --libs opencv` 
COMMON+= `pkg-config --cflags opencv` 
endif

ifeq ($(AI2), 1) 
COMMON+= -DAI2
CFLAGS+= -DAI2
endif

ifeq ($(GPU), 1) 
COMMON+= -DGPU -I/usr/local/cuda/include/
CFLAGS+= -DGPU
LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand
endif

ifeq ($(CUDNN), 1) 
COMMON+= -DCUDNN 
CFLAGS+= -DCUDNN
LDFLAGS+= -lcudnn
endif

OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o imagenet.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o coco_demo.o tag.o cifar.o yolo_demo.o go.o batchnorm_layer.o art.o xnor_layer.o common.o binary_convolution.o
ifeq ($(GPU), 1) 
LDFLAGS+= -lstdc++ 
OBJ+=convolutional_kernels.o deconvolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o softmax_layer_kernels.o network_kernels.o avgpool_layer_kernels.o
endif

OBJS = $(addprefix $(OBJDIR), $(OBJ))
DEPS = $(wildcard src/*.h) Makefile

all: obj results $(EXEC)

$(EXEC): $(OBJS)
    $(CC) $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS)

$(OBJDIR)%.o: %.c $(DEPS)
    $(CC) $(COMMON) $(CFLAGS) -c $< -o $@

$(OBJDIR)%.o: %.cu $(DEPS)
    $(NVCC) $(ARCH) $(COMMON) --compiler-options "$(CFLAGS)" -c $< -o $@

obj:
    mkdir -p obj
results:
    mkdir -p results

.PHONY: clean

clean:
    rm -rf $(OBJS) $(EXEC)


 cfg/xyolo.test.cfg

New file
@@ -0,0 +1,148 @@
[net]
batch=1
subdivisions=1
height=448
width=448
channels=3
momentum=0.9
decay=0.0005

learning_rate=0.0001
policy=steps
steps=20,40,60,80,20000,30000
scales=5,5,2,2,.1,.1
max_batches = 40000

[crop]
crop_width=448
crop_height=448
flip=0
angle=0
saturation = 1.5
exposure = 1.5
noadjust=1

[convolutional]
batch_normalize=1
filters=16
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[batchnorm]

[convolutional]
xnor = 1
batch_normalize=1
filters=32
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[batchnorm]

[convolutional]
xnor = 1
batch_normalize=1
filters=64
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[batchnorm]

[convolutional]
xnor = 1
batch_normalize=1
filters=128
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[batchnorm]

[convolutional]
xnor = 1
batch_normalize=1
filters=256
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[batchnorm]

[convolutional]
xnor = 1
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[batchnorm]

[convolutional]
batch_normalize=1
filters=1024
size=3
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
size=3
stride=1
pad=1
filters=128
activation=leaky

[connected]
output= 1470
activation=linear

[detection]
classes=20
coords=4
rescore=1
side=7
num=2
softmax=0
sqrt=1
jitter=.2

object_scale=1
noobject_scale=.5
class_scale=1
coord_scale=5


 src/binary_convolution.c

New file
@@ -0,0 +1,598 @@
#include "binary_convolution.h"

int ai2_bin_dp(BINARY_WORD *a, BINARY_WORD *b, dim3 vdim) {     // TODO unroll
    int accumulator = 0;
    for (int z = 0; z < vdim.z / BITS_PER_BINARY_WORD; z++) {
        for (int y = 0; y < vdim.y; y++) {
            for (int x = 0; x < vdim.x; x++) {
                int idx = z*vdim.y*vdim.x + y*vdim.x + x;
                accumulator += __builtin_popcount(~(a[idx] ^ b[idx]));   // count the XNOR of the two bit vectors
            }
        }
    }

    return accumulator;
}

/** 
 * Pre-conditions: 
 *                  alpha_volume is an array of size x*y*z.
 *                  alpha_plane is an array of size x*y.
 *                  alpha_volume (x,y,z) is transposed to (z,x,y).
 */
void ai2_calc_alpha(float *alpha_plane, float *alpha_volume, dim3 vdim) {
    for (int y = 0; y < vdim.y; ++y) {
        for (int x = 0; x < vdim.x; ++x) {
            int out = y * vdim.x + x;
            double accum = 0.0;
            for (int z = 0; z < vdim.z; ++z) {
                accum += alpha_volume[out * vdim.z + z];
            }

            alpha_plane[out] = accum / vdim.z;
        }
    }
}

/** @brief Wrapper function for generating the beta scaling factor */
void ai2_calc_beta(float *beta_plane, float *beta_volume, dim3 vdim) {
    ai2_calc_alpha(beta_plane, beta_volume, vdim);
}

/** @brief Set the bit in a binary word */
void ai2_bitset(BINARY_WORD *bword, unsigned int position) {
    BINARY_WORD mask = (1 << position);
    *bword = *bword | mask;
}

/** @brief Checks that the bit is set in a binary word */
int ai2_is_set(BINARY_WORD bword, unsigned int position) {
    unsigned int position_complement = (BITS_PER_BINARY_WORD - 1) - position;   // number of leading bits before the bit position of interest
    bword = (bword << position_complement);                                     // zero out leading bits
    bword = (bword >> (BITS_PER_BINARY_WORD - 1));                              // shift bit position of interest to the 0th position
    return (bword & 0x1);                                                       // test if bit position of interest is set
}

void ai2_flt_to_bin(BINARY_WORD *binary_vol, float *real_vol, dim3 dim) {
    ai2_transpose3D(real_vol, dim); // (x,y,z) -> (z,x,y)

    int sz = dim.x * dim.y * dim.z;
    for (int i = 0; i < sz; i += BITS_PER_BINARY_WORD) {
        BINARY_WORD tmp = 0x00000000;
        for (int x = 0; x < BITS_PER_BINARY_WORD; ++x) {
            int waddr = x + i;
            if (signbit(real_vol[waddr]) == 0)
                ai2_bitset(&tmp, (BITS_PER_BINARY_WORD - 1) - x);
        }
        binary_vol[i / BITS_PER_BINARY_WORD] = tmp;
    }
}

void ai2_bin_to_flt(float *real_vol, BINARY_WORD *binary_vol, dim3 dim) {   // TODO unit tests
    for (int z = 0; z < dim.z; z++) {
        for (int y = 0; y < dim.y; y++) {
            for (int x = 0; x < dim.x / BITS_PER_BINARY_WORD; x++) {    // TODO boundary checks, for uneven input
                BINARY_WORD word = binary_vol[z*dim.y*dim.x + y*dim.x + x];
                for (int t = 0; t < BITS_PER_BINARY_WORD; ++t) {
                    int oidx = z*dim.y*dim.x + y*dim.x + x * BITS_PER_BINARY_WORD + t;
                    if (ai2_is_set(word, t))
                        real_vol[oidx] = 1.f;
                    else
                        real_vol[oidx] = -1.f;
                }
            }
        }
    }

    // Transpose channels back to output
    ai2_transpose3D(real_vol, dim); // (z,y,x) -> (x,y,z)
}

/* @brief: input is padded.
 */
void ai2_bin_conv2D(float *output, const BINARY_WORD *input, const BINARY_WORD *weights, int ix, int iy, int wx, int wy, int pad, int stride) {

    int r, rd, c, cd;
    int wx_2 = wx / 2;
    int wy_2 = wy / 2;

    // Indexing for output pixels. x = [wx_2, ix + wx_2 - 1], y = [wy_2, iy + wy_2 - 1]
    int sx = pad;               // start x
    int ex = ix + pad - 1;      // end x
    int sy = pad;               // start y
    int ey = iy + pad - 1;      // end y

    // Indexing for weights
    int wsx, wex, wsy, wey;
    if (wx % 2 == 1) {  // odd weights
        wsx = -wx_2; wex = wx_2 + 1;
        wsy = -wy_2; wey = wy_2 + 1;    
    }
    else {
        wsx = -wx_2; wex = wx_2;
        wsy = -wy_2; wey = wy_2;    
    }

    int px = ix + 2*pad;
    //int py = iy + 2*pad;

    for (r = sy; r <= ey; ++r) {
        for (c = sx; c <= ex; ++c) {
            int accumulator = 0;
            for (rd = wsy; rd < wey; ++rd) {
                for (cd = wsx; cd < wex; ++cd) {
                    int iidx = (r+rd)*px + (c+cd);
                    BINARY_WORD pixel = input[iidx];
                    //BINARY_WORD pixel = 0xFFFFFFFF;
                    //BINARY_WORD weight = 0xFFFFFFFF;
                    int widx = (rd + wy_2)*wx + (cd+wx_2);
                    BINARY_WORD weight = weights[widx];
                    accumulator += __builtin_popcount(~(pixel ^ weight));
                }
            }

            // Padded space
            int oidx = r*px + c;
            output[oidx] += (float) accumulator;
        }
    }

    //for (r = sy; r <= ey; ++r) {
    //  for (c = sx; c <= ex; ++c) {
    //      int accumulator = 0;
    //      for (rd = -wy_2; rd < wy_2; ++rd) {
    //          for (cd = -wx_2; cd < wx_2; ++cd) {
    //              int iidx = (r+rd)*px + (c+cd);
    //              BINARY_WORD pixel = input[iidx];
    //              //BINARY_WORD pixel = 0xFFFFFFFF;
    //              //BINARY_WORD weight = 0xFFFFFFFF;
    //              int widx = (rd + wy_2)*wx + (cd+wx_2);
    //              BINARY_WORD weight = weights[widx];
    //              accumulator += __builtin_popcount(~(pixel ^ weight));
    //          }
    //      }

    //      // Padded space
    //      int oidx = r*px + c;
    //      output[oidx] += (float) accumulator;
    //  }
    //}
    
    //ai2_bin_conv_within_boundary(output, input, weights, ix, iy, wx, wy, stride);
    //ai2_bin_conv_borders(output, input, weights, ix, iy, wx, wy, stride);
}

void ai2_pointwise_mul_mm(float *output, const float *input, int N) {
    int i = 0;

    while (i + 8 <= N) {
        output[i+0] *= input[i+0];
        output[i+1] *= input[i+1];
        output[i+2] *= input[i+2];
        output[i+3] *= input[i+3];
        output[i+4] *= input[i+4];
        output[i+5] *= input[i+5];
        output[i+6] *= input[i+6];
        output[i+7] *= input[i+7];

        i += 8;
    }

    while (++i < N) // Finish iteration that's leftover (e.g., last batch not divisible by 8 exactly)
         output[i] *= input[i];
}

/** @brief Performs a tiled pointwise matrix multiplication between two 2D tensors
 *  Pre-conditions: wx < ix, and wy < iy
 */
void ai2_pointwise_mul_mm_2d(float *output, const float *alpha, int ix, int iy, int wx, int wy, int pad) {
    // Slower version
//      for (int y = 0; y < iy; ++y) 
//          for (int x = 0; x < ix; x++)
//              output[y*ix+x] *= input[(y % wy)*wx + (x % wx)];

    // Stride prefetch optimized
    for (int s = 0; s < wy; ++s) {  // for each strip
        const float *strip_ptr = &alpha[s*wx];
        for (int y = pad; y < pad + (iy / wy); ++y) {   //
            int stride = y*((ix+2*pad)*wy) + s*(ix+2*pad);
            float *output_ptr = &output[stride];

            for (int x = 0; x < ix; ++x) {
                output_ptr[x] *= strip_ptr[x % wx];
            }
        }
    }
}

void ai2_setFltInput(ai2_bin_conv_layer *layer, float *new_input) {
    if (new_input != NULL) {
        if (layer->input != NULL)
            free(layer->input);
        layer->input = new_input;

        dim3 dim;
        dim.x = layer->px;
        dim.y = layer->py;
        dim.z = layer->c;

        // Binarize input
        ai2_flt_to_bin(layer->binary_input, layer->input, dim);

        float *new_beta = (float *) calloc (dim.x * dim.y, sizeof(float));
        ai2_setFltBeta(layer, new_beta);

        // layer->input is transposed to (z,x,y) already
        ai2_calc_beta(layer->beta, layer->input, dim);
    }
}

void ai2_setBinInput(ai2_bin_conv_layer *layer, BINARY_WORD *new_input) {
    if (new_input != NULL) {
        if (layer->binary_input != NULL)
            free(layer->binary_input);
        layer->binary_input = new_input;
    }
}

void ai2_setFltWeights(ai2_bin_conv_layer *layer, float *new_weights) {
    if (new_weights != NULL) {
        if (layer->weights != NULL)
            free(layer->weights);
        layer->weights = new_weights;

        dim3 dim;
        dim.x = layer->wx;
        dim.y = layer->wy;
        dim.z = layer->c;

        ai2_flt_to_bin(layer->binary_weights, layer->weights, dim);

        // Calculate alpha
        if (layer->alpha != NULL)
            free(layer->alpha);

        layer->alpha = (float *) calloc (dim.x * dim.y, sizeof(float));
        // layer->weights is already transposed to (z,x,y) from ai2_flt_to_bin()
        ai2_calc_alpha(layer->alpha, layer->weights, dim);
    }
}

void ai2_setBinWeights(ai2_bin_conv_layer *layer, BINARY_WORD *new_weights) {
    if (new_weights != NULL) {
        if (layer->binary_weights != NULL)
            free(layer->binary_weights);
        layer->binary_weights = new_weights;
    }
}

void ai2_setFltOutput(ai2_bin_conv_layer *layer, float *new_output) {
    if (new_output != NULL) {
        if (layer->output != NULL)
            free(layer->output);
        layer->output = new_output;
    }
}

void ai2_setBinOutput(ai2_bin_conv_layer *layer, BINARY_WORD *new_output) {
    if (new_output != NULL) {
        if (layer->binary_output != NULL)
            free(layer->binary_output);
        layer->binary_output = new_output;
    }
}

void ai2_setFltAlpha(ai2_bin_conv_layer *layer, float *new_alpha) {
    if (new_alpha != NULL) {
        if (layer->alpha != NULL)
            free(layer->alpha);
        layer->alpha = new_alpha;
    }
}

void ai2_setFltBeta(ai2_bin_conv_layer *layer, float *new_beta) {
    if (new_beta != NULL) {
        if (layer->beta != NULL)
            free(layer->beta);
        layer->beta = new_beta;
    }
}

void ai2_setFltNewBeta(ai2_bin_conv_layer *layer, float *new_new_beta) {
    if (new_new_beta != NULL) {
        if (layer->new_beta != NULL)
            free(layer->new_beta);
        layer->new_beta = new_new_beta;
    }
}

float* ai2_getFltOutput(ai2_bin_conv_layer *layer) {
    //if (layer->output != NULL && layer->binary_output != NULL) {
    if (layer->output != NULL) {

        // The idea here was that all intermediate states are stored in the binary output. 
        // Whenever the user needs the real-valued output, the conversion happens at this function call.
        //dim3 dim;
        //dim.x = layer->px;
        //dim.y = layer->py;
        //dim.z = layer->batch;
        //ai2_bin_to_flt(layer->output, layer->binary_output, dim);

        return layer->output;
    }
    else
        return NULL;
}

void ai2_transpose3D(float *data, dim3 d) {
    // Slow transpose for correctness

    // (x,y,z) becomes (z,x,y). Requires two transposes:
    //  (x,y,z) -> (x,z,y).
    //  (x,z,y) -> (z,x,y).

    // Intermediate buffer
    float *new_data = (float *) calloc (d.x * d.y * d.z, sizeof(float));

    // Transpose y and z axis.
    // (x,y,z) -> (x,z,y);
    for (int y = 0; y < d.y; ++y) {
        for (int z = 0; z < d.z; ++z) {
            for (int x = 0; x < d.x; ++x) {
                new_data[y*d.x*d.z + z*d.x + x] = data[z*d.x*d.y + y*d.x + x];
                //new_data[z*d.y*d.x + y*d.x + x] = data[y*d.x*d.z + z*d.x + x];
            }
        }
    }

    // Transpose x and z axis.
    //  (x,z,y) -> (z,x,y)
    for (int y = 0; y < d.y; ++y) {
        for (int x = 0; x < d.x; ++x) {
            for (int z = 0; z < d.z; ++z) {
                data[y*d.z*d.x + x*d.z + z] = new_data[y*d.x*d.z + x + z*d.x];
            }
        }
    }

    free(new_data);
}

int ai2_isFloatWhole(float f) { // TODO unit test
    return (ceilf(f) == f) ? 1 : 0;
}

/* @brief Initialize and create all memory arrays for this layer
 * b - batches (number of filter batches)
 * c - input channels
 * ix - input width
 * iy - input height
 * wx - weight/filter width
 * wy - weight/filter height
 * s - stride between sliding windows
 * pad - the amount of padding
 */
ai2_bin_conv_layer ai2_make_bin_conv_layer(int b, int c, int ix, int iy, int wx, int wy, int s, int pad) {
    // http://cs231n.github.io/convolutional-networks/
    //  See: spatial arrangement section for determining what the output size will be
    float output_size = ((ix - wx + 2 * pad) / s) + 1;
    if (ai2_isFloatWhole(output_size) == 0) {
        fprintf(stderr, "ERROR! conv layer of (b,c,ix,iy,s,pad) = (%d, %d, %d, %d, %d, %d) will give "
            " invalid output dimension: %fx%f\n", b, c, ix, iy, s, pad, output_size, output_size);
        exit(1);
    }

    // TODO: Support strided output
    if (s != 1) {
        fprintf(stderr, "ERROR! Only stride values of 1 is supported\n");
        exit(1);
    }

    // padded input size
    int px = (int) ix + 2*pad; 
    int py = (int) iy + 2*pad;

    ai2_bin_conv_layer l = {0}; // initialize all to 0
    l.input = (float *) calloc (c * px * py, sizeof(float));        // is padded
    l.binary_input =   (BINARY_WORD *) calloc (c * px * py / BITS_PER_BINARY_WORD, sizeof(BINARY_WORD));     // is padded

    dim3 dim;
    dim.x = px;
    dim.y = py;
    dim.z = c;
    ai2_flt_to_bin(l.binary_input, l.input, dim);

    l.weights = (float *) calloc (b * c * wx * wy, sizeof(float));  
    l.binary_weights = (BINARY_WORD *) calloc (b * c * wx * wy / BITS_PER_BINARY_WORD, sizeof(BINARY_WORD));

    l.output = (float *) calloc (c * px * py, sizeof(float));   // is padded
    l.new_beta = (float *) calloc(px * py, sizeof(float));      // is padded

    l.batch = b;
    l.c = c;
    l.h = iy;
    l.w = ix;
    l.stride = s;
    l.pad = pad;
    l.px = px;
    l.py = py;
    l.wx = wx;
    l.wy = wy;

    // The following parameters are uninitialized and should be set elsewhere:
    //  l.beta  - padded
    //  l.alpha - not padded

    return l;
}

void ai2_free_bin_conv_layer(ai2_bin_conv_layer *layer) {
    if (layer->input) free (layer->input);
    if (layer->binary_input) free(layer->binary_input);
    if (layer->weights) free (layer->weights);
    if (layer->binary_weights) free(layer->binary_weights);
    if (layer->output) free(layer->output);
    if (layer->binary_output) free (layer->binary_output);
    if (layer->alpha) free(layer->alpha);
    if (layer->beta) free(layer->beta);
    if (layer->new_beta) free(layer->new_beta);
}

void ai2_throw_error(char *str) {
    fprintf(stderr, "ERROR: %s\n", str);
    exit(1);
}

void ai2_bin_forward(ai2_bin_conv_layer *l) {
    if (l->input == NULL) ai2_throw_error("Input was not allocated and set in this layer");
    if (l->weights == NULL) ai2_throw_error("Weights was not allocated and set in this layer");
    if (l->output == NULL) ai2_throw_error("Output was not allocated and set in this layer");
    if (l->alpha == NULL) ai2_throw_error("Alpha was not allocated and set in this layer");
    if (l->beta == NULL) ai2_throw_error("Beta was not allocated and set in this layer");

    if (l->c % 32 != 0) ai2_throw_error("Channel is not divisible by 32. Need to implement mask "
                                        "before supporting arbitrary channel size. For now, "
                                        "set the channel size to the nearest multiple of 32 "
                                        "and ignore any ''extra'' channels unused.");

    l->c /= BITS_PER_BINARY_WORD;   // For compensating with doing more work per word

    float *output = l->output;
    float *alpha = l->alpha;
    float *beta = l->beta;
    int px = l->px;
    int py = l->py;
    BINARY_WORD *binary_weights = l->binary_weights;

    for (int z = 0; z < l->batch; ++z) {    // for each filter map
        BINARY_WORD *binary_input = l->binary_input;
        for (int c = 0; c < l->c; ++c) {    // for each input channel
            ai2_bin_conv2D(output, binary_input, binary_weights, l->w, l->h, l->wx, l->wy, l->pad, l->stride);
            binary_input += px*py;   // increment with next 2D plane
            binary_weights += l->wx*l->wy;       // increment with next 2D plane

            ai2_pointwise_mul_mm(output, beta, px*py);  
            ai2_pointwise_mul_mm_2d(output, alpha, l->w, l->h, l->wx, l->wy, l->pad);
        }
    }
}

// Deprecated
//double ai2_bin_conv_benchmark(ConvolutionArgs conv_args) {
//    printf("Running Binary Convolution test!\n");
//
//    size_t ix, iy, iz, wx, wy, wz, L, stride;
//    ix = conv_args.input.x;
//    iy = conv_args.input.y;
//    iz = conv_args.input.z;
//    wx = conv_args.weights.x;
//    wy = conv_args.weights.y;
//    wz = conv_args.weights.z;
//  L = BITS_PER_BINARY_WORD;
//  stride = 1;
//
//    printf("Input size (num elements, xyz): %zu %zu %zu\n", ix, iy, iz);
//    printf("Weights size (num elements. xyz): %zu %zu %zu\n", wx, wy, wz);
//
//    double sz_input_elements = ix * iy * iz;
//    double sz_input_bytes = getSizeBytesBinaryArray(conv_args.input);
//    double sz_weight_bytes = getSizeBytesBinaryArray(conv_args.weights);
//
//    printf("Input Size (MB): %f\n", sz_input_bytes / (1 << 20));
//    printf("Weight Size (MB): %f\n", sz_weight_bytes / (1 << 20));
//
//    BINARY_WORD *binary_input = mallocBinaryVolume(conv_args.input);
//    BINARY_WORD *binary_weights = mallocBinaryVolume(conv_args.weights);
//    BINARY_WORD *b_input = binary_input;    // alias
//    BINARY_WORD *b_weight = binary_weights; // alias
//    float *output = mallocFloatVolume(conv_args.output);
//  float *output_ptr = output;
//  float *beta =  (float *) malloc(sizeof(float) * ix * iy);   // we assume beta is given to us
//  float *alpha = (float *) malloc(sizeof(float) * wx * wy);   // we assume alpha is given to us
//  float *new_output = mallocFloatVolume(conv_args.output);
//  //float *new_output_ptr = new_output;
//  float *new_beta = (float *) malloc(sizeof(float) * ix * iy);
//  //float *new_beta_ptr = new_beta;
//
//    // Scale number of computations because we're packing.
//    // After this point, you should not have to reason about input dimensions for input and weights.
//    iz /= BITS_PER_BINARY_WORD;
//    wz /= BITS_PER_BINARY_WORD;
//
//    // Calculate time taken by a request
//    struct timeval start_time;
//    gettimeofday(&start_time, NULL);
//
//  // Preprocessing
//  int pad = wx/2;
//
//    for (int z = 0; z < iz; ++z) {    // number of channels
//        ai2_bin_conv2D(output_ptr, b_input, b_weight, ix, iy, wx, wy, pad, stride);
//        b_input += ix*iy;   // increment with next 2D plane
//        b_weight += wx*wy;  // increment with next 2D plane
//
//      ai2_pointwise_mul_mm(output_ptr, beta, ix*iy);
//      ai2_pointwise_mul_mm_2d(output_ptr, alpha, ix, iy, wx, wy, pad);
//    }
//
//  // copy to new array (need to wrap this around); TODO.
//    struct timeval end_time;
//    gettimeofday(&end_time, NULL);
//
//    struct timeval diff_time;
//    timersub(&end_time, &start_time, &diff_time);
//    double time_conv_s = diff_time.tv_sec + diff_time.tv_usec * 1e-6;
//    double time_conv_ms = time_conv_s * 1000.0;
//
//  double model_ops = (3*ix*iy*wx*wy*wz/L) + 2*ix*iy + ix*iy*iz;
//  double conv_ops_s = 1e-9 * model_ops / time_conv_s;
//    double conv_bandwidth_gb_s = 1e-9 * sz_input_bytes / (time_conv_ms / 1000.0);
//    double conv_bandwidth_gelement_s = 1e-9 * sz_input_elements / (time_conv_ms / 1000.0);
//
//    printf("Execution Time (ms): %f\n", time_conv_ms);
//    printf("Binary Convolution OPS/s (GOPS/s): %f\n", conv_ops_s);
//    printf("Binary Convolution Bandwidth (GB/s): %f\n", conv_bandwidth_gb_s);
//    printf("Binary Convolution Bandwidth (GElements/s): %f\n\n", conv_bandwidth_gelement_s);
//
//    free(binary_input);
//    free(binary_weights);
//    free(output);
//  free(beta);
//  free(alpha);
//  free(new_output);
//  free(new_beta);
//
//    return time_conv_ms;
//}

// double ai2_bin_conv_benchmark(ConvolutionArgs conv_args);

//void benchmark() {
//    int ix, iy, iz, wx, wy, wz;
//    iz = (1 << 9) * BITS_PER_BINARY_WORD;
//    ix = 227; // x == y for square face
//    iy = 227;
//    wx = 3;    // x == y for a square face
//    wy = 3;
//    wz = iz;
//
//  int runs = 1;
//  double accum_binary = 0;
//  double accum_real = 0;
//    ConvolutionArgs conv_args = initArgs(ix, iy, iz, wx, wy, wz);
//  for (int i = 0; i < runs; ++i) {
//      double t_binary_convolve = ai2_bin_conv_benchmark(conv_args);
//      double t_real_convolve = run_convolve2D_real(conv_args);
//      printf("t binary = %lf\n", t_binary_convolve);
//      printf("t real = %lf\n", t_real_convolve);
//      accum_binary += t_binary_convolve;
//      accum_real += t_real_convolve;
//  }
//
//  accum_binary /= runs;
//  accum_real /= runs;
//  printf("Average convolution pass binary (ms): %lf\n", accum_binary);
//  printf("Average convolution pass flt (ms): %lf\n", accum_real);
//  printf("Speedup (Binary over Real): %lfx\n", accum_real / accum_binary);    
//  exit(1);
//}

 src/binary_convolution.h

New file
@@ -0,0 +1,218 @@
#ifndef AI2_BINARY_CONVOLUTION_H
#define AI2_BINARY_CONVOLUTION_H

/** @file binary_convolution.h
 *  @brief Routines related for approximating convolutions using binary operations
 *      
 *  @author Carlo C. del Mundo (carlom)
 *  @date 05/23/2016
 */

#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include <assert.h>
#include <limits.h>
#include <tgmath.h>
#include <unistd.h>
#include <stdint.h>
#include <string.h>
#include "common.h"

typedef struct {
    int batch;   // number of filter batches
    int c;       // channels, z
    int h;       // height, y
    int w;       // width, x
    int stride;
    int pad;

    int px;     // padded x (use this for striding in padded input and output arrays)
    int py;     // padded y (use this for striding in padded input and output arrays)
    int wx;
    int wy;

    float *input;       // input values
    BINARY_WORD *binary_input;

    float *weights;     // weight or filter values
    BINARY_WORD *binary_weights;

    float *output;      // output values
    BINARY_WORD *binary_output;

    float *alpha;       // we assume alpha is calculated at the beginning of initialization
    float *beta;        // we assume beta is given to us
    float *new_beta;    // we calculate the new beta for the next layer

    struct ai2_bin_conv_layer *next;
} ai2_bin_conv_layer;

/** @brief Performs a binary convolution using XNOR and POPCOUNT between input and weights
 *
 *  @param output A 2D real-valued plane to store the outputs
 *  @param input A 2D binary-valued plane that holds the inputs
 *  @param weights A 2D binary-valued plane that holds the weights 
 *  @param ix   the input's x dimension 
 *  @param iy   the input's y dimensions
 *  @param wx   the weight's x dimension
 *  @param wy   the weight's y dimension
 *  @param pad  the amount of padding applied to input. (ix+2*pad is the x dimension of the input
 *  @param stride NOP. TODO: implement stride. the stride between sliding windows
 *  @return the count of all overlapping set bits between the two volumes.
 */
void ai2_bin_conv2D(float *output, const BINARY_WORD *input, const BINARY_WORD *weights, int ix, int iy, int wx, int wy, int pad, int stride);

/** @brief Performs a binary dot product (XNOR and POPCOUNT) for two equal sized volumes.
 *
 *  @param a A 3D binary tensor
 *  @param b A 3D binary tensor 
 *  @param vdim the dimensionality of the data. Note: we pack 32 elements in the Z element.
 *  @return the count of all overlapping set bits between the two volumes.
 */
int ai2_bin_dp(BINARY_WORD *a, BINARY_WORD *b, dim3 vdim);

/** @brief Calculates the alpha plane given an alpha volume. 
 *
 *  Each point in the yz alpha plane
 *  is the average sum of the absolute value of all elements in the z-direction.
 *
 * Pre-conditions: 
 *                  alpha_volume is an array of size x*y*z.
 *                  alpha_plane is an array of size x*y.
 *                  alpha_volume (x,y,z) is transposed to (z,x,y).
 *
 *  @param alpha_plane The 2D real-valued output plane
 *  @param alpha_volume The 3D real-valued output volume
 *  @param vdim the dimensionality of alpha_volume.
 */
void ai2_calc_alpha(float *alpha_plane, float *alpha_volume, dim3 vdim);

/** @brief Wrapper function for generating the beta scaling factor */
void ai2_calc_beta(float *beta_plane, float *beta_volume, dim3 vdim); 

/** @brief Set the bit in a binary word */
void ai2_bitset(BINARY_WORD *bword, unsigned int position);

/** @brief Checks that the bit is set in a binary word */
int ai2_is_set(BINARY_WORD bword, unsigned int position) ;

/** @brief Converts a 3D float tensor into a 3D binary tensor.
 *
 *  The value of the ith element in the binary tensor is the sign
 *  of the ith element in the floating tensor.
 *
 *  @param binary_vol the binary tensor
 *  @param real_vol the real tensor
 *  @param vdim the size of the 3D tensor
 */
void ai2_flt_to_bin(BINARY_WORD *binary_vol, float *real_vol, dim3 vdim) ;

/** @brief Converts a 3D binary tensor into a 3D float tensor.
 *
 * The ith float element will be '1' if the ith binary element is '1'.
 * Otherwise, the float element will be '-1'.
 *
 *  @param real_vol the output real tensor
 *  @param binary_vol the input binary tensor
 *  @param vdim the dimension of both binary_vol and real_vol
 */
void ai2_bin_to_flt(float *real_vol, BINARY_WORD *binary_vol, dim3 vdim); 

/** @brief Performs a pointwise matrix multication between two 2D tensors
 *  @param output A 2D real-valued plane to store the outputs
 *  @param input A 2D binary-valued plane that holds the inputs
 *  @param N the number of elements between the arrays
 */
void ai2_pointwise_mul_mm(float *output, const float *input, int N);

/** @brief Performs a tiled pointwise matrix multiplication between two 2D tensors
 *  
 *  Pre-conditions: wx < ix, and wy < iy
 *
 *  @param output A 2D real-valued plane of size ix, iy
 *  @param alpha A 2D binary-valued plane of size wx, wy
 *  @param ix   the output's x dimension 
 *  @param iy   the output's y dimensions
 *  @param wx   the alpha's x dimension
 *  @param wy   the alpha's y dimension
 *  @param pad  how many cells are padded, adds 2*pad to the borders of the image 
 */
void ai2_pointwise_mul_mm_2d(float *output, const float *alpha, int ix, int iy, int wx, int wy, int pad);

// --------------------------------------
//  SETTER FUNCTIONS
// --------------------------------------
/** @brief Safe function to set the float input of a conv_layer
 */
void ai2_setFltInput(ai2_bin_conv_layer *layer, float *new_input);

/** @brief Safe function to set the binary input of a conv_layer
 */
void ai2_setBinInput(ai2_bin_conv_layer *layer, BINARY_WORD *new_input);

/** @brief Safe function to set the binary weights of a conv_layer
 */
void ai2_setFltWeights(ai2_bin_conv_layer *layer, float *new_weights);

/** @brief Safe function to set the binary weights of a conv_layer
 */
void ai2_setBinWeights(ai2_bin_conv_layer *layer, BINARY_WORD *new_weights);

/** @brief Safe function to set the binary outputs of a conv_layer
 */
void ai2_setFltOutput(ai2_bin_conv_layer *layer, float *new_output);

/** @brief Safe function to set the binary outputs of a conv_layer
 */
void ai2_setBinOutput(ai2_bin_conv_layer *layer, BINARY_WORD *new_output);

/** @brief Safe function to set the alpha of a conv_layer
 */
void ai2_setFltAlpha(ai2_bin_conv_layer *layer, float *new_alpha);

/** @brief Safe function to set the beta of a conv_layer
 */
void ai2_setFltBeta(ai2_bin_conv_layer *layer, float *new_beta);

/** @brief Safe function to set the new_beta of a conv_layer
 */
void ai2_setFltNewBeta(ai2_bin_conv_layer *layer, float *new_new_beta);

// --------------------------------------
//  GETTER FUNCTIONS
// --------------------------------------
/** @brief Safe function to get the float outputs of a conv_layer
 */
float * ai2_getFltOutput(ai2_bin_conv_layer *layer);

/** @brief 3D tranpose from (x,y,z) to (z,y,x)
 *  @return a new pointer with the transposed matrix
 */
void ai2_transpose3D(float *data, dim3 d);

/** @brief Checks if a float is a whole number (e.g., an int)
 */
int ai2_isFloatWhole(float f);

/* @brief Allocates all memory objects in an ai2_bin_conv_layer
 * b - batches (number of filter batches)
 * c - input channels
 * ix - input width
 * iy - input height
 * wx - weight/filter width
 * wy - weight/filter height
 * s - stride between sliding windows
 * pad - the amount of padding
 */
ai2_bin_conv_layer ai2_make_bin_conv_layer(int b, int c, int ix, int iy, int wx, int wy, int s, int pad);

/* @brief Safe deallocation of  all memory objects in an ai2_bin_conv_layer
 */
void ai2_free_bin_conv_layer(ai2_bin_conv_layer *layer);

/* @brief Given real-valued filter data and a conv layer, performs a forward pass
 */
void ai2_bin_forward(ai2_bin_conv_layer *layer);

#endif

 src/common.c

New file
@@ -0,0 +1,81 @@
#include "common.h" 

// Returns the time in ms
double getElapsedTime(Timer *timer) {
    // Calculate time it took in seconds
    double accum_ms = ( timer->requestEnd.tv_sec - timer->requestStart.tv_sec )
      + ( timer->requestEnd.tv_nsec - timer->requestStart.tv_nsec )
      / 1e6;
    return accum_ms;
}

void start_timer(Timer *timer) {
    clock_gettime(CLOCK_MONOTONIC_RAW, &(timer->requestStart));
}

void stop_timer(Timer *timer) {
    clock_gettime(CLOCK_MONOTONIC_RAW, &(timer->requestEnd));
}


BINARY_WORD * mallocBinaryVolume(dim3 vol) {
    return (BINARY_WORD *) malloc (vol.x * vol.y * vol.z / BITS_PER_BINARY_WORD * sizeof(BINARY_WORD));
}

float * mallocFloatVolume(dim3 vol) {
    return (float *) malloc (vol.x * vol.y * vol.z * sizeof(float));
}

// Returns the size (in bytes) of a binary array with dimensions stored in conv_args
double getSizeBytesBinaryArray(dim3 conv_args) {
    return conv_args.x * conv_args.y * conv_args.z * sizeof(BINARY_WORD) / (BITS_PER_BINARY_WORD);
}


ConvolutionArgs initArgs(size_t ix, size_t iy, size_t iz, size_t wx, size_t wy, size_t wz) {
    ConvolutionArgs conv_args;
    // Input Volume
    conv_args.input.x = ix;    // x == y for a square face
    conv_args.input.y = iy;
    conv_args.input.z = iz;
    conv_args.weights.x = wx; // x == y for square face
    conv_args.weights.y = wy;
    conv_args.weights.z = wz;

    // <!-- DO NOT MODIFY -->
    // Intermediate Volumes
    conv_args.alpha_plane.x = conv_args.weights.x;
    conv_args.alpha_plane.y = conv_args.weights.y;
    conv_args.alpha_plane.z = 1;

    conv_args.beta_plane.x = 1;
    conv_args.beta_plane.y = conv_args.input.y;
    conv_args.beta_plane.z = conv_args.input.z;

    conv_args.gamma_plane.x = conv_args.input.x * conv_args.weights.x;
    conv_args.gamma_plane.y = conv_args.input.y * conv_args.weights.y;
    conv_args.gamma_plane.z = 1;

    conv_args.zeta_plane.x = conv_args.gamma_plane.x;
    conv_args.zeta_plane.y = conv_args.gamma_plane.y;
    conv_args.zeta_plane.z = 1;

    // Output Volume
    conv_args.output.x = conv_args.input.x;
    conv_args.output.y = conv_args.input.y;
    conv_args.output.z = 1; // Output should be a 2D plane

    // Verify dimensions
    //assert(conv_args.weights.x % 32 == 0);  // must be divisble by 32 for efficient alignment to unsigned 32-bit ints
//    assert(conv_args.weights.y % 32 == 0);  // must be divisble by 32 for efficient alignment to unsigned 32-bit ints
    assert(conv_args.weights.z % 32 == 0);  // must be divisble by 32 for efficient alignment to unsigned 32-bit ints
    //assert(conv_args.input.x % 32 == 0);    // must be divisble by 32 for efficient alignment to unsigned 32-bit ints
//    assert(conv_args.input.y % 32 == 0);    // must be divisble by 32 for efficient alignment to unsigned 32-bit ints
    assert(conv_args.input.z % 32 == 0);    // must be divisble by 32 for efficient alignment to unsigned 32-bit ints
    assert(conv_args.weights.x <= conv_args.input.x);
    assert(conv_args.weights.y <= conv_args.input.y);
    assert(conv_args.weights.z <= conv_args.input.z);
    // <!-- DO NOT MODIFY -->

    return conv_args;
}

 src/common.h

New file
@@ -0,0 +1,50 @@
#ifndef AI2_COMMON_H 
#define AI2_COMMON_H

#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <inttypes.h>
#include <assert.h>
#include <limits.h>
#include <tgmath.h>
#include <unistd.h>
#include <stdint.h>
//#include <gperftools/profiler.h>
#include <sys/time.h>

typedef uint32_t BINARY_WORD;
#define BITS_PER_BINARY_WORD (sizeof(BINARY_WORD) * CHAR_BIT)

typedef struct{
    struct timespec requestStart;
    struct timespec requestEnd;
} Timer;

typedef struct {
    size_t x;
    size_t y;
    size_t z;
} dim3;

typedef struct {
    dim3 weights;
    dim3 input;
    dim3 output;
    dim3 alpha_plane;
    dim3 beta_plane;
    dim3 gamma_plane;
    dim3 zeta_plane;
} ConvolutionArgs;

// Timer stuff
double getElapsedTime(Timer *timer); // Returns the time in ms
void start_timer(Timer *timer);
void stop_timer(Timer *timer);

BINARY_WORD * mallocBinaryVolume(dim3 vol);
float * mallocFloatVolume(dim3 vol);
ConvolutionArgs initArgs(size_t ix, size_t iy, size_t iz, size_t wx, size_t wy, size_t wz);
double getSizeBytesBinaryArray(dim3 conv_args);

#endif

 src/convolutional_layer.c

@@ -8,6 +8,10 @@
#include <stdio.h>
#include <time.h>

#ifndef AI2
#define AI2 0
#endif

void swap_binary(convolutional_layer *l)
{
    float *swap = l->filters;
@@ -21,24 +25,6 @@
    #endif
}

void binarize_filters2(float *filters, int n, int size, char *binary, float *scales)
{
    int i, k, f;
    for(f = 0; f < n; ++f){
        float mean = 0;
        for(i = 0; i < size; ++i){
            mean += fabs(filters[f*size + i]);
        }
        mean = mean / size;
        scales[f] = mean;
        for(i = 0; i < size/8; ++i){
            binary[f*size + i] = (filters[f*size + i] > 0) ? 1 : 0;
            for(k = 0; k < 8; ++k){
            }
        }
    }
}

void binarize_filters(float *filters, int n, int size, float *binary)
{
    int i, f;
@@ -54,6 +40,21 @@
    }
}

void binarize_input(float *input, int n, int size, float *binary)
{
    int i, s;
    for(s = 0; s < size; ++s){
        float mean = 0;
        for(i = 0; i < n; ++i){
            mean += fabs(input[i*size + s]);
        }
        mean = mean / n;
        for(i = 0; i < n; ++i){
            binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean;
        }
    }
}

int convolutional_out_height(convolutional_layer l)
{
    int h = l.h;
@@ -89,7 +90,7 @@
}

size_t get_workspace_size(layer l){
    #ifdef CUDNN
#ifdef CUDNN
    size_t most = 0;
    size_t s = 0;
    cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
@@ -117,9 +118,9 @@
            &s);
    if (s > most) most = s;
    return most;
    #else
#else
    return (size_t)l.out_h*l.out_w*l.size*l.size*l.c*sizeof(float);
    #endif
#endif
}

convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int binary, int xnor)
@@ -133,6 +134,7 @@
    l.c = c;
    l.n = n;
    l.binary = binary;
    l.xnor = xnor;
    l.batch = batch;
    l.stride = stride;
    l.size = size;
@@ -164,6 +166,10 @@
        l.cfilters = calloc(c*n*size*size, sizeof(char));
        l.scales = calloc(n, sizeof(float));
    }
    if(xnor){
        l.binary_filters = calloc(c*n*size*size, sizeof(float));
        l.binary_input = calloc(l.inputs*l.batch, sizeof(float));
    }

    if(batch_normalize){
        l.scales = calloc(n, sizeof(float));
@@ -199,7 +205,6 @@
        l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size);
        l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
    }
    l.xnor = xnor;

    if(batch_normalize){
        l.mean_gpu = cuda_make_array(l.mean, n);
@@ -325,7 +330,7 @@

    l->delta_gpu =     cuda_make_array(l->delta, l->batch*out_h*out_w*l->n);
    l->output_gpu =    cuda_make_array(l->output, l->batch*out_h*out_w*l->n);
    #ifdef CUDNN
#ifdef CUDNN
    cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
    cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
    cudnnSetFilter4dDescriptor(l->dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); 
@@ -359,7 +364,7 @@
            CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
            0,
            &l->bf_algo);
    #endif
#endif
#endif
    l->workspace_size = get_workspace_size(*l);
}
@@ -404,7 +409,9 @@
    int out_w = convolutional_out_width(l);
    int i;


    fill_cpu(l.outputs*l.batch, 0, l.output, 1);

    /*
       if(l.binary){
       binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.binary_filters);
@@ -413,44 +420,59 @@
       }
     */

/*
    if(l.binary){
        int m = l.n;
        int k = l.size*l.size*l.c;
        int n = out_h*out_w;
    /*
       if(l.binary){
       int m = l.n;
       int k = l.size*l.size*l.c;
       int n = out_h*out_w;

        char  *a = l.cfilters;
       char  *a = l.cfilters;
       float *b = state.workspace;
       float *c = l.output;

       for(i = 0; i < l.batch; ++i){
       im2col_cpu(state.input, l.c, l.h, l.w, 
       l.size, l.stride, l.pad, b);
       gemm_bin(m,n,k,1,a,k,b,n,c,n);
       c += n*m;
       state.input += l.c*l.h*l.w;
       }
       scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w);
       add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
       activate_array(l.output, m*n*l.batch, l.activation);
       return;
       }
     */

    if(l.xnor && (l.c%32 != 0 || !AI2)){
        binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.binary_filters);
        swap_binary(&l);
        for(i = 0; i < l.batch; ++i){
            binarize_input(state.input + i*l.inputs, l.c, l.h*l.w, l.binary_input + i*l.inputs);
        }
        state.input = l.binary_input;
    }

    int m = l.n;
    int k = l.size*l.size*l.c;
    int n = out_h*out_w;

    if (l.xnor && l.c%32 == 0 && AI2) {
        forward_xnor_layer(l, state);
        printf("xnor\n");
    } else {

        float *a = l.filters;
        float *b = state.workspace;
        float *c = l.output;

        for(i = 0; i < l.batch; ++i){
            im2col_cpu(state.input, l.c, l.h, l.w, 
                    l.size, l.stride, l.pad, b);
            gemm_bin(m,n,k,1,a,k,b,n,c,n);
            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
            c += n*m;
            state.input += l.c*l.h*l.w;
        }
        scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w);
        add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
        activate_array(l.output, m*n*l.batch, l.activation);
        return;
    }
    */

    int m = l.n;
    int k = l.size*l.size*l.c;
    int n = out_h*out_w;

    float *a = l.filters;
    float *b = state.workspace;
    float *c = l.output;

    for(i = 0; i < l.batch; ++i){
        im2col_cpu(state.input, l.c, l.h, l.w, 
                l.size, l.stride, l.pad, b);
        gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
        c += n*m;
        state.input += l.c*l.h*l.w;
    }

    if(l.batch_normalize){
@@ -459,6 +481,7 @@
    add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);

    activate_array(l.output, m*n*l.batch, l.activation);
    if(l.binary || l.xnor) swap_binary(&l);
}

void backward_convolutional_layer(convolutional_layer l, network_state state)

 src/layer.h

@@ -167,6 +167,8 @@
    float *r_cpu;
    float *h_cpu;

    float *binary_input;

    size_t workspace_size;

    #ifdef GPU

 src/parser.c

@@ -1021,7 +1021,6 @@
            }
        }
    }
    binarize_filters2(l.filters, l.n, l.c*l.size*l.size, l.cfilters, l.scales);
#ifdef GPU
    if(gpu_index >= 0){
        push_convolutional_layer(l);
@@ -1046,7 +1045,7 @@
    if (l.flipped) {
        transpose_matrix(l.filters, l.c*l.size*l.size, l.n);
    }
    if (l.binary) binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.filters);
    //if (l.binary) binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.filters);
#ifdef GPU
    if(gpu_index >= 0){
        push_convolutional_layer(l);

 src/xnor_layer.c

New file
@@ -0,0 +1,86 @@
#include "xnor_layer.h"
#include "binary_convolution.h"
#include "convolutional_layer.h"

layer make_xnor_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalize)
{
    int i;
    layer l = {0};
    l.type = XNOR;

    l.h = h;
    l.w = w;
    l.c = c;
    l.n = n;
    l.batch = batch;
    l.stride = stride;
    l.size = size;
    l.pad = pad;
    l.batch_normalize = batch_normalize;

    l.filters = calloc(c*n*size*size, sizeof(float));
    l.biases = calloc(n, sizeof(float));

    int out_h = convolutional_out_height(l);
    int out_w = convolutional_out_width(l);
    l.out_h = out_h;
    l.out_w = out_w;
    l.out_c = n;
    l.outputs = l.out_h * l.out_w * l.out_c;
    l.inputs = l.w * l.h * l.c;

    l.output = calloc(l.batch*out_h * out_w * n, sizeof(float));

    if(batch_normalize){
        l.scales = calloc(n, sizeof(float));
        for(i = 0; i < n; ++i){
            l.scales[i] = 1;
        }

        l.mean = calloc(n, sizeof(float));
        l.variance = calloc(n, sizeof(float));

        l.rolling_mean = calloc(n, sizeof(float));
        l.rolling_variance = calloc(n, sizeof(float));
    }

    l.activation = activation;

    fprintf(stderr, "XNOR Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n);

    return l;
}

void forward_xnor_layer(const layer l, network_state state)
{
    int b = l.n;
    int c = l.c;
    int ix = l.w;
    int iy = l.h;
    int wx = l.size;
    int wy = l.size;
    int s = l.stride;
    int pad = l.pad * (l.size/2);

    // MANDATORY: Make the binary layer
    ai2_bin_conv_layer al = ai2_make_bin_conv_layer(b, c, ix, iy, wx, wy, s, pad);

    // OPTIONAL: You need to set the real-valued input like:
    ai2_setFltInput(&al, state.input);
    // The above function will automatically binarize the input for the layer (channel wise).
    // If commented: using the default 0-valued input.

    ai2_setFltWeights(&al, l.filters);
    // The above function will automatically binarize the input for the layer (channel wise).
    // If commented: using the default 0-valued weights.

    // MANDATORY: Call forward
    ai2_bin_forward(&al);

    // OPTIONAL: Inspect outputs
    float *output = ai2_getFltOutput(&al);  // output is of size l.px * l.py where px and py are the padded outputs

    memcpy(l.output, output, l.outputs*sizeof(float));
    // MANDATORY: Free layer
    ai2_free_bin_conv_layer(&al);
}

 src/xnor_layer.h

New file
@@ -0,0 +1,11 @@
#ifndef XNOR_LAYER_H
#define XNOR_LAYER_H

#include "layer.h"
#include "network.h"

layer make_xnor_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalization);
void forward_xnor_layer(const layer l, network_state state);

#endif


 src/yolo.c

@@ -346,8 +346,8 @@
        if (nms) do_nms_sort(boxes, probs, l.side*l.side*l.n, l.classes, nms);
        //draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, voc_names, voc_labels, 20);
        draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, voc_names, voc_labels, 20);
        show_image(im, "predictions");
        save_image(im, "predictions");
        show_image(im, "predictions");

        show_image(sized, "resized");
        free_image(im);

New file
			@@ -0,0 +1,79 @@
			GPU=0
			CUDNN=0
			OPENCV=0
			DEBUG=0
			AI2=1

			ARCH= --gpu-architecture=compute_52 --gpu-code=compute_52

			VPATH=./src/
			EXEC=darknet
			OBJDIR=./obj/

			CC=gcc -std=gnu11
			NVCC=nvcc
			OPTS=-Ofast
			LDFLAGS= -lm -pthread
			COMMON=
			CFLAGS=-Wall -Wfatal-errors

			ifeq ($(DEBUG), 1)
			OPTS=-O0 -g
			endif

			CFLAGS+=$(OPTS)

			ifeq ($(OPENCV), 1)
			COMMON+= -DOPENCV
			CFLAGS+= -DOPENCV
			LDFLAGS+= `pkg-config --libs opencv`
			COMMON+= `pkg-config --cflags opencv`
			endif

			ifeq ($(AI2), 1)
			COMMON+= -DAI2
			CFLAGS+= -DAI2
			endif

			ifeq ($(GPU), 1)
			COMMON+= -DGPU -I/usr/local/cuda/include/
			CFLAGS+= -DGPU
			LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand
			endif

			ifeq ($(CUDNN), 1)
			COMMON+= -DCUDNN
			CFLAGS+= -DCUDNN
			LDFLAGS+= -lcudnn
			endif

			OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o imagenet.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o coco_demo.o tag.o cifar.o yolo_demo.o go.o batchnorm_layer.o art.o xnor_layer.o common.o binary_convolution.o
			ifeq ($(GPU), 1)
			LDFLAGS+= -lstdc++
			OBJ+=convolutional_kernels.o deconvolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o softmax_layer_kernels.o network_kernels.o avgpool_layer_kernels.o
			endif

			OBJS = $(addprefix $(OBJDIR), $(OBJ))
			DEPS = $(wildcard src/*.h) Makefile

			all: obj results $(EXEC)

			$(EXEC): $(OBJS)
			$(CC) $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS)

			$(OBJDIR)%.o: %.c $(DEPS)
			$(CC) $(COMMON) $(CFLAGS) -c $< -o $@

			$(OBJDIR)%.o: %.cu $(DEPS)
			$(NVCC) $(ARCH) $(COMMON) --compiler-options "$(CFLAGS)" -c $< -o $@

			obj:
			mkdir -p obj
			results:
			mkdir -p results

			.PHONY: clean

			clean:
			rm -rf $(OBJS) $(EXEC)

New file
			@@ -0,0 +1,148 @@
			[net]
			batch=1
			subdivisions=1
			height=448
			width=448
			channels=3
			momentum=0.9
			decay=0.0005

			learning_rate=0.0001
			policy=steps
			steps=20,40,60,80,20000,30000
			scales=5,5,2,2,.1,.1
			max_batches = 40000

			[crop]
			crop_width=448
			crop_height=448
			flip=0
			angle=0
			saturation = 1.5
			exposure = 1.5
			noadjust=1

			[convolutional]
			batch_normalize=1
			filters=16
			size=3
			stride=1
			pad=1
			activation=leaky

			[maxpool]
			size=2
			stride=2

			[batchnorm]

			[convolutional]
			xnor = 1
			batch_normalize=1
			filters=32
			size=3
			stride=1
			pad=1
			activation=leaky

			[maxpool]
			size=2
			stride=2

			[batchnorm]

			[convolutional]
			xnor = 1
			batch_normalize=1
			filters=64
			size=3
			stride=1
			pad=1
			activation=leaky

			[maxpool]
			size=2
			stride=2

			[batchnorm]

			[convolutional]
			xnor = 1
			batch_normalize=1
			filters=128
			size=3
			stride=1
			pad=1
			activation=leaky

			[maxpool]
			size=2
			stride=2

			[batchnorm]

			[convolutional]
			xnor = 1
			batch_normalize=1
			filters=256
			size=3
			stride=1
			pad=1
			activation=leaky

			[maxpool]
			size=2
			stride=2

			[batchnorm]

			[convolutional]
			xnor = 1
			batch_normalize=1
			filters=512
			size=3
			stride=1
			pad=1
			activation=leaky

			[maxpool]
			size=2
			stride=2

			[batchnorm]

			[convolutional]
			batch_normalize=1
			filters=1024
			size=3
			stride=1
			pad=1
			activation=leaky

			[convolutional]
			batch_normalize=1
			size=3
			stride=1
			pad=1
			filters=128
			activation=leaky

			[connected]
			output= 1470
			activation=linear

			[detection]
			classes=20
			coords=4
			rescore=1
			side=7
			num=2
			softmax=0
			sqrt=1
			jitter=.2

			object_scale=1
			noobject_scale=.5
			class_scale=1
			coord_scale=5

New file
			@@ -0,0 +1,598 @@
			#include "binary_convolution.h"

			int ai2_bin_dp(BINARY_WORD a, BINARY_WORD b, dim3 vdim) { // TODO unroll
			int accumulator = 0;
			for (int z = 0; z < vdim.z / BITS_PER_BINARY_WORD; z++) {
			for (int y = 0; y < vdim.y; y++) {
			for (int x = 0; x < vdim.x; x++) {
			int idx = zvdim.yvdim.x + y*vdim.x + x;
			accumulator += __builtin_popcount(~(a[idx] ^ b[idx])); // count the XNOR of the two bit vectors
			}
			}
			}

			return accumulator;
			}

			/**
			* Pre-conditions:
			* alpha_volume is an array of size xyz.
			* alpha_plane is an array of size x*y.
			* alpha_volume (x,y,z) is transposed to (z,x,y).
			*/
			void ai2_calc_alpha(float alpha_plane, float alpha_volume, dim3 vdim) {
			for (int y = 0; y < vdim.y; ++y) {
			for (int x = 0; x < vdim.x; ++x) {
			int out = y * vdim.x + x;
			double accum = 0.0;
			for (int z = 0; z < vdim.z; ++z) {
			accum += alpha_volume[out * vdim.z + z];
			}

			alpha_plane[out] = accum / vdim.z;
			}
			}
			}

			/** @brief Wrapper function for generating the beta scaling factor */
			void ai2_calc_beta(float beta_plane, float beta_volume, dim3 vdim) {
			ai2_calc_alpha(beta_plane, beta_volume, vdim);
			}

			/** @brief Set the bit in a binary word */
			void ai2_bitset(BINARY_WORD *bword, unsigned int position) {
			BINARY_WORD mask = (1 << position);
			bword = bword \| mask;
			}

			/** @brief Checks that the bit is set in a binary word */
			int ai2_is_set(BINARY_WORD bword, unsigned int position) {
			unsigned int position_complement = (BITS_PER_BINARY_WORD - 1) - position; // number of leading bits before the bit position of interest
			bword = (bword << position_complement); // zero out leading bits
			bword = (bword >> (BITS_PER_BINARY_WORD - 1)); // shift bit position of interest to the 0th position
			return (bword & 0x1); // test if bit position of interest is set
			}

			void ai2_flt_to_bin(BINARY_WORD binary_vol, float real_vol, dim3 dim) {
			ai2_transpose3D(real_vol, dim); // (x,y,z) -> (z,x,y)

			int sz = dim.x * dim.y * dim.z;
			for (int i = 0; i < sz; i += BITS_PER_BINARY_WORD) {
			BINARY_WORD tmp = 0x00000000;
			for (int x = 0; x < BITS_PER_BINARY_WORD; ++x) {
			int waddr = x + i;
			if (signbit(real_vol[waddr]) == 0)
			ai2_bitset(&tmp, (BITS_PER_BINARY_WORD - 1) - x);
			}
			binary_vol[i / BITS_PER_BINARY_WORD] = tmp;
			}
			}

			void ai2_bin_to_flt(float real_vol, BINARY_WORD binary_vol, dim3 dim) { // TODO unit tests
			for (int z = 0; z < dim.z; z++) {
			for (int y = 0; y < dim.y; y++) {
			for (int x = 0; x < dim.x / BITS_PER_BINARY_WORD; x++) { // TODO boundary checks, for uneven input
			BINARY_WORD word = binary_vol[zdim.ydim.x + y*dim.x + x];
			for (int t = 0; t < BITS_PER_BINARY_WORD; ++t) {
			int oidx = zdim.ydim.x + ydim.x + x BITS_PER_BINARY_WORD + t;
			if (ai2_is_set(word, t))
			real_vol[oidx] = 1.f;
			else
			real_vol[oidx] = -1.f;
			}
			}
			}
			}

			// Transpose channels back to output
			ai2_transpose3D(real_vol, dim); // (z,y,x) -> (x,y,z)
			}

			/* @brief: input is padded.
			*/
			void ai2_bin_conv2D(float output, const BINARY_WORD input, const BINARY_WORD *weights, int ix, int iy, int wx, int wy, int pad, int stride) {

			int r, rd, c, cd;
			int wx_2 = wx / 2;
			int wy_2 = wy / 2;

			// Indexing for output pixels. x = [wx_2, ix + wx_2 - 1], y = [wy_2, iy + wy_2 - 1]
			int sx = pad; // start x
			int ex = ix + pad - 1; // end x
			int sy = pad; // start y
			int ey = iy + pad - 1; // end y

			// Indexing for weights
			int wsx, wex, wsy, wey;
			if (wx % 2 == 1) { // odd weights
			wsx = -wx_2; wex = wx_2 + 1;
			wsy = -wy_2; wey = wy_2 + 1;
			}
			else {
			wsx = -wx_2; wex = wx_2;
			wsy = -wy_2; wey = wy_2;
			}

			int px = ix + 2*pad;
			//int py = iy + 2*pad;

			for (r = sy; r <= ey; ++r) {
			for (c = sx; c <= ex; ++c) {
			int accumulator = 0;
			for (rd = wsy; rd < wey; ++rd) {
			for (cd = wsx; cd < wex; ++cd) {
			int iidx = (r+rd)*px + (c+cd);
			BINARY_WORD pixel = input[iidx];
			//BINARY_WORD pixel = 0xFFFFFFFF;
			//BINARY_WORD weight = 0xFFFFFFFF;
			int widx = (rd + wy_2)*wx + (cd+wx_2);
			BINARY_WORD weight = weights[widx];
			accumulator += __builtin_popcount(~(pixel ^ weight));
			}
			}

			// Padded space
			int oidx = r*px + c;
			output[oidx] += (float) accumulator;
			}
			}

			//for (r = sy; r <= ey; ++r) {
			// for (c = sx; c <= ex; ++c) {
			// int accumulator = 0;
			// for (rd = -wy_2; rd < wy_2; ++rd) {
			// for (cd = -wx_2; cd < wx_2; ++cd) {
			// int iidx = (r+rd)*px + (c+cd);
			// BINARY_WORD pixel = input[iidx];
			// //BINARY_WORD pixel = 0xFFFFFFFF;
			// //BINARY_WORD weight = 0xFFFFFFFF;
			// int widx = (rd + wy_2)*wx + (cd+wx_2);
			// BINARY_WORD weight = weights[widx];
			// accumulator += __builtin_popcount(~(pixel ^ weight));
			// }
			// }

			// // Padded space
			// int oidx = r*px + c;
			// output[oidx] += (float) accumulator;
			// }
			//}

			//ai2_bin_conv_within_boundary(output, input, weights, ix, iy, wx, wy, stride);
			//ai2_bin_conv_borders(output, input, weights, ix, iy, wx, wy, stride);
			}

			void ai2_pointwise_mul_mm(float output, const float input, int N) {
			int i = 0;

			while (i + 8 <= N) {
			output[i+0] *= input[i+0];
			output[i+1] *= input[i+1];
			output[i+2] *= input[i+2];
			output[i+3] *= input[i+3];
			output[i+4] *= input[i+4];
			output[i+5] *= input[i+5];
			output[i+6] *= input[i+6];
			output[i+7] *= input[i+7];

			i += 8;
			}

			while (++i < N) // Finish iteration that's leftover (e.g., last batch not divisible by 8 exactly)
			output[i] *= input[i];
			}

			/** @brief Performs a tiled pointwise matrix multiplication between two 2D tensors
			* Pre-conditions: wx < ix, and wy < iy
			*/
			void ai2_pointwise_mul_mm_2d(float output, const float alpha, int ix, int iy, int wx, int wy, int pad) {
			// Slower version
			// for (int y = 0; y < iy; ++y)
			// for (int x = 0; x < ix; x++)
			// output[yix+x] = input[(y % wy)*wx + (x % wx)];

			// Stride prefetch optimized
			for (int s = 0; s < wy; ++s) { // for each strip
			const float strip_ptr = &alpha[swx];
			for (int y = pad; y < pad + (iy / wy); ++y) { //
			int stride = y((ix+2pad)wy) + s(ix+2*pad);
			float *output_ptr = &output[stride];

			for (int x = 0; x < ix; ++x) {
			output_ptr[x] *= strip_ptr[x % wx];
			}
			}
			}
			}

			void ai2_setFltInput(ai2_bin_conv_layer layer, float new_input) {
			if (new_input != NULL) {
			if (layer->input != NULL)
			free(layer->input);
			layer->input = new_input;

			dim3 dim;
			dim.x = layer->px;
			dim.y = layer->py;
			dim.z = layer->c;

			// Binarize input
			ai2_flt_to_bin(layer->binary_input, layer->input, dim);

			float new_beta = (float ) calloc (dim.x * dim.y, sizeof(float));
			ai2_setFltBeta(layer, new_beta);

			// layer->input is transposed to (z,x,y) already
			ai2_calc_beta(layer->beta, layer->input, dim);
			}
			}

			void ai2_setBinInput(ai2_bin_conv_layer layer, BINARY_WORD new_input) {
			if (new_input != NULL) {
			if (layer->binary_input != NULL)
			free(layer->binary_input);
			layer->binary_input = new_input;
			}
			}

			void ai2_setFltWeights(ai2_bin_conv_layer layer, float new_weights) {
			if (new_weights != NULL) {
			if (layer->weights != NULL)
			free(layer->weights);
			layer->weights = new_weights;

			dim3 dim;
			dim.x = layer->wx;
			dim.y = layer->wy;
			dim.z = layer->c;

			ai2_flt_to_bin(layer->binary_weights, layer->weights, dim);

			// Calculate alpha
			if (layer->alpha != NULL)
			free(layer->alpha);

			layer->alpha = (float ) calloc (dim.x dim.y, sizeof(float));
			// layer->weights is already transposed to (z,x,y) from ai2_flt_to_bin()
			ai2_calc_alpha(layer->alpha, layer->weights, dim);
			}
			}

			void ai2_setBinWeights(ai2_bin_conv_layer layer, BINARY_WORD new_weights) {
			if (new_weights != NULL) {
			if (layer->binary_weights != NULL)
			free(layer->binary_weights);
			layer->binary_weights = new_weights;
			}
			}

			void ai2_setFltOutput(ai2_bin_conv_layer layer, float new_output) {
			if (new_output != NULL) {
			if (layer->output != NULL)
			free(layer->output);
			layer->output = new_output;
			}
			}

			void ai2_setBinOutput(ai2_bin_conv_layer layer, BINARY_WORD new_output) {
			if (new_output != NULL) {
			if (layer->binary_output != NULL)
			free(layer->binary_output);
			layer->binary_output = new_output;
			}
			}

			void ai2_setFltAlpha(ai2_bin_conv_layer layer, float new_alpha) {
			if (new_alpha != NULL) {
			if (layer->alpha != NULL)
			free(layer->alpha);
			layer->alpha = new_alpha;
			}
			}

			void ai2_setFltBeta(ai2_bin_conv_layer layer, float new_beta) {
			if (new_beta != NULL) {
			if (layer->beta != NULL)
			free(layer->beta);
			layer->beta = new_beta;
			}
			}

			void ai2_setFltNewBeta(ai2_bin_conv_layer layer, float new_new_beta) {
			if (new_new_beta != NULL) {
			if (layer->new_beta != NULL)
			free(layer->new_beta);
			layer->new_beta = new_new_beta;
			}
			}

			float* ai2_getFltOutput(ai2_bin_conv_layer *layer) {
			//if (layer->output != NULL && layer->binary_output != NULL) {
			if (layer->output != NULL) {

			// The idea here was that all intermediate states are stored in the binary output.
			// Whenever the user needs the real-valued output, the conversion happens at this function call.
			//dim3 dim;
			//dim.x = layer->px;
			//dim.y = layer->py;
			//dim.z = layer->batch;
			//ai2_bin_to_flt(layer->output, layer->binary_output, dim);

			return layer->output;
			}
			else
			return NULL;
			}

			void ai2_transpose3D(float *data, dim3 d) {
			// Slow transpose for correctness

			// (x,y,z) becomes (z,x,y). Requires two transposes:
			// (x,y,z) -> (x,z,y).
			// (x,z,y) -> (z,x,y).

			// Intermediate buffer
			float new_data = (float ) calloc (d.x * d.y * d.z, sizeof(float));

			// Transpose y and z axis.
			// (x,y,z) -> (x,z,y);
			for (int y = 0; y < d.y; ++y) {
			for (int z = 0; z < d.z; ++z) {
			for (int x = 0; x < d.x; ++x) {
			new_data[yd.xd.z + zd.x + x] = data[zd.xd.y + yd.x + x];
			//new_data[zd.yd.x + yd.x + x] = data[yd.xd.z + zd.x + x];
			}
			}
			}

			// Transpose x and z axis.
			// (x,z,y) -> (z,x,y)
			for (int y = 0; y < d.y; ++y) {
			for (int x = 0; x < d.x; ++x) {
			for (int z = 0; z < d.z; ++z) {
			data[yd.zd.x + xd.z + z] = new_data[yd.xd.z + x + zd.x];
			}
			}
			}

			free(new_data);
			}

			int ai2_isFloatWhole(float f) { // TODO unit test
			return (ceilf(f) == f) ? 1 : 0;
			}

			/* @brief Initialize and create all memory arrays for this layer
			* b - batches (number of filter batches)
			* c - input channels
			* ix - input width
			* iy - input height
			* wx - weight/filter width
			* wy - weight/filter height
			* s - stride between sliding windows
			* pad - the amount of padding
			*/
			ai2_bin_conv_layer ai2_make_bin_conv_layer(int b, int c, int ix, int iy, int wx, int wy, int s, int pad) {
			// http://cs231n.github.io/convolutional-networks/
			// See: spatial arrangement section for determining what the output size will be
			float output_size = ((ix - wx + 2 * pad) / s) + 1;
			if (ai2_isFloatWhole(output_size) == 0) {
			fprintf(stderr, "ERROR! conv layer of (b,c,ix,iy,s,pad) = (%d, %d, %d, %d, %d, %d) will give "
			" invalid output dimension: %fx%f\n", b, c, ix, iy, s, pad, output_size, output_size);
			exit(1);
			}

			// TODO: Support strided output
			if (s != 1) {
			fprintf(stderr, "ERROR! Only stride values of 1 is supported\n");
			exit(1);
			}

			// padded input size
			int px = (int) ix + 2*pad;
			int py = (int) iy + 2*pad;

			ai2_bin_conv_layer l = {0}; // initialize all to 0
			l.input = (float ) calloc (c px * py, sizeof(float)); // is padded
			l.binary_input = (BINARY_WORD ) calloc (c px * py / BITS_PER_BINARY_WORD, sizeof(BINARY_WORD)); // is padded

			dim3 dim;
			dim.x = px;
			dim.y = py;
			dim.z = c;
			ai2_flt_to_bin(l.binary_input, l.input, dim);

			l.weights = (float ) calloc (b c * wx * wy, sizeof(float));
			l.binary_weights = (BINARY_WORD ) calloc (b c * wx * wy / BITS_PER_BINARY_WORD, sizeof(BINARY_WORD));

			l.output = (float ) calloc (c px * py, sizeof(float)); // is padded
			l.new_beta = (float ) calloc(px py, sizeof(float)); // is padded

			l.batch = b;
			l.c = c;
			l.h = iy;
			l.w = ix;
			l.stride = s;
			l.pad = pad;
			l.px = px;
			l.py = py;
			l.wx = wx;
			l.wy = wy;

			// The following parameters are uninitialized and should be set elsewhere:
			// l.beta - padded
			// l.alpha - not padded

			return l;
			}

			void ai2_free_bin_conv_layer(ai2_bin_conv_layer *layer) {
			if (layer->input) free (layer->input);
			if (layer->binary_input) free(layer->binary_input);
			if (layer->weights) free (layer->weights);
			if (layer->binary_weights) free(layer->binary_weights);
			if (layer->output) free(layer->output);
			if (layer->binary_output) free (layer->binary_output);
			if (layer->alpha) free(layer->alpha);
			if (layer->beta) free(layer->beta);
			if (layer->new_beta) free(layer->new_beta);
			}

			void ai2_throw_error(char *str) {
			fprintf(stderr, "ERROR: %s\n", str);
			exit(1);
			}

			void ai2_bin_forward(ai2_bin_conv_layer *l) {
			if (l->input == NULL) ai2_throw_error("Input was not allocated and set in this layer");
			if (l->weights == NULL) ai2_throw_error("Weights was not allocated and set in this layer");
			if (l->output == NULL) ai2_throw_error("Output was not allocated and set in this layer");
			if (l->alpha == NULL) ai2_throw_error("Alpha was not allocated and set in this layer");
			if (l->beta == NULL) ai2_throw_error("Beta was not allocated and set in this layer");

			if (l->c % 32 != 0) ai2_throw_error("Channel is not divisible by 32. Need to implement mask "
			"before supporting arbitrary channel size. For now, "
			"set the channel size to the nearest multiple of 32 "
			"and ignore any ''extra'' channels unused.");

			l->c /= BITS_PER_BINARY_WORD; // For compensating with doing more work per word

			float *output = l->output;
			float *alpha = l->alpha;
			float *beta = l->beta;
			int px = l->px;
			int py = l->py;
			BINARY_WORD *binary_weights = l->binary_weights;

			for (int z = 0; z < l->batch; ++z) { // for each filter map
			BINARY_WORD *binary_input = l->binary_input;
			for (int c = 0; c < l->c; ++c) { // for each input channel
			ai2_bin_conv2D(output, binary_input, binary_weights, l->w, l->h, l->wx, l->wy, l->pad, l->stride);
			binary_input += px*py; // increment with next 2D plane
			binary_weights += l->wx*l->wy; // increment with next 2D plane

			ai2_pointwise_mul_mm(output, beta, px*py);
			ai2_pointwise_mul_mm_2d(output, alpha, l->w, l->h, l->wx, l->wy, l->pad);
			}
			}
			}

			// Deprecated
			//double ai2_bin_conv_benchmark(ConvolutionArgs conv_args) {
			// printf("Running Binary Convolution test!\n");
			//
			// size_t ix, iy, iz, wx, wy, wz, L, stride;
			// ix = conv_args.input.x;
			// iy = conv_args.input.y;
			// iz = conv_args.input.z;
			// wx = conv_args.weights.x;
			// wy = conv_args.weights.y;
			// wz = conv_args.weights.z;
			// L = BITS_PER_BINARY_WORD;
			// stride = 1;
			//
			// printf("Input size (num elements, xyz): %zu %zu %zu\n", ix, iy, iz);
			// printf("Weights size (num elements. xyz): %zu %zu %zu\n", wx, wy, wz);
			//
			// double sz_input_elements = ix * iy * iz;
			// double sz_input_bytes = getSizeBytesBinaryArray(conv_args.input);
			// double sz_weight_bytes = getSizeBytesBinaryArray(conv_args.weights);
			//
			// printf("Input Size (MB): %f\n", sz_input_bytes / (1 << 20));
			// printf("Weight Size (MB): %f\n", sz_weight_bytes / (1 << 20));
			//
			// BINARY_WORD *binary_input = mallocBinaryVolume(conv_args.input);
			// BINARY_WORD *binary_weights = mallocBinaryVolume(conv_args.weights);
			// BINARY_WORD *b_input = binary_input; // alias
			// BINARY_WORD *b_weight = binary_weights; // alias
			// float *output = mallocFloatVolume(conv_args.output);
			// float *output_ptr = output;
			// float beta = (float ) malloc(sizeof(float) * ix * iy); // we assume beta is given to us
			// float alpha = (float ) malloc(sizeof(float) * wx * wy); // we assume alpha is given to us
			// float *new_output = mallocFloatVolume(conv_args.output);
			// //float *new_output_ptr = new_output;
			// float new_beta = (float ) malloc(sizeof(float) * ix * iy);
			// //float *new_beta_ptr = new_beta;
			//
			// // Scale number of computations because we're packing.
			// // After this point, you should not have to reason about input dimensions for input and weights.
			// iz /= BITS_PER_BINARY_WORD;
			// wz /= BITS_PER_BINARY_WORD;
			//
			// // Calculate time taken by a request
			// struct timeval start_time;
			// gettimeofday(&start_time, NULL);
			//
			// // Preprocessing
			// int pad = wx/2;
			//
			// for (int z = 0; z < iz; ++z) { // number of channels
			// ai2_bin_conv2D(output_ptr, b_input, b_weight, ix, iy, wx, wy, pad, stride);
			// b_input += ix*iy; // increment with next 2D plane
			// b_weight += wx*wy; // increment with next 2D plane
			//
			// ai2_pointwise_mul_mm(output_ptr, beta, ix*iy);
			// ai2_pointwise_mul_mm_2d(output_ptr, alpha, ix, iy, wx, wy, pad);
			// }
			//
			// // copy to new array (need to wrap this around); TODO.
			// struct timeval end_time;
			// gettimeofday(&end_time, NULL);
			//
			// struct timeval diff_time;
			// timersub(&end_time, &start_time, &diff_time);
			// double time_conv_s = diff_time.tv_sec + diff_time.tv_usec * 1e-6;
			// double time_conv_ms = time_conv_s * 1000.0;
			//
			// double model_ops = (3ixiywxwywz/L) + 2ixiy + ixiy*iz;
			// double conv_ops_s = 1e-9 * model_ops / time_conv_s;
			// double conv_bandwidth_gb_s = 1e-9 * sz_input_bytes / (time_conv_ms / 1000.0);
			// double conv_bandwidth_gelement_s = 1e-9 * sz_input_elements / (time_conv_ms / 1000.0);
			//
			// printf("Execution Time (ms): %f\n", time_conv_ms);
			// printf("Binary Convolution OPS/s (GOPS/s): %f\n", conv_ops_s);
			// printf("Binary Convolution Bandwidth (GB/s): %f\n", conv_bandwidth_gb_s);
			// printf("Binary Convolution Bandwidth (GElements/s): %f\n\n", conv_bandwidth_gelement_s);
			//
			// free(binary_input);
			// free(binary_weights);
			// free(output);
			// free(beta);
			// free(alpha);
			// free(new_output);
			// free(new_beta);
			//
			// return time_conv_ms;
			//}

			// double ai2_bin_conv_benchmark(ConvolutionArgs conv_args);

			//void benchmark() {
			// int ix, iy, iz, wx, wy, wz;
			// iz = (1 << 9) * BITS_PER_BINARY_WORD;
			// ix = 227; // x == y for square face
			// iy = 227;
			// wx = 3; // x == y for a square face
			// wy = 3;
			// wz = iz;
			//
			// int runs = 1;
			// double accum_binary = 0;
			// double accum_real = 0;
			// ConvolutionArgs conv_args = initArgs(ix, iy, iz, wx, wy, wz);
			// for (int i = 0; i < runs; ++i) {
			// double t_binary_convolve = ai2_bin_conv_benchmark(conv_args);
			// double t_real_convolve = run_convolve2D_real(conv_args);
			// printf("t binary = %lf\n", t_binary_convolve);
			// printf("t real = %lf\n", t_real_convolve);
			// accum_binary += t_binary_convolve;
			// accum_real += t_real_convolve;
			// }
			//
			// accum_binary /= runs;
			// accum_real /= runs;
			// printf("Average convolution pass binary (ms): %lf\n", accum_binary);
			// printf("Average convolution pass flt (ms): %lf\n", accum_real);
			// printf("Speedup (Binary over Real): %lfx\n", accum_real / accum_binary);
			// exit(1);
			//}

New file
			@@ -0,0 +1,218 @@
			#ifndef AI2_BINARY_CONVOLUTION_H
			#define AI2_BINARY_CONVOLUTION_H

			/** @file binary_convolution.h
			* @brief Routines related for approximating convolutions using binary operations
			*
			* @author Carlo C. del Mundo (carlom)
			* @date 05/23/2016
			*/

			#include <stdio.h>
			#include <stdlib.h>
			#include <inttypes.h>
			#include <assert.h>
			#include <limits.h>
			#include <tgmath.h>
			#include <unistd.h>
			#include <stdint.h>
			#include <string.h>
			#include "common.h"

			typedef struct {
			int batch; // number of filter batches
			int c; // channels, z
			int h; // height, y
			int w; // width, x
			int stride;
			int pad;

			int px; // padded x (use this for striding in padded input and output arrays)
			int py; // padded y (use this for striding in padded input and output arrays)
			int wx;
			int wy;

			float *input; // input values
			BINARY_WORD *binary_input;

			float *weights; // weight or filter values
			BINARY_WORD *binary_weights;

			float *output; // output values
			BINARY_WORD *binary_output;

			float *alpha; // we assume alpha is calculated at the beginning of initialization
			float *beta; // we assume beta is given to us
			float *new_beta; // we calculate the new beta for the next layer

			struct ai2_bin_conv_layer *next;
			} ai2_bin_conv_layer;

			/** @brief Performs a binary convolution using XNOR and POPCOUNT between input and weights
			*
			* @param output A 2D real-valued plane to store the outputs
			* @param input A 2D binary-valued plane that holds the inputs
			* @param weights A 2D binary-valued plane that holds the weights
			* @param ix the input's x dimension
			* @param iy the input's y dimensions
			* @param wx the weight's x dimension
			* @param wy the weight's y dimension
			* @param pad the amount of padding applied to input. (ix+2*pad is the x dimension of the input
			* @param stride NOP. TODO: implement stride. the stride between sliding windows
			* @return the count of all overlapping set bits between the two volumes.
			*/
			void ai2_bin_conv2D(float output, const BINARY_WORD input, const BINARY_WORD *weights, int ix, int iy, int wx, int wy, int pad, int stride);

			/** @brief Performs a binary dot product (XNOR and POPCOUNT) for two equal sized volumes.
			*
			* @param a A 3D binary tensor
			* @param b A 3D binary tensor
			* @param vdim the dimensionality of the data. Note: we pack 32 elements in the Z element.
			* @return the count of all overlapping set bits between the two volumes.
			*/
			int ai2_bin_dp(BINARY_WORD a, BINARY_WORD b, dim3 vdim);

			/** @brief Calculates the alpha plane given an alpha volume.
			*
			* Each point in the yz alpha plane
			* is the average sum of the absolute value of all elements in the z-direction.
			*
			* Pre-conditions:
			* alpha_volume is an array of size xyz.
			* alpha_plane is an array of size x*y.
			* alpha_volume (x,y,z) is transposed to (z,x,y).
			*
			* @param alpha_plane The 2D real-valued output plane
			* @param alpha_volume The 3D real-valued output volume
			* @param vdim the dimensionality of alpha_volume.
			*/
			void ai2_calc_alpha(float alpha_plane, float alpha_volume, dim3 vdim);

			/** @brief Wrapper function for generating the beta scaling factor */
			void ai2_calc_beta(float beta_plane, float beta_volume, dim3 vdim);

			/** @brief Set the bit in a binary word */
			void ai2_bitset(BINARY_WORD *bword, unsigned int position);

			/** @brief Checks that the bit is set in a binary word */
			int ai2_is_set(BINARY_WORD bword, unsigned int position) ;

			/** @brief Converts a 3D float tensor into a 3D binary tensor.
			*
			* The value of the ith element in the binary tensor is the sign
			* of the ith element in the floating tensor.
			*
			* @param binary_vol the binary tensor
			* @param real_vol the real tensor
			* @param vdim the size of the 3D tensor
			*/
			void ai2_flt_to_bin(BINARY_WORD binary_vol, float real_vol, dim3 vdim) ;

			/** @brief Converts a 3D binary tensor into a 3D float tensor.
			*
			* The ith float element will be '1' if the ith binary element is '1'.
			* Otherwise, the float element will be '-1'.
			*
			* @param real_vol the output real tensor
			* @param binary_vol the input binary tensor
			* @param vdim the dimension of both binary_vol and real_vol
			*/
			void ai2_bin_to_flt(float real_vol, BINARY_WORD binary_vol, dim3 vdim);

			/** @brief Performs a pointwise matrix multication between two 2D tensors
			* @param output A 2D real-valued plane to store the outputs
			* @param input A 2D binary-valued plane that holds the inputs
			* @param N the number of elements between the arrays
			*/
			void ai2_pointwise_mul_mm(float output, const float input, int N);

			/** @brief Performs a tiled pointwise matrix multiplication between two 2D tensors
			*
			* Pre-conditions: wx < ix, and wy < iy
			*
			* @param output A 2D real-valued plane of size ix, iy
			* @param alpha A 2D binary-valued plane of size wx, wy
			* @param ix the output's x dimension
			* @param iy the output's y dimensions
			* @param wx the alpha's x dimension
			* @param wy the alpha's y dimension
			* @param pad how many cells are padded, adds 2*pad to the borders of the image
			*/
			void ai2_pointwise_mul_mm_2d(float output, const float alpha, int ix, int iy, int wx, int wy, int pad);

			// --------------------------------------
			// SETTER FUNCTIONS
			// --------------------------------------
			/** @brief Safe function to set the float input of a conv_layer
			*/
			void ai2_setFltInput(ai2_bin_conv_layer layer, float new_input);

			/** @brief Safe function to set the binary input of a conv_layer
			*/
			void ai2_setBinInput(ai2_bin_conv_layer layer, BINARY_WORD new_input);

			/** @brief Safe function to set the binary weights of a conv_layer
			*/
			void ai2_setFltWeights(ai2_bin_conv_layer layer, float new_weights);

			/** @brief Safe function to set the binary weights of a conv_layer
			*/
			void ai2_setBinWeights(ai2_bin_conv_layer layer, BINARY_WORD new_weights);

			/** @brief Safe function to set the binary outputs of a conv_layer
			*/
			void ai2_setFltOutput(ai2_bin_conv_layer layer, float new_output);

			/** @brief Safe function to set the binary outputs of a conv_layer
			*/
			void ai2_setBinOutput(ai2_bin_conv_layer layer, BINARY_WORD new_output);

			/** @brief Safe function to set the alpha of a conv_layer
			*/
			void ai2_setFltAlpha(ai2_bin_conv_layer layer, float new_alpha);

			/** @brief Safe function to set the beta of a conv_layer
			*/
			void ai2_setFltBeta(ai2_bin_conv_layer layer, float new_beta);

			/** @brief Safe function to set the new_beta of a conv_layer
			*/
			void ai2_setFltNewBeta(ai2_bin_conv_layer layer, float new_new_beta);

			// --------------------------------------
			// GETTER FUNCTIONS
			// --------------------------------------
			/** @brief Safe function to get the float outputs of a conv_layer
			*/
			float * ai2_getFltOutput(ai2_bin_conv_layer *layer);

			/** @brief 3D tranpose from (x,y,z) to (z,y,x)
			* @return a new pointer with the transposed matrix
			*/
			void ai2_transpose3D(float *data, dim3 d);

			/** @brief Checks if a float is a whole number (e.g., an int)
			*/
			int ai2_isFloatWhole(float f);

			/* @brief Allocates all memory objects in an ai2_bin_conv_layer
			* b - batches (number of filter batches)
			* c - input channels
			* ix - input width
			* iy - input height
			* wx - weight/filter width
			* wy - weight/filter height
			* s - stride between sliding windows
			* pad - the amount of padding
			*/
			ai2_bin_conv_layer ai2_make_bin_conv_layer(int b, int c, int ix, int iy, int wx, int wy, int s, int pad);

			/* @brief Safe deallocation of all memory objects in an ai2_bin_conv_layer
			*/
			void ai2_free_bin_conv_layer(ai2_bin_conv_layer *layer);

			/* @brief Given real-valued filter data and a conv layer, performs a forward pass
			*/
			void ai2_bin_forward(ai2_bin_conv_layer *layer);

			#endif

New file
			@@ -0,0 +1,81 @@
			#include "common.h"

			// Returns the time in ms
			double getElapsedTime(Timer *timer) {
			// Calculate time it took in seconds
			double accum_ms = ( timer->requestEnd.tv_sec - timer->requestStart.tv_sec )
			+ ( timer->requestEnd.tv_nsec - timer->requestStart.tv_nsec )
			/ 1e6;
			return accum_ms;
			}

			void start_timer(Timer *timer) {
			clock_gettime(CLOCK_MONOTONIC_RAW, &(timer->requestStart));
			}

			void stop_timer(Timer *timer) {
			clock_gettime(CLOCK_MONOTONIC_RAW, &(timer->requestEnd));
			}


			BINARY_WORD * mallocBinaryVolume(dim3 vol) {
			return (BINARY_WORD ) malloc (vol.x vol.y * vol.z / BITS_PER_BINARY_WORD * sizeof(BINARY_WORD));
			}

			float * mallocFloatVolume(dim3 vol) {
			return (float ) malloc (vol.x vol.y * vol.z * sizeof(float));
			}

			// Returns the size (in bytes) of a binary array with dimensions stored in conv_args
			double getSizeBytesBinaryArray(dim3 conv_args) {
			return conv_args.x * conv_args.y * conv_args.z * sizeof(BINARY_WORD) / (BITS_PER_BINARY_WORD);
			}


			ConvolutionArgs initArgs(size_t ix, size_t iy, size_t iz, size_t wx, size_t wy, size_t wz) {
			ConvolutionArgs conv_args;
			// Input Volume
			conv_args.input.x = ix; // x == y for a square face
			conv_args.input.y = iy;
			conv_args.input.z = iz;
			conv_args.weights.x = wx; // x == y for square face
			conv_args.weights.y = wy;
			conv_args.weights.z = wz;

			// <!-- DO NOT MODIFY -->
			// Intermediate Volumes
			conv_args.alpha_plane.x = conv_args.weights.x;
			conv_args.alpha_plane.y = conv_args.weights.y;
			conv_args.alpha_plane.z = 1;

			conv_args.beta_plane.x = 1;
			conv_args.beta_plane.y = conv_args.input.y;
			conv_args.beta_plane.z = conv_args.input.z;

			conv_args.gamma_plane.x = conv_args.input.x * conv_args.weights.x;
			conv_args.gamma_plane.y = conv_args.input.y * conv_args.weights.y;
			conv_args.gamma_plane.z = 1;

			conv_args.zeta_plane.x = conv_args.gamma_plane.x;
			conv_args.zeta_plane.y = conv_args.gamma_plane.y;
			conv_args.zeta_plane.z = 1;

			// Output Volume
			conv_args.output.x = conv_args.input.x;
			conv_args.output.y = conv_args.input.y;
			conv_args.output.z = 1; // Output should be a 2D plane

			// Verify dimensions
			//assert(conv_args.weights.x % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints
			// assert(conv_args.weights.y % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints
			assert(conv_args.weights.z % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints
			//assert(conv_args.input.x % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints
			// assert(conv_args.input.y % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints
			assert(conv_args.input.z % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints
			assert(conv_args.weights.x <= conv_args.input.x);
			assert(conv_args.weights.y <= conv_args.input.y);
			assert(conv_args.weights.z <= conv_args.input.z);
			// <!-- DO NOT MODIFY -->

			return conv_args;
			}

New file
			@@ -0,0 +1,50 @@
			#ifndef AI2_COMMON_H
			#define AI2_COMMON_H

			#include <time.h>
			#include <stdlib.h>
			#include <stdio.h>
			#include <inttypes.h>
			#include <assert.h>
			#include <limits.h>
			#include <tgmath.h>
			#include <unistd.h>
			#include <stdint.h>
			//#include <gperftools/profiler.h>
			#include <sys/time.h>

			typedef uint32_t BINARY_WORD;
			#define BITS_PER_BINARY_WORD (sizeof(BINARY_WORD) * CHAR_BIT)

			typedef struct{
			struct timespec requestStart;
			struct timespec requestEnd;
			} Timer;

			typedef struct {
			size_t x;
			size_t y;
			size_t z;
			} dim3;

			typedef struct {
			dim3 weights;
			dim3 input;
			dim3 output;
			dim3 alpha_plane;
			dim3 beta_plane;
			dim3 gamma_plane;
			dim3 zeta_plane;
			} ConvolutionArgs;

			// Timer stuff
			double getElapsedTime(Timer *timer); // Returns the time in ms
			void start_timer(Timer *timer);
			void stop_timer(Timer *timer);

			BINARY_WORD * mallocBinaryVolume(dim3 vol);
			float * mallocFloatVolume(dim3 vol);
			ConvolutionArgs initArgs(size_t ix, size_t iy, size_t iz, size_t wx, size_t wy, size_t wz);
			double getSizeBytesBinaryArray(dim3 conv_args);

			#endif

			@@ -8,6 +8,10 @@
			#include <stdio.h>
			#include <time.h>

			#ifndef AI2
			#define AI2 0
			#endif

			void swap_binary(convolutional_layer *l)
			{
			float *swap = l->filters;
			@@ -21,24 +25,6 @@
			#endif
			}

			void binarize_filters2(float filters, int n, int size, char binary, float *scales)
			{
			int i, k, f;
			for(f = 0; f < n; ++f){
			float mean = 0;
			for(i = 0; i < size; ++i){
			mean += fabs(filters[f*size + i]);
			}
			mean = mean / size;
			scales[f] = mean;
			for(i = 0; i < size/8; ++i){
			binary[fsize + i] = (filters[fsize + i] > 0) ? 1 : 0;
			for(k = 0; k < 8; ++k){
			}
			}
			}
			}

			void binarize_filters(float filters, int n, int size, float binary)
			{
			int i, f;
			@@ -54,6 +40,21 @@
			}
			}

			void binarize_input(float input, int n, int size, float binary)
			{
			int i, s;
			for(s = 0; s < size; ++s){
			float mean = 0;
			for(i = 0; i < n; ++i){
			mean += fabs(input[i*size + s]);
			}
			mean = mean / n;
			for(i = 0; i < n; ++i){
			binary[isize + s] = (input[isize + s] > 0) ? mean : -mean;
			}
			}
			}

			int convolutional_out_height(convolutional_layer l)
			{
			int h = l.h;
			@@ -89,7 +90,7 @@
			}

			size_t get_workspace_size(layer l){
			#ifdef CUDNN
			#ifdef CUDNN
			size_t most = 0;
			size_t s = 0;
			cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
			@@ -117,9 +118,9 @@
			&s);
			if (s > most) most = s;
			return most;
			#else
			#else
			return (size_t)l.out_hl.out_wl.sizel.sizel.c*sizeof(float);
			#endif
			#endif
			}

			convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int binary, int xnor)
			@@ -133,6 +134,7 @@
			l.c = c;
			l.n = n;
			l.binary = binary;
			l.xnor = xnor;
			l.batch = batch;
			l.stride = stride;
			l.size = size;
			@@ -164,6 +166,10 @@
			l.cfilters = calloc(cnsize*size, sizeof(char));
			l.scales = calloc(n, sizeof(float));
			}
			if(xnor){
			l.binary_filters = calloc(cnsize*size, sizeof(float));
			l.binary_input = calloc(l.inputs*l.batch, sizeof(float));
			}

			if(batch_normalize){
			l.scales = calloc(n, sizeof(float));
			@@ -199,7 +205,6 @@
			l.binary_filters_gpu = cuda_make_array(l.filters, cnsize*size);
			l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
			}
			l.xnor = xnor;

			if(batch_normalize){
			l.mean_gpu = cuda_make_array(l.mean, n);
			@@ -325,7 +330,7 @@

			l->delta_gpu = cuda_make_array(l->delta, l->batchout_hout_w*l->n);
			l->output_gpu = cuda_make_array(l->output, l->batchout_hout_w*l->n);
			#ifdef CUDNN
			#ifdef CUDNN
			cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w);
			cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
			cudnnSetFilter4dDescriptor(l->dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
			@@ -359,7 +364,7 @@
			CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
			0,
			&l->bf_algo);
			#endif
			#endif
			#endif
			l->workspace_size = get_workspace_size(*l);
			}
			@@ -404,7 +409,9 @@
			int out_w = convolutional_out_width(l);
			int i;


			fill_cpu(l.outputs*l.batch, 0, l.output, 1);

			/*
			if(l.binary){
			binarize_filters(l.filters, l.n, l.cl.sizel.size, l.binary_filters);
			@@ -413,44 +420,59 @@
			}
			*/

			/*
			if(l.binary){
			int m = l.n;
			int k = l.sizel.sizel.c;
			int n = out_h*out_w;
			/*
			if(l.binary){
			int m = l.n;
			int k = l.sizel.sizel.c;
			int n = out_h*out_w;

			char *a = l.cfilters;
			char *a = l.cfilters;
			float *b = state.workspace;
			float *c = l.output;

			for(i = 0; i < l.batch; ++i){
			im2col_cpu(state.input, l.c, l.h, l.w,
			l.size, l.stride, l.pad, b);
			gemm_bin(m,n,k,1,a,k,b,n,c,n);
			c += n*m;
			state.input += l.cl.hl.w;
			}
			scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w);
			add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
			activate_array(l.output, mnl.batch, l.activation);
			return;
			}
			*/

			if(l.xnor && (l.c%32 != 0 \|\| !AI2)){
			binarize_filters(l.filters, l.n, l.cl.sizel.size, l.binary_filters);
			swap_binary(&l);
			for(i = 0; i < l.batch; ++i){
			binarize_input(state.input + il.inputs, l.c, l.hl.w, l.binary_input + i*l.inputs);
			}
			state.input = l.binary_input;
			}

			int m = l.n;
			int k = l.sizel.sizel.c;
			int n = out_h*out_w;

			if (l.xnor && l.c%32 == 0 && AI2) {
			forward_xnor_layer(l, state);
			printf("xnor\n");
			} else {

			float *a = l.filters;
			float *b = state.workspace;
			float *c = l.output;

			for(i = 0; i < l.batch; ++i){
			im2col_cpu(state.input, l.c, l.h, l.w,
			l.size, l.stride, l.pad, b);
			gemm_bin(m,n,k,1,a,k,b,n,c,n);
			gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
			c += n*m;
			state.input += l.cl.hl.w;
			}
			scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w);
			add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
			activate_array(l.output, mnl.batch, l.activation);
			return;
			}
			*/

			int m = l.n;
			int k = l.sizel.sizel.c;
			int n = out_h*out_w;

			float *a = l.filters;
			float *b = state.workspace;
			float *c = l.output;

			for(i = 0; i < l.batch; ++i){
			im2col_cpu(state.input, l.c, l.h, l.w,
			l.size, l.stride, l.pad, b);
			gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
			c += n*m;
			state.input += l.cl.hl.w;
			}

			if(l.batch_normalize){
			@@ -459,6 +481,7 @@
			add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);

			activate_array(l.output, mnl.batch, l.activation);
			if(l.binary \|\| l.xnor) swap_binary(&l);
			}

			void backward_convolutional_layer(convolutional_layer l, network_state state)

			@@ -167,6 +167,8 @@
			float *r_cpu;
			float *h_cpu;

			float *binary_input;

			size_t workspace_size;

			#ifdef GPU

			@@ -1021,7 +1021,6 @@
			}
			}
			}
			binarize_filters2(l.filters, l.n, l.cl.sizel.size, l.cfilters, l.scales);
			#ifdef GPU
			if(gpu_index >= 0){
			push_convolutional_layer(l);
			@@ -1046,7 +1045,7 @@
			if (l.flipped) {
			transpose_matrix(l.filters, l.cl.sizel.size, l.n);
			}
			if (l.binary) binarize_filters(l.filters, l.n, l.cl.sizel.size, l.filters);
			//if (l.binary) binarize_filters(l.filters, l.n, l.cl.sizel.size, l.filters);
			#ifdef GPU
			if(gpu_index >= 0){
			push_convolutional_layer(l);

New file
			@@ -0,0 +1,86 @@
			#include "xnor_layer.h"
			#include "binary_convolution.h"
			#include "convolutional_layer.h"

			layer make_xnor_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalize)
			{
			int i;
			layer l = {0};
			l.type = XNOR;

			l.h = h;
			l.w = w;
			l.c = c;
			l.n = n;
			l.batch = batch;
			l.stride = stride;
			l.size = size;
			l.pad = pad;
			l.batch_normalize = batch_normalize;

			l.filters = calloc(cnsize*size, sizeof(float));
			l.biases = calloc(n, sizeof(float));

			int out_h = convolutional_out_height(l);
			int out_w = convolutional_out_width(l);
			l.out_h = out_h;
			l.out_w = out_w;
			l.out_c = n;
			l.outputs = l.out_h * l.out_w * l.out_c;
			l.inputs = l.w * l.h * l.c;

			l.output = calloc(l.batchout_h out_w * n, sizeof(float));

			if(batch_normalize){
			l.scales = calloc(n, sizeof(float));
			for(i = 0; i < n; ++i){
			l.scales[i] = 1;
			}

			l.mean = calloc(n, sizeof(float));
			l.variance = calloc(n, sizeof(float));

			l.rolling_mean = calloc(n, sizeof(float));
			l.rolling_variance = calloc(n, sizeof(float));
			}

			l.activation = activation;

			fprintf(stderr, "XNOR Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n);

			return l;
			}

			void forward_xnor_layer(const layer l, network_state state)
			{
			int b = l.n;
			int c = l.c;
			int ix = l.w;
			int iy = l.h;
			int wx = l.size;
			int wy = l.size;
			int s = l.stride;
			int pad = l.pad * (l.size/2);

			// MANDATORY: Make the binary layer
			ai2_bin_conv_layer al = ai2_make_bin_conv_layer(b, c, ix, iy, wx, wy, s, pad);

			// OPTIONAL: You need to set the real-valued input like:
			ai2_setFltInput(&al, state.input);
			// The above function will automatically binarize the input for the layer (channel wise).
			// If commented: using the default 0-valued input.

			ai2_setFltWeights(&al, l.filters);
			// The above function will automatically binarize the input for the layer (channel wise).
			// If commented: using the default 0-valued weights.

			// MANDATORY: Call forward
			ai2_bin_forward(&al);

			// OPTIONAL: Inspect outputs
			float output = ai2_getFltOutput(&al); // output is of size l.px l.py where px and py are the padded outputs

			memcpy(l.output, output, l.outputs*sizeof(float));
			// MANDATORY: Free layer
			ai2_free_bin_conv_layer(&al);
			}

New file
			@@ -0,0 +1,11 @@
			#ifndef XNOR_LAYER_H
			#define XNOR_LAYER_H

			#include "layer.h"
			#include "network.h"

			layer make_xnor_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalization);
			void forward_xnor_layer(const layer l, network_state state);

			#endif

			@@ -346,8 +346,8 @@
			if (nms) do_nms_sort(boxes, probs, l.sidel.sidel.n, l.classes, nms);
			//draw_detections(im, l.sidel.sidel.n, thresh, boxes, probs, voc_names, voc_labels, 20);
			draw_detections(im, l.sidel.sidel.n, thresh, boxes, probs, voc_names, voc_labels, 20);
			show_image(im, "predictions");
			save_image(im, "predictions");
			show_image(im, "predictions");

			show_image(sized, "resized");
			free_image(im);