From 8a767f106677b78a389e1ceffc066501015ec51a Mon Sep 17 00:00:00 2001
From: Joseph Redmon <pjreddie@gmail.com>
Date: Mon, 06 Jun 2016 22:48:52 +0000
Subject: [PATCH] stuff for carlo
---
cfg/xyolo.test.cfg | 148 ++++++
src/yolo.c | 2
src/xnor_layer.c | 86 +++
src/common.c | 81 +++
src/convolutional_layer.c | 127 +++--
ai2.mk | 79 +++
src/parser.c | 3
src/common.h | 50 ++
src/binary_convolution.c | 598 ++++++++++++++++++++++++
src/xnor_layer.h | 11
src/binary_convolution.h | 218 +++++++++
src/layer.h | 2
12 files changed, 1,350 insertions(+), 55 deletions(-)
diff --git a/ai2.mk b/ai2.mk
new file mode 100644
index 0000000..b72e97b
--- /dev/null
+++ b/ai2.mk
@@ -0,0 +1,79 @@
+GPU=0
+CUDNN=0
+OPENCV=0
+DEBUG=0
+AI2=1
+
+ARCH= --gpu-architecture=compute_52 --gpu-code=compute_52
+
+VPATH=./src/
+EXEC=darknet
+OBJDIR=./obj/
+
+CC=gcc -std=gnu11
+NVCC=nvcc
+OPTS=-Ofast
+LDFLAGS= -lm -pthread
+COMMON=
+CFLAGS=-Wall -Wfatal-errors
+
+ifeq ($(DEBUG), 1)
+OPTS=-O0 -g
+endif
+
+CFLAGS+=$(OPTS)
+
+ifeq ($(OPENCV), 1)
+COMMON+= -DOPENCV
+CFLAGS+= -DOPENCV
+LDFLAGS+= `pkg-config --libs opencv`
+COMMON+= `pkg-config --cflags opencv`
+endif
+
+ifeq ($(AI2), 1)
+COMMON+= -DAI2
+CFLAGS+= -DAI2
+endif
+
+ifeq ($(GPU), 1)
+COMMON+= -DGPU -I/usr/local/cuda/include/
+CFLAGS+= -DGPU
+LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand
+endif
+
+ifeq ($(CUDNN), 1)
+COMMON+= -DCUDNN
+CFLAGS+= -DCUDNN
+LDFLAGS+= -lcudnn
+endif
+
+OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o imagenet.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o coco_demo.o tag.o cifar.o yolo_demo.o go.o batchnorm_layer.o art.o xnor_layer.o common.o binary_convolution.o
+ifeq ($(GPU), 1)
+LDFLAGS+= -lstdc++
+OBJ+=convolutional_kernels.o deconvolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o softmax_layer_kernels.o network_kernels.o avgpool_layer_kernels.o
+endif
+
+OBJS = $(addprefix $(OBJDIR), $(OBJ))
+DEPS = $(wildcard src/*.h) Makefile
+
+all: obj results $(EXEC)
+
+$(EXEC): $(OBJS)
+ $(CC) $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS)
+
+$(OBJDIR)%.o: %.c $(DEPS)
+ $(CC) $(COMMON) $(CFLAGS) -c $< -o $@
+
+$(OBJDIR)%.o: %.cu $(DEPS)
+ $(NVCC) $(ARCH) $(COMMON) --compiler-options "$(CFLAGS)" -c $< -o $@
+
+obj:
+ mkdir -p obj
+results:
+ mkdir -p results
+
+.PHONY: clean
+
+clean:
+ rm -rf $(OBJS) $(EXEC)
+
diff --git a/cfg/xyolo.test.cfg b/cfg/xyolo.test.cfg
new file mode 100644
index 0000000..2259679
--- /dev/null
+++ b/cfg/xyolo.test.cfg
@@ -0,0 +1,148 @@
+[net]
+batch=1
+subdivisions=1
+height=448
+width=448
+channels=3
+momentum=0.9
+decay=0.0005
+
+learning_rate=0.0001
+policy=steps
+steps=20,40,60,80,20000,30000
+scales=5,5,2,2,.1,.1
+max_batches = 40000
+
+[crop]
+crop_width=448
+crop_height=448
+flip=0
+angle=0
+saturation = 1.5
+exposure = 1.5
+noadjust=1
+
+[convolutional]
+batch_normalize=1
+filters=16
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[batchnorm]
+
+[convolutional]
+xnor = 1
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[batchnorm]
+
+[convolutional]
+xnor = 1
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[batchnorm]
+
+[convolutional]
+xnor = 1
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[batchnorm]
+
+[convolutional]
+xnor = 1
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[batchnorm]
+
+[convolutional]
+xnor = 1
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[batchnorm]
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=128
+activation=leaky
+
+[connected]
+output= 1470
+activation=linear
+
+[detection]
+classes=20
+coords=4
+rescore=1
+side=7
+num=2
+softmax=0
+sqrt=1
+jitter=.2
+
+object_scale=1
+noobject_scale=.5
+class_scale=1
+coord_scale=5
+
diff --git a/src/binary_convolution.c b/src/binary_convolution.c
new file mode 100644
index 0000000..dfededa
--- /dev/null
+++ b/src/binary_convolution.c
@@ -0,0 +1,598 @@
+#include "binary_convolution.h"
+
+int ai2_bin_dp(BINARY_WORD *a, BINARY_WORD *b, dim3 vdim) { // TODO unroll
+ int accumulator = 0;
+ for (int z = 0; z < vdim.z / BITS_PER_BINARY_WORD; z++) {
+ for (int y = 0; y < vdim.y; y++) {
+ for (int x = 0; x < vdim.x; x++) {
+ int idx = z*vdim.y*vdim.x + y*vdim.x + x;
+ accumulator += __builtin_popcount(~(a[idx] ^ b[idx])); // count the XNOR of the two bit vectors
+ }
+ }
+ }
+
+ return accumulator;
+}
+
+/**
+ * Pre-conditions:
+ * alpha_volume is an array of size x*y*z.
+ * alpha_plane is an array of size x*y.
+ * alpha_volume (x,y,z) is transposed to (z,x,y).
+ */
+void ai2_calc_alpha(float *alpha_plane, float *alpha_volume, dim3 vdim) {
+ for (int y = 0; y < vdim.y; ++y) {
+ for (int x = 0; x < vdim.x; ++x) {
+ int out = y * vdim.x + x;
+ double accum = 0.0;
+ for (int z = 0; z < vdim.z; ++z) {
+ accum += alpha_volume[out * vdim.z + z];
+ }
+
+ alpha_plane[out] = accum / vdim.z;
+ }
+ }
+}
+
+/** @brief Wrapper function for generating the beta scaling factor */
+void ai2_calc_beta(float *beta_plane, float *beta_volume, dim3 vdim) {
+ ai2_calc_alpha(beta_plane, beta_volume, vdim);
+}
+
+/** @brief Set the bit in a binary word */
+void ai2_bitset(BINARY_WORD *bword, unsigned int position) {
+ BINARY_WORD mask = (1 << position);
+ *bword = *bword | mask;
+}
+
+/** @brief Checks that the bit is set in a binary word */
+int ai2_is_set(BINARY_WORD bword, unsigned int position) {
+ unsigned int position_complement = (BITS_PER_BINARY_WORD - 1) - position; // number of leading bits before the bit position of interest
+ bword = (bword << position_complement); // zero out leading bits
+ bword = (bword >> (BITS_PER_BINARY_WORD - 1)); // shift bit position of interest to the 0th position
+ return (bword & 0x1); // test if bit position of interest is set
+}
+
+void ai2_flt_to_bin(BINARY_WORD *binary_vol, float *real_vol, dim3 dim) {
+ ai2_transpose3D(real_vol, dim); // (x,y,z) -> (z,x,y)
+
+ int sz = dim.x * dim.y * dim.z;
+ for (int i = 0; i < sz; i += BITS_PER_BINARY_WORD) {
+ BINARY_WORD tmp = 0x00000000;
+ for (int x = 0; x < BITS_PER_BINARY_WORD; ++x) {
+ int waddr = x + i;
+ if (signbit(real_vol[waddr]) == 0)
+ ai2_bitset(&tmp, (BITS_PER_BINARY_WORD - 1) - x);
+ }
+ binary_vol[i / BITS_PER_BINARY_WORD] = tmp;
+ }
+}
+
+void ai2_bin_to_flt(float *real_vol, BINARY_WORD *binary_vol, dim3 dim) { // TODO unit tests
+ for (int z = 0; z < dim.z; z++) {
+ for (int y = 0; y < dim.y; y++) {
+ for (int x = 0; x < dim.x / BITS_PER_BINARY_WORD; x++) { // TODO boundary checks, for uneven input
+ BINARY_WORD word = binary_vol[z*dim.y*dim.x + y*dim.x + x];
+ for (int t = 0; t < BITS_PER_BINARY_WORD; ++t) {
+ int oidx = z*dim.y*dim.x + y*dim.x + x * BITS_PER_BINARY_WORD + t;
+ if (ai2_is_set(word, t))
+ real_vol[oidx] = 1.f;
+ else
+ real_vol[oidx] = -1.f;
+ }
+ }
+ }
+ }
+
+ // Transpose channels back to output
+ ai2_transpose3D(real_vol, dim); // (z,y,x) -> (x,y,z)
+}
+
+/* @brief: input is padded.
+ */
+void ai2_bin_conv2D(float *output, const BINARY_WORD *input, const BINARY_WORD *weights, int ix, int iy, int wx, int wy, int pad, int stride) {
+
+ int r, rd, c, cd;
+ int wx_2 = wx / 2;
+ int wy_2 = wy / 2;
+
+ // Indexing for output pixels. x = [wx_2, ix + wx_2 - 1], y = [wy_2, iy + wy_2 - 1]
+ int sx = pad; // start x
+ int ex = ix + pad - 1; // end x
+ int sy = pad; // start y
+ int ey = iy + pad - 1; // end y
+
+ // Indexing for weights
+ int wsx, wex, wsy, wey;
+ if (wx % 2 == 1) { // odd weights
+ wsx = -wx_2; wex = wx_2 + 1;
+ wsy = -wy_2; wey = wy_2 + 1;
+ }
+ else {
+ wsx = -wx_2; wex = wx_2;
+ wsy = -wy_2; wey = wy_2;
+ }
+
+ int px = ix + 2*pad;
+ //int py = iy + 2*pad;
+
+ for (r = sy; r <= ey; ++r) {
+ for (c = sx; c <= ex; ++c) {
+ int accumulator = 0;
+ for (rd = wsy; rd < wey; ++rd) {
+ for (cd = wsx; cd < wex; ++cd) {
+ int iidx = (r+rd)*px + (c+cd);
+ BINARY_WORD pixel = input[iidx];
+ //BINARY_WORD pixel = 0xFFFFFFFF;
+ //BINARY_WORD weight = 0xFFFFFFFF;
+ int widx = (rd + wy_2)*wx + (cd+wx_2);
+ BINARY_WORD weight = weights[widx];
+ accumulator += __builtin_popcount(~(pixel ^ weight));
+ }
+ }
+
+ // Padded space
+ int oidx = r*px + c;
+ output[oidx] += (float) accumulator;
+ }
+ }
+
+ //for (r = sy; r <= ey; ++r) {
+ // for (c = sx; c <= ex; ++c) {
+ // int accumulator = 0;
+ // for (rd = -wy_2; rd < wy_2; ++rd) {
+ // for (cd = -wx_2; cd < wx_2; ++cd) {
+ // int iidx = (r+rd)*px + (c+cd);
+ // BINARY_WORD pixel = input[iidx];
+ // //BINARY_WORD pixel = 0xFFFFFFFF;
+ // //BINARY_WORD weight = 0xFFFFFFFF;
+ // int widx = (rd + wy_2)*wx + (cd+wx_2);
+ // BINARY_WORD weight = weights[widx];
+ // accumulator += __builtin_popcount(~(pixel ^ weight));
+ // }
+ // }
+
+ // // Padded space
+ // int oidx = r*px + c;
+ // output[oidx] += (float) accumulator;
+ // }
+ //}
+
+ //ai2_bin_conv_within_boundary(output, input, weights, ix, iy, wx, wy, stride);
+ //ai2_bin_conv_borders(output, input, weights, ix, iy, wx, wy, stride);
+}
+
+void ai2_pointwise_mul_mm(float *output, const float *input, int N) {
+ int i = 0;
+
+ while (i + 8 <= N) {
+ output[i+0] *= input[i+0];
+ output[i+1] *= input[i+1];
+ output[i+2] *= input[i+2];
+ output[i+3] *= input[i+3];
+ output[i+4] *= input[i+4];
+ output[i+5] *= input[i+5];
+ output[i+6] *= input[i+6];
+ output[i+7] *= input[i+7];
+
+ i += 8;
+ }
+
+ while (++i < N) // Finish iteration that's leftover (e.g., last batch not divisible by 8 exactly)
+ output[i] *= input[i];
+}
+
+/** @brief Performs a tiled pointwise matrix multiplication between two 2D tensors
+ * Pre-conditions: wx < ix, and wy < iy
+ */
+void ai2_pointwise_mul_mm_2d(float *output, const float *alpha, int ix, int iy, int wx, int wy, int pad) {
+ // Slower version
+// for (int y = 0; y < iy; ++y)
+// for (int x = 0; x < ix; x++)
+// output[y*ix+x] *= input[(y % wy)*wx + (x % wx)];
+
+ // Stride prefetch optimized
+ for (int s = 0; s < wy; ++s) { // for each strip
+ const float *strip_ptr = &alpha[s*wx];
+ for (int y = pad; y < pad + (iy / wy); ++y) { //
+ int stride = y*((ix+2*pad)*wy) + s*(ix+2*pad);
+ float *output_ptr = &output[stride];
+
+ for (int x = 0; x < ix; ++x) {
+ output_ptr[x] *= strip_ptr[x % wx];
+ }
+ }
+ }
+}
+
+void ai2_setFltInput(ai2_bin_conv_layer *layer, float *new_input) {
+ if (new_input != NULL) {
+ if (layer->input != NULL)
+ free(layer->input);
+ layer->input = new_input;
+
+ dim3 dim;
+ dim.x = layer->px;
+ dim.y = layer->py;
+ dim.z = layer->c;
+
+ // Binarize input
+ ai2_flt_to_bin(layer->binary_input, layer->input, dim);
+
+ float *new_beta = (float *) calloc (dim.x * dim.y, sizeof(float));
+ ai2_setFltBeta(layer, new_beta);
+
+ // layer->input is transposed to (z,x,y) already
+ ai2_calc_beta(layer->beta, layer->input, dim);
+ }
+}
+
+void ai2_setBinInput(ai2_bin_conv_layer *layer, BINARY_WORD *new_input) {
+ if (new_input != NULL) {
+ if (layer->binary_input != NULL)
+ free(layer->binary_input);
+ layer->binary_input = new_input;
+ }
+}
+
+void ai2_setFltWeights(ai2_bin_conv_layer *layer, float *new_weights) {
+ if (new_weights != NULL) {
+ if (layer->weights != NULL)
+ free(layer->weights);
+ layer->weights = new_weights;
+
+ dim3 dim;
+ dim.x = layer->wx;
+ dim.y = layer->wy;
+ dim.z = layer->c;
+
+ ai2_flt_to_bin(layer->binary_weights, layer->weights, dim);
+
+ // Calculate alpha
+ if (layer->alpha != NULL)
+ free(layer->alpha);
+
+ layer->alpha = (float *) calloc (dim.x * dim.y, sizeof(float));
+ // layer->weights is already transposed to (z,x,y) from ai2_flt_to_bin()
+ ai2_calc_alpha(layer->alpha, layer->weights, dim);
+ }
+}
+
+void ai2_setBinWeights(ai2_bin_conv_layer *layer, BINARY_WORD *new_weights) {
+ if (new_weights != NULL) {
+ if (layer->binary_weights != NULL)
+ free(layer->binary_weights);
+ layer->binary_weights = new_weights;
+ }
+}
+
+void ai2_setFltOutput(ai2_bin_conv_layer *layer, float *new_output) {
+ if (new_output != NULL) {
+ if (layer->output != NULL)
+ free(layer->output);
+ layer->output = new_output;
+ }
+}
+
+void ai2_setBinOutput(ai2_bin_conv_layer *layer, BINARY_WORD *new_output) {
+ if (new_output != NULL) {
+ if (layer->binary_output != NULL)
+ free(layer->binary_output);
+ layer->binary_output = new_output;
+ }
+}
+
+void ai2_setFltAlpha(ai2_bin_conv_layer *layer, float *new_alpha) {
+ if (new_alpha != NULL) {
+ if (layer->alpha != NULL)
+ free(layer->alpha);
+ layer->alpha = new_alpha;
+ }
+}
+
+void ai2_setFltBeta(ai2_bin_conv_layer *layer, float *new_beta) {
+ if (new_beta != NULL) {
+ if (layer->beta != NULL)
+ free(layer->beta);
+ layer->beta = new_beta;
+ }
+}
+
+void ai2_setFltNewBeta(ai2_bin_conv_layer *layer, float *new_new_beta) {
+ if (new_new_beta != NULL) {
+ if (layer->new_beta != NULL)
+ free(layer->new_beta);
+ layer->new_beta = new_new_beta;
+ }
+}
+
+float* ai2_getFltOutput(ai2_bin_conv_layer *layer) {
+ //if (layer->output != NULL && layer->binary_output != NULL) {
+ if (layer->output != NULL) {
+
+ // The idea here was that all intermediate states are stored in the binary output.
+ // Whenever the user needs the real-valued output, the conversion happens at this function call.
+ //dim3 dim;
+ //dim.x = layer->px;
+ //dim.y = layer->py;
+ //dim.z = layer->batch;
+ //ai2_bin_to_flt(layer->output, layer->binary_output, dim);
+
+ return layer->output;
+ }
+ else
+ return NULL;
+}
+
+void ai2_transpose3D(float *data, dim3 d) {
+ // Slow transpose for correctness
+
+ // (x,y,z) becomes (z,x,y). Requires two transposes:
+ // (x,y,z) -> (x,z,y).
+ // (x,z,y) -> (z,x,y).
+
+ // Intermediate buffer
+ float *new_data = (float *) calloc (d.x * d.y * d.z, sizeof(float));
+
+ // Transpose y and z axis.
+ // (x,y,z) -> (x,z,y);
+ for (int y = 0; y < d.y; ++y) {
+ for (int z = 0; z < d.z; ++z) {
+ for (int x = 0; x < d.x; ++x) {
+ new_data[y*d.x*d.z + z*d.x + x] = data[z*d.x*d.y + y*d.x + x];
+ //new_data[z*d.y*d.x + y*d.x + x] = data[y*d.x*d.z + z*d.x + x];
+ }
+ }
+ }
+
+ // Transpose x and z axis.
+ // (x,z,y) -> (z,x,y)
+ for (int y = 0; y < d.y; ++y) {
+ for (int x = 0; x < d.x; ++x) {
+ for (int z = 0; z < d.z; ++z) {
+ data[y*d.z*d.x + x*d.z + z] = new_data[y*d.x*d.z + x + z*d.x];
+ }
+ }
+ }
+
+ free(new_data);
+}
+
+int ai2_isFloatWhole(float f) { // TODO unit test
+ return (ceilf(f) == f) ? 1 : 0;
+}
+
+/* @brief Initialize and create all memory arrays for this layer
+ * b - batches (number of filter batches)
+ * c - input channels
+ * ix - input width
+ * iy - input height
+ * wx - weight/filter width
+ * wy - weight/filter height
+ * s - stride between sliding windows
+ * pad - the amount of padding
+ */
+ai2_bin_conv_layer ai2_make_bin_conv_layer(int b, int c, int ix, int iy, int wx, int wy, int s, int pad) {
+ // http://cs231n.github.io/convolutional-networks/
+ // See: spatial arrangement section for determining what the output size will be
+ float output_size = ((ix - wx + 2 * pad) / s) + 1;
+ if (ai2_isFloatWhole(output_size) == 0) {
+ fprintf(stderr, "ERROR! conv layer of (b,c,ix,iy,s,pad) = (%d, %d, %d, %d, %d, %d) will give "
+ " invalid output dimension: %fx%f\n", b, c, ix, iy, s, pad, output_size, output_size);
+ exit(1);
+ }
+
+ // TODO: Support strided output
+ if (s != 1) {
+ fprintf(stderr, "ERROR! Only stride values of 1 is supported\n");
+ exit(1);
+ }
+
+ // padded input size
+ int px = (int) ix + 2*pad;
+ int py = (int) iy + 2*pad;
+
+ ai2_bin_conv_layer l = {0}; // initialize all to 0
+ l.input = (float *) calloc (c * px * py, sizeof(float)); // is padded
+ l.binary_input = (BINARY_WORD *) calloc (c * px * py / BITS_PER_BINARY_WORD, sizeof(BINARY_WORD)); // is padded
+
+ dim3 dim;
+ dim.x = px;
+ dim.y = py;
+ dim.z = c;
+ ai2_flt_to_bin(l.binary_input, l.input, dim);
+
+ l.weights = (float *) calloc (b * c * wx * wy, sizeof(float));
+ l.binary_weights = (BINARY_WORD *) calloc (b * c * wx * wy / BITS_PER_BINARY_WORD, sizeof(BINARY_WORD));
+
+ l.output = (float *) calloc (c * px * py, sizeof(float)); // is padded
+ l.new_beta = (float *) calloc(px * py, sizeof(float)); // is padded
+
+ l.batch = b;
+ l.c = c;
+ l.h = iy;
+ l.w = ix;
+ l.stride = s;
+ l.pad = pad;
+ l.px = px;
+ l.py = py;
+ l.wx = wx;
+ l.wy = wy;
+
+ // The following parameters are uninitialized and should be set elsewhere:
+ // l.beta - padded
+ // l.alpha - not padded
+
+ return l;
+}
+
+void ai2_free_bin_conv_layer(ai2_bin_conv_layer *layer) {
+ if (layer->input) free (layer->input);
+ if (layer->binary_input) free(layer->binary_input);
+ if (layer->weights) free (layer->weights);
+ if (layer->binary_weights) free(layer->binary_weights);
+ if (layer->output) free(layer->output);
+ if (layer->binary_output) free (layer->binary_output);
+ if (layer->alpha) free(layer->alpha);
+ if (layer->beta) free(layer->beta);
+ if (layer->new_beta) free(layer->new_beta);
+}
+
+void ai2_throw_error(char *str) {
+ fprintf(stderr, "ERROR: %s\n", str);
+ exit(1);
+}
+
+void ai2_bin_forward(ai2_bin_conv_layer *l) {
+ if (l->input == NULL) ai2_throw_error("Input was not allocated and set in this layer");
+ if (l->weights == NULL) ai2_throw_error("Weights was not allocated and set in this layer");
+ if (l->output == NULL) ai2_throw_error("Output was not allocated and set in this layer");
+ if (l->alpha == NULL) ai2_throw_error("Alpha was not allocated and set in this layer");
+ if (l->beta == NULL) ai2_throw_error("Beta was not allocated and set in this layer");
+
+ if (l->c % 32 != 0) ai2_throw_error("Channel is not divisible by 32. Need to implement mask "
+ "before supporting arbitrary channel size. For now, "
+ "set the channel size to the nearest multiple of 32 "
+ "and ignore any ''extra'' channels unused.");
+
+ l->c /= BITS_PER_BINARY_WORD; // For compensating with doing more work per word
+
+ float *output = l->output;
+ float *alpha = l->alpha;
+ float *beta = l->beta;
+ int px = l->px;
+ int py = l->py;
+ BINARY_WORD *binary_weights = l->binary_weights;
+
+ for (int z = 0; z < l->batch; ++z) { // for each filter map
+ BINARY_WORD *binary_input = l->binary_input;
+ for (int c = 0; c < l->c; ++c) { // for each input channel
+ ai2_bin_conv2D(output, binary_input, binary_weights, l->w, l->h, l->wx, l->wy, l->pad, l->stride);
+ binary_input += px*py; // increment with next 2D plane
+ binary_weights += l->wx*l->wy; // increment with next 2D plane
+
+ ai2_pointwise_mul_mm(output, beta, px*py);
+ ai2_pointwise_mul_mm_2d(output, alpha, l->w, l->h, l->wx, l->wy, l->pad);
+ }
+ }
+}
+
+// Deprecated
+//double ai2_bin_conv_benchmark(ConvolutionArgs conv_args) {
+// printf("Running Binary Convolution test!\n");
+//
+// size_t ix, iy, iz, wx, wy, wz, L, stride;
+// ix = conv_args.input.x;
+// iy = conv_args.input.y;
+// iz = conv_args.input.z;
+// wx = conv_args.weights.x;
+// wy = conv_args.weights.y;
+// wz = conv_args.weights.z;
+// L = BITS_PER_BINARY_WORD;
+// stride = 1;
+//
+// printf("Input size (num elements, xyz): %zu %zu %zu\n", ix, iy, iz);
+// printf("Weights size (num elements. xyz): %zu %zu %zu\n", wx, wy, wz);
+//
+// double sz_input_elements = ix * iy * iz;
+// double sz_input_bytes = getSizeBytesBinaryArray(conv_args.input);
+// double sz_weight_bytes = getSizeBytesBinaryArray(conv_args.weights);
+//
+// printf("Input Size (MB): %f\n", sz_input_bytes / (1 << 20));
+// printf("Weight Size (MB): %f\n", sz_weight_bytes / (1 << 20));
+//
+// BINARY_WORD *binary_input = mallocBinaryVolume(conv_args.input);
+// BINARY_WORD *binary_weights = mallocBinaryVolume(conv_args.weights);
+// BINARY_WORD *b_input = binary_input; // alias
+// BINARY_WORD *b_weight = binary_weights; // alias
+// float *output = mallocFloatVolume(conv_args.output);
+// float *output_ptr = output;
+// float *beta = (float *) malloc(sizeof(float) * ix * iy); // we assume beta is given to us
+// float *alpha = (float *) malloc(sizeof(float) * wx * wy); // we assume alpha is given to us
+// float *new_output = mallocFloatVolume(conv_args.output);
+// //float *new_output_ptr = new_output;
+// float *new_beta = (float *) malloc(sizeof(float) * ix * iy);
+// //float *new_beta_ptr = new_beta;
+//
+// // Scale number of computations because we're packing.
+// // After this point, you should not have to reason about input dimensions for input and weights.
+// iz /= BITS_PER_BINARY_WORD;
+// wz /= BITS_PER_BINARY_WORD;
+//
+// // Calculate time taken by a request
+// struct timeval start_time;
+// gettimeofday(&start_time, NULL);
+//
+// // Preprocessing
+// int pad = wx/2;
+//
+// for (int z = 0; z < iz; ++z) { // number of channels
+// ai2_bin_conv2D(output_ptr, b_input, b_weight, ix, iy, wx, wy, pad, stride);
+// b_input += ix*iy; // increment with next 2D plane
+// b_weight += wx*wy; // increment with next 2D plane
+//
+// ai2_pointwise_mul_mm(output_ptr, beta, ix*iy);
+// ai2_pointwise_mul_mm_2d(output_ptr, alpha, ix, iy, wx, wy, pad);
+// }
+//
+// // copy to new array (need to wrap this around); TODO.
+// struct timeval end_time;
+// gettimeofday(&end_time, NULL);
+//
+// struct timeval diff_time;
+// timersub(&end_time, &start_time, &diff_time);
+// double time_conv_s = diff_time.tv_sec + diff_time.tv_usec * 1e-6;
+// double time_conv_ms = time_conv_s * 1000.0;
+//
+// double model_ops = (3*ix*iy*wx*wy*wz/L) + 2*ix*iy + ix*iy*iz;
+// double conv_ops_s = 1e-9 * model_ops / time_conv_s;
+// double conv_bandwidth_gb_s = 1e-9 * sz_input_bytes / (time_conv_ms / 1000.0);
+// double conv_bandwidth_gelement_s = 1e-9 * sz_input_elements / (time_conv_ms / 1000.0);
+//
+// printf("Execution Time (ms): %f\n", time_conv_ms);
+// printf("Binary Convolution OPS/s (GOPS/s): %f\n", conv_ops_s);
+// printf("Binary Convolution Bandwidth (GB/s): %f\n", conv_bandwidth_gb_s);
+// printf("Binary Convolution Bandwidth (GElements/s): %f\n\n", conv_bandwidth_gelement_s);
+//
+// free(binary_input);
+// free(binary_weights);
+// free(output);
+// free(beta);
+// free(alpha);
+// free(new_output);
+// free(new_beta);
+//
+// return time_conv_ms;
+//}
+
+// double ai2_bin_conv_benchmark(ConvolutionArgs conv_args);
+
+//void benchmark() {
+// int ix, iy, iz, wx, wy, wz;
+// iz = (1 << 9) * BITS_PER_BINARY_WORD;
+// ix = 227; // x == y for square face
+// iy = 227;
+// wx = 3; // x == y for a square face
+// wy = 3;
+// wz = iz;
+//
+// int runs = 1;
+// double accum_binary = 0;
+// double accum_real = 0;
+// ConvolutionArgs conv_args = initArgs(ix, iy, iz, wx, wy, wz);
+// for (int i = 0; i < runs; ++i) {
+// double t_binary_convolve = ai2_bin_conv_benchmark(conv_args);
+// double t_real_convolve = run_convolve2D_real(conv_args);
+// printf("t binary = %lf\n", t_binary_convolve);
+// printf("t real = %lf\n", t_real_convolve);
+// accum_binary += t_binary_convolve;
+// accum_real += t_real_convolve;
+// }
+//
+// accum_binary /= runs;
+// accum_real /= runs;
+// printf("Average convolution pass binary (ms): %lf\n", accum_binary);
+// printf("Average convolution pass flt (ms): %lf\n", accum_real);
+// printf("Speedup (Binary over Real): %lfx\n", accum_real / accum_binary);
+// exit(1);
+//}
diff --git a/src/binary_convolution.h b/src/binary_convolution.h
new file mode 100644
index 0000000..602677e
--- /dev/null
+++ b/src/binary_convolution.h
@@ -0,0 +1,218 @@
+#ifndef AI2_BINARY_CONVOLUTION_H
+#define AI2_BINARY_CONVOLUTION_H
+
+/** @file binary_convolution.h
+ * @brief Routines related for approximating convolutions using binary operations
+ *
+ * @author Carlo C. del Mundo (carlom)
+ * @date 05/23/2016
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <assert.h>
+#include <limits.h>
+#include <tgmath.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <string.h>
+#include "common.h"
+
+typedef struct {
+ int batch; // number of filter batches
+ int c; // channels, z
+ int h; // height, y
+ int w; // width, x
+ int stride;
+ int pad;
+
+ int px; // padded x (use this for striding in padded input and output arrays)
+ int py; // padded y (use this for striding in padded input and output arrays)
+ int wx;
+ int wy;
+
+ float *input; // input values
+ BINARY_WORD *binary_input;
+
+ float *weights; // weight or filter values
+ BINARY_WORD *binary_weights;
+
+ float *output; // output values
+ BINARY_WORD *binary_output;
+
+ float *alpha; // we assume alpha is calculated at the beginning of initialization
+ float *beta; // we assume beta is given to us
+ float *new_beta; // we calculate the new beta for the next layer
+
+ struct ai2_bin_conv_layer *next;
+} ai2_bin_conv_layer;
+
+/** @brief Performs a binary convolution using XNOR and POPCOUNT between input and weights
+ *
+ * @param output A 2D real-valued plane to store the outputs
+ * @param input A 2D binary-valued plane that holds the inputs
+ * @param weights A 2D binary-valued plane that holds the weights
+ * @param ix the input's x dimension
+ * @param iy the input's y dimensions
+ * @param wx the weight's x dimension
+ * @param wy the weight's y dimension
+ * @param pad the amount of padding applied to input. (ix+2*pad is the x dimension of the input
+ * @param stride NOP. TODO: implement stride. the stride between sliding windows
+ * @return the count of all overlapping set bits between the two volumes.
+ */
+void ai2_bin_conv2D(float *output, const BINARY_WORD *input, const BINARY_WORD *weights, int ix, int iy, int wx, int wy, int pad, int stride);
+
+/** @brief Performs a binary dot product (XNOR and POPCOUNT) for two equal sized volumes.
+ *
+ * @param a A 3D binary tensor
+ * @param b A 3D binary tensor
+ * @param vdim the dimensionality of the data. Note: we pack 32 elements in the Z element.
+ * @return the count of all overlapping set bits between the two volumes.
+ */
+int ai2_bin_dp(BINARY_WORD *a, BINARY_WORD *b, dim3 vdim);
+
+/** @brief Calculates the alpha plane given an alpha volume.
+ *
+ * Each point in the yz alpha plane
+ * is the average sum of the absolute value of all elements in the z-direction.
+ *
+ * Pre-conditions:
+ * alpha_volume is an array of size x*y*z.
+ * alpha_plane is an array of size x*y.
+ * alpha_volume (x,y,z) is transposed to (z,x,y).
+ *
+ * @param alpha_plane The 2D real-valued output plane
+ * @param alpha_volume The 3D real-valued output volume
+ * @param vdim the dimensionality of alpha_volume.
+ */
+void ai2_calc_alpha(float *alpha_plane, float *alpha_volume, dim3 vdim);
+
+/** @brief Wrapper function for generating the beta scaling factor */
+void ai2_calc_beta(float *beta_plane, float *beta_volume, dim3 vdim);
+
+/** @brief Set the bit in a binary word */
+void ai2_bitset(BINARY_WORD *bword, unsigned int position);
+
+/** @brief Checks that the bit is set in a binary word */
+int ai2_is_set(BINARY_WORD bword, unsigned int position) ;
+
+/** @brief Converts a 3D float tensor into a 3D binary tensor.
+ *
+ * The value of the ith element in the binary tensor is the sign
+ * of the ith element in the floating tensor.
+ *
+ * @param binary_vol the binary tensor
+ * @param real_vol the real tensor
+ * @param vdim the size of the 3D tensor
+ */
+void ai2_flt_to_bin(BINARY_WORD *binary_vol, float *real_vol, dim3 vdim) ;
+
+/** @brief Converts a 3D binary tensor into a 3D float tensor.
+ *
+ * The ith float element will be '1' if the ith binary element is '1'.
+ * Otherwise, the float element will be '-1'.
+ *
+ * @param real_vol the output real tensor
+ * @param binary_vol the input binary tensor
+ * @param vdim the dimension of both binary_vol and real_vol
+ */
+void ai2_bin_to_flt(float *real_vol, BINARY_WORD *binary_vol, dim3 vdim);
+
+/** @brief Performs a pointwise matrix multication between two 2D tensors
+ * @param output A 2D real-valued plane to store the outputs
+ * @param input A 2D binary-valued plane that holds the inputs
+ * @param N the number of elements between the arrays
+ */
+void ai2_pointwise_mul_mm(float *output, const float *input, int N);
+
+/** @brief Performs a tiled pointwise matrix multiplication between two 2D tensors
+ *
+ * Pre-conditions: wx < ix, and wy < iy
+ *
+ * @param output A 2D real-valued plane of size ix, iy
+ * @param alpha A 2D binary-valued plane of size wx, wy
+ * @param ix the output's x dimension
+ * @param iy the output's y dimensions
+ * @param wx the alpha's x dimension
+ * @param wy the alpha's y dimension
+ * @param pad how many cells are padded, adds 2*pad to the borders of the image
+ */
+void ai2_pointwise_mul_mm_2d(float *output, const float *alpha, int ix, int iy, int wx, int wy, int pad);
+
+// --------------------------------------
+// SETTER FUNCTIONS
+// --------------------------------------
+/** @brief Safe function to set the float input of a conv_layer
+ */
+void ai2_setFltInput(ai2_bin_conv_layer *layer, float *new_input);
+
+/** @brief Safe function to set the binary input of a conv_layer
+ */
+void ai2_setBinInput(ai2_bin_conv_layer *layer, BINARY_WORD *new_input);
+
+/** @brief Safe function to set the binary weights of a conv_layer
+ */
+void ai2_setFltWeights(ai2_bin_conv_layer *layer, float *new_weights);
+
+/** @brief Safe function to set the binary weights of a conv_layer
+ */
+void ai2_setBinWeights(ai2_bin_conv_layer *layer, BINARY_WORD *new_weights);
+
+/** @brief Safe function to set the binary outputs of a conv_layer
+ */
+void ai2_setFltOutput(ai2_bin_conv_layer *layer, float *new_output);
+
+/** @brief Safe function to set the binary outputs of a conv_layer
+ */
+void ai2_setBinOutput(ai2_bin_conv_layer *layer, BINARY_WORD *new_output);
+
+/** @brief Safe function to set the alpha of a conv_layer
+ */
+void ai2_setFltAlpha(ai2_bin_conv_layer *layer, float *new_alpha);
+
+/** @brief Safe function to set the beta of a conv_layer
+ */
+void ai2_setFltBeta(ai2_bin_conv_layer *layer, float *new_beta);
+
+/** @brief Safe function to set the new_beta of a conv_layer
+ */
+void ai2_setFltNewBeta(ai2_bin_conv_layer *layer, float *new_new_beta);
+
+// --------------------------------------
+// GETTER FUNCTIONS
+// --------------------------------------
+/** @brief Safe function to get the float outputs of a conv_layer
+ */
+float * ai2_getFltOutput(ai2_bin_conv_layer *layer);
+
+/** @brief 3D tranpose from (x,y,z) to (z,y,x)
+ * @return a new pointer with the transposed matrix
+ */
+void ai2_transpose3D(float *data, dim3 d);
+
+/** @brief Checks if a float is a whole number (e.g., an int)
+ */
+int ai2_isFloatWhole(float f);
+
+/* @brief Allocates all memory objects in an ai2_bin_conv_layer
+ * b - batches (number of filter batches)
+ * c - input channels
+ * ix - input width
+ * iy - input height
+ * wx - weight/filter width
+ * wy - weight/filter height
+ * s - stride between sliding windows
+ * pad - the amount of padding
+ */
+ai2_bin_conv_layer ai2_make_bin_conv_layer(int b, int c, int ix, int iy, int wx, int wy, int s, int pad);
+
+/* @brief Safe deallocation of all memory objects in an ai2_bin_conv_layer
+ */
+void ai2_free_bin_conv_layer(ai2_bin_conv_layer *layer);
+
+/* @brief Given real-valued filter data and a conv layer, performs a forward pass
+ */
+void ai2_bin_forward(ai2_bin_conv_layer *layer);
+
+#endif
diff --git a/src/common.c b/src/common.c
new file mode 100644
index 0000000..9d59ee8
--- /dev/null
+++ b/src/common.c
@@ -0,0 +1,81 @@
+#include "common.h"
+
+// Returns the time in ms
+double getElapsedTime(Timer *timer) {
+ // Calculate time it took in seconds
+ double accum_ms = ( timer->requestEnd.tv_sec - timer->requestStart.tv_sec )
+ + ( timer->requestEnd.tv_nsec - timer->requestStart.tv_nsec )
+ / 1e6;
+ return accum_ms;
+}
+
+void start_timer(Timer *timer) {
+ clock_gettime(CLOCK_MONOTONIC_RAW, &(timer->requestStart));
+}
+
+void stop_timer(Timer *timer) {
+ clock_gettime(CLOCK_MONOTONIC_RAW, &(timer->requestEnd));
+}
+
+
+BINARY_WORD * mallocBinaryVolume(dim3 vol) {
+ return (BINARY_WORD *) malloc (vol.x * vol.y * vol.z / BITS_PER_BINARY_WORD * sizeof(BINARY_WORD));
+}
+
+float * mallocFloatVolume(dim3 vol) {
+ return (float *) malloc (vol.x * vol.y * vol.z * sizeof(float));
+}
+
+// Returns the size (in bytes) of a binary array with dimensions stored in conv_args
+double getSizeBytesBinaryArray(dim3 conv_args) {
+ return conv_args.x * conv_args.y * conv_args.z * sizeof(BINARY_WORD) / (BITS_PER_BINARY_WORD);
+}
+
+
+ConvolutionArgs initArgs(size_t ix, size_t iy, size_t iz, size_t wx, size_t wy, size_t wz) {
+ ConvolutionArgs conv_args;
+ // Input Volume
+ conv_args.input.x = ix; // x == y for a square face
+ conv_args.input.y = iy;
+ conv_args.input.z = iz;
+ conv_args.weights.x = wx; // x == y for square face
+ conv_args.weights.y = wy;
+ conv_args.weights.z = wz;
+
+ // <!-- DO NOT MODIFY -->
+ // Intermediate Volumes
+ conv_args.alpha_plane.x = conv_args.weights.x;
+ conv_args.alpha_plane.y = conv_args.weights.y;
+ conv_args.alpha_plane.z = 1;
+
+ conv_args.beta_plane.x = 1;
+ conv_args.beta_plane.y = conv_args.input.y;
+ conv_args.beta_plane.z = conv_args.input.z;
+
+ conv_args.gamma_plane.x = conv_args.input.x * conv_args.weights.x;
+ conv_args.gamma_plane.y = conv_args.input.y * conv_args.weights.y;
+ conv_args.gamma_plane.z = 1;
+
+ conv_args.zeta_plane.x = conv_args.gamma_plane.x;
+ conv_args.zeta_plane.y = conv_args.gamma_plane.y;
+ conv_args.zeta_plane.z = 1;
+
+ // Output Volume
+ conv_args.output.x = conv_args.input.x;
+ conv_args.output.y = conv_args.input.y;
+ conv_args.output.z = 1; // Output should be a 2D plane
+
+ // Verify dimensions
+ //assert(conv_args.weights.x % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints
+// assert(conv_args.weights.y % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints
+ assert(conv_args.weights.z % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints
+ //assert(conv_args.input.x % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints
+// assert(conv_args.input.y % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints
+ assert(conv_args.input.z % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints
+ assert(conv_args.weights.x <= conv_args.input.x);
+ assert(conv_args.weights.y <= conv_args.input.y);
+ assert(conv_args.weights.z <= conv_args.input.z);
+ // <!-- DO NOT MODIFY -->
+
+ return conv_args;
+}
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000..bad428d
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,50 @@
+#ifndef AI2_COMMON_H
+#define AI2_COMMON_H
+
+#include <time.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <assert.h>
+#include <limits.h>
+#include <tgmath.h>
+#include <unistd.h>
+#include <stdint.h>
+//#include <gperftools/profiler.h>
+#include <sys/time.h>
+
+typedef uint32_t BINARY_WORD;
+#define BITS_PER_BINARY_WORD (sizeof(BINARY_WORD) * CHAR_BIT)
+
+typedef struct{
+ struct timespec requestStart;
+ struct timespec requestEnd;
+} Timer;
+
+typedef struct {
+ size_t x;
+ size_t y;
+ size_t z;
+} dim3;
+
+typedef struct {
+ dim3 weights;
+ dim3 input;
+ dim3 output;
+ dim3 alpha_plane;
+ dim3 beta_plane;
+ dim3 gamma_plane;
+ dim3 zeta_plane;
+} ConvolutionArgs;
+
+// Timer stuff
+double getElapsedTime(Timer *timer); // Returns the time in ms
+void start_timer(Timer *timer);
+void stop_timer(Timer *timer);
+
+BINARY_WORD * mallocBinaryVolume(dim3 vol);
+float * mallocFloatVolume(dim3 vol);
+ConvolutionArgs initArgs(size_t ix, size_t iy, size_t iz, size_t wx, size_t wy, size_t wz);
+double getSizeBytesBinaryArray(dim3 conv_args);
+
+#endif
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index c377802..f0c312c 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -8,6 +8,10 @@
#include <stdio.h>
#include <time.h>
+#ifndef AI2
+#define AI2 0
+#endif
+
void swap_binary(convolutional_layer *l)
{
float *swap = l->filters;
@@ -21,24 +25,6 @@
#endif
}
-void binarize_filters2(float *filters, int n, int size, char *binary, float *scales)
-{
- int i, k, f;
- for(f = 0; f < n; ++f){
- float mean = 0;
- for(i = 0; i < size; ++i){
- mean += fabs(filters[f*size + i]);
- }
- mean = mean / size;
- scales[f] = mean;
- for(i = 0; i < size/8; ++i){
- binary[f*size + i] = (filters[f*size + i] > 0) ? 1 : 0;
- for(k = 0; k < 8; ++k){
- }
- }
- }
-}
-
void binarize_filters(float *filters, int n, int size, float *binary)
{
int i, f;
@@ -54,6 +40,21 @@
}
}
+void binarize_input(float *input, int n, int size, float *binary)
+{
+ int i, s;
+ for(s = 0; s < size; ++s){
+ float mean = 0;
+ for(i = 0; i < n; ++i){
+ mean += fabs(input[i*size + s]);
+ }
+ mean = mean / n;
+ for(i = 0; i < n; ++i){
+ binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean;
+ }
+ }
+}
+
int convolutional_out_height(convolutional_layer l)
{
int h = l.h;
@@ -89,7 +90,7 @@
}
size_t get_workspace_size(layer l){
- #ifdef CUDNN
+#ifdef CUDNN
size_t most = 0;
size_t s = 0;
cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
@@ -117,9 +118,9 @@
&s);
if (s > most) most = s;
return most;
- #else
+#else
return (size_t)l.out_h*l.out_w*l.size*l.size*l.c*sizeof(float);
- #endif
+#endif
}
convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int binary, int xnor)
@@ -133,6 +134,7 @@
l.c = c;
l.n = n;
l.binary = binary;
+ l.xnor = xnor;
l.batch = batch;
l.stride = stride;
l.size = size;
@@ -164,6 +166,10 @@
l.cfilters = calloc(c*n*size*size, sizeof(char));
l.scales = calloc(n, sizeof(float));
}
+ if(xnor){
+ l.binary_filters = calloc(c*n*size*size, sizeof(float));
+ l.binary_input = calloc(l.inputs*l.batch, sizeof(float));
+ }
if(batch_normalize){
l.scales = calloc(n, sizeof(float));
@@ -199,7 +205,6 @@
l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size);
l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
}
- l.xnor = xnor;
if(batch_normalize){
l.mean_gpu = cuda_make_array(l.mean, n);
@@ -325,7 +330,7 @@
l->delta_gpu = cuda_make_array(l->delta, l->batch*out_h*out_w*l->n);
l->output_gpu = cuda_make_array(l->output, l->batch*out_h*out_w*l->n);
- #ifdef CUDNN
+#ifdef CUDNN
cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w);
cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
cudnnSetFilter4dDescriptor(l->dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
@@ -359,7 +364,7 @@
CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
0,
&l->bf_algo);
- #endif
+#endif
#endif
l->workspace_size = get_workspace_size(*l);
}
@@ -404,7 +409,9 @@
int out_w = convolutional_out_width(l);
int i;
+
fill_cpu(l.outputs*l.batch, 0, l.output, 1);
+
/*
if(l.binary){
binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.binary_filters);
@@ -413,44 +420,59 @@
}
*/
-/*
- if(l.binary){
- int m = l.n;
- int k = l.size*l.size*l.c;
- int n = out_h*out_w;
+ /*
+ if(l.binary){
+ int m = l.n;
+ int k = l.size*l.size*l.c;
+ int n = out_h*out_w;
- char *a = l.cfilters;
+ char *a = l.cfilters;
+ float *b = state.workspace;
+ float *c = l.output;
+
+ for(i = 0; i < l.batch; ++i){
+ im2col_cpu(state.input, l.c, l.h, l.w,
+ l.size, l.stride, l.pad, b);
+ gemm_bin(m,n,k,1,a,k,b,n,c,n);
+ c += n*m;
+ state.input += l.c*l.h*l.w;
+ }
+ scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w);
+ add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
+ activate_array(l.output, m*n*l.batch, l.activation);
+ return;
+ }
+ */
+
+ if(l.xnor && (l.c%32 != 0 || !AI2)){
+ binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.binary_filters);
+ swap_binary(&l);
+ for(i = 0; i < l.batch; ++i){
+ binarize_input(state.input + i*l.inputs, l.c, l.h*l.w, l.binary_input + i*l.inputs);
+ }
+ state.input = l.binary_input;
+ }
+
+ int m = l.n;
+ int k = l.size*l.size*l.c;
+ int n = out_h*out_w;
+
+ if (l.xnor && l.c%32 == 0 && AI2) {
+ forward_xnor_layer(l, state);
+ printf("xnor\n");
+ } else {
+
+ float *a = l.filters;
float *b = state.workspace;
float *c = l.output;
for(i = 0; i < l.batch; ++i){
im2col_cpu(state.input, l.c, l.h, l.w,
l.size, l.stride, l.pad, b);
- gemm_bin(m,n,k,1,a,k,b,n,c,n);
+ gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
c += n*m;
state.input += l.c*l.h*l.w;
}
- scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w);
- add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
- activate_array(l.output, m*n*l.batch, l.activation);
- return;
- }
- */
-
- int m = l.n;
- int k = l.size*l.size*l.c;
- int n = out_h*out_w;
-
- float *a = l.filters;
- float *b = state.workspace;
- float *c = l.output;
-
- for(i = 0; i < l.batch; ++i){
- im2col_cpu(state.input, l.c, l.h, l.w,
- l.size, l.stride, l.pad, b);
- gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
- c += n*m;
- state.input += l.c*l.h*l.w;
}
if(l.batch_normalize){
@@ -459,6 +481,7 @@
add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
activate_array(l.output, m*n*l.batch, l.activation);
+ if(l.binary || l.xnor) swap_binary(&l);
}
void backward_convolutional_layer(convolutional_layer l, network_state state)
diff --git a/src/layer.h b/src/layer.h
index d2250a6..0c457f7 100644
--- a/src/layer.h
+++ b/src/layer.h
@@ -167,6 +167,8 @@
float *r_cpu;
float *h_cpu;
+ float *binary_input;
+
size_t workspace_size;
#ifdef GPU
diff --git a/src/parser.c b/src/parser.c
index d12b5c1..71f54cc 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -1021,7 +1021,6 @@
}
}
}
- binarize_filters2(l.filters, l.n, l.c*l.size*l.size, l.cfilters, l.scales);
#ifdef GPU
if(gpu_index >= 0){
push_convolutional_layer(l);
@@ -1046,7 +1045,7 @@
if (l.flipped) {
transpose_matrix(l.filters, l.c*l.size*l.size, l.n);
}
- if (l.binary) binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.filters);
+ //if (l.binary) binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.filters);
#ifdef GPU
if(gpu_index >= 0){
push_convolutional_layer(l);
diff --git a/src/xnor_layer.c b/src/xnor_layer.c
new file mode 100644
index 0000000..6c5e9b8
--- /dev/null
+++ b/src/xnor_layer.c
@@ -0,0 +1,86 @@
+#include "xnor_layer.h"
+#include "binary_convolution.h"
+#include "convolutional_layer.h"
+
+layer make_xnor_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalize)
+{
+ int i;
+ layer l = {0};
+ l.type = XNOR;
+
+ l.h = h;
+ l.w = w;
+ l.c = c;
+ l.n = n;
+ l.batch = batch;
+ l.stride = stride;
+ l.size = size;
+ l.pad = pad;
+ l.batch_normalize = batch_normalize;
+
+ l.filters = calloc(c*n*size*size, sizeof(float));
+ l.biases = calloc(n, sizeof(float));
+
+ int out_h = convolutional_out_height(l);
+ int out_w = convolutional_out_width(l);
+ l.out_h = out_h;
+ l.out_w = out_w;
+ l.out_c = n;
+ l.outputs = l.out_h * l.out_w * l.out_c;
+ l.inputs = l.w * l.h * l.c;
+
+ l.output = calloc(l.batch*out_h * out_w * n, sizeof(float));
+
+ if(batch_normalize){
+ l.scales = calloc(n, sizeof(float));
+ for(i = 0; i < n; ++i){
+ l.scales[i] = 1;
+ }
+
+ l.mean = calloc(n, sizeof(float));
+ l.variance = calloc(n, sizeof(float));
+
+ l.rolling_mean = calloc(n, sizeof(float));
+ l.rolling_variance = calloc(n, sizeof(float));
+ }
+
+ l.activation = activation;
+
+ fprintf(stderr, "XNOR Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n);
+
+ return l;
+}
+
+void forward_xnor_layer(const layer l, network_state state)
+{
+ int b = l.n;
+ int c = l.c;
+ int ix = l.w;
+ int iy = l.h;
+ int wx = l.size;
+ int wy = l.size;
+ int s = l.stride;
+ int pad = l.pad * (l.size/2);
+
+ // MANDATORY: Make the binary layer
+ ai2_bin_conv_layer al = ai2_make_bin_conv_layer(b, c, ix, iy, wx, wy, s, pad);
+
+ // OPTIONAL: You need to set the real-valued input like:
+ ai2_setFltInput(&al, state.input);
+ // The above function will automatically binarize the input for the layer (channel wise).
+ // If commented: using the default 0-valued input.
+
+ ai2_setFltWeights(&al, l.filters);
+ // The above function will automatically binarize the input for the layer (channel wise).
+ // If commented: using the default 0-valued weights.
+
+ // MANDATORY: Call forward
+ ai2_bin_forward(&al);
+
+ // OPTIONAL: Inspect outputs
+ float *output = ai2_getFltOutput(&al); // output is of size l.px * l.py where px and py are the padded outputs
+
+ memcpy(l.output, output, l.outputs*sizeof(float));
+ // MANDATORY: Free layer
+ ai2_free_bin_conv_layer(&al);
+}
diff --git a/src/xnor_layer.h b/src/xnor_layer.h
new file mode 100644
index 0000000..f1c5b68
--- /dev/null
+++ b/src/xnor_layer.h
@@ -0,0 +1,11 @@
+#ifndef XNOR_LAYER_H
+#define XNOR_LAYER_H
+
+#include "layer.h"
+#include "network.h"
+
+layer make_xnor_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalization);
+void forward_xnor_layer(const layer l, network_state state);
+
+#endif
+
diff --git a/src/yolo.c b/src/yolo.c
index 9c3999e..2b99935 100644
--- a/src/yolo.c
+++ b/src/yolo.c
@@ -346,8 +346,8 @@
if (nms) do_nms_sort(boxes, probs, l.side*l.side*l.n, l.classes, nms);
//draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, voc_names, voc_labels, 20);
draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, voc_names, voc_labels, 20);
- show_image(im, "predictions");
save_image(im, "predictions");
+ show_image(im, "predictions");
show_image(sized, "resized");
free_image(im);
--
Gitblit v1.10.0