4 files modified
8 files added
| New file |
| | |
| | | GPU=0 |
| | | CUDNN=0 |
| | | OPENCV=0 |
| | | DEBUG=0 |
| | | AI2=1 |
| | | |
| | | ARCH= --gpu-architecture=compute_52 --gpu-code=compute_52 |
| | | |
| | | VPATH=./src/ |
| | | EXEC=darknet |
| | | OBJDIR=./obj/ |
| | | |
| | | CC=gcc -std=gnu11 |
| | | NVCC=nvcc |
| | | OPTS=-Ofast |
| | | LDFLAGS= -lm -pthread |
| | | COMMON= |
| | | CFLAGS=-Wall -Wfatal-errors |
| | | |
| | | ifeq ($(DEBUG), 1) |
| | | OPTS=-O0 -g |
| | | endif |
| | | |
| | | CFLAGS+=$(OPTS) |
| | | |
| | | ifeq ($(OPENCV), 1) |
| | | COMMON+= -DOPENCV |
| | | CFLAGS+= -DOPENCV |
| | | LDFLAGS+= `pkg-config --libs opencv` |
| | | COMMON+= `pkg-config --cflags opencv` |
| | | endif |
| | | |
| | | ifeq ($(AI2), 1) |
| | | COMMON+= -DAI2 |
| | | CFLAGS+= -DAI2 |
| | | endif |
| | | |
| | | ifeq ($(GPU), 1) |
| | | COMMON+= -DGPU -I/usr/local/cuda/include/ |
| | | CFLAGS+= -DGPU |
| | | LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand |
| | | endif |
| | | |
| | | ifeq ($(CUDNN), 1) |
| | | COMMON+= -DCUDNN |
| | | CFLAGS+= -DCUDNN |
| | | LDFLAGS+= -lcudnn |
| | | endif |
| | | |
| | | OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o imagenet.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o coco_demo.o tag.o cifar.o yolo_demo.o go.o batchnorm_layer.o art.o xnor_layer.o common.o binary_convolution.o |
| | | ifeq ($(GPU), 1) |
| | | LDFLAGS+= -lstdc++ |
| | | OBJ+=convolutional_kernels.o deconvolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o softmax_layer_kernels.o network_kernels.o avgpool_layer_kernels.o |
| | | endif |
| | | |
| | | OBJS = $(addprefix $(OBJDIR), $(OBJ)) |
| | | DEPS = $(wildcard src/*.h) Makefile |
| | | |
| | | all: obj results $(EXEC) |
| | | |
| | | $(EXEC): $(OBJS) |
| | | $(CC) $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS) |
| | | |
| | | $(OBJDIR)%.o: %.c $(DEPS) |
| | | $(CC) $(COMMON) $(CFLAGS) -c $< -o $@ |
| | | |
| | | $(OBJDIR)%.o: %.cu $(DEPS) |
| | | $(NVCC) $(ARCH) $(COMMON) --compiler-options "$(CFLAGS)" -c $< -o $@ |
| | | |
| | | obj: |
| | | mkdir -p obj |
| | | results: |
| | | mkdir -p results |
| | | |
| | | .PHONY: clean |
| | | |
| | | clean: |
| | | rm -rf $(OBJS) $(EXEC) |
| | | |
| New file |
| | |
| | | [net] |
| | | batch=1 |
| | | subdivisions=1 |
| | | height=448 |
| | | width=448 |
| | | channels=3 |
| | | momentum=0.9 |
| | | decay=0.0005 |
| | | |
| | | learning_rate=0.0001 |
| | | policy=steps |
| | | steps=20,40,60,80,20000,30000 |
| | | scales=5,5,2,2,.1,.1 |
| | | max_batches = 40000 |
| | | |
| | | [crop] |
| | | crop_width=448 |
| | | crop_height=448 |
| | | flip=0 |
| | | angle=0 |
| | | saturation = 1.5 |
| | | exposure = 1.5 |
| | | noadjust=1 |
| | | |
| | | [convolutional] |
| | | batch_normalize=1 |
| | | filters=16 |
| | | size=3 |
| | | stride=1 |
| | | pad=1 |
| | | activation=leaky |
| | | |
| | | [maxpool] |
| | | size=2 |
| | | stride=2 |
| | | |
| | | [batchnorm] |
| | | |
| | | [convolutional] |
| | | xnor = 1 |
| | | batch_normalize=1 |
| | | filters=32 |
| | | size=3 |
| | | stride=1 |
| | | pad=1 |
| | | activation=leaky |
| | | |
| | | [maxpool] |
| | | size=2 |
| | | stride=2 |
| | | |
| | | [batchnorm] |
| | | |
| | | [convolutional] |
| | | xnor = 1 |
| | | batch_normalize=1 |
| | | filters=64 |
| | | size=3 |
| | | stride=1 |
| | | pad=1 |
| | | activation=leaky |
| | | |
| | | [maxpool] |
| | | size=2 |
| | | stride=2 |
| | | |
| | | [batchnorm] |
| | | |
| | | [convolutional] |
| | | xnor = 1 |
| | | batch_normalize=1 |
| | | filters=128 |
| | | size=3 |
| | | stride=1 |
| | | pad=1 |
| | | activation=leaky |
| | | |
| | | [maxpool] |
| | | size=2 |
| | | stride=2 |
| | | |
| | | [batchnorm] |
| | | |
| | | [convolutional] |
| | | xnor = 1 |
| | | batch_normalize=1 |
| | | filters=256 |
| | | size=3 |
| | | stride=1 |
| | | pad=1 |
| | | activation=leaky |
| | | |
| | | [maxpool] |
| | | size=2 |
| | | stride=2 |
| | | |
| | | [batchnorm] |
| | | |
| | | [convolutional] |
| | | xnor = 1 |
| | | batch_normalize=1 |
| | | filters=512 |
| | | size=3 |
| | | stride=1 |
| | | pad=1 |
| | | activation=leaky |
| | | |
| | | [maxpool] |
| | | size=2 |
| | | stride=2 |
| | | |
| | | [batchnorm] |
| | | |
| | | [convolutional] |
| | | batch_normalize=1 |
| | | filters=1024 |
| | | size=3 |
| | | stride=1 |
| | | pad=1 |
| | | activation=leaky |
| | | |
| | | [convolutional] |
| | | batch_normalize=1 |
| | | size=3 |
| | | stride=1 |
| | | pad=1 |
| | | filters=128 |
| | | activation=leaky |
| | | |
| | | [connected] |
| | | output= 1470 |
| | | activation=linear |
| | | |
| | | [detection] |
| | | classes=20 |
| | | coords=4 |
| | | rescore=1 |
| | | side=7 |
| | | num=2 |
| | | softmax=0 |
| | | sqrt=1 |
| | | jitter=.2 |
| | | |
| | | object_scale=1 |
| | | noobject_scale=.5 |
| | | class_scale=1 |
| | | coord_scale=5 |
| | | |
| New file |
| | |
| | | #include "binary_convolution.h" |
| | | |
| | | int ai2_bin_dp(BINARY_WORD *a, BINARY_WORD *b, dim3 vdim) { // TODO unroll |
| | | int accumulator = 0; |
| | | for (int z = 0; z < vdim.z / BITS_PER_BINARY_WORD; z++) { |
| | | for (int y = 0; y < vdim.y; y++) { |
| | | for (int x = 0; x < vdim.x; x++) { |
| | | int idx = z*vdim.y*vdim.x + y*vdim.x + x; |
| | | accumulator += __builtin_popcount(~(a[idx] ^ b[idx])); // count the XNOR of the two bit vectors |
| | | } |
| | | } |
| | | } |
| | | |
| | | return accumulator; |
| | | } |
| | | |
| | | /** |
| | | * Pre-conditions: |
| | | * alpha_volume is an array of size x*y*z. |
| | | * alpha_plane is an array of size x*y. |
| | | * alpha_volume (x,y,z) is transposed to (z,x,y). |
| | | */ |
| | | void ai2_calc_alpha(float *alpha_plane, float *alpha_volume, dim3 vdim) { |
| | | for (int y = 0; y < vdim.y; ++y) { |
| | | for (int x = 0; x < vdim.x; ++x) { |
| | | int out = y * vdim.x + x; |
| | | double accum = 0.0; |
| | | for (int z = 0; z < vdim.z; ++z) { |
| | | accum += alpha_volume[out * vdim.z + z]; |
| | | } |
| | | |
| | | alpha_plane[out] = accum / vdim.z; |
| | | } |
| | | } |
| | | } |
| | | |
| | | /** @brief Wrapper function for generating the beta scaling factor */ |
| | | void ai2_calc_beta(float *beta_plane, float *beta_volume, dim3 vdim) { |
| | | ai2_calc_alpha(beta_plane, beta_volume, vdim); |
| | | } |
| | | |
| | | /** @brief Set the bit in a binary word */ |
| | | void ai2_bitset(BINARY_WORD *bword, unsigned int position) { |
| | | BINARY_WORD mask = (1 << position); |
| | | *bword = *bword | mask; |
| | | } |
| | | |
| | | /** @brief Checks that the bit is set in a binary word */ |
| | | int ai2_is_set(BINARY_WORD bword, unsigned int position) { |
| | | unsigned int position_complement = (BITS_PER_BINARY_WORD - 1) - position; // number of leading bits before the bit position of interest |
| | | bword = (bword << position_complement); // zero out leading bits |
| | | bword = (bword >> (BITS_PER_BINARY_WORD - 1)); // shift bit position of interest to the 0th position |
| | | return (bword & 0x1); // test if bit position of interest is set |
| | | } |
| | | |
| | | void ai2_flt_to_bin(BINARY_WORD *binary_vol, float *real_vol, dim3 dim) { |
| | | ai2_transpose3D(real_vol, dim); // (x,y,z) -> (z,x,y) |
| | | |
| | | int sz = dim.x * dim.y * dim.z; |
| | | for (int i = 0; i < sz; i += BITS_PER_BINARY_WORD) { |
| | | BINARY_WORD tmp = 0x00000000; |
| | | for (int x = 0; x < BITS_PER_BINARY_WORD; ++x) { |
| | | int waddr = x + i; |
| | | if (signbit(real_vol[waddr]) == 0) |
| | | ai2_bitset(&tmp, (BITS_PER_BINARY_WORD - 1) - x); |
| | | } |
| | | binary_vol[i / BITS_PER_BINARY_WORD] = tmp; |
| | | } |
| | | } |
| | | |
| | | void ai2_bin_to_flt(float *real_vol, BINARY_WORD *binary_vol, dim3 dim) { // TODO unit tests |
| | | for (int z = 0; z < dim.z; z++) { |
| | | for (int y = 0; y < dim.y; y++) { |
| | | for (int x = 0; x < dim.x / BITS_PER_BINARY_WORD; x++) { // TODO boundary checks, for uneven input |
| | | BINARY_WORD word = binary_vol[z*dim.y*dim.x + y*dim.x + x]; |
| | | for (int t = 0; t < BITS_PER_BINARY_WORD; ++t) { |
| | | int oidx = z*dim.y*dim.x + y*dim.x + x * BITS_PER_BINARY_WORD + t; |
| | | if (ai2_is_set(word, t)) |
| | | real_vol[oidx] = 1.f; |
| | | else |
| | | real_vol[oidx] = -1.f; |
| | | } |
| | | } |
| | | } |
| | | } |
| | | |
| | | // Transpose channels back to output |
| | | ai2_transpose3D(real_vol, dim); // (z,y,x) -> (x,y,z) |
| | | } |
| | | |
| | | /* @brief: input is padded. |
| | | */ |
| | | void ai2_bin_conv2D(float *output, const BINARY_WORD *input, const BINARY_WORD *weights, int ix, int iy, int wx, int wy, int pad, int stride) { |
| | | |
| | | int r, rd, c, cd; |
| | | int wx_2 = wx / 2; |
| | | int wy_2 = wy / 2; |
| | | |
| | | // Indexing for output pixels. x = [wx_2, ix + wx_2 - 1], y = [wy_2, iy + wy_2 - 1] |
| | | int sx = pad; // start x |
| | | int ex = ix + pad - 1; // end x |
| | | int sy = pad; // start y |
| | | int ey = iy + pad - 1; // end y |
| | | |
| | | // Indexing for weights |
| | | int wsx, wex, wsy, wey; |
| | | if (wx % 2 == 1) { // odd weights |
| | | wsx = -wx_2; wex = wx_2 + 1; |
| | | wsy = -wy_2; wey = wy_2 + 1; |
| | | } |
| | | else { |
| | | wsx = -wx_2; wex = wx_2; |
| | | wsy = -wy_2; wey = wy_2; |
| | | } |
| | | |
| | | int px = ix + 2*pad; |
| | | //int py = iy + 2*pad; |
| | | |
| | | for (r = sy; r <= ey; ++r) { |
| | | for (c = sx; c <= ex; ++c) { |
| | | int accumulator = 0; |
| | | for (rd = wsy; rd < wey; ++rd) { |
| | | for (cd = wsx; cd < wex; ++cd) { |
| | | int iidx = (r+rd)*px + (c+cd); |
| | | BINARY_WORD pixel = input[iidx]; |
| | | //BINARY_WORD pixel = 0xFFFFFFFF; |
| | | //BINARY_WORD weight = 0xFFFFFFFF; |
| | | int widx = (rd + wy_2)*wx + (cd+wx_2); |
| | | BINARY_WORD weight = weights[widx]; |
| | | accumulator += __builtin_popcount(~(pixel ^ weight)); |
| | | } |
| | | } |
| | | |
| | | // Padded space |
| | | int oidx = r*px + c; |
| | | output[oidx] += (float) accumulator; |
| | | } |
| | | } |
| | | |
| | | //for (r = sy; r <= ey; ++r) { |
| | | // for (c = sx; c <= ex; ++c) { |
| | | // int accumulator = 0; |
| | | // for (rd = -wy_2; rd < wy_2; ++rd) { |
| | | // for (cd = -wx_2; cd < wx_2; ++cd) { |
| | | // int iidx = (r+rd)*px + (c+cd); |
| | | // BINARY_WORD pixel = input[iidx]; |
| | | // //BINARY_WORD pixel = 0xFFFFFFFF; |
| | | // //BINARY_WORD weight = 0xFFFFFFFF; |
| | | // int widx = (rd + wy_2)*wx + (cd+wx_2); |
| | | // BINARY_WORD weight = weights[widx]; |
| | | // accumulator += __builtin_popcount(~(pixel ^ weight)); |
| | | // } |
| | | // } |
| | | |
| | | // // Padded space |
| | | // int oidx = r*px + c; |
| | | // output[oidx] += (float) accumulator; |
| | | // } |
| | | //} |
| | | |
| | | //ai2_bin_conv_within_boundary(output, input, weights, ix, iy, wx, wy, stride); |
| | | //ai2_bin_conv_borders(output, input, weights, ix, iy, wx, wy, stride); |
| | | } |
| | | |
| | | void ai2_pointwise_mul_mm(float *output, const float *input, int N) { |
| | | int i = 0; |
| | | |
| | | while (i + 8 <= N) { |
| | | output[i+0] *= input[i+0]; |
| | | output[i+1] *= input[i+1]; |
| | | output[i+2] *= input[i+2]; |
| | | output[i+3] *= input[i+3]; |
| | | output[i+4] *= input[i+4]; |
| | | output[i+5] *= input[i+5]; |
| | | output[i+6] *= input[i+6]; |
| | | output[i+7] *= input[i+7]; |
| | | |
| | | i += 8; |
| | | } |
| | | |
| | | while (++i < N) // Finish iteration that's leftover (e.g., last batch not divisible by 8 exactly) |
| | | output[i] *= input[i]; |
| | | } |
| | | |
| | | /** @brief Performs a tiled pointwise matrix multiplication between two 2D tensors |
| | | * Pre-conditions: wx < ix, and wy < iy |
| | | */ |
| | | void ai2_pointwise_mul_mm_2d(float *output, const float *alpha, int ix, int iy, int wx, int wy, int pad) { |
| | | // Slower version |
| | | // for (int y = 0; y < iy; ++y) |
| | | // for (int x = 0; x < ix; x++) |
| | | // output[y*ix+x] *= input[(y % wy)*wx + (x % wx)]; |
| | | |
| | | // Stride prefetch optimized |
| | | for (int s = 0; s < wy; ++s) { // for each strip |
| | | const float *strip_ptr = &alpha[s*wx]; |
| | | for (int y = pad; y < pad + (iy / wy); ++y) { // |
| | | int stride = y*((ix+2*pad)*wy) + s*(ix+2*pad); |
| | | float *output_ptr = &output[stride]; |
| | | |
| | | for (int x = 0; x < ix; ++x) { |
| | | output_ptr[x] *= strip_ptr[x % wx]; |
| | | } |
| | | } |
| | | } |
| | | } |
| | | |
| | | void ai2_setFltInput(ai2_bin_conv_layer *layer, float *new_input) { |
| | | if (new_input != NULL) { |
| | | if (layer->input != NULL) |
| | | free(layer->input); |
| | | layer->input = new_input; |
| | | |
| | | dim3 dim; |
| | | dim.x = layer->px; |
| | | dim.y = layer->py; |
| | | dim.z = layer->c; |
| | | |
| | | // Binarize input |
| | | ai2_flt_to_bin(layer->binary_input, layer->input, dim); |
| | | |
| | | float *new_beta = (float *) calloc (dim.x * dim.y, sizeof(float)); |
| | | ai2_setFltBeta(layer, new_beta); |
| | | |
| | | // layer->input is transposed to (z,x,y) already |
| | | ai2_calc_beta(layer->beta, layer->input, dim); |
| | | } |
| | | } |
| | | |
| | | void ai2_setBinInput(ai2_bin_conv_layer *layer, BINARY_WORD *new_input) { |
| | | if (new_input != NULL) { |
| | | if (layer->binary_input != NULL) |
| | | free(layer->binary_input); |
| | | layer->binary_input = new_input; |
| | | } |
| | | } |
| | | |
| | | void ai2_setFltWeights(ai2_bin_conv_layer *layer, float *new_weights) { |
| | | if (new_weights != NULL) { |
| | | if (layer->weights != NULL) |
| | | free(layer->weights); |
| | | layer->weights = new_weights; |
| | | |
| | | dim3 dim; |
| | | dim.x = layer->wx; |
| | | dim.y = layer->wy; |
| | | dim.z = layer->c; |
| | | |
| | | ai2_flt_to_bin(layer->binary_weights, layer->weights, dim); |
| | | |
| | | // Calculate alpha |
| | | if (layer->alpha != NULL) |
| | | free(layer->alpha); |
| | | |
| | | layer->alpha = (float *) calloc (dim.x * dim.y, sizeof(float)); |
| | | // layer->weights is already transposed to (z,x,y) from ai2_flt_to_bin() |
| | | ai2_calc_alpha(layer->alpha, layer->weights, dim); |
| | | } |
| | | } |
| | | |
| | | void ai2_setBinWeights(ai2_bin_conv_layer *layer, BINARY_WORD *new_weights) { |
| | | if (new_weights != NULL) { |
| | | if (layer->binary_weights != NULL) |
| | | free(layer->binary_weights); |
| | | layer->binary_weights = new_weights; |
| | | } |
| | | } |
| | | |
| | | void ai2_setFltOutput(ai2_bin_conv_layer *layer, float *new_output) { |
| | | if (new_output != NULL) { |
| | | if (layer->output != NULL) |
| | | free(layer->output); |
| | | layer->output = new_output; |
| | | } |
| | | } |
| | | |
| | | void ai2_setBinOutput(ai2_bin_conv_layer *layer, BINARY_WORD *new_output) { |
| | | if (new_output != NULL) { |
| | | if (layer->binary_output != NULL) |
| | | free(layer->binary_output); |
| | | layer->binary_output = new_output; |
| | | } |
| | | } |
| | | |
| | | void ai2_setFltAlpha(ai2_bin_conv_layer *layer, float *new_alpha) { |
| | | if (new_alpha != NULL) { |
| | | if (layer->alpha != NULL) |
| | | free(layer->alpha); |
| | | layer->alpha = new_alpha; |
| | | } |
| | | } |
| | | |
| | | void ai2_setFltBeta(ai2_bin_conv_layer *layer, float *new_beta) { |
| | | if (new_beta != NULL) { |
| | | if (layer->beta != NULL) |
| | | free(layer->beta); |
| | | layer->beta = new_beta; |
| | | } |
| | | } |
| | | |
| | | void ai2_setFltNewBeta(ai2_bin_conv_layer *layer, float *new_new_beta) { |
| | | if (new_new_beta != NULL) { |
| | | if (layer->new_beta != NULL) |
| | | free(layer->new_beta); |
| | | layer->new_beta = new_new_beta; |
| | | } |
| | | } |
| | | |
| | | float* ai2_getFltOutput(ai2_bin_conv_layer *layer) { |
| | | //if (layer->output != NULL && layer->binary_output != NULL) { |
| | | if (layer->output != NULL) { |
| | | |
| | | // The idea here was that all intermediate states are stored in the binary output. |
| | | // Whenever the user needs the real-valued output, the conversion happens at this function call. |
| | | //dim3 dim; |
| | | //dim.x = layer->px; |
| | | //dim.y = layer->py; |
| | | //dim.z = layer->batch; |
| | | //ai2_bin_to_flt(layer->output, layer->binary_output, dim); |
| | | |
| | | return layer->output; |
| | | } |
| | | else |
| | | return NULL; |
| | | } |
| | | |
| | | void ai2_transpose3D(float *data, dim3 d) { |
| | | // Slow transpose for correctness |
| | | |
| | | // (x,y,z) becomes (z,x,y). Requires two transposes: |
| | | // (x,y,z) -> (x,z,y). |
| | | // (x,z,y) -> (z,x,y). |
| | | |
| | | // Intermediate buffer |
| | | float *new_data = (float *) calloc (d.x * d.y * d.z, sizeof(float)); |
| | | |
| | | // Transpose y and z axis. |
| | | // (x,y,z) -> (x,z,y); |
| | | for (int y = 0; y < d.y; ++y) { |
| | | for (int z = 0; z < d.z; ++z) { |
| | | for (int x = 0; x < d.x; ++x) { |
| | | new_data[y*d.x*d.z + z*d.x + x] = data[z*d.x*d.y + y*d.x + x]; |
| | | //new_data[z*d.y*d.x + y*d.x + x] = data[y*d.x*d.z + z*d.x + x]; |
| | | } |
| | | } |
| | | } |
| | | |
| | | // Transpose x and z axis. |
| | | // (x,z,y) -> (z,x,y) |
| | | for (int y = 0; y < d.y; ++y) { |
| | | for (int x = 0; x < d.x; ++x) { |
| | | for (int z = 0; z < d.z; ++z) { |
| | | data[y*d.z*d.x + x*d.z + z] = new_data[y*d.x*d.z + x + z*d.x]; |
| | | } |
| | | } |
| | | } |
| | | |
| | | free(new_data); |
| | | } |
| | | |
| | | int ai2_isFloatWhole(float f) { // TODO unit test |
| | | return (ceilf(f) == f) ? 1 : 0; |
| | | } |
| | | |
| | | /* @brief Initialize and create all memory arrays for this layer |
| | | * b - batches (number of filter batches) |
| | | * c - input channels |
| | | * ix - input width |
| | | * iy - input height |
| | | * wx - weight/filter width |
| | | * wy - weight/filter height |
| | | * s - stride between sliding windows |
| | | * pad - the amount of padding |
| | | */ |
| | | ai2_bin_conv_layer ai2_make_bin_conv_layer(int b, int c, int ix, int iy, int wx, int wy, int s, int pad) { |
| | | // http://cs231n.github.io/convolutional-networks/ |
| | | // See: spatial arrangement section for determining what the output size will be |
| | | float output_size = ((ix - wx + 2 * pad) / s) + 1; |
| | | if (ai2_isFloatWhole(output_size) == 0) { |
| | | fprintf(stderr, "ERROR! conv layer of (b,c,ix,iy,s,pad) = (%d, %d, %d, %d, %d, %d) will give " |
| | | " invalid output dimension: %fx%f\n", b, c, ix, iy, s, pad, output_size, output_size); |
| | | exit(1); |
| | | } |
| | | |
| | | // TODO: Support strided output |
| | | if (s != 1) { |
| | | fprintf(stderr, "ERROR! Only stride values of 1 is supported\n"); |
| | | exit(1); |
| | | } |
| | | |
| | | // padded input size |
| | | int px = (int) ix + 2*pad; |
| | | int py = (int) iy + 2*pad; |
| | | |
| | | ai2_bin_conv_layer l = {0}; // initialize all to 0 |
| | | l.input = (float *) calloc (c * px * py, sizeof(float)); // is padded |
| | | l.binary_input = (BINARY_WORD *) calloc (c * px * py / BITS_PER_BINARY_WORD, sizeof(BINARY_WORD)); // is padded |
| | | |
| | | dim3 dim; |
| | | dim.x = px; |
| | | dim.y = py; |
| | | dim.z = c; |
| | | ai2_flt_to_bin(l.binary_input, l.input, dim); |
| | | |
| | | l.weights = (float *) calloc (b * c * wx * wy, sizeof(float)); |
| | | l.binary_weights = (BINARY_WORD *) calloc (b * c * wx * wy / BITS_PER_BINARY_WORD, sizeof(BINARY_WORD)); |
| | | |
| | | l.output = (float *) calloc (c * px * py, sizeof(float)); // is padded |
| | | l.new_beta = (float *) calloc(px * py, sizeof(float)); // is padded |
| | | |
| | | l.batch = b; |
| | | l.c = c; |
| | | l.h = iy; |
| | | l.w = ix; |
| | | l.stride = s; |
| | | l.pad = pad; |
| | | l.px = px; |
| | | l.py = py; |
| | | l.wx = wx; |
| | | l.wy = wy; |
| | | |
| | | // The following parameters are uninitialized and should be set elsewhere: |
| | | // l.beta - padded |
| | | // l.alpha - not padded |
| | | |
| | | return l; |
| | | } |
| | | |
| | | void ai2_free_bin_conv_layer(ai2_bin_conv_layer *layer) { |
| | | if (layer->input) free (layer->input); |
| | | if (layer->binary_input) free(layer->binary_input); |
| | | if (layer->weights) free (layer->weights); |
| | | if (layer->binary_weights) free(layer->binary_weights); |
| | | if (layer->output) free(layer->output); |
| | | if (layer->binary_output) free (layer->binary_output); |
| | | if (layer->alpha) free(layer->alpha); |
| | | if (layer->beta) free(layer->beta); |
| | | if (layer->new_beta) free(layer->new_beta); |
| | | } |
| | | |
| | | void ai2_throw_error(char *str) { |
| | | fprintf(stderr, "ERROR: %s\n", str); |
| | | exit(1); |
| | | } |
| | | |
| | | void ai2_bin_forward(ai2_bin_conv_layer *l) { |
| | | if (l->input == NULL) ai2_throw_error("Input was not allocated and set in this layer"); |
| | | if (l->weights == NULL) ai2_throw_error("Weights was not allocated and set in this layer"); |
| | | if (l->output == NULL) ai2_throw_error("Output was not allocated and set in this layer"); |
| | | if (l->alpha == NULL) ai2_throw_error("Alpha was not allocated and set in this layer"); |
| | | if (l->beta == NULL) ai2_throw_error("Beta was not allocated and set in this layer"); |
| | | |
| | | if (l->c % 32 != 0) ai2_throw_error("Channel is not divisible by 32. Need to implement mask " |
| | | "before supporting arbitrary channel size. For now, " |
| | | "set the channel size to the nearest multiple of 32 " |
| | | "and ignore any ''extra'' channels unused."); |
| | | |
| | | l->c /= BITS_PER_BINARY_WORD; // For compensating with doing more work per word |
| | | |
| | | float *output = l->output; |
| | | float *alpha = l->alpha; |
| | | float *beta = l->beta; |
| | | int px = l->px; |
| | | int py = l->py; |
| | | BINARY_WORD *binary_weights = l->binary_weights; |
| | | |
| | | for (int z = 0; z < l->batch; ++z) { // for each filter map |
| | | BINARY_WORD *binary_input = l->binary_input; |
| | | for (int c = 0; c < l->c; ++c) { // for each input channel |
| | | ai2_bin_conv2D(output, binary_input, binary_weights, l->w, l->h, l->wx, l->wy, l->pad, l->stride); |
| | | binary_input += px*py; // increment with next 2D plane |
| | | binary_weights += l->wx*l->wy; // increment with next 2D plane |
| | | |
| | | ai2_pointwise_mul_mm(output, beta, px*py); |
| | | ai2_pointwise_mul_mm_2d(output, alpha, l->w, l->h, l->wx, l->wy, l->pad); |
| | | } |
| | | } |
| | | } |
| | | |
| | | // Deprecated |
| | | //double ai2_bin_conv_benchmark(ConvolutionArgs conv_args) { |
| | | // printf("Running Binary Convolution test!\n"); |
| | | // |
| | | // size_t ix, iy, iz, wx, wy, wz, L, stride; |
| | | // ix = conv_args.input.x; |
| | | // iy = conv_args.input.y; |
| | | // iz = conv_args.input.z; |
| | | // wx = conv_args.weights.x; |
| | | // wy = conv_args.weights.y; |
| | | // wz = conv_args.weights.z; |
| | | // L = BITS_PER_BINARY_WORD; |
| | | // stride = 1; |
| | | // |
| | | // printf("Input size (num elements, xyz): %zu %zu %zu\n", ix, iy, iz); |
| | | // printf("Weights size (num elements. xyz): %zu %zu %zu\n", wx, wy, wz); |
| | | // |
| | | // double sz_input_elements = ix * iy * iz; |
| | | // double sz_input_bytes = getSizeBytesBinaryArray(conv_args.input); |
| | | // double sz_weight_bytes = getSizeBytesBinaryArray(conv_args.weights); |
| | | // |
| | | // printf("Input Size (MB): %f\n", sz_input_bytes / (1 << 20)); |
| | | // printf("Weight Size (MB): %f\n", sz_weight_bytes / (1 << 20)); |
| | | // |
| | | // BINARY_WORD *binary_input = mallocBinaryVolume(conv_args.input); |
| | | // BINARY_WORD *binary_weights = mallocBinaryVolume(conv_args.weights); |
| | | // BINARY_WORD *b_input = binary_input; // alias |
| | | // BINARY_WORD *b_weight = binary_weights; // alias |
| | | // float *output = mallocFloatVolume(conv_args.output); |
| | | // float *output_ptr = output; |
| | | // float *beta = (float *) malloc(sizeof(float) * ix * iy); // we assume beta is given to us |
| | | // float *alpha = (float *) malloc(sizeof(float) * wx * wy); // we assume alpha is given to us |
| | | // float *new_output = mallocFloatVolume(conv_args.output); |
| | | // //float *new_output_ptr = new_output; |
| | | // float *new_beta = (float *) malloc(sizeof(float) * ix * iy); |
| | | // //float *new_beta_ptr = new_beta; |
| | | // |
| | | // // Scale number of computations because we're packing. |
| | | // // After this point, you should not have to reason about input dimensions for input and weights. |
| | | // iz /= BITS_PER_BINARY_WORD; |
| | | // wz /= BITS_PER_BINARY_WORD; |
| | | // |
| | | // // Calculate time taken by a request |
| | | // struct timeval start_time; |
| | | // gettimeofday(&start_time, NULL); |
| | | // |
| | | // // Preprocessing |
| | | // int pad = wx/2; |
| | | // |
| | | // for (int z = 0; z < iz; ++z) { // number of channels |
| | | // ai2_bin_conv2D(output_ptr, b_input, b_weight, ix, iy, wx, wy, pad, stride); |
| | | // b_input += ix*iy; // increment with next 2D plane |
| | | // b_weight += wx*wy; // increment with next 2D plane |
| | | // |
| | | // ai2_pointwise_mul_mm(output_ptr, beta, ix*iy); |
| | | // ai2_pointwise_mul_mm_2d(output_ptr, alpha, ix, iy, wx, wy, pad); |
| | | // } |
| | | // |
| | | // // copy to new array (need to wrap this around); TODO. |
| | | // struct timeval end_time; |
| | | // gettimeofday(&end_time, NULL); |
| | | // |
| | | // struct timeval diff_time; |
| | | // timersub(&end_time, &start_time, &diff_time); |
| | | // double time_conv_s = diff_time.tv_sec + diff_time.tv_usec * 1e-6; |
| | | // double time_conv_ms = time_conv_s * 1000.0; |
| | | // |
| | | // double model_ops = (3*ix*iy*wx*wy*wz/L) + 2*ix*iy + ix*iy*iz; |
| | | // double conv_ops_s = 1e-9 * model_ops / time_conv_s; |
| | | // double conv_bandwidth_gb_s = 1e-9 * sz_input_bytes / (time_conv_ms / 1000.0); |
| | | // double conv_bandwidth_gelement_s = 1e-9 * sz_input_elements / (time_conv_ms / 1000.0); |
| | | // |
| | | // printf("Execution Time (ms): %f\n", time_conv_ms); |
| | | // printf("Binary Convolution OPS/s (GOPS/s): %f\n", conv_ops_s); |
| | | // printf("Binary Convolution Bandwidth (GB/s): %f\n", conv_bandwidth_gb_s); |
| | | // printf("Binary Convolution Bandwidth (GElements/s): %f\n\n", conv_bandwidth_gelement_s); |
| | | // |
| | | // free(binary_input); |
| | | // free(binary_weights); |
| | | // free(output); |
| | | // free(beta); |
| | | // free(alpha); |
| | | // free(new_output); |
| | | // free(new_beta); |
| | | // |
| | | // return time_conv_ms; |
| | | //} |
| | | |
| | | // double ai2_bin_conv_benchmark(ConvolutionArgs conv_args); |
| | | |
| | | //void benchmark() { |
| | | // int ix, iy, iz, wx, wy, wz; |
| | | // iz = (1 << 9) * BITS_PER_BINARY_WORD; |
| | | // ix = 227; // x == y for square face |
| | | // iy = 227; |
| | | // wx = 3; // x == y for a square face |
| | | // wy = 3; |
| | | // wz = iz; |
| | | // |
| | | // int runs = 1; |
| | | // double accum_binary = 0; |
| | | // double accum_real = 0; |
| | | // ConvolutionArgs conv_args = initArgs(ix, iy, iz, wx, wy, wz); |
| | | // for (int i = 0; i < runs; ++i) { |
| | | // double t_binary_convolve = ai2_bin_conv_benchmark(conv_args); |
| | | // double t_real_convolve = run_convolve2D_real(conv_args); |
| | | // printf("t binary = %lf\n", t_binary_convolve); |
| | | // printf("t real = %lf\n", t_real_convolve); |
| | | // accum_binary += t_binary_convolve; |
| | | // accum_real += t_real_convolve; |
| | | // } |
| | | // |
| | | // accum_binary /= runs; |
| | | // accum_real /= runs; |
| | | // printf("Average convolution pass binary (ms): %lf\n", accum_binary); |
| | | // printf("Average convolution pass flt (ms): %lf\n", accum_real); |
| | | // printf("Speedup (Binary over Real): %lfx\n", accum_real / accum_binary); |
| | | // exit(1); |
| | | //} |
| New file |
| | |
| | | #ifndef AI2_BINARY_CONVOLUTION_H |
| | | #define AI2_BINARY_CONVOLUTION_H |
| | | |
| | | /** @file binary_convolution.h |
| | | * @brief Routines related for approximating convolutions using binary operations |
| | | * |
| | | * @author Carlo C. del Mundo (carlom) |
| | | * @date 05/23/2016 |
| | | */ |
| | | |
| | | #include <stdio.h> |
| | | #include <stdlib.h> |
| | | #include <inttypes.h> |
| | | #include <assert.h> |
| | | #include <limits.h> |
| | | #include <tgmath.h> |
| | | #include <unistd.h> |
| | | #include <stdint.h> |
| | | #include <string.h> |
| | | #include "common.h" |
| | | |
| | | typedef struct { |
| | | int batch; // number of filter batches |
| | | int c; // channels, z |
| | | int h; // height, y |
| | | int w; // width, x |
| | | int stride; |
| | | int pad; |
| | | |
| | | int px; // padded x (use this for striding in padded input and output arrays) |
| | | int py; // padded y (use this for striding in padded input and output arrays) |
| | | int wx; |
| | | int wy; |
| | | |
| | | float *input; // input values |
| | | BINARY_WORD *binary_input; |
| | | |
| | | float *weights; // weight or filter values |
| | | BINARY_WORD *binary_weights; |
| | | |
| | | float *output; // output values |
| | | BINARY_WORD *binary_output; |
| | | |
| | | float *alpha; // we assume alpha is calculated at the beginning of initialization |
| | | float *beta; // we assume beta is given to us |
| | | float *new_beta; // we calculate the new beta for the next layer |
| | | |
| | | struct ai2_bin_conv_layer *next; |
| | | } ai2_bin_conv_layer; |
| | | |
| | | /** @brief Performs a binary convolution using XNOR and POPCOUNT between input and weights |
| | | * |
| | | * @param output A 2D real-valued plane to store the outputs |
| | | * @param input A 2D binary-valued plane that holds the inputs |
| | | * @param weights A 2D binary-valued plane that holds the weights |
| | | * @param ix the input's x dimension |
| | | * @param iy the input's y dimensions |
| | | * @param wx the weight's x dimension |
| | | * @param wy the weight's y dimension |
| | | * @param pad the amount of padding applied to input. (ix+2*pad is the x dimension of the input |
| | | * @param stride NOP. TODO: implement stride. the stride between sliding windows |
| | | * @return the count of all overlapping set bits between the two volumes. |
| | | */ |
| | | void ai2_bin_conv2D(float *output, const BINARY_WORD *input, const BINARY_WORD *weights, int ix, int iy, int wx, int wy, int pad, int stride); |
| | | |
| | | /** @brief Performs a binary dot product (XNOR and POPCOUNT) for two equal sized volumes. |
| | | * |
| | | * @param a A 3D binary tensor |
| | | * @param b A 3D binary tensor |
| | | * @param vdim the dimensionality of the data. Note: we pack 32 elements in the Z element. |
| | | * @return the count of all overlapping set bits between the two volumes. |
| | | */ |
| | | int ai2_bin_dp(BINARY_WORD *a, BINARY_WORD *b, dim3 vdim); |
| | | |
| | | /** @brief Calculates the alpha plane given an alpha volume. |
| | | * |
| | | * Each point in the yz alpha plane |
| | | * is the average sum of the absolute value of all elements in the z-direction. |
| | | * |
| | | * Pre-conditions: |
| | | * alpha_volume is an array of size x*y*z. |
| | | * alpha_plane is an array of size x*y. |
| | | * alpha_volume (x,y,z) is transposed to (z,x,y). |
| | | * |
| | | * @param alpha_plane The 2D real-valued output plane |
| | | * @param alpha_volume The 3D real-valued output volume |
| | | * @param vdim the dimensionality of alpha_volume. |
| | | */ |
| | | void ai2_calc_alpha(float *alpha_plane, float *alpha_volume, dim3 vdim); |
| | | |
| | | /** @brief Wrapper function for generating the beta scaling factor */ |
| | | void ai2_calc_beta(float *beta_plane, float *beta_volume, dim3 vdim); |
| | | |
| | | /** @brief Set the bit in a binary word */ |
| | | void ai2_bitset(BINARY_WORD *bword, unsigned int position); |
| | | |
| | | /** @brief Checks that the bit is set in a binary word */ |
| | | int ai2_is_set(BINARY_WORD bword, unsigned int position) ; |
| | | |
| | | /** @brief Converts a 3D float tensor into a 3D binary tensor. |
| | | * |
| | | * The value of the ith element in the binary tensor is the sign |
| | | * of the ith element in the floating tensor. |
| | | * |
| | | * @param binary_vol the binary tensor |
| | | * @param real_vol the real tensor |
| | | * @param vdim the size of the 3D tensor |
| | | */ |
| | | void ai2_flt_to_bin(BINARY_WORD *binary_vol, float *real_vol, dim3 vdim) ; |
| | | |
| | | /** @brief Converts a 3D binary tensor into a 3D float tensor. |
| | | * |
| | | * The ith float element will be '1' if the ith binary element is '1'. |
| | | * Otherwise, the float element will be '-1'. |
| | | * |
| | | * @param real_vol the output real tensor |
| | | * @param binary_vol the input binary tensor |
| | | * @param vdim the dimension of both binary_vol and real_vol |
| | | */ |
| | | void ai2_bin_to_flt(float *real_vol, BINARY_WORD *binary_vol, dim3 vdim); |
| | | |
| | | /** @brief Performs a pointwise matrix multication between two 2D tensors |
| | | * @param output A 2D real-valued plane to store the outputs |
| | | * @param input A 2D binary-valued plane that holds the inputs |
| | | * @param N the number of elements between the arrays |
| | | */ |
| | | void ai2_pointwise_mul_mm(float *output, const float *input, int N); |
| | | |
| | | /** @brief Performs a tiled pointwise matrix multiplication between two 2D tensors |
| | | * |
| | | * Pre-conditions: wx < ix, and wy < iy |
| | | * |
| | | * @param output A 2D real-valued plane of size ix, iy |
| | | * @param alpha A 2D binary-valued plane of size wx, wy |
| | | * @param ix the output's x dimension |
| | | * @param iy the output's y dimensions |
| | | * @param wx the alpha's x dimension |
| | | * @param wy the alpha's y dimension |
| | | * @param pad how many cells are padded, adds 2*pad to the borders of the image |
| | | */ |
| | | void ai2_pointwise_mul_mm_2d(float *output, const float *alpha, int ix, int iy, int wx, int wy, int pad); |
| | | |
| | | // -------------------------------------- |
| | | // SETTER FUNCTIONS |
| | | // -------------------------------------- |
| | | /** @brief Safe function to set the float input of a conv_layer |
| | | */ |
| | | void ai2_setFltInput(ai2_bin_conv_layer *layer, float *new_input); |
| | | |
| | | /** @brief Safe function to set the binary input of a conv_layer |
| | | */ |
| | | void ai2_setBinInput(ai2_bin_conv_layer *layer, BINARY_WORD *new_input); |
| | | |
| | | /** @brief Safe function to set the binary weights of a conv_layer |
| | | */ |
| | | void ai2_setFltWeights(ai2_bin_conv_layer *layer, float *new_weights); |
| | | |
| | | /** @brief Safe function to set the binary weights of a conv_layer |
| | | */ |
| | | void ai2_setBinWeights(ai2_bin_conv_layer *layer, BINARY_WORD *new_weights); |
| | | |
| | | /** @brief Safe function to set the binary outputs of a conv_layer |
| | | */ |
| | | void ai2_setFltOutput(ai2_bin_conv_layer *layer, float *new_output); |
| | | |
| | | /** @brief Safe function to set the binary outputs of a conv_layer |
| | | */ |
| | | void ai2_setBinOutput(ai2_bin_conv_layer *layer, BINARY_WORD *new_output); |
| | | |
| | | /** @brief Safe function to set the alpha of a conv_layer |
| | | */ |
| | | void ai2_setFltAlpha(ai2_bin_conv_layer *layer, float *new_alpha); |
| | | |
| | | /** @brief Safe function to set the beta of a conv_layer |
| | | */ |
| | | void ai2_setFltBeta(ai2_bin_conv_layer *layer, float *new_beta); |
| | | |
| | | /** @brief Safe function to set the new_beta of a conv_layer |
| | | */ |
| | | void ai2_setFltNewBeta(ai2_bin_conv_layer *layer, float *new_new_beta); |
| | | |
| | | // -------------------------------------- |
| | | // GETTER FUNCTIONS |
| | | // -------------------------------------- |
| | | /** @brief Safe function to get the float outputs of a conv_layer |
| | | */ |
| | | float * ai2_getFltOutput(ai2_bin_conv_layer *layer); |
| | | |
| | | /** @brief 3D tranpose from (x,y,z) to (z,y,x) |
| | | * @return a new pointer with the transposed matrix |
| | | */ |
| | | void ai2_transpose3D(float *data, dim3 d); |
| | | |
| | | /** @brief Checks if a float is a whole number (e.g., an int) |
| | | */ |
| | | int ai2_isFloatWhole(float f); |
| | | |
| | | /* @brief Allocates all memory objects in an ai2_bin_conv_layer |
| | | * b - batches (number of filter batches) |
| | | * c - input channels |
| | | * ix - input width |
| | | * iy - input height |
| | | * wx - weight/filter width |
| | | * wy - weight/filter height |
| | | * s - stride between sliding windows |
| | | * pad - the amount of padding |
| | | */ |
| | | ai2_bin_conv_layer ai2_make_bin_conv_layer(int b, int c, int ix, int iy, int wx, int wy, int s, int pad); |
| | | |
| | | /* @brief Safe deallocation of all memory objects in an ai2_bin_conv_layer |
| | | */ |
| | | void ai2_free_bin_conv_layer(ai2_bin_conv_layer *layer); |
| | | |
| | | /* @brief Given real-valued filter data and a conv layer, performs a forward pass |
| | | */ |
| | | void ai2_bin_forward(ai2_bin_conv_layer *layer); |
| | | |
| | | #endif |
| New file |
| | |
| | | #include "common.h" |
| | | |
| | | // Returns the time in ms |
| | | double getElapsedTime(Timer *timer) { |
| | | // Calculate time it took in seconds |
| | | double accum_ms = ( timer->requestEnd.tv_sec - timer->requestStart.tv_sec ) |
| | | + ( timer->requestEnd.tv_nsec - timer->requestStart.tv_nsec ) |
| | | / 1e6; |
| | | return accum_ms; |
| | | } |
| | | |
| | | void start_timer(Timer *timer) { |
| | | clock_gettime(CLOCK_MONOTONIC_RAW, &(timer->requestStart)); |
| | | } |
| | | |
| | | void stop_timer(Timer *timer) { |
| | | clock_gettime(CLOCK_MONOTONIC_RAW, &(timer->requestEnd)); |
| | | } |
| | | |
| | | |
| | | BINARY_WORD * mallocBinaryVolume(dim3 vol) { |
| | | return (BINARY_WORD *) malloc (vol.x * vol.y * vol.z / BITS_PER_BINARY_WORD * sizeof(BINARY_WORD)); |
| | | } |
| | | |
| | | float * mallocFloatVolume(dim3 vol) { |
| | | return (float *) malloc (vol.x * vol.y * vol.z * sizeof(float)); |
| | | } |
| | | |
| | | // Returns the size (in bytes) of a binary array with dimensions stored in conv_args |
| | | double getSizeBytesBinaryArray(dim3 conv_args) { |
| | | return conv_args.x * conv_args.y * conv_args.z * sizeof(BINARY_WORD) / (BITS_PER_BINARY_WORD); |
| | | } |
| | | |
| | | |
| | | ConvolutionArgs initArgs(size_t ix, size_t iy, size_t iz, size_t wx, size_t wy, size_t wz) { |
| | | ConvolutionArgs conv_args; |
| | | // Input Volume |
| | | conv_args.input.x = ix; // x == y for a square face |
| | | conv_args.input.y = iy; |
| | | conv_args.input.z = iz; |
| | | conv_args.weights.x = wx; // x == y for square face |
| | | conv_args.weights.y = wy; |
| | | conv_args.weights.z = wz; |
| | | |
| | | // <!-- DO NOT MODIFY --> |
| | | // Intermediate Volumes |
| | | conv_args.alpha_plane.x = conv_args.weights.x; |
| | | conv_args.alpha_plane.y = conv_args.weights.y; |
| | | conv_args.alpha_plane.z = 1; |
| | | |
| | | conv_args.beta_plane.x = 1; |
| | | conv_args.beta_plane.y = conv_args.input.y; |
| | | conv_args.beta_plane.z = conv_args.input.z; |
| | | |
| | | conv_args.gamma_plane.x = conv_args.input.x * conv_args.weights.x; |
| | | conv_args.gamma_plane.y = conv_args.input.y * conv_args.weights.y; |
| | | conv_args.gamma_plane.z = 1; |
| | | |
| | | conv_args.zeta_plane.x = conv_args.gamma_plane.x; |
| | | conv_args.zeta_plane.y = conv_args.gamma_plane.y; |
| | | conv_args.zeta_plane.z = 1; |
| | | |
| | | // Output Volume |
| | | conv_args.output.x = conv_args.input.x; |
| | | conv_args.output.y = conv_args.input.y; |
| | | conv_args.output.z = 1; // Output should be a 2D plane |
| | | |
| | | // Verify dimensions |
| | | //assert(conv_args.weights.x % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints |
| | | // assert(conv_args.weights.y % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints |
| | | assert(conv_args.weights.z % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints |
| | | //assert(conv_args.input.x % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints |
| | | // assert(conv_args.input.y % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints |
| | | assert(conv_args.input.z % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints |
| | | assert(conv_args.weights.x <= conv_args.input.x); |
| | | assert(conv_args.weights.y <= conv_args.input.y); |
| | | assert(conv_args.weights.z <= conv_args.input.z); |
| | | // <!-- DO NOT MODIFY --> |
| | | |
| | | return conv_args; |
| | | } |
| New file |
| | |
| | | #ifndef AI2_COMMON_H |
| | | #define AI2_COMMON_H |
| | | |
| | | #include <time.h> |
| | | #include <stdlib.h> |
| | | #include <stdio.h> |
| | | #include <inttypes.h> |
| | | #include <assert.h> |
| | | #include <limits.h> |
| | | #include <tgmath.h> |
| | | #include <unistd.h> |
| | | #include <stdint.h> |
| | | //#include <gperftools/profiler.h> |
| | | #include <sys/time.h> |
| | | |
| | | typedef uint32_t BINARY_WORD; |
| | | #define BITS_PER_BINARY_WORD (sizeof(BINARY_WORD) * CHAR_BIT) |
| | | |
| | | typedef struct{ |
| | | struct timespec requestStart; |
| | | struct timespec requestEnd; |
| | | } Timer; |
| | | |
| | | typedef struct { |
| | | size_t x; |
| | | size_t y; |
| | | size_t z; |
| | | } dim3; |
| | | |
| | | typedef struct { |
| | | dim3 weights; |
| | | dim3 input; |
| | | dim3 output; |
| | | dim3 alpha_plane; |
| | | dim3 beta_plane; |
| | | dim3 gamma_plane; |
| | | dim3 zeta_plane; |
| | | } ConvolutionArgs; |
| | | |
| | | // Timer stuff |
| | | double getElapsedTime(Timer *timer); // Returns the time in ms |
| | | void start_timer(Timer *timer); |
| | | void stop_timer(Timer *timer); |
| | | |
| | | BINARY_WORD * mallocBinaryVolume(dim3 vol); |
| | | float * mallocFloatVolume(dim3 vol); |
| | | ConvolutionArgs initArgs(size_t ix, size_t iy, size_t iz, size_t wx, size_t wy, size_t wz); |
| | | double getSizeBytesBinaryArray(dim3 conv_args); |
| | | |
| | | #endif |
| | |
| | | #include <stdio.h> |
| | | #include <time.h> |
| | | |
| | | #ifndef AI2 |
| | | #define AI2 0 |
| | | #endif |
| | | |
| | | void swap_binary(convolutional_layer *l) |
| | | { |
| | | float *swap = l->filters; |
| | |
| | | #endif |
| | | } |
| | | |
| | | void binarize_filters2(float *filters, int n, int size, char *binary, float *scales) |
| | | { |
| | | int i, k, f; |
| | | for(f = 0; f < n; ++f){ |
| | | float mean = 0; |
| | | for(i = 0; i < size; ++i){ |
| | | mean += fabs(filters[f*size + i]); |
| | | } |
| | | mean = mean / size; |
| | | scales[f] = mean; |
| | | for(i = 0; i < size/8; ++i){ |
| | | binary[f*size + i] = (filters[f*size + i] > 0) ? 1 : 0; |
| | | for(k = 0; k < 8; ++k){ |
| | | } |
| | | } |
| | | } |
| | | } |
| | | |
| | | void binarize_filters(float *filters, int n, int size, float *binary) |
| | | { |
| | | int i, f; |
| | |
| | | } |
| | | } |
| | | |
| | | void binarize_input(float *input, int n, int size, float *binary) |
| | | { |
| | | int i, s; |
| | | for(s = 0; s < size; ++s){ |
| | | float mean = 0; |
| | | for(i = 0; i < n; ++i){ |
| | | mean += fabs(input[i*size + s]); |
| | | } |
| | | mean = mean / n; |
| | | for(i = 0; i < n; ++i){ |
| | | binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean; |
| | | } |
| | | } |
| | | } |
| | | |
| | | int convolutional_out_height(convolutional_layer l) |
| | | { |
| | | int h = l.h; |
| | |
| | | } |
| | | |
| | | size_t get_workspace_size(layer l){ |
| | | #ifdef CUDNN |
| | | #ifdef CUDNN |
| | | size_t most = 0; |
| | | size_t s = 0; |
| | | cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(), |
| | |
| | | &s); |
| | | if (s > most) most = s; |
| | | return most; |
| | | #else |
| | | #else |
| | | return (size_t)l.out_h*l.out_w*l.size*l.size*l.c*sizeof(float); |
| | | #endif |
| | | #endif |
| | | } |
| | | |
| | | convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int binary, int xnor) |
| | |
| | | l.c = c; |
| | | l.n = n; |
| | | l.binary = binary; |
| | | l.xnor = xnor; |
| | | l.batch = batch; |
| | | l.stride = stride; |
| | | l.size = size; |
| | |
| | | l.cfilters = calloc(c*n*size*size, sizeof(char)); |
| | | l.scales = calloc(n, sizeof(float)); |
| | | } |
| | | if(xnor){ |
| | | l.binary_filters = calloc(c*n*size*size, sizeof(float)); |
| | | l.binary_input = calloc(l.inputs*l.batch, sizeof(float)); |
| | | } |
| | | |
| | | if(batch_normalize){ |
| | | l.scales = calloc(n, sizeof(float)); |
| | |
| | | l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size); |
| | | l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch); |
| | | } |
| | | l.xnor = xnor; |
| | | |
| | | if(batch_normalize){ |
| | | l.mean_gpu = cuda_make_array(l.mean, n); |
| | |
| | | |
| | | l->delta_gpu = cuda_make_array(l->delta, l->batch*out_h*out_w*l->n); |
| | | l->output_gpu = cuda_make_array(l->output, l->batch*out_h*out_w*l->n); |
| | | #ifdef CUDNN |
| | | #ifdef CUDNN |
| | | cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); |
| | | cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); |
| | | cudnnSetFilter4dDescriptor(l->dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); |
| | |
| | | CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, |
| | | 0, |
| | | &l->bf_algo); |
| | | #endif |
| | | #endif |
| | | #endif |
| | | l->workspace_size = get_workspace_size(*l); |
| | | } |
| | |
| | | int out_w = convolutional_out_width(l); |
| | | int i; |
| | | |
| | | |
| | | fill_cpu(l.outputs*l.batch, 0, l.output, 1); |
| | | |
| | | /* |
| | | if(l.binary){ |
| | | binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.binary_filters); |
| | |
| | | } |
| | | */ |
| | | |
| | | /* |
| | | if(l.binary){ |
| | | int m = l.n; |
| | | int k = l.size*l.size*l.c; |
| | | int n = out_h*out_w; |
| | | /* |
| | | if(l.binary){ |
| | | int m = l.n; |
| | | int k = l.size*l.size*l.c; |
| | | int n = out_h*out_w; |
| | | |
| | | char *a = l.cfilters; |
| | | char *a = l.cfilters; |
| | | float *b = state.workspace; |
| | | float *c = l.output; |
| | | |
| | | for(i = 0; i < l.batch; ++i){ |
| | | im2col_cpu(state.input, l.c, l.h, l.w, |
| | | l.size, l.stride, l.pad, b); |
| | | gemm_bin(m,n,k,1,a,k,b,n,c,n); |
| | | c += n*m; |
| | | state.input += l.c*l.h*l.w; |
| | | } |
| | | scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w); |
| | | add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); |
| | | activate_array(l.output, m*n*l.batch, l.activation); |
| | | return; |
| | | } |
| | | */ |
| | | |
| | | if(l.xnor && (l.c%32 != 0 || !AI2)){ |
| | | binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.binary_filters); |
| | | swap_binary(&l); |
| | | for(i = 0; i < l.batch; ++i){ |
| | | binarize_input(state.input + i*l.inputs, l.c, l.h*l.w, l.binary_input + i*l.inputs); |
| | | } |
| | | state.input = l.binary_input; |
| | | } |
| | | |
| | | int m = l.n; |
| | | int k = l.size*l.size*l.c; |
| | | int n = out_h*out_w; |
| | | |
| | | if (l.xnor && l.c%32 == 0 && AI2) { |
| | | forward_xnor_layer(l, state); |
| | | printf("xnor\n"); |
| | | } else { |
| | | |
| | | float *a = l.filters; |
| | | float *b = state.workspace; |
| | | float *c = l.output; |
| | | |
| | | for(i = 0; i < l.batch; ++i){ |
| | | im2col_cpu(state.input, l.c, l.h, l.w, |
| | | l.size, l.stride, l.pad, b); |
| | | gemm_bin(m,n,k,1,a,k,b,n,c,n); |
| | | gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); |
| | | c += n*m; |
| | | state.input += l.c*l.h*l.w; |
| | | } |
| | | scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w); |
| | | add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); |
| | | activate_array(l.output, m*n*l.batch, l.activation); |
| | | return; |
| | | } |
| | | */ |
| | | |
| | | int m = l.n; |
| | | int k = l.size*l.size*l.c; |
| | | int n = out_h*out_w; |
| | | |
| | | float *a = l.filters; |
| | | float *b = state.workspace; |
| | | float *c = l.output; |
| | | |
| | | for(i = 0; i < l.batch; ++i){ |
| | | im2col_cpu(state.input, l.c, l.h, l.w, |
| | | l.size, l.stride, l.pad, b); |
| | | gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); |
| | | c += n*m; |
| | | state.input += l.c*l.h*l.w; |
| | | } |
| | | |
| | | if(l.batch_normalize){ |
| | |
| | | add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); |
| | | |
| | | activate_array(l.output, m*n*l.batch, l.activation); |
| | | if(l.binary || l.xnor) swap_binary(&l); |
| | | } |
| | | |
| | | void backward_convolutional_layer(convolutional_layer l, network_state state) |
| | |
| | | float *r_cpu; |
| | | float *h_cpu; |
| | | |
| | | float *binary_input; |
| | | |
| | | size_t workspace_size; |
| | | |
| | | #ifdef GPU |
| | |
| | | } |
| | | } |
| | | } |
| | | binarize_filters2(l.filters, l.n, l.c*l.size*l.size, l.cfilters, l.scales); |
| | | #ifdef GPU |
| | | if(gpu_index >= 0){ |
| | | push_convolutional_layer(l); |
| | |
| | | if (l.flipped) { |
| | | transpose_matrix(l.filters, l.c*l.size*l.size, l.n); |
| | | } |
| | | if (l.binary) binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.filters); |
| | | //if (l.binary) binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.filters); |
| | | #ifdef GPU |
| | | if(gpu_index >= 0){ |
| | | push_convolutional_layer(l); |
| New file |
| | |
| | | #include "xnor_layer.h" |
| | | #include "binary_convolution.h" |
| | | #include "convolutional_layer.h" |
| | | |
| | | layer make_xnor_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalize) |
| | | { |
| | | int i; |
| | | layer l = {0}; |
| | | l.type = XNOR; |
| | | |
| | | l.h = h; |
| | | l.w = w; |
| | | l.c = c; |
| | | l.n = n; |
| | | l.batch = batch; |
| | | l.stride = stride; |
| | | l.size = size; |
| | | l.pad = pad; |
| | | l.batch_normalize = batch_normalize; |
| | | |
| | | l.filters = calloc(c*n*size*size, sizeof(float)); |
| | | l.biases = calloc(n, sizeof(float)); |
| | | |
| | | int out_h = convolutional_out_height(l); |
| | | int out_w = convolutional_out_width(l); |
| | | l.out_h = out_h; |
| | | l.out_w = out_w; |
| | | l.out_c = n; |
| | | l.outputs = l.out_h * l.out_w * l.out_c; |
| | | l.inputs = l.w * l.h * l.c; |
| | | |
| | | l.output = calloc(l.batch*out_h * out_w * n, sizeof(float)); |
| | | |
| | | if(batch_normalize){ |
| | | l.scales = calloc(n, sizeof(float)); |
| | | for(i = 0; i < n; ++i){ |
| | | l.scales[i] = 1; |
| | | } |
| | | |
| | | l.mean = calloc(n, sizeof(float)); |
| | | l.variance = calloc(n, sizeof(float)); |
| | | |
| | | l.rolling_mean = calloc(n, sizeof(float)); |
| | | l.rolling_variance = calloc(n, sizeof(float)); |
| | | } |
| | | |
| | | l.activation = activation; |
| | | |
| | | fprintf(stderr, "XNOR Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n); |
| | | |
| | | return l; |
| | | } |
| | | |
| | | void forward_xnor_layer(const layer l, network_state state) |
| | | { |
| | | int b = l.n; |
| | | int c = l.c; |
| | | int ix = l.w; |
| | | int iy = l.h; |
| | | int wx = l.size; |
| | | int wy = l.size; |
| | | int s = l.stride; |
| | | int pad = l.pad * (l.size/2); |
| | | |
| | | // MANDATORY: Make the binary layer |
| | | ai2_bin_conv_layer al = ai2_make_bin_conv_layer(b, c, ix, iy, wx, wy, s, pad); |
| | | |
| | | // OPTIONAL: You need to set the real-valued input like: |
| | | ai2_setFltInput(&al, state.input); |
| | | // The above function will automatically binarize the input for the layer (channel wise). |
| | | // If commented: using the default 0-valued input. |
| | | |
| | | ai2_setFltWeights(&al, l.filters); |
| | | // The above function will automatically binarize the input for the layer (channel wise). |
| | | // If commented: using the default 0-valued weights. |
| | | |
| | | // MANDATORY: Call forward |
| | | ai2_bin_forward(&al); |
| | | |
| | | // OPTIONAL: Inspect outputs |
| | | float *output = ai2_getFltOutput(&al); // output is of size l.px * l.py where px and py are the padded outputs |
| | | |
| | | memcpy(l.output, output, l.outputs*sizeof(float)); |
| | | // MANDATORY: Free layer |
| | | ai2_free_bin_conv_layer(&al); |
| | | } |
| New file |
| | |
| | | #ifndef XNOR_LAYER_H |
| | | #define XNOR_LAYER_H |
| | | |
| | | #include "layer.h" |
| | | #include "network.h" |
| | | |
| | | layer make_xnor_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalization); |
| | | void forward_xnor_layer(const layer l, network_state state); |
| | | |
| | | #endif |
| | | |
| | |
| | | if (nms) do_nms_sort(boxes, probs, l.side*l.side*l.n, l.classes, nms); |
| | | //draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, voc_names, voc_labels, 20); |
| | | draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, voc_names, voc_labels, 20); |
| | | show_image(im, "predictions"); |
| | | save_image(im, "predictions"); |
| | | show_image(im, "predictions"); |
| | | |
| | | show_image(sized, "resized"); |
| | | free_image(im); |