#include "binary_convolution.h"

int ai2_bin_dp(BINARY_WORD *a, BINARY_WORD *b, dim3 vdim) {     // TODO unroll
    int accumulator = 0;
    for (int z = 0; z < vdim.z / BITS_PER_BINARY_WORD; z++) {
        for (int y = 0; y < vdim.y; y++) {
            for (int x = 0; x < vdim.x; x++) {
                int idx = z*vdim.y*vdim.x + y*vdim.x + x;
                accumulator += __builtin_popcount(~(a[idx] ^ b[idx]));   // count the XNOR of the two bit vectors
            }
        }
    }

    return accumulator;
}

/** 
 * Pre-conditions: 
 *                  alpha_volume is an array of size x*y*z.
 *                  alpha_plane is an array of size x*y.
 *                  alpha_volume (x,y,z) is transposed to (z,x,y).
 */
void ai2_calc_alpha(float *alpha_plane, float *alpha_volume, dim3 vdim) {
    for (int y = 0; y < vdim.y; ++y) {
        for (int x = 0; x < vdim.x; ++x) {
            int out = y * vdim.x + x;
            double accum = 0.0;
            for (int z = 0; z < vdim.z; ++z) {
                accum += alpha_volume[out * vdim.z + z];
            }

            alpha_plane[out] = accum / vdim.z;
        }
    }
}

/** @brief Wrapper function for generating the beta scaling factor */
void ai2_calc_beta(float *beta_plane, float *beta_volume, dim3 vdim) {
    ai2_calc_alpha(beta_plane, beta_volume, vdim);
}

/** @brief Set the bit in a binary word */
void ai2_bitset(BINARY_WORD *bword, unsigned int position) {
    BINARY_WORD mask = (1 << position);
    *bword = *bword | mask;
}

/** @brief Checks that the bit is set in a binary word */
int ai2_is_set(BINARY_WORD bword, unsigned int position) {
    unsigned int position_complement = (BITS_PER_BINARY_WORD - 1) - position;   // number of leading bits before the bit position of interest
    bword = (bword << position_complement);                                     // zero out leading bits
    bword = (bword >> (BITS_PER_BINARY_WORD - 1));                              // shift bit position of interest to the 0th position
    return (bword & 0x1);                                                       // test if bit position of interest is set
}

void ai2_flt_to_bin(BINARY_WORD *binary_vol, float *real_vol, dim3 dim) {
    ai2_transpose3D(real_vol, dim); // (x,y,z) -> (z,x,y)

    int sz = dim.x * dim.y * dim.z;
    for (int i = 0; i < sz; i += BITS_PER_BINARY_WORD) {
        BINARY_WORD tmp = 0x00000000;
        for (int x = 0; x < BITS_PER_BINARY_WORD; ++x) {
            int waddr = x + i;
            if (signbit(real_vol[waddr]) == 0)
                ai2_bitset(&tmp, (BITS_PER_BINARY_WORD - 1) - x);
        }
        binary_vol[i / BITS_PER_BINARY_WORD] = tmp;
    }
}

void ai2_bin_to_flt(float *real_vol, BINARY_WORD *binary_vol, dim3 dim) {   // TODO unit tests
    for (int z = 0; z < dim.z; z++) {
        for (int y = 0; y < dim.y; y++) {
            for (int x = 0; x < dim.x / BITS_PER_BINARY_WORD; x++) {    // TODO boundary checks, for uneven input
                BINARY_WORD word = binary_vol[z*dim.y*dim.x + y*dim.x + x];
                for (int t = 0; t < BITS_PER_BINARY_WORD; ++t) {
                    int oidx = z*dim.y*dim.x + y*dim.x + x * BITS_PER_BINARY_WORD + t;
                    if (ai2_is_set(word, t))
                        real_vol[oidx] = 1.f;
                    else
                        real_vol[oidx] = -1.f;
                }
            }
        }
    }

    // Transpose channels back to output
    ai2_transpose3D(real_vol, dim); // (z,y,x) -> (x,y,z)
}

/* @brief: input is padded.
 */
void ai2_bin_conv2D(float *output, const BINARY_WORD *input, const BINARY_WORD *weights, int ix, int iy, int wx, int wy, int pad, int stride) {

    int r, rd, c, cd;
    int wx_2 = wx / 2;
    int wy_2 = wy / 2;

    // Indexing for output pixels. x = [wx_2, ix + wx_2 - 1], y = [wy_2, iy + wy_2 - 1]
    int sx = pad;               // start x
    int ex = ix + pad - 1;      // end x
    int sy = pad;               // start y
    int ey = iy + pad - 1;      // end y

    // Indexing for weights
    int wsx, wex, wsy, wey;
    if (wx % 2 == 1) {  // odd weights
        wsx = -wx_2; wex = wx_2 + 1;
        wsy = -wy_2; wey = wy_2 + 1;    
    }
    else {
        wsx = -wx_2; wex = wx_2;
        wsy = -wy_2; wey = wy_2;    
    }

    int px = ix + 2*pad;
    //int py = iy + 2*pad;

    for (r = sy; r <= ey; ++r) {
        for (c = sx; c <= ex; ++c) {
            int accumulator = 0;
            for (rd = wsy; rd < wey; ++rd) {
                for (cd = wsx; cd < wex; ++cd) {
                    int iidx = (r+rd)*px + (c+cd);
                    BINARY_WORD pixel = input[iidx];
                    //BINARY_WORD pixel = 0xFFFFFFFF;
                    //BINARY_WORD weight = 0xFFFFFFFF;
                    int widx = (rd + wy_2)*wx + (cd+wx_2);
                    BINARY_WORD weight = weights[widx];
                    accumulator += __builtin_popcount(~(pixel ^ weight));
                }
            }

            // Padded space
            int oidx = r*px + c;
            output[oidx] += (float) accumulator;
        }
    }

    //for (r = sy; r <= ey; ++r) {
    //  for (c = sx; c <= ex; ++c) {
    //      int accumulator = 0;
    //      for (rd = -wy_2; rd < wy_2; ++rd) {
    //          for (cd = -wx_2; cd < wx_2; ++cd) {
    //              int iidx = (r+rd)*px + (c+cd);
    //              BINARY_WORD pixel = input[iidx];
    //              //BINARY_WORD pixel = 0xFFFFFFFF;
    //              //BINARY_WORD weight = 0xFFFFFFFF;
    //              int widx = (rd + wy_2)*wx + (cd+wx_2);
    //              BINARY_WORD weight = weights[widx];
    //              accumulator += __builtin_popcount(~(pixel ^ weight));
    //          }
    //      }

    //      // Padded space
    //      int oidx = r*px + c;
    //      output[oidx] += (float) accumulator;
    //  }
    //}
    
    //ai2_bin_conv_within_boundary(output, input, weights, ix, iy, wx, wy, stride);
    //ai2_bin_conv_borders(output, input, weights, ix, iy, wx, wy, stride);
}

void ai2_pointwise_mul_mm(float *output, const float *input, int N) {
    int i = 0;

    while (i + 8 <= N) {
        output[i+0] *= input[i+0];
        output[i+1] *= input[i+1];
        output[i+2] *= input[i+2];
        output[i+3] *= input[i+3];
        output[i+4] *= input[i+4];
        output[i+5] *= input[i+5];
        output[i+6] *= input[i+6];
        output[i+7] *= input[i+7];

        i += 8;
    }

    while (++i < N) // Finish iteration that's leftover (e.g., last batch not divisible by 8 exactly)
         output[i] *= input[i];
}

/** @brief Performs a tiled pointwise matrix multiplication between two 2D tensors
 *  Pre-conditions: wx < ix, and wy < iy
 */
void ai2_pointwise_mul_mm_2d(float *output, const float *alpha, int ix, int iy, int wx, int wy, int pad) {
    // Slower version
//      for (int y = 0; y < iy; ++y) 
//          for (int x = 0; x < ix; x++)
//              output[y*ix+x] *= input[(y % wy)*wx + (x % wx)];

    // Stride prefetch optimized
    for (int s = 0; s < wy; ++s) {  // for each strip
        const float *strip_ptr = &alpha[s*wx];
        for (int y = pad; y < pad + (iy / wy); ++y) {   //
            int stride = y*((ix+2*pad)*wy) + s*(ix+2*pad);
            float *output_ptr = &output[stride];

            for (int x = 0; x < ix; ++x) {
                output_ptr[x] *= strip_ptr[x % wx];
            }
        }
    }
}

void ai2_setFltInput(ai2_bin_conv_layer *layer, float *new_input) {
    if (new_input != NULL) {
        if (layer->input != NULL)
            free(layer->input);
        layer->input = new_input;

        dim3 dim;
        dim.x = layer->px;
        dim.y = layer->py;
        dim.z = layer->c;

        // Binarize input
        ai2_flt_to_bin(layer->binary_input, layer->input, dim);

        float *new_beta = (float *) calloc (dim.x * dim.y, sizeof(float));
        ai2_setFltBeta(layer, new_beta);

        // layer->input is transposed to (z,x,y) already
        ai2_calc_beta(layer->beta, layer->input, dim);
    }
}

void ai2_setBinInput(ai2_bin_conv_layer *layer, BINARY_WORD *new_input) {
    if (new_input != NULL) {
        if (layer->binary_input != NULL)
            free(layer->binary_input);
        layer->binary_input = new_input;
    }
}

void ai2_setFltWeights(ai2_bin_conv_layer *layer, float *new_weights) {
    if (new_weights != NULL) {
        if (layer->weights != NULL)
            free(layer->weights);
        layer->weights = new_weights;

        dim3 dim;
        dim.x = layer->wx;
        dim.y = layer->wy;
        dim.z = layer->c;

        ai2_flt_to_bin(layer->binary_weights, layer->weights, dim);

        // Calculate alpha
        if (layer->alpha != NULL)
            free(layer->alpha);

        layer->alpha = (float *) calloc (dim.x * dim.y, sizeof(float));
        // layer->weights is already transposed to (z,x,y) from ai2_flt_to_bin()
        ai2_calc_alpha(layer->alpha, layer->weights, dim);
    }
}

void ai2_setBinWeights(ai2_bin_conv_layer *layer, BINARY_WORD *new_weights) {
    if (new_weights != NULL) {
        if (layer->binary_weights != NULL)
            free(layer->binary_weights);
        layer->binary_weights = new_weights;
    }
}

void ai2_setFltOutput(ai2_bin_conv_layer *layer, float *new_output) {
    if (new_output != NULL) {
        if (layer->output != NULL)
            free(layer->output);
        layer->output = new_output;
    }
}

void ai2_setBinOutput(ai2_bin_conv_layer *layer, BINARY_WORD *new_output) {
    if (new_output != NULL) {
        if (layer->binary_output != NULL)
            free(layer->binary_output);
        layer->binary_output = new_output;
    }
}

void ai2_setFltAlpha(ai2_bin_conv_layer *layer, float *new_alpha) {
    if (new_alpha != NULL) {
        if (layer->alpha != NULL)
            free(layer->alpha);
        layer->alpha = new_alpha;
    }
}

void ai2_setFltBeta(ai2_bin_conv_layer *layer, float *new_beta) {
    if (new_beta != NULL) {
        if (layer->beta != NULL)
            free(layer->beta);
        layer->beta = new_beta;
    }
}

void ai2_setFltNewBeta(ai2_bin_conv_layer *layer, float *new_new_beta) {
    if (new_new_beta != NULL) {
        if (layer->new_beta != NULL)
            free(layer->new_beta);
        layer->new_beta = new_new_beta;
    }
}

float* ai2_getFltOutput(ai2_bin_conv_layer *layer) {
    //if (layer->output != NULL && layer->binary_output != NULL) {
    if (layer->output != NULL) {

        // The idea here was that all intermediate states are stored in the binary output. 
        // Whenever the user needs the real-valued output, the conversion happens at this function call.
        //dim3 dim;
        //dim.x = layer->px;
        //dim.y = layer->py;
        //dim.z = layer->batch;
        //ai2_bin_to_flt(layer->output, layer->binary_output, dim);

        return layer->output;
    }
    else
        return NULL;
}

void ai2_transpose3D(float *data, dim3 d) {
    // Slow transpose for correctness

    // (x,y,z) becomes (z,x,y). Requires two transposes:
    //  (x,y,z) -> (x,z,y).
    //  (x,z,y) -> (z,x,y).

    // Intermediate buffer
    float *new_data = (float *) calloc (d.x * d.y * d.z, sizeof(float));

    // Transpose y and z axis.
    // (x,y,z) -> (x,z,y);
    for (int y = 0; y < d.y; ++y) {
        for (int z = 0; z < d.z; ++z) {
            for (int x = 0; x < d.x; ++x) {
                new_data[y*d.x*d.z + z*d.x + x] = data[z*d.x*d.y + y*d.x + x];
                //new_data[z*d.y*d.x + y*d.x + x] = data[y*d.x*d.z + z*d.x + x];
            }
        }
    }

    // Transpose x and z axis.
    //  (x,z,y) -> (z,x,y)
    for (int y = 0; y < d.y; ++y) {
        for (int x = 0; x < d.x; ++x) {
            for (int z = 0; z < d.z; ++z) {
                data[y*d.z*d.x + x*d.z + z] = new_data[y*d.x*d.z + x + z*d.x];
            }
        }
    }

    free(new_data);
}

int ai2_isFloatWhole(float f) { // TODO unit test
    return (ceilf(f) == f) ? 1 : 0;
}

/* @brief Initialize and create all memory arrays for this layer
 * b - batches (number of filter batches)
 * c - input channels
 * ix - input width
 * iy - input height
 * wx - weight/filter width
 * wy - weight/filter height
 * s - stride between sliding windows
 * pad - the amount of padding
 */
ai2_bin_conv_layer ai2_make_bin_conv_layer(int b, int c, int ix, int iy, int wx, int wy, int s, int pad) {
    // http://cs231n.github.io/convolutional-networks/
    //  See: spatial arrangement section for determining what the output size will be
    float output_size = ((ix - wx + 2 * pad) / s) + 1;
    if (ai2_isFloatWhole(output_size) == 0) {
        fprintf(stderr, "ERROR! conv layer of (b,c,ix,iy,s,pad) = (%d, %d, %d, %d, %d, %d) will give "
            " invalid output dimension: %fx%f\n", b, c, ix, iy, s, pad, output_size, output_size);
        exit(1);
    }

    // TODO: Support strided output
    if (s != 1) {
        fprintf(stderr, "ERROR! Only stride values of 1 is supported\n");
        exit(1);
    }

    // padded input size
    int px = (int) ix + 2*pad; 
    int py = (int) iy + 2*pad;

    ai2_bin_conv_layer l = {0}; // initialize all to 0
    l.input = (float *) calloc (c * px * py, sizeof(float));        // is padded
    l.binary_input =   (BINARY_WORD *) calloc (c * px * py / BITS_PER_BINARY_WORD, sizeof(BINARY_WORD));     // is padded

    dim3 dim;
    dim.x = px;
    dim.y = py;
    dim.z = c;
    ai2_flt_to_bin(l.binary_input, l.input, dim);

    l.weights = (float *) calloc (b * c * wx * wy, sizeof(float));  
    l.binary_weights = (BINARY_WORD *) calloc (b * c * wx * wy / BITS_PER_BINARY_WORD, sizeof(BINARY_WORD));

    l.output = (float *) calloc (c * px * py, sizeof(float));   // is padded
    l.new_beta = (float *) calloc(px * py, sizeof(float));      // is padded

    l.batch = b;
    l.c = c;
    l.h = iy;
    l.w = ix;
    l.stride = s;
    l.pad = pad;
    l.px = px;
    l.py = py;
    l.wx = wx;
    l.wy = wy;

    // The following parameters are uninitialized and should be set elsewhere:
    //  l.beta  - padded
    //  l.alpha - not padded

    return l;
}

void ai2_free_bin_conv_layer(ai2_bin_conv_layer *layer) {
    if (layer->input) free (layer->input);
    if (layer->binary_input) free(layer->binary_input);
    if (layer->weights) free (layer->weights);
    if (layer->binary_weights) free(layer->binary_weights);
    if (layer->output) free(layer->output);
    if (layer->binary_output) free (layer->binary_output);
    if (layer->alpha) free(layer->alpha);
    if (layer->beta) free(layer->beta);
    if (layer->new_beta) free(layer->new_beta);
}

void ai2_throw_error(char *str) {
    fprintf(stderr, "ERROR: %s\n", str);
    exit(1);
}

void ai2_bin_forward(ai2_bin_conv_layer *l) {
    if (l->input == NULL) ai2_throw_error("Input was not allocated and set in this layer");
    if (l->weights == NULL) ai2_throw_error("Weights was not allocated and set in this layer");
    if (l->output == NULL) ai2_throw_error("Output was not allocated and set in this layer");
    if (l->alpha == NULL) ai2_throw_error("Alpha was not allocated and set in this layer");
    if (l->beta == NULL) ai2_throw_error("Beta was not allocated and set in this layer");

    if (l->c % 32 != 0) ai2_throw_error("Channel is not divisible by 32. Need to implement mask "
                                        "before supporting arbitrary channel size. For now, "
                                        "set the channel size to the nearest multiple of 32 "
                                        "and ignore any ''extra'' channels unused.");

    l->c /= BITS_PER_BINARY_WORD;   // For compensating with doing more work per word

    float *output = l->output;
    float *alpha = l->alpha;
    float *beta = l->beta;
    int px = l->px;
    int py = l->py;
    BINARY_WORD *binary_weights = l->binary_weights;

    for (int z = 0; z < l->batch; ++z) {    // for each filter map
        BINARY_WORD *binary_input = l->binary_input;
        for (int c = 0; c < l->c; ++c) {    // for each input channel
            ai2_bin_conv2D(output, binary_input, binary_weights, l->w, l->h, l->wx, l->wy, l->pad, l->stride);
            binary_input += px*py;   // increment with next 2D plane
            binary_weights += l->wx*l->wy;       // increment with next 2D plane

            ai2_pointwise_mul_mm(output, beta, px*py);  
            ai2_pointwise_mul_mm_2d(output, alpha, l->w, l->h, l->wx, l->wy, l->pad);
        }
    }
}

// Deprecated
//double ai2_bin_conv_benchmark(ConvolutionArgs conv_args) {
//    printf("Running Binary Convolution test!\n");
//
//    size_t ix, iy, iz, wx, wy, wz, L, stride;
//    ix = conv_args.input.x;
//    iy = conv_args.input.y;
//    iz = conv_args.input.z;
//    wx = conv_args.weights.x;
//    wy = conv_args.weights.y;
//    wz = conv_args.weights.z;
//  L = BITS_PER_BINARY_WORD;
//  stride = 1;
//
//    printf("Input size (num elements, xyz): %zu %zu %zu\n", ix, iy, iz);
//    printf("Weights size (num elements. xyz): %zu %zu %zu\n", wx, wy, wz);
//
//    double sz_input_elements = ix * iy * iz;
//    double sz_input_bytes = getSizeBytesBinaryArray(conv_args.input);
//    double sz_weight_bytes = getSizeBytesBinaryArray(conv_args.weights);
//
//    printf("Input Size (MB): %f\n", sz_input_bytes / (1 << 20));
//    printf("Weight Size (MB): %f\n", sz_weight_bytes / (1 << 20));
//
//    BINARY_WORD *binary_input = mallocBinaryVolume(conv_args.input);
//    BINARY_WORD *binary_weights = mallocBinaryVolume(conv_args.weights);
//    BINARY_WORD *b_input = binary_input;    // alias
//    BINARY_WORD *b_weight = binary_weights; // alias
//    float *output = mallocFloatVolume(conv_args.output);
//  float *output_ptr = output;
//  float *beta =  (float *) malloc(sizeof(float) * ix * iy);   // we assume beta is given to us
//  float *alpha = (float *) malloc(sizeof(float) * wx * wy);   // we assume alpha is given to us
//  float *new_output = mallocFloatVolume(conv_args.output);
//  //float *new_output_ptr = new_output;
//  float *new_beta = (float *) malloc(sizeof(float) * ix * iy);
//  //float *new_beta_ptr = new_beta;
//
//    // Scale number of computations because we're packing.
//    // After this point, you should not have to reason about input dimensions for input and weights.
//    iz /= BITS_PER_BINARY_WORD;
//    wz /= BITS_PER_BINARY_WORD;
//
//    // Calculate time taken by a request
//    struct timeval start_time;
//    gettimeofday(&start_time, NULL);
//
//  // Preprocessing
//  int pad = wx/2;
//
//    for (int z = 0; z < iz; ++z) {    // number of channels
//        ai2_bin_conv2D(output_ptr, b_input, b_weight, ix, iy, wx, wy, pad, stride);
//        b_input += ix*iy;   // increment with next 2D plane
//        b_weight += wx*wy;  // increment with next 2D plane
//
//      ai2_pointwise_mul_mm(output_ptr, beta, ix*iy);
//      ai2_pointwise_mul_mm_2d(output_ptr, alpha, ix, iy, wx, wy, pad);
//    }
//
//  // copy to new array (need to wrap this around); TODO.
//    struct timeval end_time;
//    gettimeofday(&end_time, NULL);
//
//    struct timeval diff_time;
//    timersub(&end_time, &start_time, &diff_time);
//    double time_conv_s = diff_time.tv_sec + diff_time.tv_usec * 1e-6;
//    double time_conv_ms = time_conv_s * 1000.0;
//
//  double model_ops = (3*ix*iy*wx*wy*wz/L) + 2*ix*iy + ix*iy*iz;
//  double conv_ops_s = 1e-9 * model_ops / time_conv_s;
//    double conv_bandwidth_gb_s = 1e-9 * sz_input_bytes / (time_conv_ms / 1000.0);
//    double conv_bandwidth_gelement_s = 1e-9 * sz_input_elements / (time_conv_ms / 1000.0);
//
//    printf("Execution Time (ms): %f\n", time_conv_ms);
//    printf("Binary Convolution OPS/s (GOPS/s): %f\n", conv_ops_s);
//    printf("Binary Convolution Bandwidth (GB/s): %f\n", conv_bandwidth_gb_s);
//    printf("Binary Convolution Bandwidth (GElements/s): %f\n\n", conv_bandwidth_gelement_s);
//
//    free(binary_input);
//    free(binary_weights);
//    free(output);
//  free(beta);
//  free(alpha);
//  free(new_output);
//  free(new_beta);
//
//    return time_conv_ms;
//}

// double ai2_bin_conv_benchmark(ConvolutionArgs conv_args);

//void benchmark() {
//    int ix, iy, iz, wx, wy, wz;
//    iz = (1 << 9) * BITS_PER_BINARY_WORD;
//    ix = 227; // x == y for square face
//    iy = 227;
//    wx = 3;    // x == y for a square face
//    wy = 3;
//    wz = iz;
//
//  int runs = 1;
//  double accum_binary = 0;
//  double accum_real = 0;
//    ConvolutionArgs conv_args = initArgs(ix, iy, iz, wx, wy, wz);
//  for (int i = 0; i < runs; ++i) {
//      double t_binary_convolve = ai2_bin_conv_benchmark(conv_args);
//      double t_real_convolve = run_convolve2D_real(conv_args);
//      printf("t binary = %lf\n", t_binary_convolve);
//      printf("t real = %lf\n", t_real_convolve);
//      accum_binary += t_binary_convolve;
//      accum_real += t_real_convolve;
//  }
//
//  accum_binary /= runs;
//  accum_real /= runs;
//  printf("Average convolution pass binary (ms): %lf\n", accum_binary);
//  printf("Average convolution pass flt (ms): %lf\n", accum_real);
//  printf("Speedup (Binary over Real): %lfx\n", accum_real / accum_binary);    
//  exit(1);
//}