col2im maybe a little faster
| | |
| | | |
| | | void train_imagenet() |
| | | { |
| | | network net = parse_network_cfg("cfg/imagenet_backup_slowest_2340.cfg"); |
| | | network net = parse_network_cfg("cfg/imagenet_small_830.cfg"); |
| | | printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); |
| | | int imgs = 1000/net.batch+1; |
| | | srand(6472345); |
| | |
| | | |
| | | int main(int argc, char *argv[]) |
| | | { |
| | | int i; |
| | | int ksize = 3; |
| | | int stride = 4; |
| | | int width_col = 20; |
| | | for(i = 0; i < 10; ++i){ |
| | | int start = (i<ksize)?0:(i-ksize)/stride + 1; |
| | | int start2 = (i-ksize+stride)/stride; |
| | | int end = i/stride + 1; |
| | | end = (width_col < end) ? width_col : end; |
| | | printf("%d: %d vs %d, %d\n", i, start,start2, end); |
| | | } |
| | | if(argc != 2){ |
| | | fprintf(stderr, "usage: %s <function>\n", argv[0]); |
| | | return 0; |
| | |
| | | id /= channels; |
| | | int b = id%batch; |
| | | |
| | | int w_start = (w<ksize)?0:(w-ksize)/stride + 1; |
| | | //int w_start = (w<ksize)?0:(w-ksize)/stride + 1; |
| | | int w_start = (w-ksize+stride)/stride; |
| | | int w_end = w/stride + 1; |
| | | w_end = (width_col < w_end) ? width_col : w_end; |
| | | //w_end = (width_col < w_end) ? width_col : w_end; |
| | | |
| | | int h_start = (h<ksize)?0:(h-ksize)/stride+1; |
| | | int h_start = (h-ksize+stride)/stride; |
| | | //int h_start = (h-ksize)/stride+1; |
| | | int h_end = h/stride + 1; |
| | | h_end = (height_col < h_end) ? height_col : h_end; |
| | | //h_end = (height_col < h_end) ? height_col : h_end; |
| | | |
| | | int rows = channels * ksize * ksize; |
| | | int cols = height_col*width_col; |
| | |
| | | int h_col, w_col; |
| | | for(h_col = h_start; h_col < h_end; ++h_col){ |
| | | for(w_col = w_start; w_col < w_end; ++w_col){ |
| | | val += data_col[offset +h_col*h_coeff + w_col*w_coeff]; |
| | | int col_index = offset +h_col*h_coeff + w_col*w_coeff; |
| | | float part = (w_col < 0 || h_col < 0 || h_col >= height_col || w_col >= width_col) ? 0 : data_col[col_index]; |
| | | val += part; |
| | | } |
| | | } |
| | | data_im[index] = val; |
| | |
| | | cl.error = clSetKernelArg(kernel, i++, sizeof(layer.output_cl), (void*) &layer.output_cl); |
| | | check_error(cl); |
| | | |
| | | const size_t global_size[] = {layer.batch, layer.n*size}; |
| | | const size_t global_size[] = {layer.n*size, layer.batch}; |
| | | |
| | | clEnqueueNDRangeKernel(queue, kernel, 2, 0, global_size, 0, 0, 0, 0); |
| | | check_error(cl); |
| | |
| | | |
| | | __kernel void bias(int n, int size, __global float *biases, __global float *output) |
| | | { |
| | | int batch = get_global_id(0); |
| | | int id = get_global_id(1); |
| | | int id = get_global_id(0); |
| | | int batch = get_global_id(1); |
| | | int filter = id/size; |
| | | int position = id%size; |
| | | //int position = id%size; |
| | | |
| | | output[batch*n*size + id] = biases[filter]; |
| | | } |