| | |
| | | return layer; |
| | | } |
| | | |
| | | void update_connected_layer(connected_layer layer, float learning_rate, float momentum, float decay) |
| | | void update_connected_layer(connected_layer layer, int batch, float learning_rate, float momentum, float decay) |
| | | { |
| | | axpy_cpu(layer.outputs, learning_rate, layer.bias_updates, 1, layer.biases, 1); |
| | | axpy_cpu(layer.outputs, learning_rate/batch, layer.bias_updates, 1, layer.biases, 1); |
| | | scal_cpu(layer.outputs, momentum, layer.bias_updates, 1); |
| | | |
| | | axpy_cpu(layer.inputs*layer.outputs, -decay, layer.weights, 1, layer.weight_updates, 1); |
| | | axpy_cpu(layer.inputs*layer.outputs, learning_rate, layer.weight_updates, 1, layer.weights, 1); |
| | | axpy_cpu(layer.inputs*layer.outputs, -decay*batch, layer.weights, 1, layer.weight_updates, 1); |
| | | axpy_cpu(layer.inputs*layer.outputs, learning_rate/batch, layer.weight_updates, 1, layer.weights, 1); |
| | | scal_cpu(layer.inputs*layer.outputs, momentum, layer.weight_updates, 1); |
| | | } |
| | | |
| | |
| | | void backward_connected_layer(connected_layer layer, network_state state) |
| | | { |
| | | int i; |
| | | float alpha = 1./layer.batch; |
| | | gradient_array(layer.output, layer.outputs*layer.batch, layer.activation, layer.delta); |
| | | for(i = 0; i < layer.batch; ++i){ |
| | | axpy_cpu(layer.outputs, alpha, layer.delta + i*layer.outputs, 1, layer.bias_updates, 1); |
| | | axpy_cpu(layer.outputs, 1, layer.delta + i*layer.outputs, 1, layer.bias_updates, 1); |
| | | } |
| | | int m = layer.inputs; |
| | | int k = layer.batch; |
| | |
| | | float *a = state.input; |
| | | float *b = layer.delta; |
| | | float *c = layer.weight_updates; |
| | | gemm(1,0,m,n,k,alpha,a,m,b,n,1,c,n); |
| | | gemm(1,0,m,n,k,1,a,m,b,n,1,c,n); |
| | | |
| | | m = layer.batch; |
| | | k = layer.outputs; |
| | |
| | | cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.outputs); |
| | | } |
| | | |
| | | void update_connected_layer_gpu(connected_layer layer, float learning_rate, float momentum, float decay) |
| | | void update_connected_layer_gpu(connected_layer layer, int batch, float learning_rate, float momentum, float decay) |
| | | { |
| | | axpy_ongpu(layer.outputs, learning_rate, layer.bias_updates_gpu, 1, layer.biases_gpu, 1); |
| | | axpy_ongpu(layer.outputs, learning_rate/batch, layer.bias_updates_gpu, 1, layer.biases_gpu, 1); |
| | | scal_ongpu(layer.outputs, momentum, layer.bias_updates_gpu, 1); |
| | | |
| | | axpy_ongpu(layer.inputs*layer.outputs, -decay, layer.weights_gpu, 1, layer.weight_updates_gpu, 1); |
| | | axpy_ongpu(layer.inputs*layer.outputs, learning_rate, layer.weight_updates_gpu, 1, layer.weights_gpu, 1); |
| | | axpy_ongpu(layer.inputs*layer.outputs, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1); |
| | | axpy_ongpu(layer.inputs*layer.outputs, learning_rate/batch, layer.weight_updates_gpu, 1, layer.weights_gpu, 1); |
| | | scal_ongpu(layer.inputs*layer.outputs, momentum, layer.weight_updates_gpu, 1); |
| | | } |
| | | |
| | |
| | | |
| | | void backward_connected_layer_gpu(connected_layer layer, network_state state) |
| | | { |
| | | float alpha = 1./layer.batch; |
| | | int i; |
| | | gradient_array_ongpu(layer.output_gpu, layer.outputs*layer.batch, layer.activation, layer.delta_gpu); |
| | | for(i = 0; i < layer.batch; ++i){ |
| | | axpy_ongpu_offset(layer.outputs, alpha, layer.delta_gpu, i*layer.outputs, 1, layer.bias_updates_gpu, 0, 1); |
| | | axpy_ongpu_offset(layer.outputs, 1, layer.delta_gpu, i*layer.outputs, 1, layer.bias_updates_gpu, 0, 1); |
| | | } |
| | | int m = layer.inputs; |
| | | int k = layer.batch; |
| | |
| | | float * a = state.input; |
| | | float * b = layer.delta_gpu; |
| | | float * c = layer.weight_updates_gpu; |
| | | gemm_ongpu(1,0,m,n,k,alpha,a,m,b,n,1,c,n); |
| | | gemm_ongpu(1,0,m,n,k,1,a,m,b,n,1,c,n); |
| | | |
| | | m = layer.batch; |
| | | k = layer.outputs; |
| | |
| | | |
| | | void forward_connected_layer(connected_layer layer, network_state state); |
| | | void backward_connected_layer(connected_layer layer, network_state state); |
| | | void update_connected_layer(connected_layer layer, float learning_rate, float momentum, float decay); |
| | | void update_connected_layer(connected_layer layer, int batch, float learning_rate, float momentum, float decay); |
| | | |
| | | #ifdef GPU |
| | | void forward_connected_layer_gpu(connected_layer layer, network_state state); |
| | | void backward_connected_layer_gpu(connected_layer layer, network_state state); |
| | | void update_connected_layer_gpu(connected_layer layer, float learning_rate, float momentum, float decay); |
| | | void update_connected_layer_gpu(connected_layer layer, int batch, float learning_rate, float momentum, float decay); |
| | | void push_connected_layer(connected_layer layer); |
| | | void pull_connected_layer(connected_layer layer); |
| | | #endif |
| | |
| | | |
| | | extern "C" void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size) |
| | | { |
| | | float alpha = 1./batch; |
| | | |
| | | backward_bias_kernel<<<n, BLOCK>>>(bias_updates, delta, batch, n, size, alpha); |
| | | backward_bias_kernel<<<n, BLOCK>>>(bias_updates, delta, batch, n, size, 1); |
| | | check_error(cudaPeekAtLastError()); |
| | | } |
| | | |
| | | extern "C" void forward_convolutional_layer_gpu(convolutional_layer layer, network_state state) |
| | | { |
| | | //clock_t time = clock(); |
| | | int i; |
| | | int m = layer.n; |
| | | int k = layer.size*layer.size*layer.c; |
| | |
| | | convolutional_out_width(layer); |
| | | |
| | | bias_output_gpu(layer.output_gpu, layer.biases_gpu, layer.batch, layer.n, n); |
| | | //cudaDeviceSynchronize(); |
| | | //printf("bias %f\n", sec(clock() - time)); |
| | | //time = clock(); |
| | | |
| | | //float imt=0; |
| | | //float gemt = 0; |
| | | for(i = 0; i < layer.batch; ++i){ |
| | | //time = clock(); |
| | | im2col_ongpu(state.input + i*layer.c*layer.h*layer.w, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, layer.col_image_gpu); |
| | | //cudaDeviceSynchronize(); |
| | | //imt += sec(clock()-time); |
| | | //time = clock(); |
| | | float * a = layer.filters_gpu; |
| | | float * b = layer.col_image_gpu; |
| | | float * c = layer.output_gpu; |
| | | gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n); |
| | | //cudaDeviceSynchronize(); |
| | | //gemt += sec(clock()-time); |
| | | //time = clock(); |
| | | } |
| | | activate_array_ongpu(layer.output_gpu, m*n*layer.batch, layer.activation); |
| | | //cudaDeviceSynchronize(); |
| | | //printf("activate %f\n", sec(clock() - time)); |
| | | //printf("im2col %f\n", imt); |
| | | //printf("gemm %f\n", gemt); |
| | | } |
| | | |
| | | extern "C" void backward_convolutional_layer_gpu(convolutional_layer layer, network_state state) |
| | | { |
| | | float alpha = 1./layer.batch; |
| | | int i; |
| | | int m = layer.n; |
| | | int n = layer.size*layer.size*layer.c; |
| | |
| | | float * c = layer.filter_updates_gpu; |
| | | |
| | | im2col_ongpu(state.input + i*layer.c*layer.h*layer.w, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, layer.col_image_gpu); |
| | | gemm_ongpu(0,1,m,n,k,alpha,a + i*m*k,k,b,k,1,c,n); |
| | | gemm_ongpu(0,1,m,n,k,1,a + i*m*k,k,b,k,1,c,n); |
| | | |
| | | if(state.delta){ |
| | | |
| | |
| | | cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n); |
| | | } |
| | | |
| | | extern "C" void update_convolutional_layer_gpu(convolutional_layer layer, float learning_rate, float momentum, float decay) |
| | | extern "C" void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay) |
| | | { |
| | | int size = layer.size*layer.size*layer.c*layer.n; |
| | | |
| | | axpy_ongpu(layer.n, learning_rate, layer.bias_updates_gpu, 1, layer.biases_gpu, 1); |
| | | axpy_ongpu(layer.n, learning_rate/batch, layer.bias_updates_gpu, 1, layer.biases_gpu, 1); |
| | | scal_ongpu(layer.n, momentum, layer.bias_updates_gpu, 1); |
| | | |
| | | axpy_ongpu(size, -decay, layer.filters_gpu, 1, layer.filter_updates_gpu, 1); |
| | | axpy_ongpu(size, learning_rate, layer.filter_updates_gpu, 1, layer.filters_gpu, 1); |
| | | axpy_ongpu(size, -decay*batch, layer.filters_gpu, 1, layer.filter_updates_gpu, 1); |
| | | axpy_ongpu(size, learning_rate/batch, layer.filter_updates_gpu, 1, layer.filters_gpu, 1); |
| | | scal_ongpu(size, momentum, layer.filter_updates_gpu, 1); |
| | | } |
| | | |
| | |
| | | |
| | | void backward_bias(float *bias_updates, float *delta, int batch, int n, int size) |
| | | { |
| | | float alpha = 1./batch; |
| | | int i,b; |
| | | for(b = 0; b < batch; ++b){ |
| | | for(i = 0; i < n; ++i){ |
| | | bias_updates[i] += alpha * sum_array(delta+size*(i+b*n), size); |
| | | bias_updates[i] += sum_array(delta+size*(i+b*n), size); |
| | | } |
| | | } |
| | | } |
| | |
| | | |
| | | void backward_convolutional_layer(convolutional_layer layer, network_state state) |
| | | { |
| | | float alpha = 1./layer.batch; |
| | | int i; |
| | | int m = layer.n; |
| | | int n = layer.size*layer.size*layer.c; |
| | |
| | | |
| | | im2col_cpu(im, layer.c, layer.h, layer.w, |
| | | layer.size, layer.stride, layer.pad, b); |
| | | gemm(0,1,m,n,k,alpha,a,k,b,k,1,c,n); |
| | | gemm(0,1,m,n,k,1,a,k,b,k,1,c,n); |
| | | |
| | | if(state.delta){ |
| | | a = layer.filters; |
| | |
| | | } |
| | | } |
| | | |
| | | void update_convolutional_layer(convolutional_layer layer, float learning_rate, float momentum, float decay) |
| | | void update_convolutional_layer(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay) |
| | | { |
| | | int size = layer.size*layer.size*layer.c*layer.n; |
| | | axpy_cpu(layer.n, learning_rate, layer.bias_updates, 1, layer.biases, 1); |
| | | axpy_cpu(layer.n, learning_rate/batch, layer.bias_updates, 1, layer.biases, 1); |
| | | scal_cpu(layer.n, momentum, layer.bias_updates, 1); |
| | | |
| | | axpy_cpu(size, -decay, layer.filters, 1, layer.filter_updates, 1); |
| | | axpy_cpu(size, learning_rate, layer.filter_updates, 1, layer.filters, 1); |
| | | axpy_cpu(size, -decay*batch, layer.filters, 1, layer.filter_updates, 1); |
| | | axpy_cpu(size, learning_rate/batch, layer.filter_updates, 1, layer.filters, 1); |
| | | scal_cpu(size, momentum, layer.filter_updates, 1); |
| | | } |
| | | |
| | |
| | | #ifdef GPU |
| | | void forward_convolutional_layer_gpu(convolutional_layer layer, network_state state); |
| | | void backward_convolutional_layer_gpu(convolutional_layer layer, network_state state); |
| | | void update_convolutional_layer_gpu(convolutional_layer layer, float learning_rate, float momentum, float decay); |
| | | void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay); |
| | | |
| | | void push_convolutional_layer(convolutional_layer layer); |
| | | void pull_convolutional_layer(convolutional_layer layer); |
| | |
| | | convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation); |
| | | void resize_convolutional_layer(convolutional_layer *layer, int h, int w); |
| | | void forward_convolutional_layer(const convolutional_layer layer, network_state state); |
| | | void update_convolutional_layer(convolutional_layer layer, float learning_rate, float momentum, float decay); |
| | | void update_convolutional_layer(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay); |
| | | image *visualize_convolutional_layer(convolutional_layer layer, char *window, image *prev_filters); |
| | | |
| | | void backward_convolutional_layer(convolutional_layer layer, network_state state); |
| | |
| | | if(!init){ |
| | | curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); |
| | | curandSetPseudoRandomGeneratorSeed(gen, 0ULL); |
| | | init = 1; |
| | | } |
| | | curandGenerateUniform(gen, x_gpu, n); |
| | | check_error(cudaPeekAtLastError()); |
| | |
| | | void update_network(network net) |
| | | { |
| | | int i; |
| | | int update_batch = net.batch*net.subdivisions; |
| | | for(i = 0; i < net.n; ++i){ |
| | | if(net.types[i] == CONVOLUTIONAL){ |
| | | convolutional_layer layer = *(convolutional_layer *)net.layers[i]; |
| | | update_convolutional_layer(layer, net.learning_rate, net.momentum, net.decay); |
| | | update_convolutional_layer(layer, update_batch, net.learning_rate, net.momentum, net.decay); |
| | | } |
| | | else if(net.types[i] == DECONVOLUTIONAL){ |
| | | deconvolutional_layer layer = *(deconvolutional_layer *)net.layers[i]; |
| | |
| | | } |
| | | else if(net.types[i] == CONNECTED){ |
| | | connected_layer layer = *(connected_layer *)net.layers[i]; |
| | | update_connected_layer(layer, net.learning_rate, net.momentum, net.decay); |
| | | update_connected_layer(layer, update_batch, net.learning_rate, net.momentum, net.decay); |
| | | } |
| | | } |
| | | } |
| | |
| | | forward_network(net, state); |
| | | backward_network(net, state); |
| | | float error = get_network_cost(net); |
| | | update_network(net); |
| | | if((net.seen/net.batch)%net.subdivisions == 0) update_network(net); |
| | | return error; |
| | | } |
| | | |
| | |
| | | int i; |
| | | float sum = 0; |
| | | for(i = 0; i < n; ++i){ |
| | | net.seen += batch; |
| | | get_random_batch(d, batch, X, y); |
| | | float err = train_network_datum(net, X, y); |
| | | sum += err; |
| | |
| | | float sum = 0; |
| | | for(i = 0; i < n; ++i){ |
| | | get_next_batch(d, batch, i*batch, X, y); |
| | | net.seen += batch; |
| | | float err = train_network_datum(net, X, y); |
| | | sum += err; |
| | | } |
| | |
| | | int n; |
| | | int batch; |
| | | int seen; |
| | | int subdivisions; |
| | | float learning_rate; |
| | | float momentum; |
| | | float decay; |
| | |
| | | { |
| | | int i; |
| | | for(i = 0; i < net.n; ++i){ |
| | | //clock_t time = clock(); |
| | | if(net.types[i] == CONVOLUTIONAL){ |
| | | forward_convolutional_layer_gpu(*(convolutional_layer *)net.layers[i], state); |
| | | } |
| | |
| | | forward_crop_layer_gpu(*(crop_layer *)net.layers[i], state); |
| | | } |
| | | state.input = get_network_output_gpu_layer(net, i); |
| | | //cudaDeviceSynchronize(); |
| | | //printf("forw %d: %s %f\n", i, get_layer_string(net.types[i]), sec(clock() - time)); |
| | | //time = clock(); |
| | | } |
| | | } |
| | | |
| | |
| | | int i; |
| | | float * original_input = state.input; |
| | | for(i = net.n-1; i >= 0; --i){ |
| | | //clock_t time = clock(); |
| | | if(i == 0){ |
| | | state.input = original_input; |
| | | state.delta = 0; |
| | |
| | | else if(net.types[i] == SOFTMAX){ |
| | | backward_softmax_layer_gpu(*(softmax_layer *)net.layers[i], state); |
| | | } |
| | | //cudaDeviceSynchronize(); |
| | | //printf("back %d: %s %f\n", i, get_layer_string(net.types[i]), sec(clock() - time)); |
| | | //time = clock(); |
| | | } |
| | | } |
| | | |
| | | void update_network_gpu(network net) |
| | | { |
| | | int i; |
| | | int update_batch = net.batch*net.subdivisions; |
| | | for(i = 0; i < net.n; ++i){ |
| | | if(net.types[i] == CONVOLUTIONAL){ |
| | | convolutional_layer layer = *(convolutional_layer *)net.layers[i]; |
| | | update_convolutional_layer_gpu(layer, net.learning_rate, net.momentum, net.decay); |
| | | update_convolutional_layer_gpu(layer, update_batch, net.learning_rate, net.momentum, net.decay); |
| | | } |
| | | else if(net.types[i] == DECONVOLUTIONAL){ |
| | | deconvolutional_layer layer = *(deconvolutional_layer *)net.layers[i]; |
| | |
| | | } |
| | | else if(net.types[i] == CONNECTED){ |
| | | connected_layer layer = *(connected_layer *)net.layers[i]; |
| | | update_connected_layer_gpu(layer, net.learning_rate, net.momentum, net.decay); |
| | | update_connected_layer_gpu(layer, update_batch, net.learning_rate, net.momentum, net.decay); |
| | | } |
| | | } |
| | | } |
| | |
| | | |
| | | float train_network_datum_gpu(network net, float *x, float *y) |
| | | { |
| | | // clock_t time = clock(); |
| | | network_state state; |
| | | int x_size = get_network_input_size(net)*net.batch; |
| | | int y_size = get_network_output_size(net)*net.batch; |
| | |
| | | state.input = *net.input_gpu; |
| | | state.truth = *net.truth_gpu; |
| | | state.train = 1; |
| | | //cudaDeviceSynchronize(); |
| | | //printf("trans %f\n", sec(clock() - time)); |
| | | //time = clock(); |
| | | forward_network_gpu(net, state); |
| | | //cudaDeviceSynchronize(); |
| | | //printf("forw %f\n", sec(clock() - time)); |
| | | //time = clock(); |
| | | backward_network_gpu(net, state); |
| | | //cudaDeviceSynchronize(); |
| | | //printf("back %f\n", sec(clock() - time)); |
| | | //time = clock(); |
| | | update_network_gpu(net); |
| | | if ((net.seen / net.batch) % net.subdivisions == 0) update_network_gpu(net); |
| | | float error = get_network_cost(net); |
| | | |
| | | //print_letters(y, 50); |
| | | //float *out = get_network_output_gpu(net); |
| | | //print_letters(out, 50); |
| | | //cudaDeviceSynchronize(); |
| | | //printf("updt %f\n", sec(clock() - time)); |
| | | //time = clock(); |
| | | return error; |
| | | } |
| | | |
| | |
| | | net->momentum = option_find_float(options, "momentum", .9); |
| | | net->decay = option_find_float(options, "decay", .0001); |
| | | net->seen = option_find_int(options, "seen",0); |
| | | int subdivs = option_find_int(options, "subdivisions",1); |
| | | net->batch /= subdivs; |
| | | net->subdivisions = subdivs; |
| | | |
| | | net->h = option_find_int_quiet(options, "height",0); |
| | | net->w = option_find_int_quiet(options, "width",0); |
| | | net->c = option_find_int_quiet(options, "channels",0); |
| | | net->inputs = option_find_int_quiet(options, "inputs", net->h * net->w * net->c); |
| | | if(!net->inputs && !(net->h && net->w && net->c)) error("No input parameters supplied"); |
| | | option_unused(options); |
| | | } |
| | | |
| | | network parse_network_cfg(char *filename) |