~speedprog/mtg/mtg_card_detector.git

parent: 5c9a773b | patch | commit | ignore whitespace

Joseph Redmon

2014-10-26 edbccdfcaf46f11e631afe98796f3e6e170da5d0

Maybe something changed?

7 files modified

	src/cnn.c	7 ●●●●● patch \| view \| raw \| blame \| history
	src/col2im.cl	4 ●●●●● patch \| view \| raw \| blame \| history
	src/convolutional_layer.c	2 ●●●●● patch \| view \| raw \| blame \| history
	src/gemm.c	4 ●●●●● patch \| view \| raw \| blame \| history
	src/im2col.c	8 ●●●●● patch \| view \| raw \| blame \| history
	src/im2col.cl	19 ●●●●● patch \| view \| raw \| blame \| history
	src/network.c	13 ●●●●● patch \| view \| raw \| blame \| history

 src/cnn.c

@@ -499,7 +499,7 @@
    int iters = 10000/net.batch;
    while(++count <= 2000){
        clock_t start = clock(), end;
        float loss = train_network_sgd(net, train, iters);
        float loss = train_network_sgd_gpu(net, train, iters);
        end = clock();
        float test_acc = network_accuracy(net, test);
        //float test_acc = 0;
@@ -957,8 +957,9 @@

int main(int argc, char *argv[])
{
    //test_gpu_blas();
    train_imagenet();
    test_gpu_blas();
    //train_imagenet();
    //train_nist();
    fprintf(stderr, "Success!\n");
    return 0;
}

 src/col2im.cl

@@ -23,11 +23,11 @@

    int w_start = (w<ksize)?0:(w-ksize)/stride + 1;
    int w_end = w/stride + 1;
    if(width_col < w_end) w_end = width_col;
    w_end = (width_col < w_end) ? width_col : w_end;

    int h_start = (h<ksize)?0:(h-ksize)/stride+1;
    int h_end = h/stride + 1;
    if(height_col < h_end) h_end = height_col;
    h_end = (height_col < h_end) ? height_col : h_end;

    int rows = channels * ksize * ksize;
    int cols = height_col*width_col;

 src/convolutional_layer.c

@@ -342,7 +342,7 @@
    check_error(cl);
}

#define TIMEIT
//#define TIMEIT

void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in)
{

 src/gemm.c

@@ -176,12 +176,14 @@
        float BETA,
        cl_mem C_gpu, int ldc)
{
/*
    cl_setup();
    cl_command_queue queue = cl.queue;
    cl_event event;
    cl.error = clblasSgemm(clblasRowMajor, TA?clblasTrans:clblasNoTrans, TB?clblasTrans:clblasNoTrans,M, N, K,ALPHA, A_gpu, 0, lda,B_gpu, 0, ldb,BETA, C_gpu, 0, ldc,1, &queue, 0, NULL, &event);

    //gemm_ongpu_new(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc);
*/
    gemm_ongpu_new(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc);
}

void gemm_ongpu_new(int TA, int TB, int M, int N, int K, float ALPHA, 

 src/im2col.c

@@ -91,12 +91,10 @@
        width_col = 1 + (width-1) / stride;
    }

    size_t global_size[2];
    global_size[0] = batch*channels_col;
    global_size[1] = height_col*width_col;
    size_t global_size = batch*channels_col*height_col*width_col;

    clEnqueueNDRangeKernel(queue, im2col_kernel, 2, 0,
            global_size, 0, 0, 0, 0);
    clEnqueueNDRangeKernel(queue, im2col_kernel, 1, 0,
            &global_size, 0, 0, 0, 0);
    check_error(cl);
}


 src/im2col.cl

@@ -16,21 +16,22 @@
    int c,h,w,b;
    int height_col = (height - ksize) / stride + 1;
    int width_col = (width - ksize) / stride + 1;
    int channels_col = channels * ksize * ksize;
    if (pad){
        height_col = 1 + (height-1) / stride;
        width_col = 1 + (width-1) / stride;
        pad = ksize/2;
    }
    int gid1 = get_global_id(0);
    b = gid1%batch;
    c = gid1/batch;
    int id = get_global_id(0);
    w = id % width_col;
    id /= width_col;
    h = id % height_col;
    id /= height_col;
    c = id % channels_col;
    id /= channels_col;
    b = id % batch;
    id /= batch;

    int gid2 = get_global_id(1);
    h = gid2%height_col;
    w = gid2/height_col;


    int channels_col = channels * ksize * ksize;
    int col_size = height_col*width_col*channels_col;
    int w_offset = c % ksize;
    int h_offset = (c / ksize) % ksize;

 src/network.c

@@ -38,7 +38,7 @@
    //printf("start\n");
    int i;
    for(i = 0; i < net.n; ++i){
        clock_t time = clock();
        //clock_t time = clock();
        if(net.types[i] == CONVOLUTIONAL){
            convolutional_layer layer = *(convolutional_layer *)net.layers[i];
            forward_convolutional_layer_gpu(layer, input);
@@ -63,7 +63,7 @@
            forward_softmax_layer_gpu(layer, input);
            input = layer.output_cl;
        }
        printf("%d %f\n", i, sec(clock()-time));
        //printf("%d %f\n", i, sec(clock()-time));
        /*
           else if(net.types[i] == CROP){
           crop_layer layer = *(crop_layer *)net.layers[i];
@@ -386,6 +386,7 @@
{
    int x_size = get_network_input_size(net)*net.batch;
    int y_size = get_network_output_size(net)*net.batch;
    clock_t time = clock();
    if(!*net.input_cl){
        *net.input_cl = cl_make_array(x, x_size);
        *net.truth_cl = cl_make_array(y, y_size);
@@ -393,10 +394,18 @@
        cl_write_array(*net.input_cl, x, x_size);
        cl_write_array(*net.truth_cl, y, y_size);
    }
    //printf("trans %f\n", sec(clock()-time));
    time = clock();
    forward_network_gpu(net, *net.input_cl, *net.truth_cl, 1);
    //printf("forw %f\n", sec(clock()-time));
    time = clock();
    backward_network_gpu(net, *net.input_cl);
    //printf("back %f\n", sec(clock()-time));
    time = clock();
    float error = get_network_cost(net);
    update_network_gpu(net);
    //printf("updt %f\n", sec(clock()-time));
    time = clock();
    return error;
}

			@@ -499,7 +499,7 @@
			int iters = 10000/net.batch;
			while(++count <= 2000){
			clock_t start = clock(), end;
			float loss = train_network_sgd(net, train, iters);
			float loss = train_network_sgd_gpu(net, train, iters);
			end = clock();
			float test_acc = network_accuracy(net, test);
			//float test_acc = 0;
			@@ -957,8 +957,9 @@

			int main(int argc, char *argv[])
			{
			//test_gpu_blas();
			train_imagenet();
			test_gpu_blas();
			//train_imagenet();
			//train_nist();
			fprintf(stderr, "Success!\n");
			return 0;
			}

			@@ -23,11 +23,11 @@

			int w_start = (w<ksize)?0:(w-ksize)/stride + 1;
			int w_end = w/stride + 1;
			if(width_col < w_end) w_end = width_col;
			w_end = (width_col < w_end) ? width_col : w_end;

			int h_start = (h<ksize)?0:(h-ksize)/stride+1;
			int h_end = h/stride + 1;
			if(height_col < h_end) h_end = height_col;
			h_end = (height_col < h_end) ? height_col : h_end;

			int rows = channels * ksize * ksize;
			int cols = height_col*width_col;

			@@ -342,7 +342,7 @@
			check_error(cl);
			}

			#define TIMEIT
			//#define TIMEIT

			void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in)
			{

			@@ -176,12 +176,14 @@
			float BETA,
			cl_mem C_gpu, int ldc)
			{
			/*
			cl_setup();
			cl_command_queue queue = cl.queue;
			cl_event event;
			cl.error = clblasSgemm(clblasRowMajor, TA?clblasTrans:clblasNoTrans, TB?clblasTrans:clblasNoTrans,M, N, K,ALPHA, A_gpu, 0, lda,B_gpu, 0, ldb,BETA, C_gpu, 0, ldc,1, &queue, 0, NULL, &event);

			//gemm_ongpu_new(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc);
			*/
			gemm_ongpu_new(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc);
			}

			void gemm_ongpu_new(int TA, int TB, int M, int N, int K, float ALPHA,

			@@ -91,12 +91,10 @@
			width_col = 1 + (width-1) / stride;
			}

			size_t global_size[2];
			global_size[0] = batch*channels_col;
			global_size[1] = height_col*width_col;
			size_t global_size = batchchannels_colheight_col*width_col;

			clEnqueueNDRangeKernel(queue, im2col_kernel, 2, 0,
			global_size, 0, 0, 0, 0);
			clEnqueueNDRangeKernel(queue, im2col_kernel, 1, 0,
			&global_size, 0, 0, 0, 0);
			check_error(cl);
			}

			@@ -16,21 +16,22 @@
			int c,h,w,b;
			int height_col = (height - ksize) / stride + 1;
			int width_col = (width - ksize) / stride + 1;
			int channels_col = channels * ksize * ksize;
			if (pad){
			height_col = 1 + (height-1) / stride;
			width_col = 1 + (width-1) / stride;
			pad = ksize/2;
			}
			int gid1 = get_global_id(0);
			b = gid1%batch;
			c = gid1/batch;
			int id = get_global_id(0);
			w = id % width_col;
			id /= width_col;
			h = id % height_col;
			id /= height_col;
			c = id % channels_col;
			id /= channels_col;
			b = id % batch;
			id /= batch;

			int gid2 = get_global_id(1);
			h = gid2%height_col;
			w = gid2/height_col;


			int channels_col = channels * ksize * ksize;
			int col_size = height_colwidth_colchannels_col;
			int w_offset = c % ksize;
			int h_offset = (c / ksize) % ksize;

			@@ -38,7 +38,7 @@
			//printf("start\n");
			int i;
			for(i = 0; i < net.n; ++i){
			clock_t time = clock();
			//clock_t time = clock();
			if(net.types[i] == CONVOLUTIONAL){
			convolutional_layer layer = (convolutional_layer )net.layers[i];
			forward_convolutional_layer_gpu(layer, input);
			@@ -63,7 +63,7 @@
			forward_softmax_layer_gpu(layer, input);
			input = layer.output_cl;
			}
			printf("%d %f\n", i, sec(clock()-time));
			//printf("%d %f\n", i, sec(clock()-time));
			/*
			else if(net.types[i] == CROP){
			crop_layer layer = (crop_layer )net.layers[i];
			@@ -386,6 +386,7 @@
			{
			int x_size = get_network_input_size(net)*net.batch;
			int y_size = get_network_output_size(net)*net.batch;
			clock_t time = clock();
			if(!*net.input_cl){
			*net.input_cl = cl_make_array(x, x_size);
			*net.truth_cl = cl_make_array(y, y_size);
			@@ -393,10 +394,18 @@
			cl_write_array(*net.input_cl, x, x_size);
			cl_write_array(*net.truth_cl, y, y_size);
			}
			//printf("trans %f\n", sec(clock()-time));
			time = clock();
			forward_network_gpu(net, net.input_cl, net.truth_cl, 1);
			//printf("forw %f\n", sec(clock()-time));
			time = clock();
			backward_network_gpu(net, *net.input_cl);
			//printf("back %f\n", sec(clock()-time));
			time = clock();
			float error = get_network_cost(net);
			update_network_gpu(net);
			//printf("updt %f\n", sec(clock()-time));
			time = clock();
			return error;
			}