~speedprog/mtg/mtg_card_detector.git

			@@ -210,7 +210,7 @@
			//network net = parse_network_cfg("/home/pjreddie/imagenet_backup/alexnet_1270.cfg");
			srand(time(0));
			network net = parse_network_cfg(cfgfile);
			set_learning_network(&net, net.learning_rate*10., net.momentum, net.decay);
			set_learning_network(&net, net.learning_rate*100., net.momentum, net.decay);
			printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
			int imgs = 1024;
			int i = 6600;

			@@ -164,8 +164,7 @@

			#define TILE 64
			#define TILE_K 16
			#define WPT 8
			#define THREADS (TILETILE)/(WPTWPT)
			#define THREADS 64

			cl_kernel get_gemm_nn_fast_kernel()
			{
			@@ -175,7 +174,6 @@
			gemm_kernel = get_kernel("src/gemm_fast.cl", "gemm_nn_fast", "-D TILE=" STR(TILE)
			" -cl-nv-verbose "
			" -D TILE_K=" STR(TILE_K)
			" -D WPT=" STR(WPT)
			" -D THREADS=" STR(THREADS));
			init = 1;
			}
			@@ -464,7 +462,6 @@

			test_gpu_accuracy(0,0,128,128,128);

			/*
			time_ongpu(0,0,64,2916,363);
			time_ongpu_fast(0,0,64,2916,363);
			time_ongpu(0,0,64,2916,363);
			@@ -483,7 +480,6 @@
			time_ongpu_fast(0,0,128,4096,12544);
			time_ongpu(0,0,128,4096,4096);
			time_ongpu_fast(0,0,128,4096,4096);
			*/
			// time_ongpu(1,0,2304,196,256);
			// time_ongpu_fast(1,0,2304,196,256);
			// time_ongpu(0,1,256,2304,196);

			@@ -16,16 +16,15 @@
			int ctile = get_group_id(0);
			int rtile = get_group_id(1);

			float Breg;
			float Areg[WPT];
			float acc[WPT][WPT];
			float Areg[TILE];
			float acc[TILE][TILE/THREADS];

			A += rtileTILElda;
			B += ctile*TILE;
			C += rtileTILEldc + ctile*TILE;

			for(i = 0; i < WPT; ++i){
			for(j = 0; j < WPT; ++j){
			for(i = 0; i < TILE; ++i){
			for(j = 0; j < TILE/THREADS; ++j){
			acc[i][j] = 0;
			}
			}
			@@ -51,28 +50,26 @@
			barrier(CLK_LOCAL_MEM_FENCE);

			for(k = 0; k < TILE_K; ++k){
			for(y = 0; y < WPT; ++y){
			int row = (offset + (yWPT)THREADS)/TILE;
			//Areg[y] = Asub[y*WPT][k];
			#pragma unroll
			for(y = 0; y < TILE; ++y){
			Areg[y] = Asub[y][k];
			}
			for(y = 0; y < WPT; ++y){
			for(x = 0; x < WPT; ++x){
			int index = offset + (yWPT + x)THREADS;
			int row = index / TILE;
			int col = index % TILE;
			acc[y][x] += Asub[row][k]*Bsub[k][col];
			for(x = 0; x < TILE; x += THREADS){
			float Breg = Bsub[k][x+offset];
			#pragma unroll
			for(y = 0; y < TILE; ++y){
			acc[y][x/THREADS] += Breg * Areg[y];
			}
			}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			}

			for(y = 0; y < WPT; ++y){
			for(x = 0; x < WPT; ++x){
			int index = offset + (yWPT + x)THREADS;
			int row = index / TILE;
			int col = index % TILE;
			C[rowldc+col] = ALPHAacc[y][x] + BETAC[rowldc+col];
			for(i = 0; i < TILE; ++i){
			for(j = 0; j < TILE/THREADS; ++j){
			int col = j*THREADS + offset;
			int row = i;
			C[rowldc+col] = ALPHAacc[i][j] + BETAC[rowldc+col];
			}
			}
			}