From 4ac78c89269138b4623993f9f1d81829d8e88131 Mon Sep 17 00:00:00 2001
From: Joseph Redmon <pjreddie@gmail.com>
Date: Tue, 20 Jan 2015 21:26:46 +0000
Subject: [PATCH] I am so done with opencl, switching to cuda

---
 src/gemm_fast.cl |   37 +++++++++++++++++--------------------
 1 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/src/gemm_fast.cl b/src/gemm_fast.cl
index 9a98208..2a76396 100644
--- a/src/gemm_fast.cl
+++ b/src/gemm_fast.cl
@@ -16,16 +16,15 @@
     int ctile = get_group_id(0);
     int rtile = get_group_id(1);
 
-    float Breg;
-    float Areg[WPT];
-    float acc[WPT][WPT];
+    float Areg[TILE];
+    float acc[TILE][TILE/THREADS];
 
     A += rtile*TILE*lda;
     B += ctile*TILE;
     C += rtile*TILE*ldc + ctile*TILE;
 
-    for(i = 0; i < WPT; ++i){
-        for(j = 0; j < WPT; ++j){
+    for(i = 0; i < TILE; ++i){
+        for(j = 0; j < TILE/THREADS; ++j){
             acc[i][j] = 0;
         }
     }
@@ -51,28 +50,26 @@
         barrier(CLK_LOCAL_MEM_FENCE);
 
         for(k = 0; k < TILE_K; ++k){
-            for(y = 0; y < WPT; ++y){
-                int row = (offset + (y*WPT)*THREADS)/TILE;
-                //Areg[y] = Asub[y*WPT][k];
+            #pragma unroll
+            for(y = 0; y < TILE; ++y){
+                Areg[y] = Asub[y][k];
             }
-            for(y = 0; y < WPT; ++y){
-                for(x = 0; x < WPT; ++x){
-                    int index = offset + (y*WPT + x)*THREADS;
-                    int row = index / TILE;
-                    int col = index % TILE;
-                    acc[y][x] += Asub[row][k]*Bsub[k][col];
+            for(x = 0; x < TILE; x += THREADS){
+                float Breg = Bsub[k][x+offset];
+                #pragma unroll
+                for(y = 0; y < TILE; ++y){
+                    acc[y][x/THREADS] += Breg * Areg[y];
                 }
             }
         }
         barrier(CLK_LOCAL_MEM_FENCE);
     }
 
-    for(y = 0; y < WPT; ++y){
-        for(x = 0; x < WPT; ++x){
-            int index = offset + (y*WPT + x)*THREADS;
-            int row = index / TILE;
-            int col = index % TILE;
-            C[row*ldc+col] = ALPHA*acc[y][x] + BETA*C[row*ldc+col];
+    for(i = 0; i < TILE; ++i){
+        for(j = 0; j < TILE/THREADS; ++j){
+            int col = j*THREADS + offset;
+            int row = i;
+            C[row*ldc+col] = ALPHA*acc[i][j] + BETA*C[row*ldc+col];
         }
     }
 }

--
Gitblit v1.10.0