From 1b5afb45838e603fa6780762eb8cc59246dc2d81 Mon Sep 17 00:00:00 2001
From: IlyaOvodov <b@ovdv.ru>
Date: Tue, 08 May 2018 11:09:35 +0000
Subject: [PATCH] Output improvements for detector results: When printing detector results, output was done in random order, obfuscating results for interpreting. Now: 1. Text output includes coordinates of rects in (left,right,top,bottom in pixels) along with label and score 2. Text output is sorted by rect lefts to simplify finding appropriate rects on image 3. If several class probs are > thresh for some detection, the most probable is written first and coordinates for others are not repeated 4. Rects are imprinted in image in order by their best class prob, so most probable rects are always on top and not overlayed by less probable ones 5. Most probable label for rect is always written first Also: 6. Message about low GPU memory include required amount

---
 src/im2col_kernels.cu |  130 ++++++++++++++++---------------------------
 1 files changed, 49 insertions(+), 81 deletions(-)

diff --git a/src/im2col_kernels.cu b/src/im2col_kernels.cu
index a82c2dc..8a15e50 100644
--- a/src/im2col_kernels.cu
+++ b/src/im2col_kernels.cu
@@ -1,93 +1,61 @@
+#include "cuda_runtime.h"
+#include "curand.h"
+#include "cublas_v2.h"
+
 extern "C" {
 #include "im2col.h"
 #include "cuda.h"
 }
 
-__global__ void im2col_pad_kernel(float *im,
-     int channels,  int height,  int width,
-     int ksize,  int stride, float *data_col)
-{
-    int c,h,w;
-    int height_col = 1 + (height-1) / stride;
-    int width_col = 1 + (width-1) / stride;
-    int channels_col = channels * ksize * ksize;
+// src: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu
+// You may also want to read: https://github.com/BVLC/caffe/blob/master/LICENSE
 
-    int pad = ksize/2;
+__global__ void im2col_gpu_kernel(const int n, const float* data_im,
+        const int height, const int width, const int ksize,
+        const int pad,
+        const int stride,
+        const int height_col, const int width_col,
+        float *data_col) {
+    int index = blockIdx.x*blockDim.x+threadIdx.x;
+    for(; index < n; index += blockDim.x*gridDim.x){
+        int w_out = index % width_col;
+        int h_index = index / width_col;
+        int h_out = h_index % height_col;
+        int channel_in = h_index / height_col;
+        int channel_out = channel_in * ksize * ksize;
+        int h_in = h_out * stride - pad;
+        int w_in = w_out * stride - pad;
+        float* data_col_ptr = data_col;
+        data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
+        const float* data_im_ptr = data_im;
+        data_im_ptr += (channel_in * height + h_in) * width + w_in;
+        for (int i = 0; i < ksize; ++i) {
+            for (int j = 0; j < ksize; ++j) {
+                int h = h_in + i;
+                int w = w_in + j;
 
-    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    int col_size = height_col*width_col*channels_col;
-    if (id >= col_size) return;
+                *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?
+                    data_im_ptr[i * width + j] : 0;
 
-    int col_index = id;
-    w = id % width_col;
-    id /= width_col;
-    h = id % height_col;
-    id /= height_col;
-    c = id % channels_col;
-    id /= channels_col;
+                //*data_col_ptr = data_im_ptr[ii * width + jj];
 
-    int w_offset = c % ksize;
-    int h_offset = (c / ksize) % ksize;
-    int im_channel = c / ksize / ksize;
-    int im_row = h_offset + h * stride - pad;
-    int im_col = w_offset + w * stride - pad;
-
-    int im_index = im_col + width*(im_row + height*im_channel);
-    float val = (im_row < 0 || im_col < 0 || im_row >= height || im_col >= width) ? 0 : im[im_index];
-
-    data_col[col_index] = val;
-}
-
-__global__ void im2col_nopad_kernel(float *im,
-        int channels,  int height,  int width,
-        int ksize,  int stride, float *data_col)
-{
-    int c,h,w;
-    int height_col = (height - ksize) / stride + 1;
-    int width_col = (width - ksize) / stride + 1;
-    int channels_col = channels * ksize * ksize;
-
-    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    int col_size = height_col*width_col*channels_col;
-    if (id >= col_size) return;
-
-    int col_index = id;
-    w = id % width_col;
-    id /= width_col;
-    h = id % height_col;
-    id /= height_col;
-    c = id % channels_col;
-    id /= channels_col;
-
-    int w_offset = c % ksize;
-    int h_offset = (c / ksize) % ksize;
-    int im_channel = c / ksize / ksize;
-    int im_row = h_offset + h * stride;
-    int im_col = w_offset + w * stride;
-
-    int im_index = im_col + width*(im_row + height*im_channel);
-    float val = (im_row < 0 || im_col < 0 || im_row >= height || im_col >= width) ? 0 : im[im_index];
-
-    data_col[col_index] = val;
-}
-
-extern "C" void im2col_ongpu(float *im,
-        int channels,  int height,  int width,
-        int ksize,  int stride,  int pad, float *data_col)
-{
-
-    int height_col = (height - ksize) / stride + 1;
-    int width_col = (width - ksize) / stride + 1;
-    int channels_col = channels * ksize * ksize;
-
-    if (pad){
-        height_col = 1 + (height-1) / stride;
-        width_col = 1 + (width-1) / stride;
+                data_col_ptr += height_col * width_col;
+            }
+        }
     }
+}
 
-    size_t n = channels_col*height_col*width_col;
-
-    if(pad)im2col_pad_kernel<<<cuda_gridsize(n),BLOCK>>>(im,  channels, height, width, ksize, stride, data_col);
-    else im2col_nopad_kernel<<<cuda_gridsize(n),BLOCK>>>(im,  channels, height, width, ksize, stride, data_col);
-    check_error(cudaPeekAtLastError());
+void im2col_ongpu(float *im,
+         int channels, int height, int width,
+         int ksize, int stride, int pad, float *data_col){
+    // We are going to launch channels * height_col * width_col kernels, each
+    // kernel responsible for copying a single-channel grid.
+    int height_col = (height + 2 * pad - ksize) / stride + 1;
+    int width_col = (width + 2 * pad - ksize) / stride + 1;
+    int num_kernels = channels * height_col * width_col;
+    im2col_gpu_kernel<<<(num_kernels+BLOCK-1)/BLOCK,
+        BLOCK, 0, get_cuda_stream()>>>(
+                num_kernels, im, height, width, ksize, pad,
+                stride, height_col,
+                width_col, data_col);
 }

--
Gitblit v1.10.0