| | |
| | | extern "C" { |
| | | #include "blas.h" |
| | | #include "cuda.h" |
| | | #include "utils.h" |
| | | } |
| | | |
| | | __global__ void axpy_kernel(int N, float ALPHA, float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY) |
| | |
| | | if(i < N) Y[OFFY+i*INCY] += ALPHA*X[OFFX+i*INCX]; |
| | | } |
| | | |
| | | __global__ void pow_kernel(int N, float ALPHA, float *X, int INCX, float *Y, int INCY) |
| | | { |
| | | int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; |
| | | if(i < N) Y[i*INCY] = pow(X[i*INCX], ALPHA); |
| | | } |
| | | |
| | | __global__ void const_kernel(int N, float ALPHA, float *X, int INCX) |
| | | { |
| | | int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; |
| | | if(i < N) X[i*INCX] = ALPHA; |
| | | } |
| | | |
| | | __global__ void scal_kernel(int N, float ALPHA, float *X, int INCX) |
| | | { |
| | | int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; |
| | | if(i < N) X[i*INCX] *= ALPHA; |
| | | } |
| | | |
| | | __global__ void mask_kernel(int n, float *x, float *mask, int mod) |
| | | __global__ void mask_kernel(int n, float *x, float mask_num, float *mask) |
| | | { |
| | | int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; |
| | | if(i < n) x[i] = (i%mod && !mask[(i/mod)*mod]) ? 0 : x[i]; |
| | | if(i < n && mask[i] == mask_num) x[i] = mask_num; |
| | | } |
| | | |
| | | __global__ void copy_kernel(int N, float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY) |
| | |
| | | if(i < N) Y[i*INCY + OFFY] = X[i*INCX + OFFX]; |
| | | } |
| | | |
| | | __global__ void mul_kernel(int N, float *X, int INCX, float *Y, int INCY) |
| | | { |
| | | int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; |
| | | if(i < N) Y[i*INCY] *= X[i*INCX]; |
| | | } |
| | | |
| | | extern "C" void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY) |
| | | { |
| | | axpy_ongpu_offset(N, ALPHA, X, 0, INCX, Y, 0, INCY); |
| | | } |
| | | |
| | | extern "C" void pow_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY) |
| | | { |
| | | pow_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX, Y, INCY); |
| | | check_error(cudaPeekAtLastError()); |
| | | } |
| | | |
| | | extern "C" void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY) |
| | | { |
| | | axpy_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, OFFX, INCX, Y, OFFY, INCY); |
| | |
| | | copy_ongpu_offset(N, X, 0, INCX, Y, 0, INCY); |
| | | } |
| | | |
| | | extern "C" void mul_ongpu(int N, float * X, int INCX, float * Y, int INCY) |
| | | { |
| | | mul_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, INCX, Y, INCY); |
| | | check_error(cudaPeekAtLastError()); |
| | | } |
| | | |
| | | extern "C" void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY) |
| | | { |
| | | copy_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, OFFX, INCX, Y, OFFY, INCY); |
| | | check_error(cudaPeekAtLastError()); |
| | | } |
| | | |
| | | extern "C" void mask_ongpu(int N, float * X, float * mask, float mod) |
| | | extern "C" void mask_ongpu(int N, float * X, float mask_num, float * mask) |
| | | { |
| | | mask_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, mask, mod); |
| | | mask_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, mask_num, mask); |
| | | check_error(cudaPeekAtLastError()); |
| | | } |
| | | |
| | | extern "C" void const_ongpu(int N, float ALPHA, float * X, int INCX) |
| | | { |
| | | const_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX); |
| | | check_error(cudaPeekAtLastError()); |
| | | } |
| | | |