1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
| extern "C" {
| #include "blas.h"
| #include "cuda.h"
| }
|
| __global__ void axpy_kernel(int N, float ALPHA, float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY)
| {
| int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
| if(i < N) Y[OFFY+i*INCY] += ALPHA*X[OFFX+i*INCX];
| }
|
| __global__ void scal_kernel(int N, float ALPHA, float *X, int INCX)
| {
| int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
| if(i < N) X[i*INCX] *= ALPHA;
| }
|
| __global__ void mask_kernel(int n, float *x, float *mask, int mod)
| {
| int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
| if(i < n) x[i] = (i%mod && !mask[(i/mod)*mod]) ? 0 : x[i];
| }
|
| __global__ void copy_kernel(int N, float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY)
| {
| int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
| if(i < N) Y[i*INCY + OFFY] = X[i*INCX + OFFX];
| }
|
| extern "C" void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
| {
| axpy_ongpu_offset(N, ALPHA, X, 0, INCX, Y, 0, INCY);
| }
|
| extern "C" void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
| {
| axpy_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, OFFX, INCX, Y, OFFY, INCY);
| check_error(cudaPeekAtLastError());
| }
|
| extern "C" void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY)
| {
| copy_ongpu_offset(N, X, 0, INCX, Y, 0, INCY);
| }
|
| extern "C" void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
| {
| copy_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, OFFX, INCX, Y, OFFY, INCY);
| check_error(cudaPeekAtLastError());
| }
|
| extern "C" void mask_ongpu(int N, float * X, float * mask, float mod)
| {
| mask_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, mask, mod);
| check_error(cudaPeekAtLastError());
| }
|
| extern "C" void scal_ongpu(int N, float ALPHA, float * X, int INCX)
| {
| scal_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
| check_error(cudaPeekAtLastError());
| }
|
|