1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
| extern "C" {
| #include "blas.h"
| #include "cuda.h"
| #include "utils.h"
| }
|
| __global__ void axpy_kernel(int N, float ALPHA, float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY)
| {
| int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
| if(i < N) Y[OFFY+i*INCY] += ALPHA*X[OFFX+i*INCX];
| }
|
| __global__ void pow_kernel(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
| {
| int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
| if(i < N) Y[i*INCY] = pow(X[i*INCX], ALPHA);
| }
|
| __global__ void const_kernel(int N, float ALPHA, float *X, int INCX)
| {
| int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
| if(i < N) X[i*INCX] = ALPHA;
| }
|
| __global__ void scal_kernel(int N, float ALPHA, float *X, int INCX)
| {
| int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
| if(i < N) X[i*INCX] *= ALPHA;
| }
|
| __global__ void mask_kernel(int n, float *x, float mask_num, float *mask)
| {
| int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
| if(i < n && mask[i] == mask_num) x[i] = mask_num;
| }
|
| __global__ void copy_kernel(int N, float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY)
| {
| int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
| if(i < N) Y[i*INCY + OFFY] = X[i*INCX + OFFX];
| }
|
| __global__ void mul_kernel(int N, float *X, int INCX, float *Y, int INCY)
| {
| int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
| if(i < N) Y[i*INCY] *= X[i*INCX];
| }
|
| extern "C" void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
| {
| axpy_ongpu_offset(N, ALPHA, X, 0, INCX, Y, 0, INCY);
| }
|
| extern "C" void pow_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
| {
| pow_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX, Y, INCY);
| check_error(cudaPeekAtLastError());
| }
|
| extern "C" void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
| {
| axpy_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, OFFX, INCX, Y, OFFY, INCY);
| check_error(cudaPeekAtLastError());
| }
|
| extern "C" void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY)
| {
| copy_ongpu_offset(N, X, 0, INCX, Y, 0, INCY);
| }
|
| extern "C" void mul_ongpu(int N, float * X, int INCX, float * Y, int INCY)
| {
| mul_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, INCX, Y, INCY);
| check_error(cudaPeekAtLastError());
| }
|
| extern "C" void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
| {
| copy_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, OFFX, INCX, Y, OFFY, INCY);
| check_error(cudaPeekAtLastError());
| }
|
| extern "C" void mask_ongpu(int N, float * X, float mask_num, float * mask)
| {
| mask_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, mask_num, mask);
| check_error(cudaPeekAtLastError());
| }
|
| extern "C" void const_ongpu(int N, float ALPHA, float * X, int INCX)
| {
| const_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
| check_error(cudaPeekAtLastError());
| }
|
| extern "C" void scal_ongpu(int N, float ALPHA, float * X, int INCX)
| {
| scal_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
| check_error(cudaPeekAtLastError());
| }
|
|