PLSSVM - Parallel Least Squares Support Vector Machine  2.0.0
A Least Squares Support Vector Machine implementation using different backends.
svm_kernel.hip.hpp
Go to the documentation of this file.
1 
12 #ifndef PLSSVM_BACKENDS_HIP_SVM_KERNEL_HPP_
13 #define PLSSVM_BACKENDS_HIP_SVM_KERNEL_HPP_
14 #pragma once
15 
16 #include "hip/hip_runtime.h"
17 #include "hip/hip_runtime_api.h"
18 
19 #include "plssvm/constants.hpp" // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE
20 
21 namespace plssvm::hip {
22 
38 template <typename real_type>
39 __global__ void device_kernel_linear(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) {
40  kernel_index_type i = blockIdx.x * blockDim.x * INTERNAL_BLOCK_SIZE;
41  kernel_index_type j = blockIdx.y * blockDim.y * INTERNAL_BLOCK_SIZE;
42 
43  __shared__ real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
44  __shared__ real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
45  real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };
46  real_type data_j[INTERNAL_BLOCK_SIZE];
47 
48  if (i >= j) {
49  i += threadIdx.x * INTERNAL_BLOCK_SIZE;
50  const kernel_index_type ji = j + threadIdx.x * INTERNAL_BLOCK_SIZE;
51  j += threadIdx.y * INTERNAL_BLOCK_SIZE;
52  // cache data
53  for (kernel_index_type vec_index = 0; vec_index < feature_range * num_rows; vec_index += num_rows) {
54  __syncthreads();
55  #pragma unroll INTERNAL_BLOCK_SIZE
56  for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
57  const kernel_index_type idx = block_id % THREAD_BLOCK_SIZE;
58  if (threadIdx.y == idx) {
59  data_intern_i[threadIdx.x][block_id] = data_d[block_id + vec_index + i];
60  }
61  const kernel_index_type idx_2 = block_id + INTERNAL_BLOCK_SIZE % THREAD_BLOCK_SIZE;
62  if (threadIdx.y == idx_2) {
63  data_intern_j[threadIdx.x][block_id] = data_d[block_id + vec_index + ji];
64  }
65  }
66  __syncthreads();
67 
68  #pragma unroll INTERNAL_BLOCK_SIZE
69  for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
70  data_j[data_index] = data_intern_j[threadIdx.y][data_index];
71  }
72 
73  #pragma unroll INTERNAL_BLOCK_SIZE
74  for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
75  const real_type data_i = data_intern_i[threadIdx.x][l];
76  #pragma unroll INTERNAL_BLOCK_SIZE
77  for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
78  matr[k][l] += data_i * data_j[k];
79  }
80  }
81  }
82 
83  #pragma unroll INTERNAL_BLOCK_SIZE
84  for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
85  real_type ret_jx = 0.0;
86  #pragma unroll INTERNAL_BLOCK_SIZE
87  for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
88  real_type temp;
89  if (id == 0) {
90  temp = (matr[x][y] + QA_cost - q[i + y] - q[j + x]) * add;
91  } else {
92  temp = matr[x][y] * add;
93  }
94  if (i + x > j + y) {
95  // upper triangular matrix
96  atomicAdd(&ret[i + y], temp * d[j + x]);
97  ret_jx += temp * d[i + y];
98  } else if (i + x == j + y) {
99  // diagonal
100  if (id == 0) {
101  ret_jx += (temp + cost * add) * d[i + y];
102  } else {
103  ret_jx += temp * d[i + y];
104  }
105  }
106  }
107  atomicAdd(&ret[j + x], ret_jx);
108  }
109  }
110 }
111 
129 template <typename real_type>
130 __global__ void device_kernel_polynomial(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) {
131  kernel_index_type i = blockIdx.x * blockDim.x * INTERNAL_BLOCK_SIZE;
132  kernel_index_type j = blockIdx.y * blockDim.y * INTERNAL_BLOCK_SIZE;
133 
134  __shared__ real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
135  __shared__ real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
136  real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };
137  real_type data_j[INTERNAL_BLOCK_SIZE];
138 
139  if (i >= j) {
140  i += threadIdx.x * INTERNAL_BLOCK_SIZE;
141  const kernel_index_type ji = j + threadIdx.x * INTERNAL_BLOCK_SIZE;
142  j += threadIdx.y * INTERNAL_BLOCK_SIZE;
143  for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {
144  __syncthreads();
145  #pragma unroll INTERNAL_BLOCK_SIZE
146  for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
147  const kernel_index_type idx = block_id % THREAD_BLOCK_SIZE;
148  if (threadIdx.y == idx) {
149  data_intern_i[threadIdx.x][block_id] = data_d[block_id + vec_index + i];
150  }
151  const kernel_index_type idx_2 = block_id + INTERNAL_BLOCK_SIZE % THREAD_BLOCK_SIZE;
152  if (threadIdx.y == idx_2) {
153  data_intern_j[threadIdx.x][block_id] = data_d[block_id + vec_index + ji];
154  }
155  }
156  __syncthreads();
157 
158  #pragma unroll INTERNAL_BLOCK_SIZE
159  for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
160  data_j[data_index] = data_intern_j[threadIdx.y][data_index];
161  }
162 
163  #pragma unroll INTERNAL_BLOCK_SIZE
164  for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
165  const real_type data_i = data_intern_i[threadIdx.x][l];
166  #pragma unroll INTERNAL_BLOCK_SIZE
167  for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
168  matr[k][l] += data_i * data_j[k];
169  }
170  }
171  }
172 
173  #pragma unroll INTERNAL_BLOCK_SIZE
174  for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
175  real_type ret_jx = 0.0;
176  #pragma unroll INTERNAL_BLOCK_SIZE
177  for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
178  const real_type temp = (pow(gamma * matr[x][y] + coef0, degree) + QA_cost - q[i + y] - q[j + x]) * add;
179  if (i + x > j + y) {
180  // upper triangular matrix
181  atomicAdd(&ret[i + y], temp * d[j + x]);
182  ret_jx += temp * d[i + y];
183  } else if (i + x == j + y) {
184  // diagonal
185  ret_jx += (temp + cost * add) * d[i + y];
186  }
187  }
188  atomicAdd(&ret[j + x], ret_jx);
189  }
190  }
191 }
192 
208 template <typename real_type>
209 __global__ void device_kernel_rbf(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) {
210  kernel_index_type i = blockIdx.x * blockDim.x * INTERNAL_BLOCK_SIZE;
211  kernel_index_type j = blockIdx.y * blockDim.y * INTERNAL_BLOCK_SIZE;
212 
213  __shared__ real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
214  __shared__ real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE];
215  real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } };
216  real_type data_j[INTERNAL_BLOCK_SIZE];
217 
218  if (i >= j) {
219  i += threadIdx.x * INTERNAL_BLOCK_SIZE;
220  const kernel_index_type ji = j + threadIdx.x * INTERNAL_BLOCK_SIZE;
221  j += threadIdx.y * INTERNAL_BLOCK_SIZE;
222  for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) {
223  __syncthreads();
224  #pragma unroll INTERNAL_BLOCK_SIZE
225  for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
226  const kernel_index_type idx = block_id % THREAD_BLOCK_SIZE;
227  if (threadIdx.y == idx) {
228  data_intern_i[threadIdx.x][block_id] = data_d[block_id + vec_index + i];
229  }
230  const kernel_index_type idx2 = block_id + INTERNAL_BLOCK_SIZE % THREAD_BLOCK_SIZE;
231  if (threadIdx.y == idx2) {
232  data_intern_j[threadIdx.x][block_id] = data_d[block_id + vec_index + ji];
233  }
234  }
235  __syncthreads();
236 
237  #pragma unroll INTERNAL_BLOCK_SIZE
238  for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
239  data_j[data_index] = data_intern_j[threadIdx.y][data_index];
240  }
241 
242  #pragma unroll INTERNAL_BLOCK_SIZE
243  for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
244  const real_type data_i = data_intern_i[threadIdx.x][l];
245  #pragma unroll INTERNAL_BLOCK_SIZE
246  for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
247  matr[k][l] += (data_i - data_j[k]) * (data_i - data_j[k]);
248  }
249  }
250  }
251 
252  #pragma unroll INTERNAL_BLOCK_SIZE
253  for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
254  real_type ret_jx = 0.0;
255  #pragma unroll INTERNAL_BLOCK_SIZE
256  for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
257  const real_type temp = (exp(-gamma * matr[x][y]) + QA_cost - q[i + y] - q[j + x]) * add;
258  if (i + x > j + y) {
259  // upper triangular matrix
260  atomicAdd(&ret[i + y], temp * d[j + x]);
261  ret_jx += temp * d[i + y];
262  } else if (i + x == j + y) {
263  // diagonal
264  ret_jx += (temp + cost * add) * d[i + y];
265  }
266  }
267  atomicAdd(&ret[j + x], ret_jx);
268  }
269  }
270 }
271 
272 } // namespace plssvm::hip
273 
274 #endif // PLSSVM_BACKENDS_HIP_SVM_KERNEL_HPP_
__device__ __forceinline__ double atomicAdd(double *addr, const double val)
Atomically add the double precision val to the value denoted by addr.
Definition: atomics.cuh:24
Global type definitions and compile-time constants.
Namespace containing the C-SVM using the HIP backend.
Definition: csvm.hpp:34
__global__ void device_kernel_linear(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id)
Calculates the C-SVM kernel using the linear kernel function.
Definition: svm_kernel.hip.hpp:39
__global__ void device_kernel_polynomial(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0)
Calculates the C-SVM kernel using the polynomial kernel function.
Definition: svm_kernel.hip.hpp:130
__global__ void device_kernel_rbf(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma)
Calculates the C-SVM kernel using the radial basis function kernel function.
Definition: svm_kernel.hip.hpp:209
constexpr kernel_index_type THREAD_BLOCK_SIZE
Global compile-time constant used for internal caching. May be changed during the CMake configuration...
Definition: constants.hpp:25
int kernel_index_type
Integer type used inside kernels.
Definition: constants.hpp:19
constexpr kernel_index_type INTERNAL_BLOCK_SIZE
Global compile-time constant used for internal caching. May be changed during the CMake configuration...
Definition: constants.hpp:32