PLSSVM - Parallel Least Squares Support Vector Machine  2.0.0
A Least Squares Support Vector Machine implementation using different backends.
svm_kernel_nd_range.hpp
Go to the documentation of this file.
1 
12 #ifndef PLSSVM_BACKENDS_SYCL_SVM_KERNEL_ND_RANGE_HPP_
13 #define PLSSVM_BACKENDS_SYCL_SVM_KERNEL_ND_RANGE_HPP_
14 #pragma once
15 
16 #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op
17 #include "plssvm/constants.hpp" // plssvm::kernel_index_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE
18 
19 #include "sycl/sycl.hpp" // sycl::nd_item, sycl::local_accessor, sycl::range, sycl::group_barrier, sycl::pow, sycl::exp, sycl::atomic_ref
20 
21 #include <cstddef> // std::size_t (cant' use kernel_index_type because of comparisons with unsigned long values)
22 
23 namespace plssvm::sycl::detail {
24 
30 template <typename T>
32  public:
34  using real_type = T;
35 
50  nd_range_device_kernel_linear(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) :
51  data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {}
52 
58  void operator()(::sycl::nd_item<2> nd_idx) const {
59  kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
60  kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
61 
63  real_type data_j[INTERNAL_BLOCK_SIZE] = { { 0.0 } };
64 
65  if (nd_idx.get_local_range(0) < THREAD_BLOCK_SIZE && nd_idx.get_local_range(1) == 0) {
66  #pragma unroll INTERNAL_BLOCK_SIZE
67  for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
68  data_intern_i_[nd_idx.get_local_range(0)][block_id] = 0.0;
69  data_intern_j_[nd_idx.get_local_range(0)][block_id] = 0.0;
70  }
71  }
72  ::sycl::group_barrier(nd_idx.get_group());
73 
74  if (i >= j) {
75  i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
76  j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
77 
78  // cache data
79  for (kernel_index_type vec_index = 0; vec_index < feature_range_ * num_rows_; vec_index += num_rows_) {
80  ::sycl::group_barrier(nd_idx.get_group());
81  #pragma unroll INTERNAL_BLOCK_SIZE
82  for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
83  const std::size_t idx = block_id % THREAD_BLOCK_SIZE;
84  if (nd_idx.get_local_id(1) == idx) {
85  data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i];
86  }
87  const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
88  if (nd_idx.get_local_id(0) == idx_2) {
89  data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j];
90  }
91  }
92  ::sycl::group_barrier(nd_idx.get_group());
93 
94  #pragma unroll INTERNAL_BLOCK_SIZE
95  for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
96  data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index];
97  }
98 
99  #pragma unroll INTERNAL_BLOCK_SIZE
100  for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
101  const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l];
102  #pragma unroll INTERNAL_BLOCK_SIZE
103  for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
104  matr[k][l] += data_i * data_j[k];
105  }
106  }
107  }
108 
109  #pragma unroll INTERNAL_BLOCK_SIZE
110  for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
111  real_type ret_jx = 0.0;
112  #pragma unroll INTERNAL_BLOCK_SIZE
113  for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
114  real_type temp;
115  if (device_ == 0) {
116  temp = (matr[x][y] + QA_cost_ - q_[i + y] - q_[j + x]) * add_;
117  } else {
118  temp = matr[x][y] * add_;
119  }
120  if (i + x > j + y) {
121  // upper triangular matrix
122  detail::atomic_op<real_type>{ ret_[i + y] } += temp * d_[j + x];
123  ret_jx += temp * d_[i + y];
124  } else if (i + x == j + y) {
125  // diagonal
126  if (device_ == 0) {
127  ret_jx += (temp + cost_ * add_) * d_[i + y];
128  } else {
129  ret_jx += temp * d_[i + y];
130  }
131  }
132  }
133  detail::atomic_op<real_type>{ ret_[j + x] } += ret_jx;
134  }
135  }
136  }
137 
138  private:
140  ::sycl::local_accessor<real_type, 2> data_intern_i_;
142  ::sycl::local_accessor<real_type, 2> data_intern_j_;
143 
145  const real_type *q_;
146  real_type *ret_;
147  const real_type *d_;
148  const real_type *data_d_;
149  const real_type QA_cost_;
150  const real_type cost_;
151  const kernel_index_type num_rows_;
152  const kernel_index_type feature_range_;
153  const real_type add_;
154  const kernel_index_type device_;
156 };
157 
163 template <typename T>
165  public:
167  using real_type = T;
168 
185  nd_range_device_kernel_polynomial(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) :
186  data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {}
187 
193  void operator()(::sycl::nd_item<2> nd_idx) const {
194  kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
195  kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
196 
198  real_type data_j[INTERNAL_BLOCK_SIZE] = { { 0.0 } };
199 
200  if (nd_idx.get_local_range(0) < THREAD_BLOCK_SIZE && nd_idx.get_local_range(1) == 0) {
201  #pragma unroll INTERNAL_BLOCK_SIZE
202  for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
203  data_intern_i_[nd_idx.get_local_range(0)][block_id] = 0.0;
204  data_intern_j_[nd_idx.get_local_range(0)][block_id] = 0.0;
205  }
206  }
207  ::sycl::group_barrier(nd_idx.get_group());
208 
209  if (i >= j) {
210  i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
211  j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
212 
213  // cache data
214  for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) {
215  ::sycl::group_barrier(nd_idx.get_group());
216  #pragma unroll INTERNAL_BLOCK_SIZE
217  for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
218  const std::size_t idx = block_id % THREAD_BLOCK_SIZE;
219  if (nd_idx.get_local_id(1) == idx) {
220  data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i];
221  }
222  const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
223  if (nd_idx.get_local_id(0) == idx_2) {
224  data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j];
225  }
226  }
227  ::sycl::group_barrier(nd_idx.get_group());
228 
229  #pragma unroll INTERNAL_BLOCK_SIZE
230  for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
231  data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index];
232  }
233 
234  #pragma unroll INTERNAL_BLOCK_SIZE
235  for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
236  const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l];
237  #pragma unroll INTERNAL_BLOCK_SIZE
238  for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
239  matr[k][l] += data_i * data_j[k];
240  }
241  }
242  }
243 
244  #pragma unroll INTERNAL_BLOCK_SIZE
245  for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
246  real_type ret_jx = 0.0;
247  #pragma unroll INTERNAL_BLOCK_SIZE
248  for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
249  const real_type temp = (::sycl::pow(gamma_ * matr[x][y] + coef0_, static_cast<real_type>(degree_)) + QA_cost_ - q_[i + y] - q_[j + x]) * add_;
250  if (i + x > j + y) {
251  // upper triangular matrix
252  detail::atomic_op<real_type>{ ret_[i + y] } += temp * d_[j + x];
253  ret_jx += temp * d_[i + y];
254  } else if (i + x == j + y) {
255  // diagonal
256  ret_jx += (temp + cost_ * add_) * d_[i + y];
257  }
258  }
259  detail::atomic_op<real_type>{ ret_[j + x] } += ret_jx;
260  }
261  }
262  }
263 
264  private:
266  ::sycl::local_accessor<real_type, 2> data_intern_i_;
268  ::sycl::local_accessor<real_type, 2> data_intern_j_;
269 
271  const real_type *q_;
272  real_type *ret_;
273  const real_type *d_;
274  const real_type *data_d_;
275  const real_type QA_cost_;
276  const real_type cost_;
277  const kernel_index_type num_rows_;
278  const kernel_index_type num_cols_;
279  const real_type add_;
280  const int degree_;
281  const real_type gamma_;
282  const real_type coef0_;
284 };
285 
291 template <typename T>
293  public:
295  using real_type = T;
296 
311  nd_range_device_kernel_rbf(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) :
312  data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {}
313 
319  void operator()(::sycl::nd_item<2> nd_idx) const {
320  kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
321  kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
322 
324  real_type data_j[INTERNAL_BLOCK_SIZE] = { { 0.0 } };
325 
326  if (nd_idx.get_local_range(0) < THREAD_BLOCK_SIZE && nd_idx.get_local_range(1) == 0) {
327  #pragma unroll INTERNAL_BLOCK_SIZE
328  for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
329  data_intern_i_[nd_idx.get_local_range(0)][block_id] = 0.0;
330  data_intern_j_[nd_idx.get_local_range(0)][block_id] = 0.0;
331  }
332  }
333  ::sycl::group_barrier(nd_idx.get_group());
334 
335  if (i >= j) {
336  i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
337  j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
338 
339  // cache data
340  for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) {
341  ::sycl::group_barrier(nd_idx.get_group());
342  #pragma unroll INTERNAL_BLOCK_SIZE
343  for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
344  const std::size_t idx = block_id % THREAD_BLOCK_SIZE;
345  if (nd_idx.get_local_id(1) == idx) {
346  data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i];
347  }
348  const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
349  if (nd_idx.get_local_id(0) == idx_2) {
350  data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j];
351  }
352  }
353  ::sycl::group_barrier(nd_idx.get_group());
354 
355  #pragma unroll INTERNAL_BLOCK_SIZE
356  for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
357  data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index];
358  }
359 
360  #pragma unroll INTERNAL_BLOCK_SIZE
361  for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
362  const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l];
363  #pragma unroll INTERNAL_BLOCK_SIZE
364  for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
365  matr[k][l] += (data_i - data_j[k]) * (data_i - data_j[k]);
366  }
367  }
368  }
369 
370  #pragma unroll INTERNAL_BLOCK_SIZE
371  for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
372  real_type ret_jx = 0.0;
373  #pragma unroll INTERNAL_BLOCK_SIZE
374  for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
375  const real_type temp = (::sycl::exp(-gamma_ * matr[x][y]) + QA_cost_ - q_[i + y] - q_[j + x]) * add_;
376  if (i + x > j + y) {
377  // upper triangular matrix
378  detail::atomic_op<real_type>{ ret_[i + y] } += temp * d_[j + x];
379  ret_jx += temp * d_[i + y];
380  } else if (i + x == j + y) {
381  // diagonal
382  ret_jx += (temp + cost_ * add_) * d_[i + y];
383  }
384  }
385  detail::atomic_op<real_type>{ ret_[j + x] } += ret_jx;
386  }
387  }
388  }
389 
390  private:
392  ::sycl::local_accessor<real_type, 2> data_intern_i_;
394  ::sycl::local_accessor<real_type, 2> data_intern_j_;
395 
397  const real_type *q_;
398  real_type *ret_;
399  const real_type *d_;
400  const real_type *data_d_;
401  const real_type QA_cost_;
402  const real_type cost_;
403  const kernel_index_type num_rows_;
404  const kernel_index_type num_cols_;
405  const real_type add_;
406  const real_type gamma_;
408 };
409 
410 } // namespace plssvm::sycl::detail
411 
412 #endif // PLSSVM_BACKENDS_SYCL_SVM_KERNEL_ND_RANGE_HPP_
Defines an atomic_ref wrapper for the SYCL backend.
Calculates the C-SVM kernel using the nd_range formulation and the linear kernel function.
Definition: svm_kernel_nd_range.hpp:31
void operator()(::sycl::nd_item< 2 > nd_idx) const
Function call operator overload performing the actual calculation.
Definition: svm_kernel_nd_range.hpp:58
T real_type
The type of the data.
Definition: svm_kernel_nd_range.hpp:34
::sycl::local_accessor< real_type, 2 > data_intern_j_
Local memory used for internal memory access optimizations.
Definition: svm_kernel_nd_range.hpp:142
nd_range_device_kernel_linear(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id)
Construct a new device kernel calculating the C-SVM kernel using the linear C-SVM kernel.
Definition: svm_kernel_nd_range.hpp:50
::sycl::local_accessor< real_type, 2 > data_intern_i_
Local memory used for internal memory access optimizations.
Definition: svm_kernel_nd_range.hpp:140
Calculates the C-SVM kernel using the nd_range formulation and the polynomial kernel function.
Definition: svm_kernel_nd_range.hpp:164
::sycl::local_accessor< real_type, 2 > data_intern_j_
Local memory used for internal memory access optimizations.
Definition: svm_kernel_nd_range.hpp:268
void operator()(::sycl::nd_item< 2 > nd_idx) const
Function call operator overload performing the actual calculation.
Definition: svm_kernel_nd_range.hpp:193
T real_type
The type of the data.
Definition: svm_kernel_nd_range.hpp:167
nd_range_device_kernel_polynomial(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0)
Construct a new device kernel calculating the C-SVM kernel using the polynomial C-SVM kernel.
Definition: svm_kernel_nd_range.hpp:185
::sycl::local_accessor< real_type, 2 > data_intern_i_
Local memory used for internal memory access optimizations.
Definition: svm_kernel_nd_range.hpp:266
Calculates the C-SVM kernel using the nd_range formulation and the radial basis functions kernel func...
Definition: svm_kernel_nd_range.hpp:292
void operator()(::sycl::nd_item< 2 > nd_idx) const
Function call operator overload performing the actual calculation.
Definition: svm_kernel_nd_range.hpp:319
nd_range_device_kernel_rbf(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma)
Construct a new device kernel calculating the C-SVM kernel using the radial basis functions C-SVM ker...
Definition: svm_kernel_nd_range.hpp:311
T real_type
The type of the data.
Definition: svm_kernel_nd_range.hpp:295
::sycl::local_accessor< real_type, 2 > data_intern_j_
Local memory used for internal memory access optimizations.
Definition: svm_kernel_nd_range.hpp:394
::sycl::local_accessor< real_type, 2 > data_intern_i_
Local memory used for internal memory access optimizations.
Definition: svm_kernel_nd_range.hpp:392
Global type definitions and compile-time constants.
Namespace containing the C-SVM using the SYCL backend with the preferred SYCL implementation....
Definition: atomics.hpp:18
::sycl::atomic_ref< T, ::sycl::memory_order::relaxed, ::sycl::memory_scope::device, ::sycl::access::address_space::global_space > atomic_op
Shortcut alias for a sycl::atomic_ref targeting global memory.
Definition: atomics.hpp:25
constexpr kernel_index_type THREAD_BLOCK_SIZE
Global compile-time constant used for internal caching. May be changed during the CMake configuration...
Definition: constants.hpp:25
int kernel_index_type
Integer type used inside kernels.
Definition: constants.hpp:19
constexpr kernel_index_type INTERNAL_BLOCK_SIZE
Global compile-time constant used for internal caching. May be changed during the CMake configuration...
Definition: constants.hpp:32