PLSSVM - Parallel Least Squares Support Vector Machine  2.0.0
A Least Squares Support Vector Machine implementation using different backends.
svm_kernel_hierarchical.hpp
Go to the documentation of this file.
1 
12 #ifndef PLSSVM_BACKENDS_SYCL_SVM_KERNEL_HIERARCHICAL_HPP_
13 #define PLSSVM_BACKENDS_SYCL_SVM_KERNEL_HIERARCHICAL_HPP_
14 #pragma once
15 
16 #include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::detail::atomic_op
17 #include "plssvm/constants.hpp" // plssvm::kernel_index_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE
18 
19 #include "sycl/sycl.hpp" // sycl::queue, sycl::handler, sycl::h_item, sycl::range, sycl::private_memory, sycl::pow, sycl::exp
20 
21 #include <cstddef> // std::size_t (cant' use kernel_index_type because of comparisons with unsigned long values)
22 
23 namespace plssvm::sycl::detail {
24 
30 template <typename T>
32  public:
34  using real_type = T;
35 
49  hierarchical_device_kernel_linear(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) :
50  q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {}
51 
57  void operator()(::sycl::group<2> group) const {
58  // allocate shared memory
61 
62  // allocate memory for work-item local variables
63  // -> accessible across different 'parallel_for_work_item' invocations
64  ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
65  ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
66  ::sycl::private_memory<kernel_index_type, 2> private_i{ group };
67  ::sycl::private_memory<kernel_index_type, 2> private_j{ group };
68  ::sycl::private_memory<bool, 2> private_cond{ group };
69 
70  // initialize private variables
71  group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
72  // indices and diagonal condition
73  private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
74  private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
75  private_cond(idx) = private_i(idx) >= private_j(idx);
76  if (private_cond(idx)) {
77  private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
78  private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
79  }
80 
81  // matrix
82  #pragma unroll INTERNAL_BLOCK_SIZE
83  for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
84  #pragma unroll INTERNAL_BLOCK_SIZE
85  for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
86  private_matr(idx)[i][j] = real_type{ 0.0 };
87  }
88  }
89  });
90 
91  // implicit group barrier
92 
93  // load data from global in shared memory
94  for (kernel_index_type vec_index = 0; vec_index < feature_range_ * num_rows_; vec_index += num_rows_) {
95  group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
96  if (private_cond(idx)) {
97  #pragma unroll INTERNAL_BLOCK_SIZE
98  for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
99  const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
100  if (idx.get_local_id(1) == idx_1) {
101  data_intern_i[idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + private_i(idx)];
102  }
103  const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
104  if (idx.get_local_id(0) == idx_2) {
105  data_intern_j[idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + private_j(idx)];
106  }
107  }
108  }
109  });
110 
111  // implicit group barrier
112 
113  // load data from shared in private memory and perform scalar product
114  group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
115  if (private_cond(idx)) {
116  #pragma unroll INTERNAL_BLOCK_SIZE
117  for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
118  private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
119  }
120 
121  #pragma unroll INTERNAL_BLOCK_SIZE
122  for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
123  const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
124  #pragma unroll INTERNAL_BLOCK_SIZE
125  for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
126  private_matr(idx)[k][l] += data_i * private_data_j(idx)[k];
127  }
128  }
129  }
130  });
131 
132  // implicit group barrier
133  }
134 
135  // kernel function
136  group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
137  if (private_cond(idx)) {
138  #pragma unroll INTERNAL_BLOCK_SIZE
139  for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
140  real_type ret_jx = 0.0;
141  #pragma unroll INTERNAL_BLOCK_SIZE
142  for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
143  real_type temp;
144  if (device_ == 0) {
145  temp = (private_matr(idx)[x][y] + QA_cost_ - q_[private_i(idx) + y] - q_[private_j(idx) + x]) * add_;
146  } else {
147  temp = private_matr(idx)[x][y] * add_;
148  }
149  if (private_i(idx) + x > private_j(idx) + y) {
150  // upper triangular matrix
151  detail::atomic_op<real_type>{ ret_[private_i(idx) + y] } += temp * d_[private_j(idx) + x];
152  ret_jx += temp * d_[private_i(idx) + y];
153  } else if (private_i(idx) + x == private_j(idx) + y) {
154  // diagonal
155  if (device_ == 0) {
156  ret_jx += (temp + cost_ * add_) * d_[private_i(idx) + y];
157  } else {
158  ret_jx += temp * d_[private_i(idx) + y];
159  }
160  }
161  }
162  detail::atomic_op<real_type>{ ret_[private_j(idx) + x] } += ret_jx;
163  }
164  }
165  });
166  }
167 
168  private:
170  const real_type *q_;
171  real_type *ret_;
172  const real_type *d_;
173  const real_type *data_d_;
174  const real_type QA_cost_;
175  const real_type cost_;
176  const kernel_index_type num_rows_;
177  const kernel_index_type feature_range_;
178  const real_type add_;
179  const kernel_index_type device_;
181 };
182 
188 template <typename T>
190  public:
192  using real_type = T;
193 
209  hierarchical_device_kernel_polynomial(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) :
210  q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {}
211 
217  void operator()(::sycl::group<2> group) const {
218  // allocate shared memory
221 
222  // allocate memory for work-item local variables
223  // -> accessible across different 'parallel_for_work_item' invocations
224  ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
225  ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
226  ::sycl::private_memory<kernel_index_type, 2> private_i{ group };
227  ::sycl::private_memory<kernel_index_type, 2> private_j{ group };
228  ::sycl::private_memory<bool, 2> private_cond{ group };
229 
230  // initialize private variables
231  group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
232  // indices and diagonal condition
233  private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
234  private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
235  private_cond(idx) = private_i(idx) >= private_j(idx);
236  if (private_cond(idx)) {
237  private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
238  private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
239  }
240 
241  // matrix
242  #pragma unroll INTERNAL_BLOCK_SIZE
243  for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
244  #pragma unroll INTERNAL_BLOCK_SIZE
245  for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
246  private_matr(idx)[i][j] = real_type{ 0.0 };
247  }
248  }
249  });
250 
251  // implicit group barrier
252 
253  // load data from global in shared memory
254  for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) {
255  group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
256  if (private_cond(idx)) {
257  #pragma unroll INTERNAL_BLOCK_SIZE
258  for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
259  const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
260  if (idx.get_local_id(1) == idx_1) {
261  data_intern_i[idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + private_i(idx)];
262  }
263  const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
264  if (idx.get_local_id(0) == idx_2) {
265  data_intern_j[idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + private_j(idx)];
266  }
267  }
268  }
269  });
270 
271  // implicit group barrier
272 
273  // load data from shared in private memory and perform scalar product
274  group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
275  if (private_cond(idx)) {
276  #pragma unroll INTERNAL_BLOCK_SIZE
277  for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
278  private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
279  }
280 
281  #pragma unroll INTERNAL_BLOCK_SIZE
282  for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
283  const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
284  #pragma unroll INTERNAL_BLOCK_SIZE
285  for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
286  private_matr(idx)[k][l] += data_i * private_data_j(idx)[k];
287  }
288  }
289  }
290  });
291 
292  // implicit group barrier
293  }
294 
295  // kernel function
296  group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
297  if (private_cond(idx)) {
298  #pragma unroll INTERNAL_BLOCK_SIZE
299  for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
300  real_type ret_jx = 0.0;
301  #pragma unroll INTERNAL_BLOCK_SIZE
302  for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
303  const real_type temp = (::sycl::pow(gamma_ * private_matr(idx)[x][y] + coef0_, static_cast<real_type>(degree_)) + QA_cost_ - q_[private_i(idx) + y] - q_[private_j(idx) + x]) * add_;
304  if (private_i(idx) + x > private_j(idx) + y) {
305  // upper triangular matrix
306  detail::atomic_op<real_type>{ ret_[private_i(idx) + y] } += temp * d_[private_j(idx) + x];
307  ret_jx += temp * d_[private_i(idx) + y];
308  } else if (private_i(idx) + x == private_j(idx) + y) {
309  // diagonal
310  ret_jx += (temp + cost_ * add_) * d_[private_i(idx) + y];
311  }
312  }
313  detail::atomic_op<real_type>{ ret_[private_j(idx) + x] } += ret_jx;
314  }
315  }
316  });
317  }
318 
319  private:
321  const real_type *q_;
322  real_type *ret_;
323  const real_type *d_;
324  const real_type *data_d_;
325  const real_type QA_cost_;
326  const real_type cost_;
327  const kernel_index_type num_rows_;
328  const kernel_index_type num_cols_;
329  const real_type add_;
330  const int degree_;
331  const real_type gamma_;
332  const real_type coef0_;
334 };
335 
341 template <typename T>
343  public:
345  using real_type = T;
346 
360  hierarchical_device_kernel_rbf(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) :
361  q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {}
362 
368  void operator()(::sycl::group<2> group) const {
369  // allocate shared memory
372 
373  // allocate memory for work-item local variables
374  // -> accessible across different 'parallel_for_work_item' invocations
375  ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE], 2> private_matr{ group };
376  ::sycl::private_memory<real_type[INTERNAL_BLOCK_SIZE], 2> private_data_j{ group };
377  ::sycl::private_memory<kernel_index_type, 2> private_i{ group };
378  ::sycl::private_memory<kernel_index_type, 2> private_j{ group };
379  ::sycl::private_memory<bool, 2> private_cond{ group };
380 
381  // initialize private variables
382  group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
383  // indices and diagonal condition
384  private_i(idx) = group[0] * idx.get_local_range(0) * INTERNAL_BLOCK_SIZE;
385  private_j(idx) = group[1] * idx.get_local_range(1) * INTERNAL_BLOCK_SIZE;
386  private_cond(idx) = private_i(idx) >= private_j(idx);
387  if (private_cond(idx)) {
388  private_i(idx) += idx.get_local_id(0) * INTERNAL_BLOCK_SIZE;
389  private_j(idx) += idx.get_local_id(1) * INTERNAL_BLOCK_SIZE;
390  }
391 
392  // matrix
393  #pragma unroll INTERNAL_BLOCK_SIZE
394  for (kernel_index_type i = 0; i < INTERNAL_BLOCK_SIZE; ++i) {
395  #pragma unroll INTERNAL_BLOCK_SIZE
396  for (kernel_index_type j = 0; j < INTERNAL_BLOCK_SIZE; ++j) {
397  private_matr(idx)[i][j] = real_type{ 0.0 };
398  }
399  }
400  });
401 
402  // implicit group barrier
403 
404  // load data from global in shared memory
405  for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) {
406  group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
407  if (private_cond(idx)) {
408  #pragma unroll INTERNAL_BLOCK_SIZE
409  for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) {
410  const std::size_t idx_1 = block_id % THREAD_BLOCK_SIZE;
411  if (idx.get_local_id(1) == idx_1) {
412  data_intern_i[idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + private_i(idx)];
413  }
414  const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE;
415  if (idx.get_local_id(0) == idx_2) {
416  data_intern_j[idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + private_j(idx)];
417  }
418  }
419  }
420  });
421 
422  // implicit group barrier
423 
424  // load data from shared in private memory and perform scalar product
425  group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
426  if (private_cond(idx)) {
427  #pragma unroll INTERNAL_BLOCK_SIZE
428  for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) {
429  private_data_j(idx)[data_index] = data_intern_j[idx.get_local_id(1)][data_index];
430  }
431 
432  #pragma unroll INTERNAL_BLOCK_SIZE
433  for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) {
434  const real_type data_i = data_intern_i[idx.get_local_id(0)][l];
435  #pragma unroll INTERNAL_BLOCK_SIZE
436  for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) {
437  private_matr(idx)[k][l] += (data_i - private_data_j(idx)[k]) * (data_i - private_data_j(idx)[k]);
438  }
439  }
440  }
441  });
442 
443  // implicit group barrier
444  }
445 
446  // kernel function
447  group.parallel_for_work_item([&](::sycl::h_item<2> idx) {
448  if (private_cond(idx)) {
449  #pragma unroll INTERNAL_BLOCK_SIZE
450  for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) {
451  real_type ret_jx = 0.0;
452  #pragma unroll INTERNAL_BLOCK_SIZE
453  for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) {
454  const real_type temp = (::sycl::exp(-gamma_ * private_matr(idx)[x][y]) + QA_cost_ - q_[private_i(idx) + y] - q_[private_j(idx) + x]) * add_;
455  if (private_i(idx) + x > private_j(idx) + y) {
456  // upper triangular matrix
457  detail::atomic_op<real_type>{ ret_[private_i(idx) + y] } += temp * d_[private_j(idx) + x];
458  ret_jx += temp * d_[private_i(idx) + y];
459  } else if (private_i(idx) + x == private_j(idx) + y) {
460  // diagonal
461  ret_jx += (temp + cost_ * add_) * d_[private_i(idx) + y];
462  }
463  }
464  detail::atomic_op<real_type>{ ret_[private_j(idx) + x] } += ret_jx;
465  }
466  }
467  });
468  }
469 
470  private:
472  const real_type *q_;
473  real_type *ret_;
474  const real_type *d_;
475  const real_type *data_d_;
476  const real_type QA_cost_;
477  const real_type cost_;
478  const kernel_index_type num_rows_;
479  const kernel_index_type num_cols_;
480  const real_type add_;
481  const real_type gamma_;
483 };
484 
485 } // namespace plssvm::sycl::detail
486 
487 #endif // PLSSVM_BACKENDS_SYCL_SVM_KERNEL_HIERARCHICAL_HPP_
Defines an atomic_ref wrapper for the SYCL backend.
Calculates the C-SVM kernel using the hierarchical formulation and the linear kernel function.
Definition: svm_kernel_hierarchical.hpp:31
T real_type
The type of the data.
Definition: svm_kernel_hierarchical.hpp:34
hierarchical_device_kernel_linear(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id)
Construct a new device kernel calculating the C-SVM kernel using the linear C-SVM kernel.
Definition: svm_kernel_hierarchical.hpp:49
void operator()(::sycl::group< 2 > group) const
Function call operator overload performing the actual calculation.
Definition: svm_kernel_hierarchical.hpp:57
Calculates the C-SVM kernel using the hierarchical formulation and the polynomial kernel function.
Definition: svm_kernel_hierarchical.hpp:189
hierarchical_device_kernel_polynomial(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0)
Construct a new device kernel calculating the C-SVM kernel using the polynomial C-SVM kernel.
Definition: svm_kernel_hierarchical.hpp:209
T real_type
The type of the data.
Definition: svm_kernel_hierarchical.hpp:192
void operator()(::sycl::group< 2 > group) const
Function call operator overload performing the actual calculation.
Definition: svm_kernel_hierarchical.hpp:217
Calculates the C-SVM kernel using the hierarchical formulation and the radial basis functions kernel ...
Definition: svm_kernel_hierarchical.hpp:342
void operator()(::sycl::group< 2 > group) const
Function call operator overload performing the actual calculation.
Definition: svm_kernel_hierarchical.hpp:368
T real_type
The type of the data.
Definition: svm_kernel_hierarchical.hpp:345
hierarchical_device_kernel_rbf(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma)
Construct a new device kernel calculating the C-SVM kernel using the radial basis functions kernel fu...
Definition: svm_kernel_hierarchical.hpp:360
Global type definitions and compile-time constants.
Namespace containing the C-SVM using the SYCL backend with the preferred SYCL implementation....
Definition: atomics.hpp:18
::sycl::atomic_ref< T, ::sycl::memory_order::relaxed, ::sycl::memory_scope::device, ::sycl::access::address_space::global_space > atomic_op
Shortcut alias for a sycl::atomic_ref targeting global memory.
Definition: atomics.hpp:25
constexpr kernel_index_type THREAD_BLOCK_SIZE
Global compile-time constant used for internal caching. May be changed during the CMake configuration...
Definition: constants.hpp:25
int kernel_index_type
Integer type used inside kernels.
Definition: constants.hpp:19
constexpr kernel_index_type INTERNAL_BLOCK_SIZE
Global compile-time constant used for internal caching. May be changed during the CMake configuration...
Definition: constants.hpp:32