12 #ifndef PLSSVM_BACKENDS_GPU_CSVM_HPP_
13 #define PLSSVM_BACKENDS_GPU_CSVM_HPP_
24 #include "fmt/chrono.h"
45 template <
template <
typename>
typename device_ptr_t,
typename queue_t>
49 template <
typename real_type>
64 template <
typename... Args>
66 ::
plssvm::
csvm{ std::forward<Args>(args)... } {}
109 template <
typename real_type>
115 [[nodiscard]] std::vector<float>
predict_values(
const parameter<float> ¶ms,
const std::vector<std::vector<float>> &support_vectors,
const std::vector<float> &alpha,
float rho, std::vector<float> &w,
const std::vector<std::vector<float>> &predict_points)
const final {
return this->
predict_values_impl(params, support_vectors, alpha, rho, w, predict_points); }
119 [[nodiscard]] std::vector<double>
predict_values(
const parameter<double> ¶ms,
const std::vector<std::vector<double>> &support_vectors,
const std::vector<double> &alpha,
double rho, std::vector<double> &w,
const std::vector<std::vector<double>> &predict_points)
const final {
return this->
predict_values_impl(params, support_vectors, alpha, rho, w, predict_points); }
123 template <
typename real_type>
124 [[nodiscard]] std::vector<real_type>
predict_values_impl(
const parameter<real_type> ¶ms,
const std::vector<std::vector<real_type>> &support_vectors,
const std::vector<real_type> &alpha, real_type rho, std::vector<real_type> &w,
const std::vector<std::vector<real_type>> &predict_points)
const;
146 template <
typename real_type>
147 [[nodiscard]] std::tuple<std::vector<device_ptr_type<real_type>>, std::vector<device_ptr_type<real_type>>, std::vector<std::size_t>>
setup_data_on_device(
const std::vector<std::vector<real_type>> &data, std::size_t num_data_points_to_setup, std::size_t num_features_to_setup, std::size_t boundary_size, std::size_t num_used_devices)
const;
160 template <
typename real_type>
172 template <
typename real_type>
189 template <
typename real_type>
190 void run_device_kernel(std::size_t device,
const parameter<real_type> ¶ms,
const device_ptr_type<real_type> &q_d,
device_ptr_type<real_type> &r_d,
const device_ptr_type<real_type> &x_d,
const device_ptr_type<real_type> &data_d,
const std::vector<std::size_t> &feature_ranges, real_type QA_cost, real_type add, std::size_t dept, std::size_t boundary_size)
const;
196 template <
typename real_type>
239 virtual void run_svm_kernel(std::size_t device,
const detail::execution_range &range,
const parameter<float> ¶ms,
const device_ptr_type<float> &q_d,
device_ptr_type<float> &r_d,
const device_ptr_type<float> &x_d,
const device_ptr_type<float> &data_d,
float QA_cost,
float add, std::size_t num_data_points_padded, std::size_t num_features)
const = 0;
243 virtual void run_svm_kernel(std::size_t device,
const detail::execution_range &range,
const parameter<double> ¶ms,
const device_ptr_type<double> &q_d,
device_ptr_type<double> &r_d,
const device_ptr_type<double> &x_d,
const device_ptr_type<double> &data_d,
double QA_cost,
double add, std::size_t num_data_points_padded, std::size_t num_features)
const = 0;
283 template <
template <
typename>
typename device_ptr_t,
typename queue_t>
285 PLSSVM_ASSERT(num_features > 0,
"At lest one feature must be given!");
289 std::clog << fmt::format(
"Warning: found {} devices, however only 1 device can be used since the polynomial and rbf kernels currently only support single GPU execution!", devices_.size()) << std::endl;
294 const std::size_t num_used_devices = std::min(devices_.size(), num_features);
295 if (num_used_devices < devices_.size()) {
296 std::clog << fmt::format(
"Warning: found {} devices, however only {} device(s) can be used since the data set only has {} features!", devices_.size(), num_used_devices, num_features) << std::endl;
298 return num_used_devices;
302 template <
template <
typename>
typename device_ptr_t,
typename queue_t>
303 template <
typename real_type>
304 std::tuple<std::vector<device_ptr_t<real_type>>, std::vector<device_ptr_t<real_type>>, std::vector<std::size_t>>
306 const std::size_t num_data_points_to_setup,
307 const std::size_t num_features_to_setup,
308 const std::size_t boundary_size,
309 const std::size_t num_used_devices)
const {
311 PLSSVM_ASSERT(!data.front().empty(),
"The data points must contain at least one feature!");
312 PLSSVM_ASSERT(std::all_of(data.cbegin(), data.cend(), [&data](
const std::vector<real_type> &data_point) { return data_point.size() == data.front().size(); }),
"All data points must have the same number of features!");
313 PLSSVM_ASSERT(num_data_points_to_setup > 0,
"At least one data point must be copied to the device!");
314 PLSSVM_ASSERT(num_data_points_to_setup <= data.size(),
"Can't copy more data points to the device than are present!: {} <= {}", num_data_points_to_setup, data.size());
315 PLSSVM_ASSERT(num_features_to_setup > 0,
"At least one feature must be copied to the device!");
316 PLSSVM_ASSERT(num_features_to_setup <= data.front().size(),
"Can't copy more features to the device than are present!: {} <= {}", num_features_to_setup, data.front().size());
317 PLSSVM_ASSERT(num_used_devices <= devices_.size(),
"Can't use more devices than are available!: {} <= {}", num_used_devices, devices_.size());
320 std::vector<std::size_t> feature_ranges(num_used_devices + 1);
321 for (
typename std::vector<queue_type>::size_type device = 0; device <= num_used_devices; ++device) {
322 feature_ranges[device] = device * num_features_to_setup / num_used_devices;
328 std::vector<device_ptr_type<real_type>> data_last_d(num_used_devices);
329 std::vector<device_ptr_type<real_type>> data_d(num_used_devices);
331 #pragma omp parallel for default(none) shared(num_used_devices, devices_, feature_ranges, data_last_d, data_d, data, transformed_data) firstprivate(num_data_points_to_setup, boundary_size, num_features_to_setup)
332 for (
typename std::vector<queue_type>::size_type device = 0; device < num_used_devices; ++device) {
333 const std::size_t num_features_in_range = feature_ranges[device + 1] - feature_ranges[device];
336 data_last_d[device] = device_ptr_type<real_type>{ num_features_in_range + boundary_size, devices_[device] };
337 data_last_d[device].memset(0);
338 data_last_d[device].copy_to_device(data.back().data() + feature_ranges[device], 0, num_features_in_range);
340 const std::size_t device_data_size = num_features_in_range * (num_data_points_to_setup + boundary_size);
341 data_d[device] = device_ptr_type<real_type>{ device_data_size, devices_[device] };
342 data_d[device].copy_to_device(transformed_data.data() + feature_ranges[device] * (num_data_points_to_setup + boundary_size), 0, device_data_size);
345 return std::make_tuple(std::move(data_d), std::move(data_last_d), std::move(feature_ranges));
349 template <
template <
typename>
typename device_ptr_t,
typename queue_t>
350 template <
typename real_type>
354 const std::size_t num_data_points,
355 const std::vector<std::size_t> &feature_ranges,
356 const std::size_t boundary_size)
const {
357 PLSSVM_ASSERT(!data_d.empty(),
"The data_d array may not be empty!");
358 PLSSVM_ASSERT(std::all_of(data_d.cbegin(), data_d.cend(), [](
const device_ptr_type<real_type> &ptr) { return !ptr.empty(); }),
"Each device_ptr in data_d must at least contain one data point!");
359 PLSSVM_ASSERT(!data_last_d.empty(),
"The data_last_d array may not be empty!");
360 PLSSVM_ASSERT(std::all_of(data_last_d.cbegin(), data_last_d.cend(), [](
const device_ptr_type<real_type> &ptr) { return !ptr.empty(); }),
"Each device_ptr in data_last_d must at least contain one data point!");
361 PLSSVM_ASSERT(data_d.size() == data_last_d.size(),
"The number of used devices to the data_d and data_last_d vectors must be equal!: {} != {}", data_d.size(), data_last_d.size());
362 PLSSVM_ASSERT(num_data_points > 0,
"At least one data point must be used to calculate q!");
363 PLSSVM_ASSERT(feature_ranges.size() == data_d.size() + 1,
"The number of values in the feature_range vector must be exactly one more than the number of used devices!: {} != {} + 1", feature_ranges.size(), data_d.size());
364 PLSSVM_ASSERT(std::adjacent_find(feature_ranges.cbegin(), feature_ranges.cend(), std::less_equal<>{}) != feature_ranges.cend(),
"The feature ranges are not monotonically increasing!");
366 const std::size_t num_used_devices = data_d.size();
367 std::vector<device_ptr_type<real_type>> q_d(num_used_devices);
369 #pragma omp parallel for default(none) shared(num_used_devices, q_d, devices_, data_d, data_last_d, feature_ranges, params) firstprivate(num_data_points, boundary_size, THREAD_BLOCK_SIZE)
370 for (
typename std::vector<queue_type>::size_type device = 0; device < num_used_devices; ++device) {
372 q_d[device].memset(0);
378 run_q_kernel(device, range, params, q_d[device], data_d[device], data_last_d[device], num_data_points + boundary_size, feature_ranges[device + 1] - feature_ranges[device]);
381 std::vector<real_type> q(num_data_points);
382 device_reduction(q_d, q);
386 template <
template <
typename>
typename device_ptr_t,
typename queue_t>
387 template <
typename real_type>
391 const std::size_t num_data_points,
392 const std::vector<std::size_t> &feature_ranges)
const {
393 PLSSVM_ASSERT(!data_d.empty(),
"The data_d array may not be empty!");
394 PLSSVM_ASSERT(std::all_of(data_d.cbegin(), data_d.cend(), [](
const device_ptr_type<real_type> &ptr) { return !ptr.empty(); }),
"Each device_ptr in data_d must at least contain one data point!");
395 PLSSVM_ASSERT(!data_last_d.empty(),
"The data_last_d array may not be empty!");
396 PLSSVM_ASSERT(std::all_of(data_last_d.cbegin(), data_last_d.cend(), [](
const device_ptr_type<real_type> &ptr) { return !ptr.empty(); }),
"Each device_ptr in data_last_d must at least contain one data point!");
397 PLSSVM_ASSERT(data_d.size() == data_last_d.size(),
"The number of used devices to the data_d and data_last_d vectors must be equal!: {} != {}", data_d.size(), data_last_d.size());
398 PLSSVM_ASSERT(!alpha_d.empty(),
"The alpha_d array may not be empty!");
399 PLSSVM_ASSERT(std::all_of(alpha_d.cbegin(), alpha_d.cend(), [](
const device_ptr_type<real_type> &ptr) { return !ptr.empty(); }),
"Each device_ptr in alpha_d must at least contain one data point!");
400 PLSSVM_ASSERT(data_d.size() == alpha_d.size(),
"The number of used devices to the data_d and alpha_d vectors must be equal!: {} != {}", data_d.size(), alpha_d.size());
401 PLSSVM_ASSERT(num_data_points > 0,
"At least one data point must be used to calculate q!");
402 PLSSVM_ASSERT(feature_ranges.size() == data_d.size() + 1,
"The number of values in the feature_range vector must be exactly one more than the number of used devices!: {} != {} + 1", feature_ranges.size(), data_d.size());
403 PLSSVM_ASSERT(std::adjacent_find(feature_ranges.cbegin(), feature_ranges.cend(), std::less_equal<>{}) != feature_ranges.cend(),
"The feature ranges are not monotonically increasing!");
405 const std::size_t num_used_devices = data_d.size();
408 std::vector<real_type> w(feature_ranges.back(), real_type{ 0.0 });
410 #pragma omp parallel for default(none) shared(num_used_devices, devices_, feature_ranges, alpha_d, data_d, data_last_d, w) firstprivate(num_data_points, THREAD_BLOCK_SIZE)
411 for (
typename std::vector<queue_type>::size_type device = 0; device < num_used_devices; ++device) {
413 const std::size_t num_features_in_range = feature_ranges[device + 1] - feature_ranges[device];
422 run_w_kernel(device, range, w_d, alpha_d[device], data_d[device], data_last_d[device], num_data_points, num_features_in_range);
426 w_d.copy_to_host(w.data() + feature_ranges[device], 0, num_features_in_range);
431 template <
template <
typename>
typename device_ptr_t,
typename queue_t>
432 template <
typename real_type>
433 void gpu_csvm<device_ptr_t, queue_t>::run_device_kernel(
const std::size_t device,
const parameter<real_type> ¶ms,
const device_ptr_type<real_type> &q_d,
device_ptr_type<real_type> &r_d,
const device_ptr_type<real_type> &x_d,
const device_ptr_type<real_type> &data_d,
const std::vector<std::size_t> &feature_ranges,
const real_type QA_cost,
const real_type add,
const std::size_t dept,
const std::size_t boundary_size)
const {
434 PLSSVM_ASSERT(device < devices_.size(),
"Requested device {}, but only {} device(s) are available!", device, devices_.size());
435 PLSSVM_ASSERT(!q_d.empty(),
"The q_d device_ptr may not be empty!");
436 PLSSVM_ASSERT(!r_d.empty(),
"The r_d device_ptr may not be empty!");
437 PLSSVM_ASSERT(!x_d.empty(),
"The x_d device_ptr may not be empty!");
438 PLSSVM_ASSERT(!data_d.empty(),
"The data_d device_ptr may not be empty!");
439 PLSSVM_ASSERT(std::adjacent_find(feature_ranges.cbegin(), feature_ranges.cend(), std::less_equal<>{}) != feature_ranges.cend(),
"The feature ranges are not monotonically increasing!");
440 PLSSVM_ASSERT(add == real_type{ -1.0 } || add == real_type{ 1.0 },
"add must either by -1.0 or 1.0, but is {}!", add);
441 PLSSVM_ASSERT(dept > 0,
"At least one data point must be used to calculate q!");
443 const auto grid =
static_cast<std::size_t
>(std::ceil(
static_cast<real_type
>(dept) /
static_cast<real_type
>(boundary_size)));
446 run_svm_kernel(device, range, params, q_d, r_d, x_d, data_d, QA_cost, add, dept + boundary_size, feature_ranges[device + 1] - feature_ranges[device]);
449 template <
template <
typename>
typename device_ptr_t,
typename queue_t>
450 template <
typename real_type>
452 PLSSVM_ASSERT(!buffer_d.empty(),
"The buffer_d array may not be empty!");
453 PLSSVM_ASSERT(std::all_of(buffer_d.cbegin(), buffer_d.cend(), [](
const device_ptr_type<real_type> &ptr) { return !ptr.empty(); }),
"Each device_ptr in buffer_d must at least contain one data point!");
454 PLSSVM_ASSERT(!buffer.empty(),
"The buffer array may not be empty!");
459 buffer_d[0].copy_to_host(buffer, 0, buffer.size());
461 if (buffer_d.size() > 1) {
462 std::vector<real_type> ret(buffer.size());
465 buffer_d[device].copy_to_host(ret, 0, ret.size());
470 #pragma omp parallel for default(none) shared(buffer_d, buffer)
472 buffer_d[device].copy_to_device(buffer, 0, buffer.size());
477 template <
template <
typename>
typename device_ptr_t,
typename queue_t>
478 template <
typename real_type>
480 const std::vector<std::vector<real_type>> &A,
481 std::vector<real_type> b,
483 const unsigned long long max_iter)
const {
485 PLSSVM_ASSERT(!A.front().empty(),
"The data points must contain at least one feature!");
486 PLSSVM_ASSERT(std::all_of(A.cbegin(), A.cend(), [&A](
const std::vector<real_type> &data_point) { return data_point.size() == A.front().size(); }),
"All data points must have the same number of features!");
487 PLSSVM_ASSERT(A.size() == b.size(),
"The number of data points in the matrix A ({}) and the values in the right hand side vector ({}) must be the same!", A.size(), b.size());
488 PLSSVM_ASSERT(eps > real_type{ 0.0 },
"The stopping criterion in the CG algorithm must be greater than 0.0, but is {}!", eps);
489 PLSSVM_ASSERT(max_iter > 0,
"The number of CG iterations must be greater than 0!");
493 const std::size_t dept = A.size() - 1;
495 const std::size_t num_features = A.front().size();
497 const std::size_t num_used_devices = this->select_num_used_devices(params.
kernel_type, num_features);
499 std::vector<device_ptr_type<real_type>> data_d;
500 std::vector<device_ptr_type<real_type>> data_last_d;
501 std::vector<std::size_t> feature_ranges;
502 std::tie(data_d, data_last_d, feature_ranges) = this->setup_data_on_device(A, dept, num_features, boundary_size, num_used_devices);
505 const std::vector<real_type> q = this->generate_q(params, data_d, data_last_d, dept, feature_ranges, boundary_size);
508 const real_type QA_cost =
kernel_function(A.back(), A.back(), params) + real_type{ 1.0 } / params.
cost;
511 const real_type b_back_value = b.back();
515 std::vector<real_type> x(dept, 1.0);
516 std::vector<device_ptr_type<real_type>> x_d(num_used_devices);
518 std::vector<real_type> r(dept, 0.0);
519 std::vector<device_ptr_type<real_type>> r_d(num_used_devices);
521 #pragma omp parallel for default(none) shared(num_used_devices, devices_, x, x_d, r_d) firstprivate(dept, boundary_size)
522 for (
typename std::vector<queue_type>::size_type device = 0; device < num_used_devices; ++device) {
524 x_d[device].memset(0);
525 x_d[device].copy_to_device(x, 0, dept);
528 r_d[device].memset(0);
530 r_d[0].copy_to_device(b, 0, dept);
532 std::vector<device_ptr_type<real_type>> q_d(num_used_devices);
533 #pragma omp parallel for default(none) shared(num_used_devices, devices_, q, q_d, r_d, x_d, data_d, feature_ranges, params) firstprivate(dept, boundary_size, QA_cost, num_features)
534 for (
typename std::vector<queue_type>::size_type device = 0; device < num_used_devices; ++device) {
536 q_d[device].memset(0);
537 q_d[device].copy_to_device(q, 0, dept);
540 run_device_kernel(device, params, q_d[device], r_d[device], x_d[device], data_d[device], feature_ranges, QA_cost, real_type{ -1.0 }, dept, boundary_size);
542 device_reduction(r_d, r);
546 const real_type delta0 = delta;
547 std::vector<real_type> Ad(dept);
549 std::vector<device_ptr_type<real_type>> Ad_d(num_used_devices);
550 for (
typename std::vector<queue_type>::size_type device = 0; device < num_used_devices; ++device) {
554 std::vector<real_type> d(r);
557 std::chrono::milliseconds average_iteration_time{};
558 std::chrono::steady_clock::time_point iteration_start_time{};
559 const auto output_iteration_duration = [&]() {
560 const auto iteration_end_time = std::chrono::steady_clock::now();
561 const auto iteration_duration = std::chrono::duration_cast<std::chrono::milliseconds>(iteration_end_time - iteration_start_time);
563 "Done in {}.\n", iteration_duration);
564 average_iteration_time += iteration_duration;
567 unsigned long long iter = 0;
568 for (; iter < max_iter; ++iter) {
570 "Start Iteration {} (max: {}) with current residuum {} (target: {}). ", iter + 1, max_iter, delta, eps * eps * delta0);
571 iteration_start_time = std::chrono::steady_clock::now();
574 #pragma omp parallel for default(none) shared(num_used_devices, devices_, Ad_d, r_d, q_d, data_d, feature_ranges, params) firstprivate(dept, QA_cost, boundary_size, num_features)
575 for (
typename std::vector<queue_type>::size_type device = 0; device < num_used_devices; ++device) {
576 Ad_d[device].memset(0);
577 r_d[device].memset(0, dept);
579 run_device_kernel(device, params, q_d[device], Ad_d[device], r_d[device], data_d[device], feature_ranges, QA_cost, real_type{ 1.0 }, dept, boundary_size);
582 device_reduction(Ad_d, Ad);
585 const real_type alpha_cd = delta / (
transposed{ d } * Ad);
590 #pragma omp parallel for default(none) shared(num_used_devices, devices_, x, x_d) firstprivate(dept)
591 for (
typename std::vector<queue_type>::size_type device = 0; device < num_used_devices; ++device) {
592 x_d[device].copy_to_device(x, 0, dept);
595 if (iter % 50 == 49) {
596 #pragma omp parallel for default(none) shared(devices_, r_d, b, q_d, x_d, params, data_d, feature_ranges) firstprivate(QA_cost, dept)
597 for (
typename std::vector<queue_type>::size_type device = 0; device < devices_.size(); ++device) {
600 r_d[device].copy_to_device(b, 0, dept);
603 r_d[device].memset(0);
606 run_device_kernel(device, params, q_d[device], r_d[device], x_d[device], data_d[device], feature_ranges, QA_cost, real_type{ -1.0 }, dept, boundary_size);
609 device_reduction(r_d, r);
616 const real_type delta_old = delta;
619 if (delta <= eps * eps * delta0) {
620 output_iteration_duration();
625 const real_type beta = delta / delta_old;
630 #pragma omp parallel for default(none) shared(num_used_devices, devices_, r_d, d) firstprivate(dept)
631 for (
typename std::vector<queue_type>::size_type device = 0; device < num_used_devices; ++device) {
632 r_d[device].copy_to_device(d, 0, dept);
635 output_iteration_duration();
638 "Finished after {}/{} iterations with a residuum of {} (target: {}) and an average iteration time of {}.\n",
643 detail::tracking_entry{
"cg",
"avg_iteration_time", average_iteration_time / std::min(iter + 1, max_iter) });
646 "optimization finished, #iter = {}\n", std::min(iter + 1, max_iter));
649 std::vector<real_type> alpha(x.begin(), x.begin() + dept);
650 const real_type bias = b_back_value + QA_cost *
sum(alpha) - (
transposed{ q } * alpha);
651 alpha.push_back(-
sum(alpha));
653 return std::make_pair(std::move(alpha), -bias);
656 template <
template <
typename>
typename device_ptr_t,
typename queue_t>
657 template <
typename real_type>
659 const std::vector<std::vector<real_type>> &support_vectors,
660 const std::vector<real_type> &alpha,
662 std::vector<real_type> &w,
663 const std::vector<std::vector<real_type>> &predict_points)
const {
664 PLSSVM_ASSERT(!support_vectors.empty(),
"The support vectors must not be empty!");
665 PLSSVM_ASSERT(!support_vectors.front().empty(),
"The support vectors must contain at least one feature!");
666 PLSSVM_ASSERT(std::all_of(support_vectors.cbegin(), support_vectors.cend(), [&support_vectors](
const std::vector<real_type> &data_point) { return data_point.size() == support_vectors.front().size(); }),
"All support vectors must have the same number of features!");
667 PLSSVM_ASSERT(support_vectors.size() == alpha.size(),
"The number of support vectors ({}) and number of weights ({}) must be the same!", support_vectors.size(), alpha.size());
668 PLSSVM_ASSERT(w.empty() || support_vectors.front().size() == w.size(),
"Either w must be empty or contain exactly the same number of values ({}) as features are present ({})!", w.size(), support_vectors.front().size());
669 PLSSVM_ASSERT(!predict_points.empty(),
"The data points to predict must not be empty!");
670 PLSSVM_ASSERT(!predict_points.front().empty(),
"The data points to predict must contain at least one feature!");
671 PLSSVM_ASSERT(std::all_of(predict_points.cbegin(), predict_points.cend(), [&predict_points](
const std::vector<real_type> &data_point) { return data_point.size() == predict_points.front().size(); }),
"All data points to predict must have the same number of features!");
672 PLSSVM_ASSERT(support_vectors.front().size() == predict_points.front().size(),
"The number of features in the support vectors ({}) must be the same as in the data points to predict ({})!", support_vectors.front().size(), predict_points.front().size());
676 const std::size_t num_support_vectors = support_vectors.size();
677 const std::size_t num_predict_points = predict_points.size();
678 const std::size_t num_features = predict_points.front().size();
681 const std::size_t num_used_devices = this->select_num_used_devices(params.
kernel_type, num_features);
683 auto [data_d, data_last_d, feature_ranges] = this->setup_data_on_device(support_vectors, num_support_vectors - 1, num_features, boundary_size, num_used_devices);
685 std::vector<device_ptr_type<real_type>> alpha_d(num_used_devices);
686 #pragma omp parallel for default(none) shared(num_used_devices, devices_, alpha_d, alpha) firstprivate(num_support_vectors)
687 for (
typename std::vector<queue_type>::size_type device = 0; device < num_used_devices; ++device) {
689 alpha_d[device].memset(0);
690 alpha_d[device].copy_to_device(alpha, 0, num_support_vectors);
693 std::vector<real_type> out(predict_points.size());
697 w = calculate_w(data_d, data_last_d, alpha_d, support_vectors.size(), feature_ranges);
702 #pragma omp parallel for default(none) shared(out, predict_points, w) firstprivate(num_predict_points, rho)
703 for (
typename std::vector<std::vector<real_type>>::size_type i = 0; i < num_predict_points; ++i) {
715 point_d.copy_to_device(transformed_data, 0, transformed_data.size());
718 static_cast<std::size_t
>(std::ceil(
static_cast<real_type
>(num_predict_points) /
static_cast<real_type
>(
THREAD_BLOCK_SIZE))) },
722 run_predict_kernel(range, params, out_d, alpha_d[0], point_d, data_d[0], data_last_d[0], num_support_vectors, num_predict_points, num_features);
724 out_d.copy_to_host(out, 0, num_predict_points);
#define PLSSVM_ASSERT(cond, msg,...)
Defines the PLSSVM_ASSERT macro if PLSSVM_ASSERT_ENABLED is defined.
Definition: assert.hpp:74
Base class for all C-SVM backends.
Definition: csvm.hpp:50
Class specifying a backend independent execution range.
Definition: execution_range.hpp:31
A C-SVM implementation for all GPU backends to reduce code duplication.
Definition: gpu_csvm.hpp:46
std::vector< real_type > generate_q(const parameter< real_type > ¶ms, const std::vector< device_ptr_type< real_type >> &data_d, const std::vector< device_ptr_type< real_type >> &data_last_d, std::size_t num_data_points, const std::vector< std::size_t > &feature_ranges, std::size_t boundary_size) const
Calculate the q vector used in the dimensional reduction.
Definition: gpu_csvm.hpp:351
gpu_csvm(const gpu_csvm &)=delete
Delete copy-constructor since a CSVM is a move-only type.
std::vector< real_type > calculate_w(const std::vector< device_ptr_type< real_type >> &data_d, const std::vector< device_ptr_type< real_type >> &data_last_d, const std::vector< device_ptr_type< real_type >> &alpha_d, std::size_t num_data_points, const std::vector< std::size_t > &feature_ranges) const
Precalculate the w vector to speedup up the prediction using the linear kernel function.
Definition: gpu_csvm.hpp:388
std::size_t select_num_used_devices(kernel_function_type kernel, std::size_t num_features) const noexcept
Returns the number of usable devices given the kernel function kernel and the number of features num_...
Definition: gpu_csvm.hpp:284
std::pair< std::vector< float >, float > solve_system_of_linear_equations(const parameter< float > ¶ms, const std::vector< std::vector< float >> &A, std::vector< float > b, float eps, unsigned long long max_iter) const final
Solves the equation using the Conjugated Gradients algorithm.
Definition: gpu_csvm.hpp:101
std::vector< double > predict_values(const parameter< double > ¶ms, const std::vector< std::vector< double >> &support_vectors, const std::vector< double > &alpha, double rho, std::vector< double > &w, const std::vector< std::vector< double >> &predict_points) const final
Uses the already learned model to predict the class of multiple (new) data points.
Definition: gpu_csvm.hpp:119
std::tuple< std::vector< device_ptr_type< real_type > >, std::vector< device_ptr_type< real_type > >, std::vector< std::size_t > > setup_data_on_device(const std::vector< std::vector< real_type >> &data, std::size_t num_data_points_to_setup, std::size_t num_features_to_setup, std::size_t boundary_size, std::size_t num_used_devices) const
Performs all necessary steps such that the data is available on the device with the correct layout.
std::vector< queue_type > devices_
The available/used backend devices.
Definition: gpu_csvm.hpp:280
virtual void device_synchronize(const queue_type &queue) const =0
Synchronize the device denoted by queue.
void run_device_kernel(std::size_t device, const parameter< real_type > ¶ms, const device_ptr_type< real_type > &q_d, device_ptr_type< real_type > &r_d, const device_ptr_type< real_type > &x_d, const device_ptr_type< real_type > &data_d, const std::vector< std::size_t > &feature_ranges, real_type QA_cost, real_type add, std::size_t dept, std::size_t boundary_size) const
Select the correct kernel based on the value of kernel_ and run it on the device denoted by device.
Definition: gpu_csvm.hpp:433
virtual void run_svm_kernel(std::size_t device, const detail::execution_range &range, const parameter< float > ¶ms, const device_ptr_type< float > &q_d, device_ptr_type< float > &r_d, const device_ptr_type< float > &x_d, const device_ptr_type< float > &data_d, float QA_cost, float add, std::size_t num_data_points_padded, std::size_t num_features) const =0
Run the main device kernel used in the CG algorithm.
queue_t queue_type
The type of the device queue (dependent on the used backend).
Definition: gpu_csvm.hpp:52
virtual void run_w_kernel(std::size_t device, const detail::execution_range &range, device_ptr_type< double > &w_d, const device_ptr_type< double > &alpha_d, const device_ptr_type< double > &data_d, const device_ptr_type< double > &data_last_d, std::size_t num_data_points, std::size_t num_features) const =0
Run the device kernel the calculate the w vector used to speed up the prediction when using the linea...
std::pair< std::vector< double >, double > solve_system_of_linear_equations(const parameter< double > ¶ms, const std::vector< std::vector< double >> &A, std::vector< double > b, double eps, unsigned long long max_iter) const final
Solves the equation using the Conjugated Gradients algorithm.
Definition: gpu_csvm.hpp:105
gpu_csvm(Args &&...args)
Construct a C-SVM forwarding all parameters args to the plssvm::parameter constructor.
Definition: gpu_csvm.hpp:65
void device_reduction(std::vector< device_ptr_type< real_type >> &buffer_d, std::vector< real_type > &buffer) const
Combines the data in buffer_d from all devices into buffer and distributes them back to each device.
Definition: gpu_csvm.hpp:451
gpu_csvm(plssvm::parameter params={})
Construct a C-SVM using the SVM parameter params.
Definition: gpu_csvm.hpp:57
virtual void run_predict_kernel(const detail::execution_range &range, const parameter< float > ¶ms, device_ptr_type< float > &out_d, const device_ptr_type< float > &alpha_d, const device_ptr_type< float > &point_d, const device_ptr_type< float > &data_d, const device_ptr_type< float > &data_last_d, std::size_t num_support_vectors, std::size_t num_predict_points, std::size_t num_features) const =0
Run the device kernel (only on the first device) to predict the new data points point_d.
std::vector< float > predict_values(const parameter< float > ¶ms, const std::vector< std::vector< float >> &support_vectors, const std::vector< float > &alpha, float rho, std::vector< float > &w, const std::vector< std::vector< float >> &predict_points) const final
Uses the already learned model to predict the class of multiple (new) data points.
Definition: gpu_csvm.hpp:115
virtual void run_q_kernel(std::size_t device, const detail::execution_range &range, const parameter< float > ¶ms, device_ptr_type< float > &q_d, const device_ptr_type< float > &data_d, const device_ptr_type< float > &data_last_d, std::size_t num_data_points_padded, std::size_t num_features) const =0
Run the device kernel filling the q vector.
gpu_csvm(gpu_csvm &&) noexcept=default
Default move-constructor since a virtual destructor has been declared. noexcept
virtual void run_w_kernel(std::size_t device, const detail::execution_range &range, device_ptr_type< float > &w_d, const device_ptr_type< float > &alpha_d, const device_ptr_type< float > &data_d, const device_ptr_type< float > &data_last_d, std::size_t num_data_points, std::size_t num_features) const =0
Run the device kernel the calculate the w vector used to speed up the prediction when using the linea...
std::size_t num_available_devices() const noexcept
Return the number of available devices for the current backend.
Definition: gpu_csvm.hpp:93
virtual void run_q_kernel(std::size_t device, const detail::execution_range &range, const parameter< double > ¶ms, device_ptr_type< double > &q_d, const device_ptr_type< double > &data_d, const device_ptr_type< double > &data_last_d, std::size_t num_data_points_padded, std::size_t num_features) const =0
Run the device kernel filling the q vector.
virtual void run_predict_kernel(const detail::execution_range &range, const parameter< double > ¶ms, device_ptr_type< double > &out_d, const device_ptr_type< double > &alpha_d, const device_ptr_type< double > &point_d, const device_ptr_type< double > &data_d, const device_ptr_type< double > &data_last_d, std::size_t num_support_vectors, std::size_t num_predict_points, std::size_t num_features) const =0
Run the device kernel (only on the first device) to predict the new data points point_d.
std::pair< std::vector< real_type >, real_type > solve_system_of_linear_equations_impl(const parameter< real_type > ¶ms, const std::vector< std::vector< real_type >> &A, std::vector< real_type > b, real_type eps, unsigned long long max_iter) const
Solves the equation using the Conjugated Gradients algorithm.
Definition: gpu_csvm.hpp:479
device_ptr_t< real_type > device_ptr_type
The type of the device pointer (dependent on the used backend).
Definition: gpu_csvm.hpp:50
virtual void run_svm_kernel(std::size_t device, const detail::execution_range &range, const parameter< double > ¶ms, const device_ptr_type< double > &q_d, device_ptr_type< double > &r_d, const device_ptr_type< double > &x_d, const device_ptr_type< double > &data_d, double QA_cost, double add, std::size_t num_data_points_padded, std::size_t num_features) const =0
Run the main device kernel used in the CG algorithm.
std::vector< real_type > predict_values_impl(const parameter< real_type > ¶ms, const std::vector< std::vector< real_type >> &support_vectors, const std::vector< real_type > &alpha, real_type rho, std::vector< real_type > &w, const std::vector< std::vector< real_type >> &predict_points) const
Uses the already learned model to predict the class of multiple (new) data points.
Definition: gpu_csvm.hpp:658
Global type definitions and compile-time constants.
Defines the base class for all C-SVM backends and implements the functionality shared by all of them.
Implement a backend independent class used to specify the execution range for all kernel invocations.
Defines functions to convert 2D vectors to 1D SoA or AoS vectors.
Defines a simple logging function.
void device_synchronize(int device)
Wait for the compute device to finish.
Namespace containing implementation details. Should not directly be used by users.
Definition: csvm.hpp:27
void log(const verbosity_level verb, const std::string_view msg, Args &&...args)
Definition: logger.hpp:109
std::vector< real_type > transform_to_layout(const layout_type layout, const std::vector< std::vector< real_type >> &matrix, const std::size_t boundary_size, const std::size_t num_points)
Convert a 2D matrix into a 1D array in the layout adding boundary_size values per data point or featu...
Definition: layout.hpp:118
Namespace containing operator overloads for std::vector and other mathematical functions on vectors.
Definition: core.hpp:49
T sum(const std::vector< T > &vec)
Accumulate all elements in the std::vector vec.
Definition: operators.hpp:144
The main namespace containing all public API functions.
Definition: backend_types.hpp:24
constexpr kernel_index_type THREAD_BLOCK_SIZE
Global compile-time constant used for internal caching. May be changed during the CMake configuration...
Definition: constants.hpp:25
kernel_function_type
Enum class for all implemented kernel functions.
Definition: kernel_function_types.hpp:31
constexpr kernel_index_type INTERNAL_BLOCK_SIZE
Global compile-time constant used for internal caching. May be changed during the CMake configuration...
Definition: constants.hpp:32
real_type kernel_function(const std::vector< real_type > &xi, const std::vector< real_type > &xj, Args &&...args)
Computes the value of the two vectors xi and xj using the kernel function determined at compile-time.
Definition: kernel_function_types.hpp:76
Implements the parameter class encapsulating all important C-SVM parameters.
default_value< real_type > cost
The cost parameter in the C-SVM.
Definition: parameter.hpp:165
default_value< kernel_function_type > kernel_type
The used kernel function: linear, polynomial, or radial basis functions (rbf).
Definition: parameter.hpp:157
A single tracking entry containing a specific category, a unique name, and the actual value to be tra...
Definition: performance_tracker.hpp:40
Wrapper struct for overloading the dot product operator.
Definition: operators.hpp:99