PLSSVM - Parallel Least Squares Support Vector Machine  2.0.0
A Least Squares Support Vector Machine implementation using different backends.
data_set.hpp
Go to the documentation of this file.
1 
12 #ifndef PLSSVM_DATA_SET_HPP_
13 #define PLSSVM_DATA_SET_HPP_
14 #pragma once
15 
16 #include "plssvm/detail/io/arff_parsing.hpp" // plssvm::detail::io::{read_libsvm_data, write_libsvm_data}
17 #include "plssvm/detail/io/file_reader.hpp" // plssvm::detail::io::file_reader
18 #include "plssvm/detail/io/libsvm_parsing.hpp" // plssvm::detail::io::{read_arff_data, write_arff_data}
19 #include "plssvm/detail/io/scaling_factors_parsing.hpp" // plssvm::detail::io::{parse_scaling_factors, read_scaling_factors}
20 #include "plssvm/detail/logger.hpp" // plssvm::detail::log, plssvm::verbosity_level
21 #include "plssvm/detail/performance_tracker.hpp" // plssvm::detail::tracking_entry
22 #include "plssvm/detail/string_utility.hpp" // plssvm::detail::ends_with
23 #include "plssvm/detail/type_list.hpp" // plssvm::detail::{real_type_list, label_type_list, type_list_contains_v}
24 #include "plssvm/detail/utility.hpp" // plssvm::detail::contains
25 #include "plssvm/exceptions/exceptions.hpp" // plssvm::data_set_exception
26 #include "plssvm/file_format_types.hpp" // plssvm::file_format_type
27 
28 #include "fmt/chrono.h" // directly output std::chrono times via fmt
29 #include "fmt/core.h" // fmt::format
30 #include "fmt/ostream.h" // directly output objects with operator<< overload via fmt
31 
32 #include <algorithm> // std::all_of, std::max, std::min, std::sort, std::adjacent_find
33 #include <chrono> // std::chrono::{time_point, steady_clock, duration_cast, millisecond}
34 #include <cstddef> // std::size_t
35 #include <functional> // std::reference_wrapper, std::cref
36 #include <iostream> // std::cout, std::endl
37 #include <limits> // std::numeric_limits::{max, lowest}
38 #include <map> // std::map
39 #include <memory> // std::shared_ptr, std::make_shared
40 #include <optional> // std::optional, std::make_optional, std::nullopt
41 #include <set> // std::set
42 #include <string> // std::string
43 #include <tuple> // std::tie
44 #include <utility> // std::move, std::pair, std::make_pair
45 #include <vector> // std::vector
46 
47 namespace plssvm {
48 
53 template <typename T>
54 using optional_ref = std::optional<std::reference_wrapper<T>>;
55 
68 template <typename T, typename U = int>
69 class data_set {
70  // make sure only valid template types are used
71  static_assert(detail::type_list_contains_v<T, detail::real_type_list>, "Illegal real type provided! See the 'real_type_list' in the type_list.hpp header for a list of the allowed types.");
72  static_assert(detail::type_list_contains_v<U, detail::label_type_list>, "Illegal label type provided! See the 'label_type_list' in the type_list.hpp header for a list of the allowed types.");
73 
74  // plssvm::model needs the default constructor
75  template <typename, typename>
76  friend class model;
77  // plssvm::csvm needs the label mapping
78  friend class csvm;
79 
80  public:
82  using real_type = T;
84  using label_type = U;
86  using size_type = std::size_t;
87 
88  // forward declare the scaling class
89  class scaling;
90  // forward declare the label_mapper class
91  class label_mapper;
92 
100  explicit data_set(const std::string &filename);
107  data_set(const std::string &filename, file_format_type format);
117  data_set(const std::string &filename, scaling scale_parameter);
127  data_set(const std::string &filename, file_format_type format, scaling scale_parameter);
128 
137  explicit data_set(std::vector<std::vector<real_type>> data_points);
147  data_set(std::vector<std::vector<real_type>> data_points, std::vector<label_type> labels);
157  data_set(std::vector<std::vector<real_type>> data_points, scaling scale_parameter);
169  data_set(std::vector<std::vector<real_type>> data_points, std::vector<label_type> labels, scaling scale_parameter);
170 
176  void save(const std::string &filename, file_format_type format) const;
183  void save(const std::string &filename) const;
184 
189  [[nodiscard]] const std::vector<std::vector<real_type>> &data() const noexcept { return *X_ptr_; }
194  [[nodiscard]] bool has_labels() const noexcept { return labels_ptr_ != nullptr; }
200  [[nodiscard]] optional_ref<const std::vector<label_type>> labels() const noexcept;
207  [[nodiscard]] std::optional<std::vector<label_type>> different_labels() const;
208 
213  [[nodiscard]] size_type num_data_points() const noexcept { return num_data_points_; }
218  [[nodiscard]] size_type num_features() const noexcept { return num_features_; }
225  [[nodiscard]] size_type num_different_labels() const noexcept { return mapping_ != nullptr ? mapping_->num_mappings() : 0; }
226 
232  [[nodiscard]] bool is_scaled() const noexcept { return scale_parameters_ != nullptr; }
239  [[nodiscard]] optional_ref<const scaling> scaling_factors() const noexcept;
240 
241  private:
246  X_ptr_{ std::make_shared<std::vector<std::vector<real_type>>>() } {}
247 
262  void scale();
270  void read_file(const std::string &filename, file_format_type format);
271 
273  std::shared_ptr<std::vector<std::vector<real_type>>> X_ptr_{ nullptr };
275  std::shared_ptr<std::vector<label_type>> labels_ptr_{ nullptr };
277  std::shared_ptr<std::vector<real_type>> y_ptr_{ nullptr };
278 
283 
285  std::shared_ptr<const label_mapper> mapping_{ nullptr };
287  std::shared_ptr<scaling> scale_parameters_{ nullptr };
288 };
289 
290 //*************************************************************************************************************************************//
291 // scaling nested-class //
292 //*************************************************************************************************************************************//
293 
297 template <typename T, typename U>
298 class data_set<T, U>::scaling {
299  public:
303  struct factors {
307  factors() = default;
314  factors(const size_type feature_index, const real_type lower_bound, const real_type upper_bound) :
315  feature{ feature_index }, lower{ lower_bound }, upper{ upper_bound } {}
316 
318  size_type feature{};
320  real_type lower{};
322  real_type upper{};
323  };
324 
331  scaling(real_type lower, real_type upper);
337  scaling(const std::string &filename);
338 
344  void save(const std::string &filename) const;
345 
347  std::pair<real_type, real_type> scaling_interval{};
349  std::vector<factors> scaling_factors{};
350 };
351 
352 template <typename T, typename U>
354  scaling_interval{ std::make_pair(lower, upper) } {
355  if (lower >= upper) {
356  throw data_set_exception{ fmt::format("Inconsistent scaling interval specification: lower ({}) must be less than upper ({})!", lower, upper) };
357  }
358 }
359 
360 template <typename T, typename U>
361 data_set<T, U>::scaling::scaling(const std::string &filename) {
362  // open the file
363  detail::io::file_reader reader{ filename };
364  reader.read_lines('#');
365 
366  // read scaling values from file
367  std::tie(scaling_interval, scaling_factors) = detail::io::parse_scaling_factors<real_type, factors>(reader);
368 }
369 
370 template <typename T, typename U>
371 void data_set<T, U>::scaling::save(const std::string &filename) const {
372  const std::chrono::time_point start_time = std::chrono::steady_clock::now();
373 
374  // write scaling values to file
375  detail::io::write_scaling_factors(filename, scaling_interval, scaling_factors);
376 
377  const std::chrono::time_point end_time = std::chrono::steady_clock::now();
379  "Write {} scaling factors in {} to the file '{}'.\n",
380  detail::tracking_entry{ "scaling_factors_write", "num_scaling_factors", scaling_factors.size() },
381  detail::tracking_entry{ "scaling_factors_write", "time", std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time) },
382  detail::tracking_entry{ "scaling_factors_write", "filename", filename });
383 }
384 
385 //*************************************************************************************************************************************//
386 // label mapper nested-class //
387 //*************************************************************************************************************************************//
388 
393 template <typename T, typename U>
394 class data_set<T, U>::label_mapper {
395  public:
402  explicit label_mapper(const std::vector<label_type> &labels);
403 
410  [[nodiscard]] const real_type &get_mapped_value_by_label(const label_type &label) const;
417  [[nodiscard]] const label_type &get_label_by_mapped_value(const real_type &mapped_value) const;
422  [[nodiscard]] size_type num_mappings() const noexcept;
427  [[nodiscard]] std::vector<label_type> labels() const;
428 
429  private:
431  std::map<label_type, real_type> label_to_mapped_{};
433  std::map<real_type, label_type> mapped_to_label_{};
434 };
435 
437 template <typename T, typename U>
438 data_set<T, U>::data_set::label_mapper::label_mapper(const std::vector<label_type> &labels) {
439  // we are only interested in unique labels
440  std::set<label_type> unique_labels(labels.begin(), labels.end());
441  // currently, only two different labels are supported
442  if (unique_labels.size() != 2) {
443  throw data_set_exception{ fmt::format("Currently only binary classification is supported, but {} different labels were given!", unique_labels.size()) };
444  }
445  // create mapping
446  // first label
447  auto iter = unique_labels.begin();
448  label_to_mapped_[*iter] = -1;
449  mapped_to_label_[-1] = *iter;
450  // second label
451  ++iter;
452  label_to_mapped_[*iter] = +1;
453  mapped_to_label_[+1] = *iter;
454 }
456 
457 template <typename T, typename U>
459  if (!detail::contains(label_to_mapped_, label)) {
460  throw data_set_exception{ fmt::format("Label \"{}\" unknown in this label mapping!", label) };
461  }
462  return label_to_mapped_.at(label);
463 }
464 
465 template <typename T, typename U>
467  if (!detail::contains(mapped_to_label_, mapped_value)) {
468  throw data_set_exception{ fmt::format("Mapped value \"{}\" unknown in this label mapping!", mapped_value) };
469  }
470  return mapped_to_label_.at(mapped_value);
471 }
472 
473 template <typename T, typename U>
475  PLSSVM_ASSERT(label_to_mapped_.size() == mapped_to_label_.size(), "Both maps must contain the same number of values, but {} and {} were given!", label_to_mapped_.size(), mapped_to_label_.size());
476  return label_to_mapped_.size();
477 }
478 
479 template <typename T, typename U>
480 auto data_set<T, U>::label_mapper::labels() const -> std::vector<label_type> {
481  std::vector<label_type> available_labels;
482  available_labels.reserve(this->num_mappings());
483  for (const auto &[key, value] : label_to_mapped_) {
484  available_labels.push_back(key);
485  }
486  return available_labels;
487 }
488 
489 //*************************************************************************************************************************************//
490 // data set class //
491 //*************************************************************************************************************************************//
492 
493 template <typename T, typename U>
494 data_set<T, U>::data_set(const std::string &filename) {
495  // read data set from file
496  // if the file doesn't end with .arff, assume a LIBSVM file
497  this->read_file(filename, detail::ends_with(filename, ".arff") ? file_format_type::arff : file_format_type::libsvm);
498 }
499 
500 template <typename T, typename U>
501 data_set<T, U>::data_set(const std::string &filename, const file_format_type format) {
502  // read data set from file
503  this->read_file(filename, format);
504 }
505 
506 template <typename T, typename U>
507 data_set<T, U>::data_set(const std::string &filename, scaling scale_parameter) :
508  data_set{ filename } {
509  // initialize scaling
510  scale_parameters_ = std::make_shared<scaling>(std::move(scale_parameter));
511  // scale data set
512  this->scale();
513 }
514 
515 template <typename T, typename U>
516 data_set<T, U>::data_set(const std::string &filename, file_format_type format, scaling scale_parameter) :
517  data_set{ filename, format } {
518  // initialize scaling
519  scale_parameters_ = std::make_shared<scaling>(std::move(scale_parameter));
520  // scale data set
521  this->scale();
522 }
523 
524 template <typename T, typename U>
525 data_set<T, U>::data_set(std::vector<std::vector<real_type>> data_points) :
526  X_ptr_{ std::make_shared<std::vector<std::vector<real_type>>>(std::move(data_points)) } {
527  // the provided data points vector may not be empty
528  if (X_ptr_->empty()) {
529  throw data_set_exception{ "Data vector is empty!" };
530  }
531  // check that all data points have the same number of features
532  if (!std::all_of(X_ptr_->cbegin(), X_ptr_->cend(), [this](const std::vector<real_type> &point) { return point.size() == X_ptr_->front().size(); })) {
533  throw data_set_exception{ "All points in the data vector must have the same number of features!" };
534  }
535  // check that the data points have at least one feature
536  if (X_ptr_->front().size() == 0) {
537  throw data_set_exception{ "No features provided for the data points!" };
538  }
539 
540  num_data_points_ = X_ptr_->size();
541  num_features_ = X_ptr_->front().size();
542 }
543 
544 template <typename T, typename U>
545 data_set<T, U>::data_set(std::vector<std::vector<real_type>> data_points, std::vector<label_type> labels) :
546  data_set{ std::move(data_points) } {
547  // initialize labels
548  labels_ptr_ = std::make_shared<std::vector<label_type>>(std::move(labels));
549  // the number of labels must be equal to the number of data points!
550  if (X_ptr_->size() != labels_ptr_->size()) {
551  throw data_set_exception{ fmt::format("Number of labels ({}) must match the number of data points ({})!", labels_ptr_->size(), X_ptr_->size()) };
552  }
553 
554  // create mapping from labels
555  this->create_mapping();
556 }
557 
558 template <typename T, typename U>
559 data_set<T, U>::data_set(std::vector<std::vector<real_type>> data_points, scaling scale_parameter) :
560  data_set{ std::move(data_points) } {
561  // initialize scaling
562  scale_parameters_ = std::make_shared<scaling>(std::move(scale_parameter));
563  // scale data set
564  this->scale();
565 }
566 
567 template <typename T, typename U>
568 data_set<T, U>::data_set(std::vector<std::vector<real_type>> data_points, std::vector<label_type> labels, scaling scale_parameter) :
569  data_set{ std::move(data_points), std::move(labels) } {
570  // initialize scaling
571  scale_parameters_ = std::make_shared<scaling>(std::move(scale_parameter));
572  // scale data set
573  this->scale();
574 }
575 
576 template <typename T, typename U>
577 void data_set<T, U>::save(const std::string &filename, const file_format_type format) const {
578  const std::chrono::time_point start_time = std::chrono::steady_clock::now();
579 
580  // save the data set
581  if (this->has_labels()) {
582  // save data with labels
583  switch (format) {
585  detail::io::write_libsvm_data(filename, *X_ptr_, *labels_ptr_);
586  break;
588  detail::io::write_arff_data(filename, *X_ptr_, *labels_ptr_);
589  break;
590  }
591  } else {
592  // save data without labels
593  switch (format) {
595  detail::io::write_libsvm_data(filename, *X_ptr_);
596  break;
598  detail::io::write_arff_data(filename, *X_ptr_);
599  break;
600  }
601  }
602 
603  const std::chrono::time_point end_time = std::chrono::steady_clock::now();
605  "Write {} data points with {} features in {} to the {} file '{}'.\n",
606  detail::tracking_entry{ "data_set_write", "num_data_points", num_data_points_ },
607  detail::tracking_entry{ "data_set_write", "num_features", num_features_ },
608  detail::tracking_entry{ "data_set_write", "time", std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time) },
609  detail::tracking_entry{ "data_set_write", "format", format },
610  detail::tracking_entry{ "data_set_write", "filename", filename });
611 }
612 
613 template <typename T, typename U>
614 void data_set<T, U>::save(const std::string &filename) const {
615  if (detail::ends_with(filename, ".libsvm")) {
616  this->save(filename, file_format_type::libsvm);
617  } else if (detail::ends_with(filename, ".arff")) {
618  this->save(filename, file_format_type::arff);
619  } else {
620  throw data_set_exception(fmt::format("Unrecognized file extension for file \"{}\" (must be one of: .libsvm or .arff)!", filename));
621  }
622 }
623 
624 template <typename T, typename U>
625 auto data_set<T, U>::labels() const noexcept -> optional_ref<const std::vector<label_type>> {
626  if (this->has_labels()) {
627  return std::make_optional(std::cref(*labels_ptr_));
628  }
629  return std::nullopt;
630 }
631 
632 template <typename T, typename U>
633 auto data_set<T, U>::different_labels() const -> std::optional<std::vector<label_type>> {
634  if (this->has_labels()) {
635  return std::make_optional(mapping_->labels());
636  }
637  return std::nullopt;
638 }
639 
640 template <typename T, typename U>
641 auto data_set<T, U>::scaling_factors() const noexcept -> optional_ref<const scaling> {
642  if (this->is_scaled()) {
643  return std::make_optional(std::cref(*scale_parameters_));
644  }
645  return std::nullopt;
646 }
647 
648 //*************************************************************************************************************************************//
649 // PRIVATE MEMBER FUNCTIONS //
650 //*************************************************************************************************************************************//
651 
652 template <typename T, typename U>
654  PLSSVM_ASSERT(labels_ptr_ != nullptr, "Can't create mapping if no labels are provided!");
655 
656  // create label mapping
657  label_mapper mapper{ *labels_ptr_ };
658 
659  // convert input labels to now mapped values
660  std::vector<real_type> tmp(labels_ptr_->size());
661  #pragma omp parallel for default(shared) shared(tmp, mapper)
662  for (typename std::vector<real_type>::size_type i = 0; i < tmp.size(); ++i) {
663  tmp[i] = mapper.get_mapped_value_by_label((*labels_ptr_)[i]);
664  }
665  y_ptr_ = std::make_shared<std::vector<real_type>>(std::move(tmp));
666  mapping_ = std::make_shared<const label_mapper>(std::move(mapper));
667 }
668 
669 template <typename T, typename U>
671  PLSSVM_ASSERT(this->is_scaled(), "No scaling parameters given for scaling!");
672 
673  const std::chrono::time_point start_time = std::chrono::steady_clock::now();
674 
675  // unpack scaling interval pair
676  const real_type lower = scale_parameters_->scaling_interval.first;
677  const real_type upper = scale_parameters_->scaling_interval.second;
678 
679  // calculate scaling factors if necessary, use provided once otherwise
680  if (scale_parameters_->scaling_factors.empty()) {
681  // calculate feature-wise min/max values for scaling
682  for (size_type feature = 0; feature < num_features_; ++feature) {
683  real_type min_value = std::numeric_limits<real_type>::max();
684  real_type max_value = std::numeric_limits<real_type>::lowest();
685 
686  // calculate min/max values of all data points at the specific feature
687  #pragma omp parallel for default(shared) firstprivate(feature) reduction(min : min_value) reduction(max : max_value)
688  for (size_type data_point = 0; data_point < num_data_points_; ++data_point) {
689  min_value = std::min(min_value, (*X_ptr_)[data_point][feature]);
690  max_value = std::max(max_value, (*X_ptr_)[data_point][feature]);
691  }
692 
693  // add scaling factor only if min_value != 0.0 AND max_value != 0.0
694  if (!(min_value == real_type{ 0.0 } && max_value == real_type{ 0.0 })) {
695  scale_parameters_->scaling_factors.emplace_back(feature, min_value, max_value);
696  }
697  }
698  } else {
699  // the number of scaling factors may not exceed the number of features
700  if (scale_parameters_->scaling_factors.size() > num_features_) {
701  throw data_set_exception{ fmt::format("Need at most as much scaling factors as features in the data set are present ({}), but {} were given!", num_features_, scale_parameters_->scaling_factors.size()) };
702  }
703  // sort vector
704  const auto scaling_factors_comp_less = [](const typename scaling::factors &lhs, const typename scaling::factors &rhs) { return lhs.feature < rhs.feature; };
705  std::sort(scale_parameters_->scaling_factors.begin(), scale_parameters_->scaling_factors.end(), scaling_factors_comp_less);
706  // check whether the biggest feature index is smaller than the number of features
707  if (scale_parameters_->scaling_factors.back().feature >= num_features_) {
708  throw data_set_exception{ fmt::format("The maximum scaling feature index most not be greater than {}, but is {}!", num_features_ - 1, scale_parameters_->scaling_factors.back().feature) };
709  }
710  // check that there are no duplicate entries
711  const auto scaling_factors_comp_eq = [](const typename scaling::factors &lhs, const typename scaling::factors &rhs) { return lhs.feature == rhs.feature; };
712  const auto iter = std::adjacent_find(scale_parameters_->scaling_factors.begin(), scale_parameters_->scaling_factors.end(), scaling_factors_comp_eq);
713  if (iter != scale_parameters_->scaling_factors.end()) {
714  throw data_set_exception{ fmt::format("Found more than one scaling factor for the feature index {}!", iter->feature) };
715  }
716  }
717 
718  // scale values
719  #pragma omp parallel for default(shared) firstprivate(lower, upper)
720  for (size_type i = 0; i < scale_parameters_->scaling_factors.size(); ++i) {
721  // extract feature-wise min and max values
722  const typename scaling::factors factor = scale_parameters_->scaling_factors[i];
723  // scale data values
724  for (size_type data_point = 0; data_point < num_data_points_; ++data_point) {
725  (*X_ptr_)[data_point][factor.feature] = lower + (upper - lower) * ((*X_ptr_)[data_point][factor.feature] - factor.lower) / (factor.upper - factor.lower);
726  }
727  }
728 
729  const std::chrono::time_point end_time = std::chrono::steady_clock::now();
731  "Scaled the data set to the range [{}, {}] in {}.\n",
732  detail::tracking_entry{ "data_set_scale", "lower", lower },
733  detail::tracking_entry{ "data_set_scale", "upper", upper },
734  detail::tracking_entry{ "data_set_scale", "time", std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time) });
735 }
736 
737 template <typename T, typename U>
738 void data_set<T, U>::read_file(const std::string &filename, file_format_type format) {
739  const std::chrono::time_point start_time = std::chrono::steady_clock::now();
740 
741  // get the comment character based on the file_format_type
742  char comment{ ' ' };
743  switch (format) {
745  comment = '#';
746  break;
748  comment = '%';
749  break;
750  }
751 
752  // open the file
753  detail::io::file_reader reader{ filename };
754  reader.read_lines(comment);
755 
756  // create the empty placeholders
757  std::vector<std::vector<real_type>> data{};
758  std::vector<label_type> label{};
759 
760  // parse the given file
761  switch (format) {
763  std::tie(num_data_points_, num_features_, data, label) = detail::io::parse_libsvm_data<real_type, label_type>(reader);
764  break;
766  std::tie(num_data_points_, num_features_, data, label) = detail::io::parse_arff_data<real_type, label_type>(reader);
767  break;
768  }
769 
770  // update shared pointer
771  X_ptr_ = std::make_shared<decltype(data)>(std::move(data));
772  if (label.empty()) {
773  labels_ptr_ = nullptr;
774  } else {
775  labels_ptr_ = std::make_shared<decltype(label)>(std::move(label));
776  }
777 
778  // create label mapping
779  if (this->has_labels()) {
780  this->create_mapping();
781  }
782 
783  const std::chrono::time_point end_time = std::chrono::steady_clock::now();
785  "Read {} data points with {} features in {} using the {} parser from file '{}'.\n",
786  detail::tracking_entry{ "data_set_read", "num_data_points", num_data_points_ },
787  detail::tracking_entry{ "data_set_read", "num_features", num_features_ },
788  detail::tracking_entry{ "data_set_read", "time", std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time) },
789  detail::tracking_entry{ "data_set_read", "format", format },
790  detail::tracking_entry{ "data_set_read", "filename", filename });
791 }
792 
793 } // namespace plssvm
794 
795 #endif // PLSSVM_DATA_SET_HPP_
Implements parsing functions for the ARFF file format.
#define PLSSVM_ASSERT(cond, msg,...)
Defines the PLSSVM_ASSERT macro if PLSSVM_ASSERT_ENABLED is defined.
Definition: assert.hpp:74
Base class for all C-SVM backends.
Definition: csvm.hpp:50
Implements all necessary functionality to map arbitrary labels to labels usable by the C-SVMs.
Definition: data_set.hpp:394
label_mapper(const std::vector< label_type > &labels)
Create a mapping from all labels to { -1 , 1 } and vice versa.
const label_type & get_label_by_mapped_value(const real_type &mapped_value) const
Given the mapped label value, return the original label value.
Definition: data_set.hpp:466
const real_type & get_mapped_value_by_label(const label_type &label) const
Given the original label value, return the mapped label value.
Definition: data_set.hpp:458
size_type num_mappings() const noexcept
Returns the number of valid mappings. This is equivalent to the number of different labels.
Definition: data_set.hpp:474
std::vector< label_type > labels() const
Return a vector containing the different, original labels of the current data set.
Definition: data_set.hpp:480
Implements all necessary data and functions needed for scaling a plssvm::data_set to an user-defined ...
Definition: data_set.hpp:298
void save(const std::string &filename) const
Save the scaling factors to the file filename.
Definition: data_set.hpp:371
scaling(real_type lower, real_type upper)
Create a new scaling class that can be used to scale all features of a data set to the interval [lowe...
Definition: data_set.hpp:353
Exception type thrown if a data_set is used inappropriately.
Definition: exceptions.hpp:88
Encapsulate all necessary data that is needed for training or predicting using an SVM.
Definition: data_set.hpp:69
data_set()
Default construct an empty data set.
Definition: data_set.hpp:245
std::shared_ptr< std::vector< label_type > > labels_ptr_
A pointer to the original labels of this data set; may be nullptr if no labels have been provided.
Definition: data_set.hpp:275
bool has_labels() const noexcept
Returns whether this data set contains labels or not.
Definition: data_set.hpp:194
std::shared_ptr< const label_mapper > mapping_
The mapping used to convert the original label to its mapped value and vice versa; may be nullptr if ...
Definition: data_set.hpp:285
data_set(std::vector< std::vector< real_type >> data_points, std::vector< label_type > labels, scaling scale_parameter)
Create a new data set using the the provided data_points and labels and scale the data_points using t...
Definition: data_set.hpp:568
void read_file(const std::string &filename, file_format_type format)
Read the data points and potential labels from the file filename assuming the plssvm::file_format_typ...
Definition: data_set.hpp:738
size_type num_different_labels() const noexcept
Returns the number of different labels in this data set.
Definition: data_set.hpp:225
const std::vector< std::vector< real_type > > & data() const noexcept
Return the data points in this data set.
Definition: data_set.hpp:189
std::size_t size_type
An unsigned integer type.
Definition: data_set.hpp:86
std::shared_ptr< std::vector< real_type > > y_ptr_
A pointer to the mapped values of the labels of this data set; may be nullptr if no labels have been ...
Definition: data_set.hpp:277
size_type num_features() const noexcept
Returns the number of features in this data set.
Definition: data_set.hpp:218
optional_ref< const scaling > scaling_factors() const noexcept
Returns the scaling factors as an optional reference used to scale the data points in this data set.
Definition: data_set.hpp:641
std::optional< std::vector< label_type > > different_labels() const
Returns an optional to the different labels in this data set.
Definition: data_set.hpp:633
T real_type
The type of the data points: either float or double.
Definition: data_set.hpp:82
size_type num_data_points_
The number of data points in this data set.
Definition: data_set.hpp:280
U label_type
The type of the labels: any arithmetic type or std::string.
Definition: data_set.hpp:84
size_type num_features_
The number of features in this data set.
Definition: data_set.hpp:282
data_set(const std::string &filename, scaling scale_parameter)
Read the data points from the file filename and scale it using the provided scale_parameter....
Definition: data_set.hpp:507
data_set(std::vector< std::vector< real_type >> data_points, scaling scale_parameter)
Create a new data set using the the provided data_points and scale them using the provided scale_para...
Definition: data_set.hpp:559
data_set(std::vector< std::vector< real_type >> data_points)
Create a new data set using the provided data_points.
Definition: data_set.hpp:525
void scale()
Scale the feature values of the data set to the provided range.
Definition: data_set.hpp:670
optional_ref< const std::vector< label_type > > labels() const noexcept
Returns an optional reference to the labels in this data set.
Definition: data_set.hpp:625
std::shared_ptr< std::vector< std::vector< real_type > > > X_ptr_
A pointer to the two-dimensional data points.
Definition: data_set.hpp:273
data_set(const std::string &filename, file_format_type format, scaling scale_parameter)
Read the data points from the file filename assuming that the file is given in the plssvm::file_forma...
Definition: data_set.hpp:516
data_set(std::vector< std::vector< real_type >> data_points, std::vector< label_type > labels)
Create a new data set using the provided data_points and labels.
Definition: data_set.hpp:545
size_type num_data_points() const noexcept
Returns the number of data points in this data set.
Definition: data_set.hpp:213
void save(const std::string &filename, file_format_type format) const
Save the data points and potential labels of this data set to the file filename using the file format...
Definition: data_set.hpp:577
std::shared_ptr< scaling > scale_parameters_
The scaling parameters used to scale the data points in this data set; may be nullptr if no data poin...
Definition: data_set.hpp:287
data_set(const std::string &filename, file_format_type format)
Read the data points from the file filename assuming that the file is given in the plssvm::file_forma...
Definition: data_set.hpp:501
bool is_scaled() const noexcept
Returns whether this data set has been scaled or not.
Definition: data_set.hpp:232
void save(const std::string &filename) const
Save the data points and potential labels of this data set to the file filename. Automatically determ...
Definition: data_set.hpp:614
void create_mapping()
Create the mapping between the provided labels and the internally used mapped values,...
Definition: data_set.hpp:653
data_set(const std::string &filename)
Read the data points from the file filename. Automatically determines the plssvm::file_format_type ba...
Definition: data_set.hpp:494
The plssvm::detail::file_reader class is responsible for reading a file and splitting it into its lin...
Definition: file_reader.hpp:42
const std::vector< std::string_view > & read_lines(std::string_view comment={ "\n" })
Read the content of the associated file and split it into lines, ignoring empty lines and lines start...
Implements a class encapsulating the result of a call to the SVM fit function. A model is used to pre...
Definition: model.hpp:50
Defines universal utility functions.
Implements custom exception classes derived from std::runtime_error including source location informa...
Defines an enumeration holding all supported file formats.
Implements a file reader class responsible for reading the input file and parsing it into lines.
Implements parsing functions for the LIBSVM file format.
Defines a simple logging function.
void write_arff_data(const std::string &filename, const std::vector< std::vector< real_type >> &data, const std::vector< label_type > &label)
Write the provided data and labels to the ARFF file filename.
Definition: arff_parsing.hpp:489
void write_scaling_factors(const std::string &filename, const std::pair< real_type, real_type > &scaling_interval, const std::vector< factors_type > &scaling_factors)
Write the scaling_interval and scaling_factors to a file for later usage in scaling another data set ...
Definition: scaling_factors_parsing.hpp:139
void write_libsvm_data(const std::string &filename, const std::vector< std::vector< real_type >> &data, const std::vector< label_type > &label)
Write the provided data and labels to the LIBSVM file filename.
Definition: libsvm_parsing.hpp:322
void log(const verbosity_level verb, const std::string_view msg, Args &&...args)
Definition: logger.hpp:109
bool ends_with(std::string_view str, std::string_view sv) noexcept
Checks if the string str ends with the suffix sv.
bool contains(std::string_view str, std::string_view sv) noexcept
Checks if the string str contains the string sv.
The main namespace containing all public API functions.
Definition: backend_types.hpp:24
std::optional< std::reference_wrapper< T > > optional_ref
Type alias for an optional reference (since std::optional<T&> is not allowed).
Definition: data_set.hpp:54
file_format_type
Enum class for all supported file types.
Definition: file_format_types.hpp:23
Defines a performance tracker which can dump performance information in a YAML file.
Implements parsing functions for the scaling factor file parsing.
Implements utility functions for string manipulation and querying.
The calculated or read feature-wise scaling factors.
Definition: data_set.hpp:303
factors(const size_type feature_index, const real_type lower_bound, const real_type upper_bound)
Construct new scaling factors struct with the provided values.
Definition: data_set.hpp:314
real_type upper
The maximum value of the feature for all data points.
Definition: data_set.hpp:322
real_type lower
The lowest value of the feature for all data points.
Definition: data_set.hpp:320
factors()=default
Default construct new scaling factors.
size_type feature
The feature index for which the scaling factors are valid.
Definition: data_set.hpp:318
A single tracking entry containing a specific category, a unique name, and the actual value to be tra...
Definition: performance_tracker.hpp:40
All possible real_type and label_type combinations for a plssvm::model and plssvm::data_set.