12 #ifndef PLSSVM_DATA_SET_HPP_
13 #define PLSSVM_DATA_SET_HPP_
28 #include "fmt/chrono.h"
30 #include "fmt/ostream.h"
68 template <
typename T,
typename U =
int>
71 static_assert(detail::type_list_contains_v<T, detail::real_type_list>,
"Illegal real type provided! See the 'real_type_list' in the type_list.hpp header for a list of the allowed types.");
72 static_assert(detail::type_list_contains_v<U, detail::label_type_list>,
"Illegal label type provided! See the 'label_type_list' in the type_list.hpp header for a list of the allowed types.");
75 template <
typename,
typename>
117 data_set(
const std::string &filename, scaling scale_parameter);
137 explicit data_set(std::vector<std::vector<real_type>> data_points);
147 data_set(std::vector<std::vector<real_type>> data_points, std::vector<label_type>
labels);
157 data_set(std::vector<std::vector<real_type>> data_points, scaling scale_parameter);
169 data_set(std::vector<std::vector<real_type>> data_points, std::vector<label_type>
labels, scaling scale_parameter);
183 void save(
const std::string &filename)
const;
189 [[nodiscard]]
const std::vector<std::vector<real_type>> &
data() const noexcept {
return *
X_ptr_; }
246 X_ptr_{ std::make_shared<std::vector<std::vector<real_type>>>() } {}
273 std::shared_ptr<std::vector<std::vector<real_type>>>
X_ptr_{
nullptr };
277 std::shared_ptr<std::vector<real_type>>
y_ptr_{
nullptr };
285 std::shared_ptr<const label_mapper>
mapping_{
nullptr };
297 template <
typename T,
typename U>
315 feature{ feature_index }, lower{ lower_bound }, upper{ upper_bound } {}
337 scaling(
const std::string &filename);
344 void save(
const std::string &filename)
const;
347 std::pair<real_type, real_type> scaling_interval{};
352 template <
typename T,
typename U>
354 scaling_interval{ std::make_pair(lower, upper) } {
355 if (lower >= upper) {
356 throw data_set_exception{ fmt::format(
"Inconsistent scaling interval specification: lower ({}) must be less than upper ({})!", lower, upper) };
360 template <
typename T,
typename U>
367 std::tie(scaling_interval,
scaling_factors) = detail::io::parse_scaling_factors<real_type, factors>(reader);
370 template <
typename T,
typename U>
372 const std::chrono::time_point start_time = std::chrono::steady_clock::now();
377 const std::chrono::time_point end_time = std::chrono::steady_clock::now();
379 "Write {} scaling factors in {} to the file '{}'.\n",
381 detail::tracking_entry{
"scaling_factors_write",
"time", std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time) },
393 template <
typename T,
typename U>
433 std::map<real_type, label_type> mapped_to_label_{};
437 template <
typename T,
typename U>
440 std::set<label_type> unique_labels(
labels.begin(),
labels.end());
442 if (unique_labels.size() != 2) {
443 throw data_set_exception{ fmt::format(
"Currently only binary classification is supported, but {} different labels were given!", unique_labels.size()) };
447 auto iter = unique_labels.begin();
448 label_to_mapped_[*iter] = -1;
449 mapped_to_label_[-1] = *iter;
452 label_to_mapped_[*iter] = +1;
453 mapped_to_label_[+1] = *iter;
457 template <
typename T,
typename U>
460 throw data_set_exception{ fmt::format(
"Label \"{}\" unknown in this label mapping!", label) };
462 return label_to_mapped_.at(label);
465 template <
typename T,
typename U>
468 throw data_set_exception{ fmt::format(
"Mapped value \"{}\" unknown in this label mapping!", mapped_value) };
470 return mapped_to_label_.at(mapped_value);
473 template <
typename T,
typename U>
475 PLSSVM_ASSERT(label_to_mapped_.size() == mapped_to_label_.size(),
"Both maps must contain the same number of values, but {} and {} were given!", label_to_mapped_.size(), mapped_to_label_.size());
476 return label_to_mapped_.size();
479 template <
typename T,
typename U>
481 std::vector<label_type> available_labels;
482 available_labels.reserve(this->num_mappings());
483 for (
const auto &[key, value] : label_to_mapped_) {
484 available_labels.push_back(key);
486 return available_labels;
493 template <
typename T,
typename U>
500 template <
typename T,
typename U>
506 template <
typename T,
typename U>
515 template <
typename T,
typename U>
524 template <
typename T,
typename U>
526 X_ptr_{ std::make_shared<std::vector<std::vector<
real_type>>>(std::move(data_points)) } {
532 if (!std::all_of(
X_ptr_->cbegin(),
X_ptr_->cend(), [
this](
const std::vector<real_type> &point) { return point.size() == X_ptr_->front().size(); })) {
533 throw data_set_exception{
"All points in the data vector must have the same number of features!" };
536 if (
X_ptr_->front().size() == 0) {
544 template <
typename T,
typename U>
546 data_set{ std::move(data_points) } {
558 template <
typename T,
typename U>
560 data_set{ std::move(data_points) } {
567 template <
typename T,
typename U>
569 data_set{ std::move(data_points), std::move(labels) } {
576 template <
typename T,
typename U>
578 const std::chrono::time_point start_time = std::chrono::steady_clock::now();
581 if (this->has_labels()) {
603 const std::chrono::time_point end_time = std::chrono::steady_clock::now();
605 "Write {} data points with {} features in {} to the {} file '{}'.\n",
608 detail::tracking_entry{
"data_set_write",
"time", std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time) },
613 template <
typename T,
typename U>
620 throw data_set_exception(fmt::format(
"Unrecognized file extension for file \"{}\" (must be one of: .libsvm or .arff)!", filename));
624 template <
typename T,
typename U>
626 if (this->has_labels()) {
627 return std::make_optional(std::cref(*labels_ptr_));
632 template <
typename T,
typename U>
634 if (this->has_labels()) {
635 return std::make_optional(mapping_->labels());
640 template <
typename T,
typename U>
642 if (this->is_scaled()) {
643 return std::make_optional(std::cref(*scale_parameters_));
652 template <
typename T,
typename U>
654 PLSSVM_ASSERT(labels_ptr_ !=
nullptr,
"Can't create mapping if no labels are provided!");
660 std::vector<real_type> tmp(labels_ptr_->size());
661 #pragma omp parallel for default(shared) shared(tmp, mapper)
662 for (
typename std::vector<real_type>::size_type i = 0; i < tmp.size(); ++i) {
663 tmp[i] = mapper.get_mapped_value_by_label((*labels_ptr_)[i]);
665 y_ptr_ = std::make_shared<std::vector<real_type>>(std::move(tmp));
666 mapping_ = std::make_shared<const label_mapper>(std::move(mapper));
669 template <
typename T,
typename U>
671 PLSSVM_ASSERT(this->is_scaled(),
"No scaling parameters given for scaling!");
673 const std::chrono::time_point start_time = std::chrono::steady_clock::now();
676 const real_type lower = scale_parameters_->scaling_interval.first;
677 const real_type upper = scale_parameters_->scaling_interval.second;
680 if (scale_parameters_->scaling_factors.empty()) {
682 for (
size_type feature = 0; feature < num_features_; ++feature) {
683 real_type min_value = std::numeric_limits<real_type>::max();
684 real_type max_value = std::numeric_limits<real_type>::lowest();
687 #pragma omp parallel for default(shared) firstprivate(feature) reduction(min : min_value) reduction(max : max_value)
688 for (
size_type data_point = 0; data_point < num_data_points_; ++data_point) {
689 min_value = std::min(min_value, (*X_ptr_)[data_point][feature]);
690 max_value = std::max(max_value, (*X_ptr_)[data_point][feature]);
695 scale_parameters_->scaling_factors.emplace_back(feature, min_value, max_value);
700 if (scale_parameters_->scaling_factors.size() > num_features_) {
701 throw data_set_exception{ fmt::format(
"Need at most as much scaling factors as features in the data set are present ({}), but {} were given!", num_features_, scale_parameters_->scaling_factors.size()) };
705 std::sort(scale_parameters_->scaling_factors.begin(), scale_parameters_->scaling_factors.end(), scaling_factors_comp_less);
707 if (scale_parameters_->scaling_factors.back().feature >= num_features_) {
708 throw data_set_exception{ fmt::format(
"The maximum scaling feature index most not be greater than {}, but is {}!", num_features_ - 1, scale_parameters_->scaling_factors.back().feature) };
712 const auto iter = std::adjacent_find(scale_parameters_->scaling_factors.begin(), scale_parameters_->scaling_factors.end(), scaling_factors_comp_eq);
713 if (iter != scale_parameters_->scaling_factors.end()) {
714 throw data_set_exception{ fmt::format(
"Found more than one scaling factor for the feature index {}!", iter->feature) };
719 #pragma omp parallel for default(shared) firstprivate(lower, upper)
720 for (
size_type i = 0; i < scale_parameters_->scaling_factors.size(); ++i) {
722 const typename scaling::factors factor = scale_parameters_->scaling_factors[i];
724 for (
size_type data_point = 0; data_point < num_data_points_; ++data_point) {
725 (*X_ptr_)[data_point][factor.
feature] = lower + (upper - lower) * ((*X_ptr_)[data_point][factor.
feature] - factor.
lower) / (factor.
upper - factor.
lower);
729 const std::chrono::time_point end_time = std::chrono::steady_clock::now();
731 "Scaled the data set to the range [{}, {}] in {}.\n",
734 detail::tracking_entry{
"data_set_scale",
"time", std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time) });
737 template <
typename T,
typename U>
739 const std::chrono::time_point start_time = std::chrono::steady_clock::now();
757 std::vector<std::vector<real_type>> data{};
758 std::vector<label_type> label{};
763 std::tie(num_data_points_, num_features_, data, label) = detail::io::parse_libsvm_data<real_type, label_type>(reader);
766 std::tie(num_data_points_, num_features_, data, label) = detail::io::parse_arff_data<real_type, label_type>(reader);
771 X_ptr_ = std::make_shared<decltype(data)>(std::move(data));
773 labels_ptr_ =
nullptr;
775 labels_ptr_ = std::make_shared<decltype(label)>(std::move(label));
779 if (this->has_labels()) {
780 this->create_mapping();
783 const std::chrono::time_point end_time = std::chrono::steady_clock::now();
785 "Read {} data points with {} features in {} using the {} parser from file '{}'.\n",
788 detail::tracking_entry{
"data_set_read",
"time", std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time) },
Implements parsing functions for the ARFF file format.
#define PLSSVM_ASSERT(cond, msg,...)
Defines the PLSSVM_ASSERT macro if PLSSVM_ASSERT_ENABLED is defined.
Definition: assert.hpp:74
Base class for all C-SVM backends.
Definition: csvm.hpp:50
Implements all necessary functionality to map arbitrary labels to labels usable by the C-SVMs.
Definition: data_set.hpp:394
label_mapper(const std::vector< label_type > &labels)
Create a mapping from all labels to { -1 , 1 } and vice versa.
const label_type & get_label_by_mapped_value(const real_type &mapped_value) const
Given the mapped label value, return the original label value.
Definition: data_set.hpp:466
const real_type & get_mapped_value_by_label(const label_type &label) const
Given the original label value, return the mapped label value.
Definition: data_set.hpp:458
size_type num_mappings() const noexcept
Returns the number of valid mappings. This is equivalent to the number of different labels.
Definition: data_set.hpp:474
std::vector< label_type > labels() const
Return a vector containing the different, original labels of the current data set.
Definition: data_set.hpp:480
Implements all necessary data and functions needed for scaling a plssvm::data_set to an user-defined ...
Definition: data_set.hpp:298
void save(const std::string &filename) const
Save the scaling factors to the file filename.
Definition: data_set.hpp:371
scaling(real_type lower, real_type upper)
Create a new scaling class that can be used to scale all features of a data set to the interval [lowe...
Definition: data_set.hpp:353
Exception type thrown if a data_set is used inappropriately.
Definition: exceptions.hpp:88
Encapsulate all necessary data that is needed for training or predicting using an SVM.
Definition: data_set.hpp:69
data_set()
Default construct an empty data set.
Definition: data_set.hpp:245
std::shared_ptr< std::vector< label_type > > labels_ptr_
A pointer to the original labels of this data set; may be nullptr if no labels have been provided.
Definition: data_set.hpp:275
bool has_labels() const noexcept
Returns whether this data set contains labels or not.
Definition: data_set.hpp:194
std::shared_ptr< const label_mapper > mapping_
The mapping used to convert the original label to its mapped value and vice versa; may be nullptr if ...
Definition: data_set.hpp:285
data_set(std::vector< std::vector< real_type >> data_points, std::vector< label_type > labels, scaling scale_parameter)
Create a new data set using the the provided data_points and labels and scale the data_points using t...
Definition: data_set.hpp:568
void read_file(const std::string &filename, file_format_type format)
Read the data points and potential labels from the file filename assuming the plssvm::file_format_typ...
Definition: data_set.hpp:738
size_type num_different_labels() const noexcept
Returns the number of different labels in this data set.
Definition: data_set.hpp:225
const std::vector< std::vector< real_type > > & data() const noexcept
Return the data points in this data set.
Definition: data_set.hpp:189
std::size_t size_type
An unsigned integer type.
Definition: data_set.hpp:86
std::shared_ptr< std::vector< real_type > > y_ptr_
A pointer to the mapped values of the labels of this data set; may be nullptr if no labels have been ...
Definition: data_set.hpp:277
size_type num_features() const noexcept
Returns the number of features in this data set.
Definition: data_set.hpp:218
optional_ref< const scaling > scaling_factors() const noexcept
Returns the scaling factors as an optional reference used to scale the data points in this data set.
Definition: data_set.hpp:641
std::optional< std::vector< label_type > > different_labels() const
Returns an optional to the different labels in this data set.
Definition: data_set.hpp:633
T real_type
The type of the data points: either float or double.
Definition: data_set.hpp:82
size_type num_data_points_
The number of data points in this data set.
Definition: data_set.hpp:280
U label_type
The type of the labels: any arithmetic type or std::string.
Definition: data_set.hpp:84
size_type num_features_
The number of features in this data set.
Definition: data_set.hpp:282
data_set(const std::string &filename, scaling scale_parameter)
Read the data points from the file filename and scale it using the provided scale_parameter....
Definition: data_set.hpp:507
data_set(std::vector< std::vector< real_type >> data_points, scaling scale_parameter)
Create a new data set using the the provided data_points and scale them using the provided scale_para...
Definition: data_set.hpp:559
data_set(std::vector< std::vector< real_type >> data_points)
Create a new data set using the provided data_points.
Definition: data_set.hpp:525
void scale()
Scale the feature values of the data set to the provided range.
Definition: data_set.hpp:670
optional_ref< const std::vector< label_type > > labels() const noexcept
Returns an optional reference to the labels in this data set.
Definition: data_set.hpp:625
std::shared_ptr< std::vector< std::vector< real_type > > > X_ptr_
A pointer to the two-dimensional data points.
Definition: data_set.hpp:273
data_set(const std::string &filename, file_format_type format, scaling scale_parameter)
Read the data points from the file filename assuming that the file is given in the plssvm::file_forma...
Definition: data_set.hpp:516
data_set(std::vector< std::vector< real_type >> data_points, std::vector< label_type > labels)
Create a new data set using the provided data_points and labels.
Definition: data_set.hpp:545
size_type num_data_points() const noexcept
Returns the number of data points in this data set.
Definition: data_set.hpp:213
void save(const std::string &filename, file_format_type format) const
Save the data points and potential labels of this data set to the file filename using the file format...
Definition: data_set.hpp:577
std::shared_ptr< scaling > scale_parameters_
The scaling parameters used to scale the data points in this data set; may be nullptr if no data poin...
Definition: data_set.hpp:287
data_set(const std::string &filename, file_format_type format)
Read the data points from the file filename assuming that the file is given in the plssvm::file_forma...
Definition: data_set.hpp:501
bool is_scaled() const noexcept
Returns whether this data set has been scaled or not.
Definition: data_set.hpp:232
void save(const std::string &filename) const
Save the data points and potential labels of this data set to the file filename. Automatically determ...
Definition: data_set.hpp:614
void create_mapping()
Create the mapping between the provided labels and the internally used mapped values,...
Definition: data_set.hpp:653
data_set(const std::string &filename)
Read the data points from the file filename. Automatically determines the plssvm::file_format_type ba...
Definition: data_set.hpp:494
The plssvm::detail::file_reader class is responsible for reading a file and splitting it into its lin...
Definition: file_reader.hpp:42
const std::vector< std::string_view > & read_lines(std::string_view comment={ "\n" })
Read the content of the associated file and split it into lines, ignoring empty lines and lines start...
Implements a class encapsulating the result of a call to the SVM fit function. A model is used to pre...
Definition: model.hpp:50
Defines universal utility functions.
Implements custom exception classes derived from std::runtime_error including source location informa...
Implements a file reader class responsible for reading the input file and parsing it into lines.
Implements parsing functions for the LIBSVM file format.
Defines a simple logging function.
void write_arff_data(const std::string &filename, const std::vector< std::vector< real_type >> &data, const std::vector< label_type > &label)
Write the provided data and labels to the ARFF file filename.
Definition: arff_parsing.hpp:489
void write_scaling_factors(const std::string &filename, const std::pair< real_type, real_type > &scaling_interval, const std::vector< factors_type > &scaling_factors)
Write the scaling_interval and scaling_factors to a file for later usage in scaling another data set ...
Definition: scaling_factors_parsing.hpp:139
void write_libsvm_data(const std::string &filename, const std::vector< std::vector< real_type >> &data, const std::vector< label_type > &label)
Write the provided data and labels to the LIBSVM file filename.
Definition: libsvm_parsing.hpp:322
void log(const verbosity_level verb, const std::string_view msg, Args &&...args)
Definition: logger.hpp:109
bool ends_with(std::string_view str, std::string_view sv) noexcept
Checks if the string str ends with the suffix sv.
bool contains(std::string_view str, std::string_view sv) noexcept
Checks if the string str contains the string sv.
The main namespace containing all public API functions.
Definition: backend_types.hpp:24
std::optional< std::reference_wrapper< T > > optional_ref
Type alias for an optional reference (since std::optional<T&> is not allowed).
Definition: data_set.hpp:54
file_format_type
Enum class for all supported file types.
Definition: file_format_types.hpp:23
Implements parsing functions for the scaling factor file parsing.
Implements utility functions for string manipulation and querying.
The calculated or read feature-wise scaling factors.
Definition: data_set.hpp:303
factors(const size_type feature_index, const real_type lower_bound, const real_type upper_bound)
Construct new scaling factors struct with the provided values.
Definition: data_set.hpp:314
real_type upper
The maximum value of the feature for all data points.
Definition: data_set.hpp:322
real_type lower
The lowest value of the feature for all data points.
Definition: data_set.hpp:320
factors()=default
Default construct new scaling factors.
size_type feature
The feature index for which the scaling factors are valid.
Definition: data_set.hpp:318
A single tracking entry containing a specific category, a unique name, and the actual value to be tra...
Definition: performance_tracker.hpp:40
All possible real_type and label_type combinations for a plssvm::model and plssvm::data_set.