12 #ifndef PLSSVM_DETAIL_IO_ARFF_PARSING_HPP_ 
   13 #define PLSSVM_DETAIL_IO_ARFF_PARSING_HPP_ 
   23 #include "fmt/format.h"                          
   30 #include <string_view>                           
   59 template <
typename label_type>
 
   60 [[nodiscard]] 
inline std::tuple<std::size_t, std::size_t, std::set<label_type>, std::size_t> 
parse_arff_header(
const std::vector<std::string_view> &lines) {
 
   61     std::size_t num_features = 0;
 
   62     std::size_t label_idx = 0;
 
   63     bool has_label = 
false;
 
   64     std::set<label_type> labels{};
 
   66     const auto check_for_name = [](std::string_view line, 
const std::size_t prefix, 
const std::size_t suffix) {
 
   67         std::string_view sv{ line };
 
   68         sv.remove_prefix(prefix);
 
   69         sv.remove_suffix(suffix);
 
   86     std::size_t header_line = 0;
 
   87     for (; header_line < lines.size(); ++header_line) {
 
   89         const std::string_view line = lines[header_line];
 
   94             if (header_line != 0) {
 
   98             check_for_name(line, 9, 0);  
 
  105             if (upper_case_line.find(
"NUMERIC") != std::string::npos) {
 
  107                 const std::string_view 
name = check_for_name(line, 10, 7);  
 
  125             std::string_view sv{ line };
 
  126             sv.remove_prefix(std::string_view{ 
"@ATTRIBUTE" }.size());
 
  136                 sv.remove_prefix(std::string_view{ 
"CLASS" }.size());
 
  147                 sv = sv.substr(1, sv.size() - 2);
 
  149                 const std::vector<std::string_view> labels_split = 
detail::split(sv, 
',');
 
  150                 if (labels_split.size() == 1) {
 
  154                 for (
const std::string_view label : labels_split) {
 
  155                     labels.insert(detail::convert_to<label_type, invalid_file_format_exception>(
detail::trim(label)));
 
  157                 if (labels_split.size() != labels.size()) {
 
  158                     throw invalid_file_format_exception{ fmt::format(
"Provided {} labels but only {} of them was/where unique!", labels_split.size(), labels.size()) };
 
  161                 if constexpr (std::is_same_v<label_type, std::string>) {
 
  162                     for (
const std::string_view label : labels_split) {
 
  186     if (num_features == 0) {
 
  190     if (header_line + 1 >= lines.size()) {
 
  195     return std::make_tuple(num_features, header_line + 1, labels, has_label ? label_idx : 0);
 
  235 template <
typename real_type, 
typename label_type>
 
  236 [[nodiscard]] 
inline std::tuple<std::size_t, std::size_t, std::vector<std::vector<real_type>>, std::vector<label_type>> 
parse_arff_data(
const file_reader &reader) {
 
  240     std::size_t num_header_lines = 0;
 
  241     std::size_t num_features = 0;
 
  242     std::set<label_type> unique_label{};
 
  243     std::size_t label_idx = 0;
 
  244     std::tie(num_features, num_header_lines, unique_label, label_idx) = detail::io::parse_arff_header<label_type>(reader.
lines());
 
  245     const bool has_label = !unique_label.empty();
 
  248     const std::size_t num_data_points = reader.
num_lines() - num_header_lines;
 
  249     const std::size_t num_attributes = num_features + 
static_cast<std::size_t
>(has_label);
 
  252     std::vector<std::vector<real_type>> data(num_data_points, std::vector<real_type>(num_features));
 
  253     std::vector<label_type> label(num_data_points);
 
  255     std::exception_ptr parallel_exception;
 
  257     #pragma omp parallel default(none) shared(reader, data, label, unique_label, parallel_exception) firstprivate(num_header_lines, num_features, num_attributes, has_label, label_idx) 
  260         for (std::size_t i = 0; i < data.size(); ++i) {
 
  262                 std::string_view line = reader.
line(i + num_header_lines);
 
  276                     bool is_class_set = 
false;
 
  277                     std::string_view::size_type pos = 1;
 
  279                         std::string_view::size_type next_pos = line.find_first_of(
' ', pos);
 
  281                         if (next_pos == std::string_view::npos) {
 
  286                         auto index = detail::convert_to<unsigned long, invalid_file_format_exception>(line.substr(pos, next_pos - pos));
 
  287                         if (index >= num_attributes) {
 
  289                             throw invalid_file_format_exception{ fmt::format(
"Trying to add feature/label at index {} but the maximum index is {}!", index, num_attributes - 1) };
 
  294                         next_pos = line.find_first_of(
",}", pos);
 
  297                         if (has_label && index == label_idx) {
 
  300                             if constexpr (std::is_same_v<label_type, bool>) {
 
  303                                 label[i] = detail::convert_to<label_type, invalid_file_format_exception>(line.substr(pos, next_pos - pos));
 
  305                                 label[i] = detail::convert_to<label_type, invalid_file_format_exception>(line.substr(pos, next_pos - pos));
 
  310                             if (has_label && index > label_idx) {
 
  313                             data[i][index] = detail::convert_to<real_type, invalid_file_format_exception>(line.substr(pos, next_pos - pos));
 
  317                         line.remove_prefix(next_pos + 1);
 
  322                     if (has_label && !is_class_set) {
 
  332                     const std::vector<std::string_view> line_split = 
detail::split(line, 
',');
 
  333                     if (line_split.size() != num_attributes) {
 
  334                         throw invalid_file_format_exception{ fmt::format(
"Invalid number of features and labels! Found {} but should be {}!", line_split.size(), num_attributes) };
 
  336                     for (std::size_t j = 0; j < num_attributes; ++j) {
 
  337                         if (has_label && label_idx == j) {
 
  340                             if constexpr (std::is_same_v<label_type, bool>) {
 
  343                                 label[i] = detail::convert_to<label_type, invalid_file_format_exception>(line_split[j]);
 
  345                                 label[i] = detail::convert_to<label_type, invalid_file_format_exception>(line_split[j]);
 
  349                             data[i][j] = detail::convert_to<real_type, invalid_file_format_exception>(line_split[j]);
 
  355                 if (has_label && !
detail::contains(unique_label, 
static_cast<label_type
>(label[i]))) {
 
  356                     throw invalid_file_format_exception{ fmt::format(
"Found the label \"{}\" which was not specified in the header ({{{}}})!", label[i], fmt::join(unique_label, 
",")) };
 
  358             } 
catch (
const std::exception &) {
 
  362                     if (!parallel_exception) {
 
  363                         parallel_exception = std::current_exception();
 
  371     if (parallel_exception) {
 
  372         std::rethrow_exception(parallel_exception);
 
  375     return std::make_tuple(num_data_points, num_features, std::move(data), has_label ? std::move(label) : std::vector<label_type>{});
 
  407 template <
typename real_type, 
typename label_type, 
bool has_label>
 
  408 inline void write_arff_data_impl(
const std::string &filename, 
const std::vector<std::vector<real_type>> &data, 
const std::vector<label_type> &label) {
 
  409     if constexpr (has_label) {
 
  410         PLSSVM_ASSERT(data.empty() || !label.empty(), 
"has_label is 'true' but no labels were provided!");
 
  411         PLSSVM_ASSERT(data.size() == label.size(), 
"Number of data points ({}) and number of labels ({}) mismatch!", data.size(), label.size());
 
  413         PLSSVM_ASSERT(label.empty(), 
"has_label is 'false' but labels were provided!");
 
  417     fmt::ostream out = fmt::output_file(filename);
 
  421     const std::size_t num_data_points = data.size();
 
  422     if (num_data_points == 0) {
 
  426     const std::size_t num_features = data.front().size();
 
  427     out.print(
"% {}x{}\n", num_data_points, num_features);
 
  429     out.print(
"@RELATION data_set\n");
 
  431     for (std::size_t i = 0; i < num_features; ++i) {
 
  432         out.print(
"@ATTRIBUTE feature_{} NUMERIC\n", i);
 
  435     if constexpr (has_label) {
 
  436         const std::set<label_type> available_labels{ label.begin(), label.end() };
 
  437         out.print(
"@ATTRIBUTE class {{{}}}\n", fmt::join(available_labels, 
","));
 
  439     out.print(
"@DATA\n");
 
  442     #pragma omp parallel default(none) shared(out, data, label) firstprivate(num_data_points) 
  445         std::string out_string;
 
  446         #pragma omp for schedule(dynamic) nowait 
  447         for (std::size_t i = 0; i < num_data_points; ++i) {
 
  448             if constexpr (has_label) {
 
  449                 out_string.append(fmt::format(
"{:.10e},{}\n", fmt::join(data[i], 
","), label[i]));
 
  451                 out_string.append(fmt::format(
"{:.10e}\n", fmt::join(data[i], 
",")));
 
  456         out.print(
"{}", out_string);
 
  488 template <
typename real_type, 
typename label_type>
 
  489 inline void write_arff_data(
const std::string &filename, 
const std::vector<std::vector<real_type>> &data, 
const std::vector<label_type> &label) {
 
  490     write_arff_data_impl<real_type, label_type, true>(filename, data, label);
 
  518 template <
typename real_type>
 
  519 inline void write_arff_data(
const std::string &filename, 
const std::vector<std::vector<real_type>> &data) {
 
  520     write_arff_data_impl<real_type, real_type, false>(filename, data, {});
 
#define PLSSVM_ASSERT(cond, msg,...)
Defines the PLSSVM_ASSERT macro if PLSSVM_ASSERT_ENABLED is defined.
Definition: assert.hpp:74
 
The plssvm::detail::file_reader class is responsible for reading a file and splitting it into its lin...
Definition: file_reader.hpp:42
 
std::string_view line(typename std::vector< std::string_view >::size_type pos) const
Return the pos line of the parsed file.
 
bool is_open() const noexcept
Checks whether this file_reader is currently associated with a file.
 
const std::vector< std::string_view > & lines() const noexcept
Return all lines present after the preprocessing.
 
std::vector< std::string_view >::size_type num_lines() const noexcept
Return the number of parsed lines (where all empty lines or lines starting with a comment are ignored...
 
Defines universal utility functions.
 
Implements custom exception classes derived from std::runtime_error including source location informa...
 
Implements a file reader class responsible for reading the input file and parsing it into lines.
 
Namespace containing implementation details for the IO related functions. Should not directly be used...
Definition: core.hpp:44
 
void write_arff_data(const std::string &filename, const std::vector< std::vector< real_type >> &data, const std::vector< label_type > &label)
Write the provided data and labels to the ARFF file filename.
Definition: arff_parsing.hpp:489
 
void write_arff_data_impl(const std::string &filename, const std::vector< std::vector< real_type >> &data, const std::vector< label_type > &label)
Write the provided data and labels to the ARFF file filename.
Definition: arff_parsing.hpp:408
 
std::tuple< std::size_t, std::size_t, std::set< label_type >, std::size_t > parse_arff_header(const std::vector< std::string_view > &lines)
Parse the ARFF file header, i.e., determine the number of features, the length of the ARRF header,...
Definition: arff_parsing.hpp:60
 
std::tuple< std::size_t, std::size_t, std::vector< std::vector< real_type > >, std::vector< label_type > > parse_arff_data(const file_reader &reader)
Parse all data points and potential label using the file reader, ignoring all empty lines and lines s...
Definition: arff_parsing.hpp:236
 
bool starts_with(std::string_view str, std::string_view sv) noexcept
Checks if the string str starts with the prefix sv.
 
std::string as_upper_case(std::string_view str)
Return a new string with the same content as str but all upper case.
 
std::string_view trim(std::string_view str) noexcept
Returns a new std::string_view equal to str where all leading and trailing whitespaces are removed.
 
std::string_view trim_left(std::string_view str) noexcept
Returns a new std::string_view equal to str where all leading whitespaces are removed.
 
bool ends_with(std::string_view str, std::string_view sv) noexcept
Checks if the string str ends with the suffix sv.
 
std::string current_date_time()
Return the current date time in the format "YYYY-MM-DD hh:mm:ss".
 
std::vector< std::string_view > split(std::string_view str, char delim=' ')
Split the string str at the positions with delimiter delim and return the sub-strings.
 
bool contains(std::string_view str, std::string_view sv) noexcept
Checks if the string str contains the string sv.
 
constexpr std::string_view name
The name of the library.
Definition: version.hpp:26
 
Defines (arithmetic) functions on std::vector and scalars.
 
Implements a conversion function from a string to an arithmetic type.
 
Implements utility functions for string manipulation and querying.