12 #ifndef PLSSVM_DETAIL_IO_ARFF_PARSING_HPP_
13 #define PLSSVM_DETAIL_IO_ARFF_PARSING_HPP_
23 #include "fmt/format.h"
30 #include <string_view>
59 template <
typename label_type>
60 [[nodiscard]]
inline std::tuple<std::size_t, std::size_t, std::set<label_type>, std::size_t>
parse_arff_header(
const std::vector<std::string_view> &lines) {
61 std::size_t num_features = 0;
62 std::size_t label_idx = 0;
63 bool has_label =
false;
64 std::set<label_type> labels{};
66 const auto check_for_name = [](std::string_view line,
const std::size_t prefix,
const std::size_t suffix) {
67 std::string_view sv{ line };
68 sv.remove_prefix(prefix);
69 sv.remove_suffix(suffix);
86 std::size_t header_line = 0;
87 for (; header_line < lines.size(); ++header_line) {
89 const std::string_view line = lines[header_line];
94 if (header_line != 0) {
98 check_for_name(line, 9, 0);
105 if (upper_case_line.find(
"NUMERIC") != std::string::npos) {
107 const std::string_view
name = check_for_name(line, 10, 7);
125 std::string_view sv{ line };
126 sv.remove_prefix(std::string_view{
"@ATTRIBUTE" }.size());
136 sv.remove_prefix(std::string_view{
"CLASS" }.size());
147 sv = sv.substr(1, sv.size() - 2);
149 const std::vector<std::string_view> labels_split =
detail::split(sv,
',');
150 if (labels_split.size() == 1) {
154 for (
const std::string_view label : labels_split) {
155 labels.insert(detail::convert_to<label_type, invalid_file_format_exception>(
detail::trim(label)));
157 if (labels_split.size() != labels.size()) {
158 throw invalid_file_format_exception{ fmt::format(
"Provided {} labels but only {} of them was/where unique!", labels_split.size(), labels.size()) };
161 if constexpr (std::is_same_v<label_type, std::string>) {
162 for (
const std::string_view label : labels_split) {
186 if (num_features == 0) {
190 if (header_line + 1 >= lines.size()) {
195 return std::make_tuple(num_features, header_line + 1, labels, has_label ? label_idx : 0);
235 template <
typename real_type,
typename label_type>
236 [[nodiscard]]
inline std::tuple<std::size_t, std::size_t, std::vector<std::vector<real_type>>, std::vector<label_type>>
parse_arff_data(
const file_reader &reader) {
240 std::size_t num_header_lines = 0;
241 std::size_t num_features = 0;
242 std::set<label_type> unique_label{};
243 std::size_t label_idx = 0;
244 std::tie(num_features, num_header_lines, unique_label, label_idx) = detail::io::parse_arff_header<label_type>(reader.
lines());
245 const bool has_label = !unique_label.empty();
248 const std::size_t num_data_points = reader.
num_lines() - num_header_lines;
249 const std::size_t num_attributes = num_features +
static_cast<std::size_t
>(has_label);
252 std::vector<std::vector<real_type>> data(num_data_points, std::vector<real_type>(num_features));
253 std::vector<label_type> label(num_data_points);
255 std::exception_ptr parallel_exception;
257 #pragma omp parallel default(none) shared(reader, data, label, unique_label, parallel_exception) firstprivate(num_header_lines, num_features, num_attributes, has_label, label_idx)
260 for (std::size_t i = 0; i < data.size(); ++i) {
262 std::string_view line = reader.
line(i + num_header_lines);
276 bool is_class_set =
false;
277 std::string_view::size_type pos = 1;
279 std::string_view::size_type next_pos = line.find_first_of(
' ', pos);
281 if (next_pos == std::string_view::npos) {
286 auto index = detail::convert_to<unsigned long, invalid_file_format_exception>(line.substr(pos, next_pos - pos));
287 if (index >= num_attributes) {
289 throw invalid_file_format_exception{ fmt::format(
"Trying to add feature/label at index {} but the maximum index is {}!", index, num_attributes - 1) };
294 next_pos = line.find_first_of(
",}", pos);
297 if (has_label && index == label_idx) {
300 if constexpr (std::is_same_v<label_type, bool>) {
303 label[i] = detail::convert_to<label_type, invalid_file_format_exception>(line.substr(pos, next_pos - pos));
305 label[i] = detail::convert_to<label_type, invalid_file_format_exception>(line.substr(pos, next_pos - pos));
310 if (has_label && index > label_idx) {
313 data[i][index] = detail::convert_to<real_type, invalid_file_format_exception>(line.substr(pos, next_pos - pos));
317 line.remove_prefix(next_pos + 1);
322 if (has_label && !is_class_set) {
332 const std::vector<std::string_view> line_split =
detail::split(line,
',');
333 if (line_split.size() != num_attributes) {
334 throw invalid_file_format_exception{ fmt::format(
"Invalid number of features and labels! Found {} but should be {}!", line_split.size(), num_attributes) };
336 for (std::size_t j = 0; j < num_attributes; ++j) {
337 if (has_label && label_idx == j) {
340 if constexpr (std::is_same_v<label_type, bool>) {
343 label[i] = detail::convert_to<label_type, invalid_file_format_exception>(line_split[j]);
345 label[i] = detail::convert_to<label_type, invalid_file_format_exception>(line_split[j]);
349 data[i][j] = detail::convert_to<real_type, invalid_file_format_exception>(line_split[j]);
355 if (has_label && !
detail::contains(unique_label,
static_cast<label_type
>(label[i]))) {
356 throw invalid_file_format_exception{ fmt::format(
"Found the label \"{}\" which was not specified in the header ({{{}}})!", label[i], fmt::join(unique_label,
",")) };
358 }
catch (
const std::exception &) {
362 if (!parallel_exception) {
363 parallel_exception = std::current_exception();
371 if (parallel_exception) {
372 std::rethrow_exception(parallel_exception);
375 return std::make_tuple(num_data_points, num_features, std::move(data), has_label ? std::move(label) : std::vector<label_type>{});
407 template <
typename real_type,
typename label_type,
bool has_label>
408 inline void write_arff_data_impl(
const std::string &filename,
const std::vector<std::vector<real_type>> &data,
const std::vector<label_type> &label) {
409 if constexpr (has_label) {
410 PLSSVM_ASSERT(data.empty() || !label.empty(),
"has_label is 'true' but no labels were provided!");
411 PLSSVM_ASSERT(data.size() == label.size(),
"Number of data points ({}) and number of labels ({}) mismatch!", data.size(), label.size());
413 PLSSVM_ASSERT(label.empty(),
"has_label is 'false' but labels were provided!");
417 fmt::ostream out = fmt::output_file(filename);
421 const std::size_t num_data_points = data.size();
422 if (num_data_points == 0) {
426 const std::size_t num_features = data.front().size();
427 out.print(
"% {}x{}\n", num_data_points, num_features);
429 out.print(
"@RELATION data_set\n");
431 for (std::size_t i = 0; i < num_features; ++i) {
432 out.print(
"@ATTRIBUTE feature_{} NUMERIC\n", i);
435 if constexpr (has_label) {
436 const std::set<label_type> available_labels{ label.begin(), label.end() };
437 out.print(
"@ATTRIBUTE class {{{}}}\n", fmt::join(available_labels,
","));
439 out.print(
"@DATA\n");
442 #pragma omp parallel default(none) shared(out, data, label) firstprivate(num_data_points)
445 std::string out_string;
446 #pragma omp for schedule(dynamic) nowait
447 for (std::size_t i = 0; i < num_data_points; ++i) {
448 if constexpr (has_label) {
449 out_string.append(fmt::format(
"{:.10e},{}\n", fmt::join(data[i],
","), label[i]));
451 out_string.append(fmt::format(
"{:.10e}\n", fmt::join(data[i],
",")));
456 out.print(
"{}", out_string);
488 template <
typename real_type,
typename label_type>
489 inline void write_arff_data(
const std::string &filename,
const std::vector<std::vector<real_type>> &data,
const std::vector<label_type> &label) {
490 write_arff_data_impl<real_type, label_type, true>(filename, data, label);
518 template <
typename real_type>
519 inline void write_arff_data(
const std::string &filename,
const std::vector<std::vector<real_type>> &data) {
520 write_arff_data_impl<real_type, real_type, false>(filename, data, {});
#define PLSSVM_ASSERT(cond, msg,...)
Defines the PLSSVM_ASSERT macro if PLSSVM_ASSERT_ENABLED is defined.
Definition: assert.hpp:74
The plssvm::detail::file_reader class is responsible for reading a file and splitting it into its lin...
Definition: file_reader.hpp:42
std::string_view line(typename std::vector< std::string_view >::size_type pos) const
Return the pos line of the parsed file.
bool is_open() const noexcept
Checks whether this file_reader is currently associated with a file.
const std::vector< std::string_view > & lines() const noexcept
Return all lines present after the preprocessing.
std::vector< std::string_view >::size_type num_lines() const noexcept
Return the number of parsed lines (where all empty lines or lines starting with a comment are ignored...
Defines universal utility functions.
Implements custom exception classes derived from std::runtime_error including source location informa...
Implements a file reader class responsible for reading the input file and parsing it into lines.
Namespace containing implementation details for the IO related functions. Should not directly be used...
Definition: core.hpp:44
void write_arff_data(const std::string &filename, const std::vector< std::vector< real_type >> &data, const std::vector< label_type > &label)
Write the provided data and labels to the ARFF file filename.
Definition: arff_parsing.hpp:489
void write_arff_data_impl(const std::string &filename, const std::vector< std::vector< real_type >> &data, const std::vector< label_type > &label)
Write the provided data and labels to the ARFF file filename.
Definition: arff_parsing.hpp:408
std::tuple< std::size_t, std::size_t, std::set< label_type >, std::size_t > parse_arff_header(const std::vector< std::string_view > &lines)
Parse the ARFF file header, i.e., determine the number of features, the length of the ARRF header,...
Definition: arff_parsing.hpp:60
std::tuple< std::size_t, std::size_t, std::vector< std::vector< real_type > >, std::vector< label_type > > parse_arff_data(const file_reader &reader)
Parse all data points and potential label using the file reader, ignoring all empty lines and lines s...
Definition: arff_parsing.hpp:236
bool starts_with(std::string_view str, std::string_view sv) noexcept
Checks if the string str starts with the prefix sv.
std::string as_upper_case(std::string_view str)
Return a new string with the same content as str but all upper case.
std::string_view trim(std::string_view str) noexcept
Returns a new std::string_view equal to str where all leading and trailing whitespaces are removed.
std::string_view trim_left(std::string_view str) noexcept
Returns a new std::string_view equal to str where all leading whitespaces are removed.
bool ends_with(std::string_view str, std::string_view sv) noexcept
Checks if the string str ends with the suffix sv.
std::string current_date_time()
Return the current date time in the format "YYYY-MM-DD hh:mm:ss".
std::vector< std::string_view > split(std::string_view str, char delim=' ')
Split the string str at the positions with delimiter delim and return the sub-strings.
bool contains(std::string_view str, std::string_view sv) noexcept
Checks if the string str contains the string sv.
constexpr std::string_view name
The name of the library.
Definition: version.hpp:26
Defines (arithmetic) functions on std::vector and scalars.
Implements a conversion function from a string to an arithmetic type.
Implements utility functions for string manipulation and querying.