PLSSVM - Parallel Least Squares Support Vector Machine  2.0.0
A Least Squares Support Vector Machine implementation using different backends.
arff_parsing.hpp
Go to the documentation of this file.
1 
12 #ifndef PLSSVM_DETAIL_IO_ARFF_PARSING_HPP_
13 #define PLSSVM_DETAIL_IO_ARFF_PARSING_HPP_
14 #pragma once
15 
16 #include "plssvm/detail/io/file_reader.hpp" // plssvm::detail::io::file_reader
17 #include "plssvm/detail/operators.hpp" // plssvm::operator::sign
18 #include "plssvm/detail/string_conversion.hpp" // plssvm::detail::convert_to
19 #include "plssvm/detail/string_utility.hpp" // plssvm::detail::{to_upper_case, as_upper_case, starts_with, ends_with}
20 #include "plssvm/detail/utility.hpp" // plssvm::detail::current_date_time
21 #include "plssvm/exceptions/exceptions.hpp" // plssvm::exception::invalid_file_format_exception
22 
23 #include "fmt/format.h" // fmt::format, fmt::join
24 #include "fmt/os.h" // fmt::ostream, fmt::output_file
25 
26 #include <cstddef> // std::size_t
27 #include <exception> // std::exception, std::exception_ptr, std::current_exception, std::rethrow_exception
28 #include <set> // std::set
29 #include <string> // std::string
30 #include <string_view> // std::string_view
31 #include <tuple> // std::tuple, std::make_tuple
32 #include <utility> // std::move
33 #include <vector> // std::vector
34 
35 namespace plssvm::detail::io {
36 
59 template <typename label_type>
60 [[nodiscard]] inline std::tuple<std::size_t, std::size_t, std::set<label_type>, std::size_t> parse_arff_header(const std::vector<std::string_view> &lines) {
61  std::size_t num_features = 0;
62  std::size_t label_idx = 0;
63  bool has_label = false;
64  std::set<label_type> labels{};
65 
66  const auto check_for_name = [](std::string_view line, const std::size_t prefix, const std::size_t suffix) {
67  std::string_view sv{ line };
68  sv.remove_prefix(prefix);
69  sv.remove_suffix(suffix);
70  sv = detail::trim(sv);
71 
72  // remaining string may not be empty
73  if (sv.empty()) {
74  throw invalid_file_format_exception{ fmt::format("The \"{}\" field must contain a name!", line) };
75  }
76  // check if string contains whitespaces -> must be quoted
77  if (detail::contains(sv, ' ') && !detail::starts_with(sv, '"') && !detail::ends_with(sv, '"')) {
78  throw invalid_file_format_exception{ fmt::format("A \"{}\" name that contains a whitespace must be quoted!", line) };
79  }
80 
81  // return name part of line
82  return sv;
83  };
84 
85  // parse arff header
86  std::size_t header_line = 0;
87  for (; header_line < lines.size(); ++header_line) {
88  // get next line and convert content to all upper case
89  const std::string_view line = lines[header_line];
90  const std::string upper_case_line = detail::as_upper_case(line);
91  // relation fields are ignored
92  if (detail::starts_with(upper_case_line, "@RELATION")) {
93  // if a relation is given, it must be given in the first line
94  if (header_line != 0) {
95  throw invalid_file_format_exception{ "The @RELATION attribute must be set before any other @ATTRIBUTE!" };
96  }
97  // the relation field must contain a name
98  check_for_name(line, 9, 0); // @RELATION is 9 chars long
99  // parse next line
100  continue;
101  }
102  // check for attribute fields
103  if (detail::starts_with(upper_case_line, "@ATTRIBUTE")) {
104  // check if a "normal" numeric feature has been found
105  if (upper_case_line.find("NUMERIC") != std::string::npos) {
106  // a numeric field must also contain a name
107  const std::string_view name = check_for_name(line, 10, 7); // @ATTRIBUTE is 10 chars long, NUMERIC 7 chars
108  // the attribute name "CLASS" is reserved!
109  if (detail::as_upper_case(name) == "CLASS") {
110  throw invalid_file_format_exception{ "May not use the combination of the reserved name \"class\" and attribute type NUMERIC!" };
111  }
112  // add a feature to the running count
113  ++num_features;
114  // increment class index as long as no class labels have been read
115  if (!has_label) {
116  ++label_idx;
117  }
118  continue;
119  }
120 
121  // only other valid line may be (nominal attribute with the name CLASS)
122  // @ATTRIBUTE CLASS {cat,dog}
123 
124  // remove attribute from string
125  std::string_view sv{ line };
126  sv.remove_prefix(std::string_view{ "@ATTRIBUTE" }.size());
127  sv = trim_left(sv);
128 
129  // if the line is valid, it must now start with CLASS
130  if (detail::starts_with(detail::as_upper_case(sv), "CLASS")) {
131  // only one class attribute is allowed
132  if (has_label) {
133  throw invalid_file_format_exception{ "A nominal attribute with the name CLASS may only be provided once!" };
134  }
135  // check if the nominal attribute ist enclosed in curly braces
136  sv.remove_prefix(std::string_view{ "CLASS" }.size());
137  sv = detail::trim(sv);
138  // the class labels must be given
139  if (sv.empty()) {
140  throw invalid_file_format_exception{ fmt::format("The \"{}\" field must contain class labels!", line) };
141  }
142  // check if string contains whitespaces -> must be quoted
143  if (!detail::starts_with(sv, '{') && !detail::ends_with(sv, '}')) {
144  throw invalid_file_format_exception{ fmt::format("The \"{}\" nominal attribute must be enclosed with {{}}!", line) };
145  }
146  // remove curly braces
147  sv = sv.substr(1, sv.size() - 2);
148  // split string with delimiter ',' to check the number of provided classes
149  const std::vector<std::string_view> labels_split = detail::split(sv, ',');
150  if (labels_split.size() == 1) {
151  throw invalid_file_format_exception{ "Only a single label has been provided!" };
152  }
153  // check whether only unique labels have been provided
154  for (const std::string_view label : labels_split) {
155  labels.insert(detail::convert_to<label_type, invalid_file_format_exception>(detail::trim(label)));
156  }
157  if (labels_split.size() != labels.size()) {
158  throw invalid_file_format_exception{ fmt::format("Provided {} labels but only {} of them was/where unique!", labels_split.size(), labels.size()) };
159  }
160  // check whether a string label contains a whitespace
161  if constexpr (std::is_same_v<label_type, std::string>) {
162  for (const std::string_view label : labels_split) {
163  if (detail::contains(detail::trim(label), ' ')) {
164  throw invalid_file_format_exception{ fmt::format("String labels may not contain whitespaces, but \"{}\" has at least one!", detail::trim(label)) };
165  }
166  }
167  }
168  // found a class
169  has_label = true;
170  continue; // don't increment num_features
171  }
172  }
173  // check for the data field
174  if (detail::starts_with(upper_case_line, "@DATA")) {
175  // finished reading header -> start parsing data
176  break;
177  }
178  // check if the line starts with an @ but is not a valid attribute
179  if (detail::starts_with(upper_case_line, "@")) {
180  // an invalid or unsupported header entry has been read!
181  throw invalid_file_format_exception{ fmt::format("Read an invalid header entry: \"{}\"!", line) };
182  }
183  }
184 
185  // perform some additional checks
186  if (num_features == 0) {
187  // no @ATTRIBUTE fields
188  throw invalid_file_format_exception{ "Can't parse file: no feature ATTRIBUTES are defined!" };
189  }
190  if (header_line + 1 >= lines.size()) {
191  // no data points provided
192  throw invalid_file_format_exception{ "Can't parse file: @DATA is missing!" };
193  }
194 
195  return std::make_tuple(num_features, header_line + 1, labels, has_label ? label_idx : 0);
196 }
197 
235 template <typename real_type, typename label_type>
236 [[nodiscard]] inline std::tuple<std::size_t, std::size_t, std::vector<std::vector<real_type>>, std::vector<label_type>> parse_arff_data(const file_reader &reader) {
237  PLSSVM_ASSERT(reader.is_open(), "The file_reader is currently not associated with a file!");
238 
239  // parse arff header, structured bindings can't be used because of the OpenMP parallel section
240  std::size_t num_header_lines = 0;
241  std::size_t num_features = 0;
242  std::set<label_type> unique_label{};
243  std::size_t label_idx = 0;
244  std::tie(num_features, num_header_lines, unique_label, label_idx) = detail::io::parse_arff_header<label_type>(reader.lines());
245  const bool has_label = !unique_label.empty();
246 
247  // calculate data set sizes
248  const std::size_t num_data_points = reader.num_lines() - num_header_lines;
249  const std::size_t num_attributes = num_features + static_cast<std::size_t>(has_label);
250 
251  // create data and label vectors
252  std::vector<std::vector<real_type>> data(num_data_points, std::vector<real_type>(num_features));
253  std::vector<label_type> label(num_data_points);
254 
255  std::exception_ptr parallel_exception;
256 
257  #pragma omp parallel default(none) shared(reader, data, label, unique_label, parallel_exception) firstprivate(num_header_lines, num_features, num_attributes, has_label, label_idx)
258  {
259  #pragma omp for
260  for (std::size_t i = 0; i < data.size(); ++i) {
261  try {
262  std::string_view line = reader.line(i + num_header_lines);
263  // there must not be any @ inside the data section
264  if (detail::starts_with(line, '@')) {
265  throw invalid_file_format_exception{ fmt::format("Read @ inside data section!: \"{}\"!", line) };
266  }
267 
268  // parse sparse or dense data point definition
269  // a sparse data point must start with a opening curly brace
270  if (detail::starts_with(line, '{')) {
271  // -> sparse data point given, but the closing brace is missing
272  if (!detail::ends_with(line, '}')) {
273  throw invalid_file_format_exception{ fmt::format("Missing closing '}}' for sparse data point \"{}\" description!", line) };
274  }
275  // parse the sparse line
276  bool is_class_set = false;
277  std::string_view::size_type pos = 1;
278  while (true) {
279  std::string_view::size_type next_pos = line.find_first_of(' ', pos);
280  // no further data points
281  if (next_pos == std::string_view::npos) {
282  break;
283  }
284 
285  // get index
286  auto index = detail::convert_to<unsigned long, invalid_file_format_exception>(line.substr(pos, next_pos - pos));
287  if (index >= num_attributes) {
288  // index too big for specified number of features
289  throw invalid_file_format_exception{ fmt::format("Trying to add feature/label at index {} but the maximum index is {}!", index, num_attributes - 1) };
290  }
291  pos = next_pos + 1;
292 
293  // get position of next value
294  next_pos = line.find_first_of(",}", pos);
295 
296  // write parsed value depending on the index
297  if (has_label && index == label_idx) {
298  // write label value
299  is_class_set = true;
300  if constexpr (std::is_same_v<label_type, bool>) {
301  // the std::vector<bool> template specialization is per C++ standard NOT thread safe
302  #pragma omp critical
303  label[i] = detail::convert_to<label_type, invalid_file_format_exception>(line.substr(pos, next_pos - pos));
304  } else {
305  label[i] = detail::convert_to<label_type, invalid_file_format_exception>(line.substr(pos, next_pos - pos));
306  }
307  } else {
308  // write feature valuehas a whitespace!
309  // if the feature index is larger than the label index, the index must be reduced in order to write the feature to the correct data index
310  if (has_label && index > label_idx) {
311  --index;
312  }
313  data[i][index] = detail::convert_to<real_type, invalid_file_format_exception>(line.substr(pos, next_pos - pos));
314  }
315 
316  // remove already processes part of the line
317  line.remove_prefix(next_pos + 1);
318  line = detail::trim_left(line);
319  pos = 0;
320  }
321  // there should be a class label but none has been found
322  if (has_label && !is_class_set) {
323  throw invalid_file_format_exception{ fmt::format("Missing label for data point \"{}\"!", reader.line(i + num_header_lines)) };
324  }
325  } else {
326  // check if the last character is a closing brace
327  if (detail::ends_with(line, '}')) {
328  // no dense line given but a sparse line with a missing opening brace
329  throw invalid_file_format_exception{ fmt::format("Missing opening '{{' for sparse data point \"{}\" description!", line) };
330  }
331  // dense line given
332  const std::vector<std::string_view> line_split = detail::split(line, ',');
333  if (line_split.size() != num_attributes) {
334  throw invalid_file_format_exception{ fmt::format("Invalid number of features and labels! Found {} but should be {}!", line_split.size(), num_attributes) };
335  }
336  for (std::size_t j = 0; j < num_attributes; ++j) {
337  if (has_label && label_idx == j) {
338  // found a label
339 
340  if constexpr (std::is_same_v<label_type, bool>) {
341  // the std::vector<bool> template specialization is per C++ standard NOT thread safe
342  #pragma omp critical
343  label[i] = detail::convert_to<label_type, invalid_file_format_exception>(line_split[j]);
344  } else {
345  label[i] = detail::convert_to<label_type, invalid_file_format_exception>(line_split[j]);
346  }
347  } else {
348  // found data point
349  data[i][j] = detail::convert_to<real_type, invalid_file_format_exception>(line_split[j]);
350  }
351  }
352  }
353 
354  // check if the parsed label is one of the labels specified in the ARFF file header
355  if (has_label && !detail::contains(unique_label, static_cast<label_type>(label[i]))) {
356  throw invalid_file_format_exception{ fmt::format("Found the label \"{}\" which was not specified in the header ({{{}}})!", label[i], fmt::join(unique_label, ",")) };
357  }
358  } catch (const std::exception &) {
359  // catch first exception and store it
360  #pragma omp critical
361  {
362  if (!parallel_exception) {
363  parallel_exception = std::current_exception();
364  }
365  }
366  }
367  }
368  }
369 
370  // rethrow if an exception occurred inside the parallel region
371  if (parallel_exception) {
372  std::rethrow_exception(parallel_exception);
373  }
374 
375  return std::make_tuple(num_data_points, num_features, std::move(data), has_label ? std::move(label) : std::vector<label_type>{});
376 }
377 
407 template <typename real_type, typename label_type, bool has_label>
408 inline void write_arff_data_impl(const std::string &filename, const std::vector<std::vector<real_type>> &data, const std::vector<label_type> &label) {
409  if constexpr (has_label) {
410  PLSSVM_ASSERT(data.empty() || !label.empty(), "has_label is 'true' but no labels were provided!");
411  PLSSVM_ASSERT(data.size() == label.size(), "Number of data points ({}) and number of labels ({}) mismatch!", data.size(), label.size());
412  } else {
413  PLSSVM_ASSERT(label.empty(), "has_label is 'false' but labels were provided!");
414  }
415 
416  // create file
417  fmt::ostream out = fmt::output_file(filename);
418  // write arff header with current time stamp
419  out.print("% This data set has been created at {}\n", detail::current_date_time());
420 
421  const std::size_t num_data_points = data.size();
422  if (num_data_points == 0) {
423  // nothing to output
424  return;
425  }
426  const std::size_t num_features = data.front().size();
427  out.print("% {}x{}\n", num_data_points, num_features);
428 
429  out.print("@RELATION data_set\n");
430  // write arff header for features
431  for (std::size_t i = 0; i < num_features; ++i) {
432  out.print("@ATTRIBUTE feature_{} NUMERIC\n", i);
433  }
434  // write arff header for the label if existing
435  if constexpr (has_label) {
436  const std::set<label_type> available_labels{ label.begin(), label.end() };
437  out.print("@ATTRIBUTE class {{{}}}\n", fmt::join(available_labels, ","));
438  }
439  out.print("@DATA\n");
440 
441  // write arff data
442  #pragma omp parallel default(none) shared(out, data, label) firstprivate(num_data_points)
443  {
444  // all support vectors
445  std::string out_string;
446  #pragma omp for schedule(dynamic) nowait
447  for (std::size_t i = 0; i < num_data_points; ++i) {
448  if constexpr (has_label) {
449  out_string.append(fmt::format("{:.10e},{}\n", fmt::join(data[i], ","), label[i]));
450  } else {
451  out_string.append(fmt::format("{:.10e}\n", fmt::join(data[i], ",")));
452  }
453  }
454 
455  #pragma omp critical
456  out.print("{}", out_string);
457  }
458 }
459 
488 template <typename real_type, typename label_type>
489 inline void write_arff_data(const std::string &filename, const std::vector<std::vector<real_type>> &data, const std::vector<label_type> &label) {
490  write_arff_data_impl<real_type, label_type, true>(filename, data, label);
491 }
492 
518 template <typename real_type>
519 inline void write_arff_data(const std::string &filename, const std::vector<std::vector<real_type>> &data) {
520  write_arff_data_impl<real_type, real_type, false>(filename, data, {});
521 }
522 
523 } // namespace plssvm::detail::io
524 
525 #endif // PLSSVM_DETAIL_IO_ARFF_PARSING_HPP_
#define PLSSVM_ASSERT(cond, msg,...)
Defines the PLSSVM_ASSERT macro if PLSSVM_ASSERT_ENABLED is defined.
Definition: assert.hpp:74
The plssvm::detail::file_reader class is responsible for reading a file and splitting it into its lin...
Definition: file_reader.hpp:42
std::string_view line(typename std::vector< std::string_view >::size_type pos) const
Return the pos line of the parsed file.
bool is_open() const noexcept
Checks whether this file_reader is currently associated with a file.
const std::vector< std::string_view > & lines() const noexcept
Return all lines present after the preprocessing.
std::vector< std::string_view >::size_type num_lines() const noexcept
Return the number of parsed lines (where all empty lines or lines starting with a comment are ignored...
Exception type thrown if the provided file has an invalid format for the selected parser (e....
Definition: exceptions.hpp:114
Defines universal utility functions.
Implements custom exception classes derived from std::runtime_error including source location informa...
Implements a file reader class responsible for reading the input file and parsing it into lines.
Namespace containing implementation details for the IO related functions. Should not directly be used...
Definition: core.hpp:44
void write_arff_data(const std::string &filename, const std::vector< std::vector< real_type >> &data, const std::vector< label_type > &label)
Write the provided data and labels to the ARFF file filename.
Definition: arff_parsing.hpp:489
void write_arff_data_impl(const std::string &filename, const std::vector< std::vector< real_type >> &data, const std::vector< label_type > &label)
Write the provided data and labels to the ARFF file filename.
Definition: arff_parsing.hpp:408
std::tuple< std::size_t, std::size_t, std::set< label_type >, std::size_t > parse_arff_header(const std::vector< std::string_view > &lines)
Parse the ARFF file header, i.e., determine the number of features, the length of the ARRF header,...
Definition: arff_parsing.hpp:60
std::tuple< std::size_t, std::size_t, std::vector< std::vector< real_type > >, std::vector< label_type > > parse_arff_data(const file_reader &reader)
Parse all data points and potential label using the file reader, ignoring all empty lines and lines s...
Definition: arff_parsing.hpp:236
bool starts_with(std::string_view str, std::string_view sv) noexcept
Checks if the string str starts with the prefix sv.
std::string as_upper_case(std::string_view str)
Return a new string with the same content as str but all upper case.
std::string_view trim(std::string_view str) noexcept
Returns a new std::string_view equal to str where all leading and trailing whitespaces are removed.
std::string_view trim_left(std::string_view str) noexcept
Returns a new std::string_view equal to str where all leading whitespaces are removed.
bool ends_with(std::string_view str, std::string_view sv) noexcept
Checks if the string str ends with the suffix sv.
std::string current_date_time()
Return the current date time in the format "YYYY-MM-DD hh:mm:ss".
std::vector< std::string_view > split(std::string_view str, char delim=' ')
Split the string str at the positions with delimiter delim and return the sub-strings.
bool contains(std::string_view str, std::string_view sv) noexcept
Checks if the string str contains the string sv.
constexpr std::string_view name
The name of the library.
Definition: version.hpp:26
Defines (arithmetic) functions on std::vector and scalars.
Implements a conversion function from a string to an arithmetic type.
Implements utility functions for string manipulation and querying.