PLSSVM - Parallel Least Squares Support Vector Machine  2.0.0
A Least Squares Support Vector Machine implementation using different backends.
libsvm_parsing.hpp
Go to the documentation of this file.
1 
12 #ifndef PLSSVM_DETAIL_IO_LIBSVM_PARSING_HPP_
13 #define PLSSVM_DETAIL_IO_LIBSVM_PARSING_HPP_
14 #pragma once
15 
16 #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT
17 #include "plssvm/detail/io/file_reader.hpp" // plssvm::detail::io::file_reader
18 #include "plssvm/detail/string_conversion.hpp" // plssvm::detail::convert_to
19 #include "plssvm/detail/utility.hpp" // plssvm::detail::current_date_time
20 #include "plssvm/exceptions/exceptions.hpp" // plssvm::invalid_file_format_exception
21 
22 #include "fmt/compile.h" // FMT_COMPILE
23 #include "fmt/format.h" // fmt::format, fmt::format_to
24 #include "fmt/os.h" // fmt::ostream, fmt::output_file
25 
26 #include <algorithm> // std::max, std::min
27 #include <cstddef> // std::size_t
28 #include <exception> // std::exception, std::exception_ptr, std::current_exception, std::rethrow_exception
29 #include <string> // std::string
30 #include <string_view> // std::string_view
31 #include <tuple> // std::tuple, std::make_tuple
32 #include <utility> // std::move
33 #include <vector> // std::vector
34 
35 namespace plssvm::detail::io {
36 
47 [[nodiscard]] inline std::size_t parse_libsvm_num_features(const std::vector<std::string_view> &lines, const std::size_t skipped_lines = 0) {
48  std::size_t num_features = 0;
49  std::exception_ptr parallel_exception;
50 
51  #pragma omp parallel default(none) shared(lines, parallel_exception, num_features) firstprivate(skipped_lines)
52  {
53  #pragma omp for reduction(max : num_features)
54  for (std::size_t i = skipped_lines; i < lines.size(); ++i) {
55  try {
56  const std::string_view line = lines[i];
57 
58  // check index of last feature entry
59  const std::string_view::size_type pos_colon = line.find_last_of(':');
60  if (pos_colon == std::string_view::npos) {
61  // no features could be found -> can't contribute to the number of feature calculation
62  continue;
63  }
64  std::string_view::size_type pos_whitespace = line.find_last_of(' ', pos_colon);
65  if (pos_whitespace == std::string_view::npos) {
66  // no whitespace BEFORE the last colon could be found
67  // this may only happen if NO labels are given
68  pos_whitespace = 0;
69  }
70  const auto index = detail::convert_to<unsigned long, invalid_file_format_exception>(line.substr(pos_whitespace, pos_colon - pos_whitespace));
71  num_features = std::max<std::size_t>(num_features, index);
72  } catch (const std::exception &) {
73  // catch first exception and store it
74  #pragma omp critical
75  {
76  if (!parallel_exception) {
77  parallel_exception = std::current_exception();
78  }
79  }
80  }
81  }
82  }
83 
84  // rethrow if an exception occurred inside the parallel region
85  if (parallel_exception) {
86  std::rethrow_exception(parallel_exception);
87  }
88 
89  return num_features;
90 }
91 
117 template <typename real_type, typename label_type>
118 [[nodiscard]] inline std::tuple<std::size_t, std::size_t, std::vector<std::vector<real_type>>, std::vector<label_type>> parse_libsvm_data(const file_reader &reader, const std::size_t skipped_lines = 0) {
119  PLSSVM_ASSERT(reader.is_open(), "The file_reader is currently not associated with a file!");
120  // sanity check: can't skip more lines than are present
121  PLSSVM_ASSERT(skipped_lines <= reader.num_lines(), "Tried to skipp {} lines, but only {} are present!", skipped_lines, reader.num_lines());
122 
123  // parse sizes
124  const std::size_t num_data_points = reader.num_lines() - skipped_lines;
125  const std::size_t num_features = parse_libsvm_num_features(reader.lines(), skipped_lines);
126 
127  // no features were parsed -> invalid file
128  if (num_features == 0) {
129  throw invalid_file_format_exception{ fmt::format("Can't parse file: no data points are given!") };
130  }
131 
132  // create vector containing the data and label
133  std::vector<std::vector<real_type>> data(num_data_points);
134  std::vector<label_type> label(num_data_points);
135 
136  std::exception_ptr parallel_exception;
137  bool has_label = false;
138  bool has_no_label = false;
139 
140  #pragma omp parallel default(none) shared(reader, skipped_lines, data, label, parallel_exception, has_label, has_no_label) firstprivate(num_features)
141  {
142  #pragma omp for reduction(|| : has_label) reduction(|| : has_no_label)
143  for (typename std::vector<std::vector<real_type>>::size_type i = 0; i < data.size(); ++i) {
144  try {
145  std::string_view line = reader.line(skipped_lines + i);
146  unsigned long last_index = 0;
147 
148  // check if class labels are present (not necessarily the case for test files)
149  std::string_view::size_type pos = line.find_first_of(" \n");
150  const std::string_view::size_type first_colon = line.find_first_of(":\n");
151  if (first_colon >= pos) {
152  // get class or alpha
153  has_label = true;
154  if constexpr (std::is_same_v<label_type, bool>) {
155  // the std::vector<bool> template specialization is per C++ standard NOT thread safe
156  #pragma omp critical
157  label[i] = detail::convert_to<bool, invalid_file_format_exception>(line.substr(0, pos));
158  } else {
159  label[i] = detail::convert_to<label_type, invalid_file_format_exception>(line.substr(0, pos));
160  }
161  } else {
162  has_no_label = true;
163  pos = 0;
164  }
165 
166  // get data
167  std::vector<real_type> vline(num_features);
168  while (true) {
169  std::string_view::size_type next_pos = line.find_first_of(':', pos);
170  // no further data points
171  if (next_pos == std::string_view::npos) {
172  break;
173  }
174 
175  // get index
176  auto index = detail::convert_to<unsigned long, invalid_file_format_exception>(line.substr(pos, next_pos - pos));
177 
178  // LIBSVM assumes a 1-based indexing -> if the parsed index is 0 this condition is violated
179  if (index == 0) {
180  throw invalid_file_format_exception{ "LIBSVM assumes a 1-based feature indexing scheme, but 0 was given!" };
181  }
182  // the indices must be strictly increasing!
183  if (last_index >= index) {
184  throw invalid_file_format_exception{ fmt::format("The features indices must be strictly increasing, but {} is smaller or equal than {}!", index, last_index) };
185  }
186  last_index = index;
187 
188  // since arrays start at 0, reduce 1 based index by one
189  --index;
190  pos = next_pos + 1;
191 
192  // get value
193  next_pos = line.find_first_of(' ', pos);
194  vline[index] = detail::convert_to<real_type, invalid_file_format_exception>(line.substr(pos, next_pos - pos));
195  pos = next_pos;
196  }
197  // move filled line to overall matrix
198  data[i] = std::move(vline);
199  } catch (const std::exception &) {
200  // catch first exception and store it
201  #pragma omp critical
202  {
203  if (!parallel_exception) {
204  parallel_exception = std::current_exception();
205  }
206  }
207  }
208  }
209  }
210 
211  // rethrow if an exception occurred inside the parallel region
212  if (parallel_exception) {
213  std::rethrow_exception(parallel_exception);
214  }
215  if (has_label && has_no_label) {
216  // some data points where given with labels, BUT some data pints where given without labels
217  throw invalid_file_format_exception{ "Inconsistent label specification found (some data points are labeled, others are not)!" };
218  }
219 
220  return std::make_tuple(num_data_points, num_features, std::move(data), !has_no_label ? std::move(label) : std::vector<label_type>{});
221 }
222 
243 template <typename real_type, typename label_type, bool has_label>
244 inline void write_libsvm_data_impl(const std::string &filename, const std::vector<std::vector<real_type>> &data, const std::vector<label_type> &label) {
245  if constexpr (has_label) {
246  PLSSVM_ASSERT(data.empty() || !label.empty(), "has_label is 'true' but no labels were provided!");
247  PLSSVM_ASSERT(data.size() == label.size(), "Number of data points ({}) and number of labels ({}) mismatch!", data.size(), label.size());
248  } else {
249  PLSSVM_ASSERT(label.empty(), "has_label is 'false' but labels were provided!");
250  }
251 
252  // create output file
253  fmt::ostream out = fmt::output_file(filename);
254  // write timestamp as current date time
255  out.print("# This data set has been created at {}\n", detail::current_date_time());
256 
257  const std::size_t num_data_points = data.size();
258  if (num_data_points == 0) {
259  // nothing to output
260  return;
261  }
262  const std::size_t num_features = data.front().size();
263  out.print("# {}x{}\n", num_data_points, num_features);
264 
265  // format one output-line
266  auto format_libsvm_line = [](std::string &output, const std::vector<real_type> &data_point) {
267  static constexpr std::size_t BLOCK_SIZE = 64;
268  static constexpr std::size_t CHARS_PER_BLOCK = 128;
269  static constexpr std::size_t BUFFER_SIZE = BLOCK_SIZE * CHARS_PER_BLOCK;
270  static std::array<char, BUFFER_SIZE> buffer;
271  #pragma omp threadprivate(buffer)
272 
273  for (typename std::vector<real_type>::size_type j = 0; j < data_point.size(); j += BLOCK_SIZE) {
274  char *ptr = buffer.data();
275  for (std::size_t i = 0; i < std::min<std::size_t>(BLOCK_SIZE, data_point.size() - j); ++i) {
276  if (data_point[j + i] != real_type{ 0.0 }) {
277  ptr = fmt::format_to(ptr, FMT_COMPILE("{}:{:.10e} "), j + i + 1, data_point[j + i]);
278  }
279  }
280  output.append(buffer.data(), ptr - buffer.data());
281  }
282  output.push_back('\n');
283  };
284 
285  #pragma omp parallel default(none) shared(out, data, label, format_libsvm_line)
286  {
287  // all support vectors
288  std::string out_string;
289  #pragma omp for schedule(dynamic) nowait
290  for (typename std::vector<real_type>::size_type i = 0; i < data.size(); ++i) {
291  if constexpr (has_label) {
292  out_string.append(fmt::format(FMT_COMPILE("{} "), label[i]));
293  }
294  format_libsvm_line(out_string, data[i]);
295  }
296 
297  #pragma omp critical
298  out.print("{}", out_string);
299  }
300 }
301 
321 template <typename real_type, typename label_type>
322 inline void write_libsvm_data(const std::string &filename, const std::vector<std::vector<real_type>> &data, const std::vector<label_type> &label) {
323  write_libsvm_data_impl<real_type, label_type, true>(filename, data, label);
324 }
325 
343 template <typename real_type>
344 inline void write_libsvm_data(const std::string &filename, const std::vector<std::vector<real_type>> &data) {
345  write_libsvm_data_impl<real_type, real_type, false>(filename, data, {});
346 }
347 
348 } // namespace plssvm::detail::io
349 
350 #endif // PLSSVM_DETAIL_IO_LIBSVM_PARSING_HPP_
Implements a custom assert macro PLSSVM_ASSERT.
#define PLSSVM_ASSERT(cond, msg,...)
Defines the PLSSVM_ASSERT macro if PLSSVM_ASSERT_ENABLED is defined.
Definition: assert.hpp:74
The plssvm::detail::file_reader class is responsible for reading a file and splitting it into its lin...
Definition: file_reader.hpp:42
std::string_view line(typename std::vector< std::string_view >::size_type pos) const
Return the pos line of the parsed file.
bool is_open() const noexcept
Checks whether this file_reader is currently associated with a file.
const std::vector< std::string_view > & lines() const noexcept
Return all lines present after the preprocessing.
std::vector< std::string_view >::size_type num_lines() const noexcept
Return the number of parsed lines (where all empty lines or lines starting with a comment are ignored...
Exception type thrown if the provided file has an invalid format for the selected parser (e....
Definition: exceptions.hpp:114
Defines universal utility functions.
Implements custom exception classes derived from std::runtime_error including source location informa...
Implements a file reader class responsible for reading the input file and parsing it into lines.
Namespace containing implementation details for the IO related functions. Should not directly be used...
Definition: core.hpp:44
std::tuple< std::size_t, std::size_t, std::vector< std::vector< real_type > >, std::vector< label_type > > parse_libsvm_data(const file_reader &reader, const std::size_t skipped_lines=0)
Parse all data points and potential label using the file reader, ignoring all empty lines and lines s...
Definition: libsvm_parsing.hpp:118
void write_libsvm_data_impl(const std::string &filename, const std::vector< std::vector< real_type >> &data, const std::vector< label_type > &label)
Write the provided data and labels to the LIBSVM file filename.
Definition: libsvm_parsing.hpp:244
void write_libsvm_data(const std::string &filename, const std::vector< std::vector< real_type >> &data, const std::vector< label_type > &label)
Write the provided data and labels to the LIBSVM file filename.
Definition: libsvm_parsing.hpp:322
std::size_t parse_libsvm_num_features(const std::vector< std::string_view > &lines, const std::size_t skipped_lines=0)
Parse the maximum number of features per data point given in lines, where the first skipped_lines are...
Definition: libsvm_parsing.hpp:47
std::string current_date_time()
Return the current date time in the format "YYYY-MM-DD hh:mm:ss".
Implements a conversion function from a string to an arithmetic type.