PLSSVM - Parallel Least Squares Support Vector Machine  2.0.0
A Least Squares Support Vector Machine implementation using different backends.
libsvm_model_parsing.hpp
Go to the documentation of this file.
1 
12 #ifndef PLSSVM_DETAIL_IO_LIBSVM_MODEL_PARSING_HPP_
13 #define PLSSVM_DETAIL_IO_LIBSVM_MODEL_PARSING_HPP_
14 #pragma once
15 
16 #include "plssvm/data_set.hpp" // plssvm::data_set
17 #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT
18 #include "plssvm/detail/logger.hpp" // plssvm::detail::log, plssvm::verbosity_level
19 #include "plssvm/detail/utility.hpp" // plssvm::detail::current_date_time
20 #include "plssvm/parameter.hpp" // plssvm::parameter
21 
22 #include "fmt/compile.h" // FMT_COMPILE
23 #include "fmt/format.h" // fmt::format, fmt::format_to
24 #include "fmt/os.h" // fmt::ostream, fmt::output_file
25 #ifdef _OPENMP
26  #include <omp.h> // omp_get_num_threads
27 #endif
28 
29 #include <algorithm> // std::min, std::fill
30 #include <cstddef> // std::size_t
31 #include <map> // std::map
32 #include <memory> // std::unique_ptr
33 #include <numeric> // std::accumulate
34 #include <set> // std::set
35 #include <sstream> // std::stringstream
36 #include <string> // std::string
37 #include <string_view> // std::string_view
38 #include <tuple> // std::tuple, std::make_tuple
39 #include <utility> // std::move, std::pair
40 #include <vector> // std::vector
41 
42 namespace plssvm::detail::io {
43 
82 template <typename real_type, typename label_type, typename size_type>
83 [[nodiscard]] inline std::tuple<plssvm::parameter, real_type, std::vector<label_type>, std::size_t> parse_libsvm_model_header(const std::vector<std::string_view> &lines) {
84  // data to read
85  plssvm::parameter params{};
86  real_type rho{};
87  size_type num_support_vectors{};
88 
89  // helper variables
90  bool svm_type_set{ false };
91  bool kernel_type_set{ false };
92  bool nr_class_set{ false };
93  bool total_sv_set{ false };
94  bool rho_set{ false };
95  bool label_set{ false };
96  bool nr_sv_set{ false };
97  size_type nr_class{};
98  std::vector<label_type> labels{};
99  std::vector<size_type> num_support_vectors_per_class{};
100 
101  // parse libsvm model file header
102  std::size_t header_line = 0;
103  {
104  for (; header_line < lines.size(); ++header_line) {
105  // get the current line and convert it to lower case
106  std::string line{ detail::trim(lines[header_line]) };
107  detail::to_lower_case(line);
108 
109  // separate value from model header entry
110  std::string_view value{ line };
111  value.remove_prefix(std::min(value.find_first_of(' ') + 1, value.size()));
112  value = detail::trim_left(value);
113 
114  if (detail::starts_with(line, "svm_type")) {
115  // svm_type must be c_svc
116  if (value != "c_svc") {
117  throw invalid_file_format_exception{ fmt::format("Can only use c_svc as svm_type, but '{}' was given!", value) };
118  }
119  // read the svm_type
120  svm_type_set = true;
121  } else if (detail::starts_with(line, "kernel_type")) {
122  // parse kernel_type, must be linear, polynomial or rbf
123  std::istringstream iss{ std::string{ value } };
124  iss >> params.kernel_type;
125  if (iss.fail()) {
126  throw invalid_file_format_exception{ fmt::format("Unrecognized kernel type '{}'!", value) };
127  }
128  // read the kernel_type
129  kernel_type_set = true;
130  } else if (detail::starts_with(line, "gamma")) {
131  // parse gamma
132  params.gamma = detail::convert_to<typename decltype(params.gamma)::value_type>(value);
133  } else if (detail::starts_with(line, "degree")) {
134  // parse degree
135  params.degree = detail::convert_to<typename decltype(params.degree)::value_type>(value);
136  } else if (detail::starts_with(line, "coef0")) {
137  // parse coef0
138  params.coef0 = detail::convert_to<typename decltype(params.coef0)::value_type>(value);
139  } else if (detail::starts_with(line, "nr_class")) {
140  // number of classes must be 2
141  nr_class = detail::convert_to<unsigned long long>(value);
142  // read the number of classes (number of different labels)
143  nr_class_set = true;
144  } else if (detail::starts_with(line, "total_sv")) {
145  // the total number of support vectors must be greater than 0
146  num_support_vectors = detail::convert_to<size_type>(value);
147  if (num_support_vectors == 0) {
148  throw invalid_file_format_exception{ "The number of support vectors must be greater than 0!" };
149  }
150  // read the number of support vectors
151  total_sv_set = true;
152  } else if (detail::starts_with(line, "rho")) {
153  // parse rho, required
154  rho = detail::convert_to<real_type>(value);
155  // read the rho value
156  rho_set = true;
157  } else if (detail::starts_with(line, "label")) {
158  // parse available label, note: we can't use value here since we want to preserve the case of the labels
159  std::string_view original_line = detail::trim(lines[header_line]);
160  original_line.remove_prefix(std::min(original_line.find_first_of(' ') + 1, original_line.size()));
161  original_line = detail::trim_left(original_line);
162  labels = detail::split_as<label_type>(original_line, ' ');
163  if (labels.size() < 2) {
164  throw invalid_file_format_exception{ fmt::format("At least two labels must be set, but only {} label ([{}]) was given!", labels.size(), fmt::join(labels, ", ")) };
165  }
166  // check if all labels are unique
167  std::set<label_type> unique_labels{};
168  for (const label_type &label : labels) {
169  unique_labels.insert(label);
170  }
171  if (labels.size() != unique_labels.size()) {
172  throw invalid_file_format_exception{ fmt::format("Provided {} labels but only {} of them was/where unique!", labels.size(), unique_labels.size()) };
173  }
174  // read the labels
175  label_set = true;
176  } else if (detail::starts_with(line, "nr_sv")) {
177  // parse number of support vectors per class
178  num_support_vectors_per_class = detail::split_as<size_type>(value, ' ');
179  if (num_support_vectors_per_class.size() < 2) {
180  throw invalid_file_format_exception{ fmt::format("At least two nr_sv must be set, but only {} ([{}]) was given!", num_support_vectors_per_class.size(), fmt::join(num_support_vectors_per_class, ", ")) };
181  }
182  // read the number of support vectors per class
183  nr_sv_set = true;
184  } else if (line == "sv") {
185  // start parsing support vectors, required
186  break;
187  } else {
188  throw invalid_file_format_exception{ fmt::format("Unrecognized header entry '{}'! Maybe SV is missing?", lines[header_line]) };
189  }
190  }
191  }
192 
193  // additional sanity checks
194  if (!svm_type_set) {
195  throw invalid_file_format_exception{ "Missing svm_type!" };
196  }
197  if (!kernel_type_set) {
198  throw invalid_file_format_exception{ "Missing kernel_type!" };
199  }
200  // check provided values based on kernel_type
201  switch (params.kernel_type) {
203  if (!params.degree.is_default()) {
204  throw invalid_file_format_exception{ "Explicitly provided a value for the degree parameter which is not used in the linear kernel!" };
205  }
206  if (!params.gamma.is_default()) {
207  throw invalid_file_format_exception{ "Explicitly provided a value for the gamma parameter which is not used in the linear kernel!" };
208  }
209  if (!params.coef0.is_default()) {
210  throw invalid_file_format_exception{ "Explicitly provided a value for the coef0 parameter which is not used in the linear kernel!" };
211  }
212  break;
214  break;
216 
217  if (!params.degree.is_default()) {
218  throw invalid_file_format_exception{ "Explicitly provided a value for the degree parameter which is not used in the radial basis function kernel!" };
219  }
220  if (!params.coef0.is_default()) {
221  throw invalid_file_format_exception{ "Explicitly provided a value for the coef0 parameter which is not used in the radial basis function kernel!" };
222  }
223  break;
224  }
225  if (!nr_class_set) {
226  throw invalid_file_format_exception{ "Missing number of different classes nr_class!" };
227  }
228  if (!total_sv_set) {
229  throw invalid_file_format_exception{ "Missing total number of support vectors total_sv!" };
230  }
231  if (!rho_set) {
232  throw invalid_file_format_exception{ "Missing rho value!" };
233  }
234  if (!label_set) {
235  throw invalid_file_format_exception{ "Missing class label specification!" };
236  }
237  // number of different labels must match the number of classes
238  if (nr_class != labels.size()) {
239  throw invalid_file_format_exception{ fmt::format("The number of classes (nr_class) is {}, but the provided number of different labels is {} (label)!", nr_class, labels.size()) };
240  }
241  if (!nr_sv_set) {
242  throw invalid_file_format_exception{ "Missing number of support vectors per class nr_sv!" };
243  }
244  // number of different label numbers must match the number of classes
245  if (nr_class != num_support_vectors_per_class.size()) {
246  throw invalid_file_format_exception{ fmt::format("The number of classes (nr_class) is {}, but the provided number of different labels is {} (nr_sv)!", nr_class, num_support_vectors_per_class.size()) };
247  }
248  // calculate the number of support as sum of the support vectors per class
249  const auto nr_sv_sum = std::accumulate(num_support_vectors_per_class.begin(), num_support_vectors_per_class.end(), size_type{ 0 });
250  if (nr_sv_sum != num_support_vectors) {
251  throw invalid_file_format_exception{ fmt::format("The total number of support vectors is {}, but the sum of nr_sv is {}!", num_support_vectors, nr_sv_sum) };
252  }
253  // check if no support vectors are given
254  if (header_line + 1 >= lines.size()) {
255  throw invalid_file_format_exception{ "Can't parse file: no support vectors are given or SV is missing!" };
256  }
257 
258  // set label according to model file definition
259  std::vector<label_type> data_labels(num_support_vectors);
260  std::size_t pos = 0;
261  for (size_type i = 0; i < labels.size(); ++i) {
262  std::fill(data_labels.begin() + pos, data_labels.begin() + pos + num_support_vectors_per_class[i], labels[i]);
263  pos += num_support_vectors_per_class[i];
264  }
265 
266  // current limitation
267  if (nr_class != 2) {
268  throw invalid_file_format_exception{ fmt::format("Currently only binary classification is supported, but {} different label where given!", nr_class) };
269  }
270 
271  return std::make_tuple(params, rho, std::move(data_labels), header_line + 1);
272 }
273 
295 template <typename real_type, typename label_type>
296 [[nodiscard]] inline std::vector<label_type> write_libsvm_model_header(fmt::ostream &out, const plssvm::parameter &params, const real_type rho, const data_set<real_type, label_type> &data) {
297  PLSSVM_ASSERT(data.has_labels(), "Cannot write a model file that does not include labels!");
298 
299  // save model file header
300  std::string out_string = fmt::format("svm_type c_svc\nkernel_type {}\n", params.kernel_type);
301  // save the SVM parameter information based on the used kernel_type
302  switch (params.kernel_type) {
304  break;
306  out_string += fmt::format("degree {}\ngamma {}\ncoef0 {}\n", params.degree, params.gamma, params.coef0);
307  break;
309  out_string += fmt::format("gamma {}\n", params.gamma);
310  break;
311  }
312 
313  // get the original labels (not the mapped once)
314  const std::vector<label_type> label_values = data.different_labels().value();
315 
316  // count the occurrence of each label
317  std::map<label_type, std::size_t> label_counts_map;
318  const std::vector<label_type> labels = data.labels().value();
319  for (const label_type &l : labels) {
320  ++label_counts_map[l];
321  }
322  // fill vector with number of occurrences in correct order
323  std::vector<std::size_t> label_counts(data.num_different_labels());
324  for (typename data_set<real_type, label_type>::size_type i = 0; i < data.num_different_labels(); ++i) {
325  label_counts[i] = label_counts_map[label_values[i]];
326  }
327 
328  out_string += fmt::format("nr_class {}\nlabel {}\ntotal_sv {}\nnr_sv {}\nrho {}\nSV\n",
329  data.num_different_labels(),
330  fmt::join(label_values, " "),
331  data.num_data_points(),
332  fmt::join(label_counts, " "),
333  rho);
334 
335  // print model header
337  "\n{}\n", out_string);
338  // write model header to file
339  out.print("{}", out_string);
340 
341  return label_values;
342 }
343 
370 template <typename real_type, typename label_type>
371 inline void write_libsvm_model_data(const std::string &filename, const plssvm::parameter &params, const real_type rho, const std::vector<real_type> &alpha, const data_set<real_type, label_type> &data) {
372  PLSSVM_ASSERT(data.has_labels(), "Cannot write a model file that does not include labels!");
373  PLSSVM_ASSERT(alpha.size() == data.num_data_points(), "The number of weights ({}) doesn't match the number of data points ({})!", alpha.size(), data.num_data_points());
374 
375  const std::vector<std::vector<real_type>> &support_vectors = data.data();
376  const std::vector<label_type> &labels = data.labels().value();
377  const std::size_t num_features = data.num_features();
378 
379  // create file
380  fmt::ostream out = fmt::output_file(filename);
381  // write timestamp for current date time
382  out.print("# This model file has been created at {}\n", detail::current_date_time());
383 
384  // write header information
385  const std::vector<label_type> label_order = write_libsvm_model_header(out, params, rho, data);
386 
387  // the maximum size of one formatted LIBSVM entry, e.g., 1234:1.365363e+10
388  // biggest number representable as std::size_t: 18446744073709551615 -> 20 chars
389  // scientific notation: 3 chars (number in front of decimal separator including a sign + decimal separator) + 10 chars (part after the decimal separator, specified during formatting) +
390  // 5 chars exponent (e + sign + maximum potential exponent (308 -> 3 digits)
391  // separators: 2 chars (: between index and feature + whitespace after feature value)
392  // -> 40 chars in total
393  // -> increased to 48 chars to be on the safe side
394  static constexpr std::size_t CHARS_PER_BLOCK = 48;
395  // results in 48 B * 128 B = 6 KiB stack buffer per thread
396  static constexpr std::size_t BLOCK_SIZE = 128;
397  // use 1 MiB as buffer per thread
398  constexpr std::size_t STRING_BUFFER_SIZE = 1024 * 1024;
399 
400  // format one output-line
401  auto format_libsvm_line = [](std::string &output, const real_type a, const std::vector<real_type> &d) {
402  static constexpr std::size_t STACK_BUFFER_SIZE = BLOCK_SIZE * CHARS_PER_BLOCK;
403  static char buffer[STACK_BUFFER_SIZE];
404  #pragma omp threadprivate(buffer)
405 
406  output.append(fmt::format(FMT_COMPILE("{:.10e} "), a));
407  for (typename std::vector<real_type>::size_type j = 0; j < d.size(); j += BLOCK_SIZE) {
408  char *ptr = buffer;
409  for (std::size_t i = 0; i < std::min<std::size_t>(BLOCK_SIZE, d.size() - j); ++i) {
410  if (d[j + i] != real_type{ 0.0 }) {
411  // add 1 to the index since LIBSVM assumes 1-based feature indexing
412  ptr = fmt::format_to(ptr, FMT_COMPILE("{}:{:.10e} "), j + i + 1, d[j + i]);
413  }
414  }
415  output.append(buffer, ptr - buffer);
416  }
417  output.push_back('\n');
418  };
419 
420  // initialize volatile array
421  auto counts = std::make_unique<volatile int[]>(label_order.size());
422  #pragma omp parallel default(none) shared(counts, alpha, format_libsvm_line, label_order, labels, support_vectors, out) firstprivate(BLOCK_SIZE, CHARS_PER_BLOCK, num_features)
423  {
424  // preallocate string buffer, only ONE allocation
425  std::string out_string;
426  out_string.reserve(STRING_BUFFER_SIZE + (num_features + 1) * CHARS_PER_BLOCK);
427 
428  // support vectors with the first class
429  #pragma omp for nowait
430  for (typename std::vector<real_type>::size_type i = 0; i < alpha.size(); ++i) {
431  if (labels[i] == label_order[0]) {
432  format_libsvm_line(out_string, alpha[i], support_vectors[i]);
433 
434  // if the buffer is full, write it to the file
435  if (out_string.size() > STRING_BUFFER_SIZE) {
436  #pragma omp critical
437  {
438  out.print("{}", out_string);
439  #pragma omp flush(out)
440  }
441  // clear buffer
442  out_string.clear();
443  }
444  }
445  }
446 
447  #pragma omp critical
448  {
449  if (!out_string.empty()) {
450  out.print("{}", out_string);
451  out_string.clear();
452  }
453  counts[0] = counts[0] + 1;
454  #pragma omp flush(counts, out)
455  }
456 
457  for (typename std::vector<label_type>::size_type l = 1; l < label_order.size(); ++l) {
458  // the support vectors with the i-th class
459 
460  #pragma omp for nowait
461  for (typename std::vector<real_type>::size_type i = 0; i < alpha.size(); ++i) {
462  if (labels[i] == label_order[l]) {
463  format_libsvm_line(out_string, alpha[i], support_vectors[i]);
464 
465  // if the buffer is full, write it to the file
466  if (out_string.size() > STRING_BUFFER_SIZE) {
467  #pragma omp critical
468  {
469  out.print("{}", out_string);
470  #pragma omp flush(out)
471  }
472  // clear buffer
473  out_string.clear();
474  }
475  }
476  }
477  // wait for all threads to write support vectors for previous class
478 #ifdef _OPENMP
479  while (counts[l - 1] < omp_get_num_threads()) {
480  }
481 #else
482  #pragma omp barrier
483 #endif
484 
485  #pragma omp critical
486  {
487  if (!out_string.empty()) {
488  out.print("{}", out_string);
489  out_string.clear();
490  }
491  counts[l] = counts[l] + 1;
492  #pragma omp flush(counts, out)
493  }
494  }
495  }
496 }
497 
498 } // namespace plssvm::detail::io
499 
500 #endif // PLSSVM_DETAIL_IO_LIBSVM_MODEL_PARSING_HPP_
Implements a custom assert macro PLSSVM_ASSERT.
#define PLSSVM_ASSERT(cond, msg,...)
Defines the PLSSVM_ASSERT macro if PLSSVM_ASSERT_ENABLED is defined.
Definition: assert.hpp:74
bool has_labels() const noexcept
Returns whether this data set contains labels or not.
Definition: data_set.hpp:194
size_type num_different_labels() const noexcept
Returns the number of different labels in this data set.
Definition: data_set.hpp:225
const std::vector< std::vector< real_type > > & data() const noexcept
Return the data points in this data set.
Definition: data_set.hpp:189
size_type num_features() const noexcept
Returns the number of features in this data set.
Definition: data_set.hpp:218
std::optional< std::vector< label_type > > different_labels() const
Returns an optional to the different labels in this data set.
Definition: data_set.hpp:633
optional_ref< const std::vector< label_type > > labels() const noexcept
Returns an optional reference to the labels in this data set.
Definition: data_set.hpp:625
size_type num_data_points() const noexcept
Returns the number of data points in this data set.
Definition: data_set.hpp:213
Exception type thrown if the provided file has an invalid format for the selected parser (e....
Definition: exceptions.hpp:114
Implements a data set class encapsulating all data points, features, and potential labels.
Defines universal utility functions.
Defines a simple logging function.
Namespace containing implementation details for the IO related functions. Should not directly be used...
Definition: core.hpp:44
std::vector< label_type > write_libsvm_model_header(fmt::ostream &out, const plssvm::parameter &params, const real_type rho, const data_set< real_type, label_type > &data)
Write the LIBSVM model file header to out.
Definition: libsvm_model_parsing.hpp:296
void write_libsvm_model_data(const std::string &filename, const plssvm::parameter &params, const real_type rho, const std::vector< real_type > &alpha, const data_set< real_type, label_type > &data)
Write the LIBSVM model to the file filename.
Definition: libsvm_model_parsing.hpp:371
std::tuple< plssvm::parameter, real_type, std::vector< label_type >, std::size_t > parse_libsvm_model_header(const std::vector< std::string_view > &lines)
Parse the LIBSVM model file header.
Definition: libsvm_model_parsing.hpp:83
bool starts_with(std::string_view str, std::string_view sv) noexcept
Checks if the string str starts with the prefix sv.
void log(const verbosity_level verb, const std::string_view msg, Args &&...args)
Definition: logger.hpp:109
T convert_to(const std::string_view str)
Converts the string str to a value of type T.
Definition: string_conversion.hpp:47
std::string_view trim(std::string_view str) noexcept
Returns a new std::string_view equal to str where all leading and trailing whitespaces are removed.
std::string_view trim_left(std::string_view str) noexcept
Returns a new std::string_view equal to str where all leading whitespaces are removed.
std::string current_date_time()
Return the current date time in the format "YYYY-MM-DD hh:mm:ss".
std::string & to_lower_case(std::string &str)
Convert the string str to its all lower case representation.
Implements the parameter class encapsulating all important C-SVM parameters.
default_value< real_type > coef0
The coef0 parameter used in the polynomial kernel function.
Definition: parameter.hpp:163
default_value< int > degree
The degree parameter used in the polynomial kernel function.
Definition: parameter.hpp:159
default_value< kernel_function_type > kernel_type
The used kernel function: linear, polynomial, or radial basis functions (rbf).
Definition: parameter.hpp:157
default_value< real_type > gamma
The gamma parameter used in the polynomial and rbf kernel functions.
Definition: parameter.hpp:161