12 #ifndef PLSSVM_DETAIL_IO_LIBSVM_PARSING_HPP_
13 #define PLSSVM_DETAIL_IO_LIBSVM_PARSING_HPP_
22 #include "fmt/compile.h"
23 #include "fmt/format.h"
30 #include <string_view>
47 [[nodiscard]]
inline std::size_t
parse_libsvm_num_features(
const std::vector<std::string_view> &lines,
const std::size_t skipped_lines = 0) {
48 std::size_t num_features = 0;
49 std::exception_ptr parallel_exception;
51 #pragma omp parallel default(none) shared(lines, parallel_exception, num_features) firstprivate(skipped_lines)
53 #pragma omp for reduction(max : num_features)
54 for (std::size_t i = skipped_lines; i < lines.size(); ++i) {
56 const std::string_view line = lines[i];
59 const std::string_view::size_type pos_colon = line.find_last_of(
':');
60 if (pos_colon == std::string_view::npos) {
64 std::string_view::size_type pos_whitespace = line.find_last_of(
' ', pos_colon);
65 if (pos_whitespace == std::string_view::npos) {
70 const auto index = detail::convert_to<unsigned long, invalid_file_format_exception>(line.substr(pos_whitespace, pos_colon - pos_whitespace));
71 num_features = std::max<std::size_t>(num_features, index);
72 }
catch (
const std::exception &) {
76 if (!parallel_exception) {
77 parallel_exception = std::current_exception();
85 if (parallel_exception) {
86 std::rethrow_exception(parallel_exception);
117 template <
typename real_type,
typename label_type>
118 [[nodiscard]]
inline std::tuple<std::size_t, std::size_t, std::vector<std::vector<real_type>>, std::vector<label_type>>
parse_libsvm_data(
const file_reader &reader,
const std::size_t skipped_lines = 0) {
124 const std::size_t num_data_points = reader.
num_lines() - skipped_lines;
128 if (num_features == 0) {
133 std::vector<std::vector<real_type>> data(num_data_points);
134 std::vector<label_type> label(num_data_points);
136 std::exception_ptr parallel_exception;
137 bool has_label =
false;
138 bool has_no_label =
false;
140 #pragma omp parallel default(none) shared(reader, skipped_lines, data, label, parallel_exception, has_label, has_no_label) firstprivate(num_features)
142 #pragma omp for reduction(|| : has_label) reduction(|| : has_no_label)
143 for (
typename std::vector<std::vector<real_type>>::size_type i = 0; i < data.size(); ++i) {
145 std::string_view line = reader.
line(skipped_lines + i);
146 unsigned long last_index = 0;
149 std::string_view::size_type pos = line.find_first_of(
" \n");
150 const std::string_view::size_type first_colon = line.find_first_of(
":\n");
151 if (first_colon >= pos) {
154 if constexpr (std::is_same_v<label_type, bool>) {
157 label[i] = detail::convert_to<bool, invalid_file_format_exception>(line.substr(0, pos));
159 label[i] = detail::convert_to<label_type, invalid_file_format_exception>(line.substr(0, pos));
167 std::vector<real_type> vline(num_features);
169 std::string_view::size_type next_pos = line.find_first_of(
':', pos);
171 if (next_pos == std::string_view::npos) {
176 auto index = detail::convert_to<unsigned long, invalid_file_format_exception>(line.substr(pos, next_pos - pos));
183 if (last_index >= index) {
184 throw invalid_file_format_exception{ fmt::format(
"The features indices must be strictly increasing, but {} is smaller or equal than {}!", index, last_index) };
193 next_pos = line.find_first_of(
' ', pos);
194 vline[index] = detail::convert_to<real_type, invalid_file_format_exception>(line.substr(pos, next_pos - pos));
198 data[i] = std::move(vline);
199 }
catch (
const std::exception &) {
203 if (!parallel_exception) {
204 parallel_exception = std::current_exception();
212 if (parallel_exception) {
213 std::rethrow_exception(parallel_exception);
215 if (has_label && has_no_label) {
220 return std::make_tuple(num_data_points, num_features, std::move(data), !has_no_label ? std::move(label) : std::vector<label_type>{});
243 template <
typename real_type,
typename label_type,
bool has_label>
244 inline void write_libsvm_data_impl(
const std::string &filename,
const std::vector<std::vector<real_type>> &data,
const std::vector<label_type> &label) {
245 if constexpr (has_label) {
246 PLSSVM_ASSERT(data.empty() || !label.empty(),
"has_label is 'true' but no labels were provided!");
247 PLSSVM_ASSERT(data.size() == label.size(),
"Number of data points ({}) and number of labels ({}) mismatch!", data.size(), label.size());
249 PLSSVM_ASSERT(label.empty(),
"has_label is 'false' but labels were provided!");
253 fmt::ostream out = fmt::output_file(filename);
257 const std::size_t num_data_points = data.size();
258 if (num_data_points == 0) {
262 const std::size_t num_features = data.front().size();
263 out.print(
"# {}x{}\n", num_data_points, num_features);
266 auto format_libsvm_line = [](std::string &output,
const std::vector<real_type> &data_point) {
267 static constexpr std::size_t BLOCK_SIZE = 64;
268 static constexpr std::size_t CHARS_PER_BLOCK = 128;
269 static constexpr std::size_t BUFFER_SIZE = BLOCK_SIZE * CHARS_PER_BLOCK;
270 static std::array<char, BUFFER_SIZE> buffer;
271 #pragma omp threadprivate(buffer)
273 for (
typename std::vector<real_type>::size_type j = 0; j < data_point.size(); j += BLOCK_SIZE) {
274 char *ptr = buffer.data();
275 for (std::size_t i = 0; i < std::min<std::size_t>(BLOCK_SIZE, data_point.size() - j); ++i) {
276 if (data_point[j + i] != real_type{ 0.0 }) {
277 ptr = fmt::format_to(ptr, FMT_COMPILE(
"{}:{:.10e} "), j + i + 1, data_point[j + i]);
280 output.append(buffer.data(), ptr - buffer.data());
282 output.push_back(
'\n');
285 #pragma omp parallel default(none) shared(out, data, label, format_libsvm_line)
288 std::string out_string;
289 #pragma omp for schedule(dynamic) nowait
290 for (
typename std::vector<real_type>::size_type i = 0; i < data.size(); ++i) {
291 if constexpr (has_label) {
292 out_string.append(fmt::format(FMT_COMPILE(
"{} "), label[i]));
294 format_libsvm_line(out_string, data[i]);
298 out.print(
"{}", out_string);
321 template <
typename real_type,
typename label_type>
322 inline void write_libsvm_data(
const std::string &filename,
const std::vector<std::vector<real_type>> &data,
const std::vector<label_type> &label) {
323 write_libsvm_data_impl<real_type, label_type, true>(filename, data, label);
343 template <
typename real_type>
344 inline void write_libsvm_data(
const std::string &filename,
const std::vector<std::vector<real_type>> &data) {
345 write_libsvm_data_impl<real_type, real_type, false>(filename, data, {});
Implements a custom assert macro PLSSVM_ASSERT.
#define PLSSVM_ASSERT(cond, msg,...)
Defines the PLSSVM_ASSERT macro if PLSSVM_ASSERT_ENABLED is defined.
Definition: assert.hpp:74
The plssvm::detail::file_reader class is responsible for reading a file and splitting it into its lin...
Definition: file_reader.hpp:42
std::string_view line(typename std::vector< std::string_view >::size_type pos) const
Return the pos line of the parsed file.
bool is_open() const noexcept
Checks whether this file_reader is currently associated with a file.
const std::vector< std::string_view > & lines() const noexcept
Return all lines present after the preprocessing.
std::vector< std::string_view >::size_type num_lines() const noexcept
Return the number of parsed lines (where all empty lines or lines starting with a comment are ignored...
Defines universal utility functions.
Implements custom exception classes derived from std::runtime_error including source location informa...
Implements a file reader class responsible for reading the input file and parsing it into lines.
Namespace containing implementation details for the IO related functions. Should not directly be used...
Definition: core.hpp:44
std::tuple< std::size_t, std::size_t, std::vector< std::vector< real_type > >, std::vector< label_type > > parse_libsvm_data(const file_reader &reader, const std::size_t skipped_lines=0)
Parse all data points and potential label using the file reader, ignoring all empty lines and lines s...
Definition: libsvm_parsing.hpp:118
void write_libsvm_data_impl(const std::string &filename, const std::vector< std::vector< real_type >> &data, const std::vector< label_type > &label)
Write the provided data and labels to the LIBSVM file filename.
Definition: libsvm_parsing.hpp:244
void write_libsvm_data(const std::string &filename, const std::vector< std::vector< real_type >> &data, const std::vector< label_type > &label)
Write the provided data and labels to the LIBSVM file filename.
Definition: libsvm_parsing.hpp:322
std::size_t parse_libsvm_num_features(const std::vector< std::string_view > &lines, const std::size_t skipped_lines=0)
Parse the maximum number of features per data point given in lines, where the first skipped_lines are...
Definition: libsvm_parsing.hpp:47
std::string current_date_time()
Return the current date time in the format "YYYY-MM-DD hh:mm:ss".
Implements a conversion function from a string to an arithmetic type.