CPPuddle
buffer_management.hpp
Go to the documentation of this file.
1 // Copyright (c) 2020-2024 Gregor Daiß
2 //
3 // Distributed under the Boost Software License, Version 1.0. (See accompanying
4 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5 
6 #ifndef BUFFER_MANAGEMENT_HPP
7 #define BUFFER_MANAGEMENT_HPP
8 
9 #include <atomic>
10 #include <cassert>
11 #include <functional>
12 #include <iostream>
13 #include <list>
14 #include <memory>
15 #include <mutex>
16 #include <optional>
17 #include <stdexcept>
18 #include <type_traits>
19 #include <unordered_map>
20 
21 // Warn about suboptimal performance without correct HPX-aware allocators
22 #ifdef CPPUDDLE_HAVE_HPX
23 #ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
24 #pragma message \
25 "Warning: CPPuddle build with HPX support but without HPX-aware allocators enabled. \
26 For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON!"
27 #else
28 // include runtime to get HPX thread IDs required for the HPX-aware allocators
29 #include <hpx/include/runtime.hpp>
30 #endif
31 #endif
32 
33 #if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX)
34 // For builds with The HPX mutex
35 #include <hpx/mutex.hpp>
36 #endif
37 
38 #ifdef CPPUDDLE_HAVE_COUNTERS
39 #include <boost/core/demangle.hpp>
40 #if defined(CPPUDDLE_HAVE_HPX)
41 #include <hpx/include/performance_counters.hpp>
42 #endif
43 #endif
44 
46 
47 namespace cppuddle {
48 namespace memory_recycling {
49 
50 namespace device_selection {
52 
55 template <typename T, typename Allocator> struct select_device_functor {
56  void operator()(const size_t device_id) {
57  if constexpr (max_number_gpus > 1)
58  throw std::runtime_error(
59  "Allocators used in Multi-GPU builds need explicit Multi-GPU support "
60  "(by having a select_device_functor overload");
61  }
62 };
63 } // namespace device_selection
64 
65 namespace detail {
66 
69 public:
70 #if defined(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING)
71 
72 // Warn about suboptimal performance without recycling
73 #pragma message \
74 "Warning: Building without buffer recycling! Use only for performance testing! \
75 For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON!"
76 
77  template <typename T, typename Host_Allocator>
78  static T *get(size_t number_elements, bool manage_content_lifetime = false,
79  std::optional<size_t> location_hint = std::nullopt,
80  std::optional<size_t> device_id = std::nullopt) {
81 
82  return Host_Allocator{}.allocate(number_elements);
83  }
85  template <typename T, typename Host_Allocator>
86  static void mark_unused(T *p, size_t number_elements,
87  std::optional<size_t> location_hint = std::nullopt,
88  std::optional<size_t> device_id = std::nullopt) {
89  return Host_Allocator{}.deallocate(p, number_elements);
90  }
91 #else
96  template <typename T, typename Host_Allocator>
97  static T *get(size_t number_elements, bool manage_content_lifetime = false,
98  std::optional<size_t> location_hint = std::nullopt,
99  std::optional<size_t> device_id = std::nullopt) {
100  try {
101  return buffer_manager<T, Host_Allocator>::get(
102  number_elements, manage_content_lifetime, location_hint, device_id);
103  } catch (const std::exception &exc) {
104  std::cerr << "ERROR: Encountered unhandled exception in cppuddle get: " << exc.what() << std::endl;
105  std::cerr << "Rethrowing exception... " << std::endl;;
106  throw;
107  }
108  }
113  template <typename T, typename Host_Allocator>
114  static void mark_unused(T *p, size_t number_elements,
115  std::optional<size_t> location_hint = std::nullopt,
116  std::optional<size_t> device_id = std::nullopt) {
117  try {
118  return buffer_manager<T, Host_Allocator>::mark_unused(p, number_elements,
119  location_hint, device_id);
120  } catch (const std::exception &exc) {
121  std::cerr << "ERROR: Encountered unhandled exception in cppuddle mark_unused: " << exc.what() << std::endl;
122  std::cerr << "Rethrowing exception... " << std::endl;;
123  throw;
124  }
125  }
126 #endif
128  template <typename T, typename Host_Allocator>
130 #ifdef CPPUDDLE_HAVE_COUNTERS
131  buffer_manager<T, Host_Allocator>::register_counters_with_hpx();
132 #else
133  std::cerr << "Warning: Trying to register allocator performance counters "
134  "with HPX but CPPuddle was built "
135  "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!"
136  << std::endl;
137 #endif
138  }
139 
141  static void clean_all() {
142  std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
143  for (const auto &clean_function :
144  instance().total_cleanup_callbacks) {
145  clean_function();
146  }
147  }
149  static void clean_unused_buffers() {
150  std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
151  for (const auto &clean_function :
152  instance().partial_cleanup_callbacks) {
153  clean_function();
154  }
155  }
157  static void finalize() {
158  std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
159  for (const auto &finalize_function :
160  instance().finalize_callbacks) {
161  finalize_function();
162  }
163  }
164 
166 #ifdef CPPUDDLE_HAVE_COUNTERS
167  std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
168  for (const auto &print_function :
169  instance().print_callbacks) {
170  print_function();
171  }
172 #else
173  std::cerr << "Warning: Trying to print allocator performance counters but CPPuddle was built "
174  "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!"
175  << std::endl;
176 #endif
177  }
178 
179  // Member variables and methods
180 private:
181 
183  static buffer_interface& instance() {
184  static buffer_interface singleton{};
185  return singleton;
186  }
188  std::list<std::function<void()>> print_callbacks;
191  std::list<std::function<void()>> finalize_callbacks;
194  std::list<std::function<void()>> total_cleanup_callbacks;
197  std::list<std::function<void()>> partial_cleanup_callbacks;
200  buffer_interface() = default;
201 
202  mutex_t callback_protection_mut;
204  static void add_total_cleanup_callback(const std::function<void()> &func) {
205  std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
206  instance().total_cleanup_callbacks.push_back(func);
207  }
210  static void add_partial_cleanup_callback(const std::function<void()> &func) {
211  std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
212  instance().partial_cleanup_callbacks.push_back(func);
213  }
216  static void add_finalize_callback(const std::function<void()> &func) {
217  std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
218  instance().finalize_callbacks.push_back(func);
219  }
222  static void add_print_callback(const std::function<void()> &func) {
223  std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
224  instance().print_callbacks.push_back(func);
225  }
226 
227 public:
228  ~buffer_interface() = default;
229 
230  // Subclasses
231 private:
233  template <typename T, typename Host_Allocator> class buffer_manager {
234  private:
235  // Tuple content: Pointer to buffer, buffer_size, location ID, Flag
236  // The flag at the end controls whether to buffer content is to be reused as
237  // well
238  using buffer_entry_type = std::tuple<T *, size_t, size_t, bool>;
239 
240 
241  public:
243  static void clean() {
244  assert(instance() && !is_finalized);
245  for (auto i = 0; i < number_instances * max_number_gpus; i++) {
246  std::lock_guard<mutex_t> guard(instance()[i].mut);
247  instance()[i].clean_all_buffers();
248  }
249  }
250  static void print_performance_counters() {
251  assert(instance() && !is_finalized);
252  for (auto i = 0; i < number_instances * max_number_gpus; i++) {
253  std::lock_guard<mutex_t> guard(instance()[i].mut);
254  instance()[i].print_counters();
255  }
256  }
257  static void finalize() {
258  assert(instance() && !is_finalized);
259  is_finalized = true;
260  for (auto i = 0; i < number_instances * max_number_gpus; i++) {
261  std::lock_guard<mutex_t> guard(instance()[i].mut);
262  instance()[i].clean_all_buffers();
263  }
264  instance().reset();
265  }
267  static void clean_unused_buffers_only() {
268  assert(instance() && !is_finalized);
269  for (auto i = 0; i < number_instances * max_number_gpus; i++) {
270  std::lock_guard<mutex_t> guard(instance()[i].mut);
271  for (auto &buffer_tuple : instance()[i].unused_buffer_list) {
272  Host_Allocator alloc;
273  if (std::get<3>(buffer_tuple)) {
274  std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
275  }
276  alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
277  }
278  instance()[i].unused_buffer_list.clear();
279  }
280  }
281 #if defined(CPPUDDLE_HAVE_COUNTERS) && defined(CPPUDDLE_HAVE_HPX)
282  static size_t get_sum_number_recycling(bool reset) {
283  if (reset)
284  sum_number_recycling = 0;
285  return sum_number_recycling;
286  }
287  static size_t get_sum_number_allocation(bool reset) {
288  if (reset)
289  sum_number_allocation = 0;
290  return sum_number_allocation;
291  }
292  static size_t get_sum_number_creation(bool reset) {
293  if (reset)
294  sum_number_creation = 0;
295  return sum_number_creation;
296  }
297  static size_t get_sum_number_deallocation(bool reset) {
298  if (reset)
299  sum_number_deallocation = 0;
300  return sum_number_deallocation;
301  }
302  static size_t get_sum_number_wrong_hints(bool reset) {
303  if (reset)
304  sum_number_wrong_hints = 0;
305  return sum_number_wrong_hints;
306  }
307  static size_t get_sum_number_wrong_device_hints(bool reset) {
308  if (reset)
309  sum_number_wrong_hints = 0;
310  return sum_number_wrong_device_hints;
311  }
312  static size_t get_sum_number_bad_allocs(bool reset) {
313  if (reset)
314  sum_number_bad_allocs = 0;
315  return sum_number_bad_allocs;
316  }
317 
318  static void register_counters_with_hpx(void) {
319  std::string alloc_name =
320  boost::core::demangle(typeid(Host_Allocator).name()) +
321  std::string("_") + boost::core::demangle(typeid(T).name());
322  hpx::performance_counters::install_counter_type(
323  std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_recycling/"),
324  &get_sum_number_recycling,
325  "Number of allocations using a recycled buffer with this "
326  "allocator");
327  hpx::performance_counters::install_counter_type(
328  std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_allocations/"),
329  &get_sum_number_allocation,
330  "Number of allocations with this allocator");
331  hpx::performance_counters::install_counter_type(
332  std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_creations/"),
333  &get_sum_number_creation,
334  "Number of allocations not using a recycled buffer with this "
335  "allocator");
336  hpx::performance_counters::install_counter_type(
337  std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_deallocations/"),
338  &get_sum_number_deallocation,
339  "Number of deallocations yielding buffers to be recycled with this "
340  "allocator");
341  hpx::performance_counters::install_counter_type(
342  std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_wrong_hints/"),
343  &get_sum_number_wrong_hints,
344  "Number of wrong hints supplied to the dealloc method with this allocator");
345  hpx::performance_counters::install_counter_type(
346  std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_wrong_device_hints/"),
347  &get_sum_number_wrong_device_hints,
348  "Number of wrong device hints supplied to the dealloc method with this allocator");
349  hpx::performance_counters::install_counter_type(
350  std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_bad_allocs/"),
351  &get_sum_number_bad_allocs,
352  "Number of wrong bad allocs which triggered a cleanup of unused buffers");
353  }
354 #endif
355 
357  static T *get(size_t number_of_elements, bool manage_content_lifetime,
358  std::optional<size_t> location_hint = std::nullopt,
359  std::optional<size_t> gpu_device_id = std::nullopt) {
360  init_callbacks_once();
361  if (is_finalized) {
362  throw std::runtime_error("Tried allocation after finalization");
363  }
364  assert(instance() && !is_finalized);
365 
366  size_t location_id = 0;
367  if (location_hint) {
368  location_id = *location_hint;
369  }
370  if (location_id >= number_instances) {
371  throw std::runtime_error("Tried to create buffer with invalid location_id [get]");
372  }
373  size_t device_id = 0;
374  if (gpu_device_id) {
375  device_id = *gpu_device_id;
376  }
377  if (device_id >= max_number_gpus) {
378  throw std::runtime_error("Tried to create buffer with invalid device id [get]! "
379  "Is multigpu support enabled with the correct number "
380  "of GPUs?");
381  }
382 
383  location_id = location_id + device_id * number_instances;
384  std::lock_guard<mutex_t> guard(instance()[location_id].mut);
385 
386 
387 #ifdef CPPUDDLE_HAVE_COUNTERS
388  instance()[location_id].number_allocation++;
389  sum_number_allocation++;
390 #endif
391  // Check for unused buffers we can recycle:
392  for (auto iter = instance()[location_id].unused_buffer_list.begin();
393  iter != instance()[location_id].unused_buffer_list.end(); iter++) {
394  auto tuple = *iter;
395  if (std::get<1>(tuple) == number_of_elements) {
396  instance()[location_id].unused_buffer_list.erase(iter);
397 
398  // handle the switch from aggressive to non aggressive reusage (or
399  // vice-versa)
400  if (manage_content_lifetime && !std::get<3>(tuple)) {
401  std::uninitialized_value_construct_n(std::get<0>(tuple),
402  number_of_elements);
403  std::get<3>(tuple) = true;
404  } else if (!manage_content_lifetime && std::get<3>(tuple)) {
405  std::destroy_n(std::get<0>(tuple), std::get<1>(tuple));
406  std::get<3>(tuple) = false;
407  }
408  instance()[location_id].buffer_map.insert({std::get<0>(tuple), tuple});
409 #ifdef CPPUDDLE_HAVE_COUNTERS
410  instance()[location_id].number_recycling++;
411  sum_number_recycling++;
412 #endif
413  return std::get<0>(tuple);
414  }
415  }
416 
417  // No unused buffer found -> Create new one and return it
418  try {
420  T, Host_Allocator>{}(device_id);
421  Host_Allocator alloc;
422  T *buffer = alloc.allocate(number_of_elements);
423  instance()[location_id].buffer_map.insert(
424  {buffer, std::make_tuple(buffer, number_of_elements, 1,
425  manage_content_lifetime)});
426 #ifdef CPPUDDLE_HAVE_COUNTERS
427  instance()[location_id].number_creation++;
428  sum_number_creation++;
429 #endif
430  if (manage_content_lifetime) {
431  std::uninitialized_value_construct_n(buffer, number_of_elements);
432  }
433  return buffer;
434  } catch (std::bad_alloc &e) {
435  // not enough memory left! Cleanup and attempt again:
436  std::cerr
437  << "Not enough memory left. Cleaning up unused buffers now..."
438  << std::endl;
440  std::cerr << "Buffers cleaned! Try allocation again..." << std::endl;
441 
442  // If there still isn't enough memory left, the caller has to handle it
443  // We've done all we can in here
444  Host_Allocator alloc;
446  T, Host_Allocator>{}(device_id);
447  T *buffer = alloc.allocate(number_of_elements);
448  instance()[location_id].buffer_map.insert(
449  {buffer, std::make_tuple(buffer, number_of_elements, 1,
450  manage_content_lifetime)});
451 #ifdef CPPUDDLE_HAVE_COUNTERS
452  instance()[location_id].number_creation++;
453  sum_number_creation++;
454  instance()[location_id].number_bad_alloc++;
455  sum_number_bad_allocs++;
456 #endif
457  std::cerr << "Second attempt allocation successful!" << std::endl;
458  if (manage_content_lifetime) {
459  std::uninitialized_value_construct_n(buffer, number_of_elements);
460  }
461  return buffer;
462  }
463  }
464 
465  static void mark_unused(T *memory_location, size_t number_of_elements,
466  std::optional<size_t> location_hint = std::nullopt,
467  std::optional<size_t> device_hint = std::nullopt) {
468  if (is_finalized)
469  return;
470  assert(instance() && !is_finalized);
471 
472  size_t location_id = 0;
473  if (location_hint) {
474  location_id = *location_hint;
475  if (location_id >= number_instances) {
476  throw std::runtime_error(
477  "Buffer recylcer received invalid location hint [mark_unused]");
478  }
479  }
480  size_t device_id = 0;
481  if (device_hint) {
482  device_id = *device_hint;
483  if (device_id >= max_number_gpus) {
484  throw std::runtime_error(
485  "Buffer recylcer received invalid devce hint [mark_unused]");
486  }
487  }
488 
489  // Attempt 1 to find the correct bucket/location: Look at provided hint:
490  if (location_hint) {
491  size_t location_id = location_hint.value() + device_id * number_instances;
492  std::lock_guard<mutex_t> guard(instance()[location_id].mut);
493  if (instance()[location_id].buffer_map.find(memory_location) !=
494  instance()[location_id].buffer_map.end()) {
495 #ifdef CPPUDDLE_HAVE_COUNTERS
496  instance()[location_id].number_deallocation++;
497  sum_number_deallocation++;
498 #endif
499  auto it = instance()[location_id].buffer_map.find(memory_location);
500  assert(it != instance()[location_id].buffer_map.end());
501  auto &tuple = it->second;
502  // sanity checks:
503  assert(std::get<1>(tuple) == number_of_elements);
504  // move to the unused_buffer list
505  instance()[location_id].unused_buffer_list.push_front(tuple);
506  instance()[location_id].buffer_map.erase(memory_location);
507  return; // Success
508  }
509  // hint was wrong
510 #ifdef CPPUDDLE_HAVE_COUNTERS
511  instance()[location_id].number_wrong_hints++;
512  sum_number_wrong_hints++;
513 #endif
514  }
515  // Failed to find buffer in the specified localtion/device!
516  // Attempt 2 - Look for buffer other locations on the same device...
517  for (size_t location_id = device_id * number_instances;
518  location_id < (device_id + 1) * number_instances; location_id++) {
519  if (location_hint) {
520  if (*location_hint + device_id * max_number_gpus == location_id) {
521  continue; // already tried this -> skip
522  }
523  }
524  std::lock_guard<mutex_t> guard(instance()[location_id].mut);
525  if (instance()[location_id].buffer_map.find(memory_location) !=
526  instance()[location_id].buffer_map.end()) {
527 #ifdef CPPUDDLE_HAVE_COUNTERS
528  instance()[location_id].number_deallocation++;
529  sum_number_deallocation++;
530 #endif
531  auto it = instance()[location_id].buffer_map.find(memory_location);
532  assert(it != instance()[location_id].buffer_map.end());
533  auto &tuple = it->second;
534  // sanity checks:
535  assert(std::get<1>(tuple) == number_of_elements);
536  // move to the unused_buffer list
537  instance()[location_id].unused_buffer_list.push_front(tuple);
538  instance()[location_id].buffer_map.erase(memory_location);
539  return; // Success
540  }
541  }
542  // device hint was wrong
543 #ifdef CPPUDDLE_HAVE_COUNTERS
544  if (device_hint) {
545  sum_number_wrong_device_hints++;
546  }
547 #endif
548  // Failed to find buffer on the specified device!
549  // Attempt 3 - Look for buffer on other devices...
550  for (size_t local_device_id = 0; local_device_id < max_number_gpus;
551  local_device_id++) {
552  if (local_device_id == device_id)
553  continue; // aldready tried this device
554 
555  // Try hint localtion first yet again (though on different device)
556  if (location_hint) {
557  size_t location_id = location_hint.value() + local_device_id * number_instances;
558  std::lock_guard<mutex_t> guard(instance()[location_id].mut);
559  if (instance()[location_id].buffer_map.find(memory_location) !=
560  instance()[location_id].buffer_map.end()) {
561 #ifdef CPPUDDLE_HAVE_COUNTERS
562  instance()[location_id].number_deallocation++;
563  sum_number_deallocation++;
564 #endif
565  auto it = instance()[location_id].buffer_map.find(memory_location);
566  assert(it != instance()[location_id].buffer_map.end());
567  auto &tuple = it->second;
568  // sanity checks:
569  assert(std::get<1>(tuple) == number_of_elements);
570  // move to the unused_buffer list
571  instance()[location_id].unused_buffer_list.push_front(tuple);
572  instance()[location_id].buffer_map.erase(memory_location);
573  return; // Success
574  }
575  }
576  // Failed - check all other localtions on device
577  for (size_t location_id = local_device_id * number_instances;
578  location_id < (local_device_id + 1) * number_instances; location_id++) {
579  if (location_hint) {
580  if (*location_hint + local_device_id * max_number_gpus == location_id) {
581  continue; // already tried this -> skip
582  }
583  }
584  std::lock_guard<mutex_t> guard(instance()[location_id].mut);
585  if (instance()[location_id].buffer_map.find(memory_location) !=
586  instance()[location_id].buffer_map.end()) {
587 #ifdef CPPUDDLE_HAVE_COUNTERS
588  instance()[location_id].number_deallocation++;
589  sum_number_deallocation++;
590 #endif
591  auto it = instance()[location_id].buffer_map.find(memory_location);
592  assert(it != instance()[location_id].buffer_map.end());
593  auto &tuple = it->second;
594  // sanity checks:
595  assert(std::get<1>(tuple) == number_of_elements);
596  // move to the unused_buffer list
597  instance()[location_id].unused_buffer_list.push_front(tuple);
598  instance()[location_id].buffer_map.erase(memory_location);
599  return; // Success
600  }
601  }
602  }
603  // Buffer that is to be deleted is nowhere to be found - we looked everywhere!
604  // =>
605  // Failure! Handle here...
606 
607  // TODO Throw exception instead in the futures, as soon as the recycler finalize is
608  // in all user codes
609  /* throw std::runtime_error("Tried to delete non-existing buffer"); */
610 
611  // This is odd: Print warning -- however, might also happen with static
612  // buffers using these allocators IF the new finalize was not called. For
613  // now, print warning until all user-code is upgraded to the finalize method.
614  // This allows using current versions of cppuddle with older application code
615  std::cerr
616  << "Warning! Tried to delete non-existing buffer within CPPuddle!"
617  << std::endl;
618  std::cerr << "Did you forget to call recycler::finalize?" << std::endl;
619  }
620 
621  private:
623  std::unordered_map<T *, buffer_entry_type> buffer_map{};
625  std::list<buffer_entry_type> unused_buffer_list{};
627  mutex_t mut;
628 #ifdef CPPUDDLE_HAVE_COUNTERS
630  size_t number_allocation{0}, number_deallocation{0}, number_wrong_hints{0},
631  number_recycling{0}, number_creation{0}, number_bad_alloc{0};
632 
633  static inline std::atomic<size_t> sum_number_allocation{0},
634  sum_number_deallocation{0}, sum_number_wrong_hints{0},
635  sum_number_wrong_device_hints{0}, sum_number_recycling{0},
636  sum_number_creation{0}, sum_number_bad_allocs{0};
637 #endif
640  buffer_manager() = default;
641  buffer_manager&
642  operator=(buffer_manager<T, Host_Allocator> const &other) = default;
643  buffer_manager&
644  operator=(buffer_manager<T, Host_Allocator> &&other) = delete;
645  static std::unique_ptr<buffer_manager[]>& instance(void) {
646  static std::unique_ptr<buffer_manager[]> instances{
647  new buffer_manager[number_instances * max_number_gpus]};
648  return instances;
649  }
650  static void init_callbacks_once(void) {
651  assert(instance());
652 #if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX)
653  static hpx::once_flag flag;
654  hpx::call_once(flag, []() {
655 #else
656  static std::once_flag flag;
657  std::call_once(flag, []() {
658 #endif
659  is_finalized = false;
660  buffer_interface::add_total_cleanup_callback(clean);
661  buffer_interface::add_partial_cleanup_callback(
662  clean_unused_buffers_only);
663  buffer_interface::add_finalize_callback(
664  finalize);
665 #ifdef CPPUDDLE_HAVE_COUNTERS
666  buffer_interface::add_print_callback(
668 #endif
669  });
670  }
671  static inline std::atomic<bool> is_finalized;
672 
673 #ifdef CPPUDDLE_HAVE_COUNTERS
674  void print_counters(void) {
675  if (number_allocation == 0)
676  return;
677  // Print performance counters
678  size_t number_cleaned = unused_buffer_list.size() + buffer_map.size();
679  std::cout << "\nBuffer manager destructor for (Alloc: "
680  << boost::core::demangle(typeid(Host_Allocator).name()) << ", Type: "
681  << boost::core::demangle(typeid(T).name())
682  << "):" << std::endl
683  << "--------------------------------------------------------------------"
684  << std::endl
685  << "--> Number of bad_allocs that triggered garbage "
686  "collection: "
687  << number_bad_alloc << std::endl
688  << "--> Number of buffers that got requested from this "
689  "manager: "
690  << number_allocation << std::endl
691  << "--> Number of times an unused buffer got recycled for a "
692  "request: "
693  << number_recycling << std::endl
694  << "--> Number of times a new buffer had to be created for a "
695  "request: "
696  << number_creation << std::endl
697  << "--> Number cleaned up buffers: "
698  " "
699  << number_cleaned << std::endl
700  << "--> Number wrong deallocation hints: "
701  " "
702  << number_wrong_hints << std::endl
703  << "--> Number of buffers that were marked as used upon "
704  "cleanup: "
705  << buffer_map.size() << std::endl
706  << "==> Recycle rate: "
707  " "
708  << static_cast<float>(number_recycling) / number_allocation *
709  100.0f
710  << "%" << std::endl;
711  }
712 #endif
713 
714  void clean_all_buffers(void) {
715 #ifdef CPPUDDLE_HAVE_COUNTERS
716  if (number_allocation == 0 && number_recycling == 0 &&
717  number_bad_alloc == 0 && number_creation == 0 &&
718  unused_buffer_list.empty() && buffer_map.empty()) {
719  return;
720  }
721 #endif
722  for (auto &buffer_tuple : unused_buffer_list) {
723  Host_Allocator alloc;
724  if (std::get<3>(buffer_tuple)) {
725  std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
726  }
727  alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
728  }
729  for (auto &map_tuple : buffer_map) {
730  auto buffer_tuple = map_tuple.second;
731  Host_Allocator alloc;
732  if (std::get<3>(buffer_tuple)) {
733  std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
734  }
735  alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
736  }
737  unused_buffer_list.clear();
738  buffer_map.clear();
739 #ifdef CPPUDDLE_HAVE_COUNTERS
740  number_allocation = 0;
741  number_recycling = 0;
742  number_bad_alloc = 0;
743  number_creation = 0;
744  number_wrong_hints = 0;
745 #endif
746  }
747  public:
748  ~buffer_manager() {
749  clean_all_buffers();
750  }
751 
752  public: // Putting deleted constructors in public gives more useful error
753  // messages
754  // Bunch of constructors we don't need
755  buffer_manager(
756  buffer_manager<T, Host_Allocator> const &other) = delete;
757  buffer_manager(
758  buffer_manager<T, Host_Allocator> &&other) = delete;
759  };
760 
761 public:
762  // Putting deleted constructors in public gives more useful error messages
763  // Bunch of constructors we don't need
764  buffer_interface(buffer_interface const &other) = delete;
765  buffer_interface& operator=(buffer_interface const &other) = delete;
768 };
769 
770 template <typename T, typename Host_Allocator> struct recycle_allocator {
771  using value_type = T;
772  using underlying_allocator_type = Host_Allocator;
773  static_assert(std::is_same_v<value_type, typename underlying_allocator_type::value_type>);
774  const std::optional<size_t> dealloc_hint;
775  const std::optional<size_t> device_id;
776 
777 #ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
778  recycle_allocator() noexcept
779  : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
780  explicit recycle_allocator(size_t hint) noexcept
781  : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
783  recycle_allocator<T, Host_Allocator> const &other) noexcept
784  : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
785  T *allocate(std::size_t n) {
786  T *data = buffer_interface::get<T, Host_Allocator>(n);
787  return data;
788  }
789  void deallocate(T *p, std::size_t n) {
790  buffer_interface::mark_unused<T, Host_Allocator>(p, n);
791  }
792 #else
793  recycle_allocator() noexcept
794  : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(0) {}
795  explicit recycle_allocator(const size_t device_id) noexcept
796  : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(device_id) {}
797  explicit recycle_allocator(const size_t device_i, const size_t location_id) noexcept
798  : dealloc_hint(location_id), device_id(device_id) {}
799  explicit recycle_allocator(
800  recycle_allocator<T, Host_Allocator> const &other) noexcept
801  : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {}
802  T *allocate(std::size_t n) {
803  T *data = buffer_interface::get<T, Host_Allocator>(
804  n, false, hpx::get_worker_thread_num() % number_instances, device_id);
805  return data;
806  }
807  void deallocate(T *p, std::size_t n) {
808  buffer_interface::mark_unused<T, Host_Allocator>(p, n, dealloc_hint,
809  device_id);
810  }
811 #endif
812 
813  template <typename... Args>
814  inline void construct(T *p, Args... args) noexcept {
815  ::new (static_cast<void *>(p)) T(std::forward<Args>(args)...);
816  }
817  void destroy(T *p) { p->~T(); }
818 };
819 template <typename T, typename U, typename Host_Allocator>
820 constexpr bool
822  recycle_allocator<U, Host_Allocator> const &) noexcept {
823  if constexpr (std::is_same_v<T, U>)
824  return true;
825  else
826  return false;
827 }
828 template <typename T, typename U, typename Host_Allocator>
829 constexpr bool
831  recycle_allocator<U, Host_Allocator> const &) noexcept {
832  if constexpr (std::is_same_v<T, U>)
833  return false;
834  else
835  return true;
836 }
837 
839 template <typename T, typename Host_Allocator>
841  using value_type = T;
842  using underlying_allocator_type = Host_Allocator;
843  static_assert(std::is_same_v<value_type, typename underlying_allocator_type::value_type>);
844  const std::optional<size_t> dealloc_hint;
845  const std::optional<size_t> device_id;
846 
847 #ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
849  : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
850  explicit aggressive_recycle_allocator(size_t hint) noexcept
851  : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
854  : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
855  T *allocate(std::size_t n) {
856  T *data = buffer_interface::get<T, Host_Allocator>(
857  n, true); // also initializes the buffer if it isn't reused
858  return data;
859  }
860  void deallocate(T *p, std::size_t n) {
861  buffer_interface::mark_unused<T, Host_Allocator>(p, n);
862  }
863 #else
865  : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(0) {}
866  explicit aggressive_recycle_allocator(const size_t device_id) noexcept
867  : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(device_id) {}
868  explicit aggressive_recycle_allocator(const size_t device_id, const size_t location_id) noexcept
869  : dealloc_hint(location_id), device_id(device_id) {}
870  explicit aggressive_recycle_allocator(
871  recycle_allocator<T, Host_Allocator> const &other) noexcept
872  : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {}
873  T *allocate(std::size_t n) {
874  T *data = buffer_interface::get<T, Host_Allocator>(
875  n, true, dealloc_hint, device_id); // also initializes the buffer
876  // if it isn't reused
877  return data;
878  }
879  void deallocate(T *p, std::size_t n) {
880  buffer_interface::mark_unused<T, Host_Allocator>(p, n, dealloc_hint,
881  device_id);
882  }
883 #endif
884 
885 #ifndef CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS
886  template <typename... Args>
887  inline void construct(T *p, Args... args) noexcept {
888  // Do nothing here - we reuse the content of the last owner
889  }
890  void destroy(T *p) {
891  // Do nothing here - Contents will be destroyed when the buffer manager is
892  // destroyed, not before
893  }
894 #else
895 // Warn about suboptimal performance without recycling
896 #pragma message \
897 "Warning: Building without content reusage for aggressive allocators! \
898 For better performance configure with CPPUDDLE_WITH_AGGRESSIVE_CONTENT_RECYCLING=ON !"
899  template <typename... Args>
900  inline void construct(T *p, Args... args) noexcept {
901  ::new (static_cast<void *>(p)) T(std::forward<Args>(args)...);
902  }
903  void destroy(T *p) { p->~T(); }
904 #endif
905 };
906 
907 template <typename T, typename U, typename Host_Allocator>
908 constexpr bool
911  if constexpr (std::is_same_v<T, U>)
912  return true;
913  else
914  return false;
915 }
916 template <typename T, typename U, typename Host_Allocator>
917 constexpr bool
920  if constexpr (std::is_same_v<T, U>)
921  return false;
922  else
923  return true;
924 }
925 } // namespace detail
926 } // namespace memory_recycling
927 } // end namespace cppuddle
928 
929 #endif
Singleton interface to all buffer_managers.
Definition: buffer_management.hpp:68
static void print_performance_counters()
Definition: buffer_management.hpp:165
static void register_allocator_counters_with_hpx(void)
Register all CPPuddle counters as HPX performance counters.
Definition: buffer_management.hpp:129
buffer_interface(buffer_interface const &other)=delete
static T * get(size_t number_elements, bool manage_content_lifetime=false, std::optional< size_t > location_hint=std::nullopt, std::optional< size_t > device_id=std::nullopt)
Definition: buffer_management.hpp:97
static void clean_all()
Deallocate all buffers, no matter whether they are marked as used or not.
Definition: buffer_management.hpp:141
buffer_interface & operator=(buffer_interface &&other)=delete
buffer_interface & operator=(buffer_interface const &other)=delete
static void finalize()
Deallocate all buffers, no matter whether they are marked as used or not.
Definition: buffer_management.hpp:157
buffer_interface(buffer_interface &&other)=delete
static void clean_unused_buffers()
Deallocated all currently unused buffer.
Definition: buffer_management.hpp:149
static void mark_unused(T *p, size_t number_elements, std::optional< size_t > location_hint=std::nullopt, std::optional< size_t > device_id=std::nullopt)
Definition: buffer_management.hpp:114
constexpr bool operator!=(recycle_allocator< T, Host_Allocator > const &, recycle_allocator< U, Host_Allocator > const &) noexcept
Definition: buffer_management.hpp:830
constexpr bool operator==(recycle_allocator< T, Host_Allocator > const &, recycle_allocator< U, Host_Allocator > const &) noexcept
Definition: buffer_management.hpp:821
Definition: config.hpp:31
constexpr size_t max_number_gpus
Definition: config.hpp:52
std::mutex mutex_t
Definition: config.hpp:36
constexpr size_t number_instances
Definition: config.hpp:50
Definition: aggregation_executors_and_allocators.hpp:1042
Recycles not only allocations but also the contents of a buffer.
Definition: buffer_management.hpp:840
const std::optional< size_t > dealloc_hint
Definition: buffer_management.hpp:843
aggressive_recycle_allocator(size_t hint) noexcept
Definition: buffer_management.hpp:850
void destroy(T *p)
Definition: buffer_management.hpp:890
void deallocate(T *p, std::size_t n)
Definition: buffer_management.hpp:860
void construct(T *p, Args... args) noexcept
Definition: buffer_management.hpp:887
T * allocate(std::size_t n)
Definition: buffer_management.hpp:855
aggressive_recycle_allocator(aggressive_recycle_allocator< T, Host_Allocator > const &) noexcept
Definition: buffer_management.hpp:852
const std::optional< size_t > device_id
Definition: buffer_management.hpp:845
Host_Allocator underlying_allocator_type
Definition: buffer_management.hpp:842
aggressive_recycle_allocator() noexcept
Definition: buffer_management.hpp:848
Definition: buffer_management.hpp:770
T * allocate(std::size_t n)
Definition: buffer_management.hpp:785
T value_type
Definition: buffer_management.hpp:771
recycle_allocator(size_t hint) noexcept
Definition: buffer_management.hpp:780
void destroy(T *p)
Definition: buffer_management.hpp:817
recycle_allocator() noexcept
Definition: buffer_management.hpp:778
Host_Allocator underlying_allocator_type
Definition: buffer_management.hpp:772
void construct(T *p, Args... args) noexcept
Definition: buffer_management.hpp:814
void deallocate(T *p, std::size_t n)
Definition: buffer_management.hpp:789
const std::optional< size_t > device_id
Definition: buffer_management.hpp:775
recycle_allocator(recycle_allocator< T, Host_Allocator > const &other) noexcept
Definition: buffer_management.hpp:782
const std::optional< size_t > dealloc_hint
Definition: buffer_management.hpp:773
Default device selector - No MultGPU support.
Definition: buffer_management.hpp:55
void operator()(const size_t device_id)
Definition: buffer_management.hpp:56