CPPuddle
Loading...
Searching...
No Matches
buffer_management.hpp
Go to the documentation of this file.
1// Copyright (c) 2020-2024 Gregor Daiß
2//
3// Distributed under the Boost Software License, Version 1.0. (See accompanying
4// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5
6#ifndef BUFFER_MANAGEMENT_HPP
7#define BUFFER_MANAGEMENT_HPP
8
9#include <atomic>
10#include <cassert>
11#include <functional>
12#include <iostream>
13#include <list>
14#include <memory>
15#include <mutex>
16#include <optional>
17#include <stdexcept>
18#include <type_traits>
19#include <unordered_map>
20
21// Warn about suboptimal performance without correct HPX-aware allocators
22#ifdef CPPUDDLE_HAVE_HPX
23#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
24#pragma message \
25"Warning: CPPuddle build with HPX support but without HPX-aware allocators enabled. \
26For better performance configure CPPuddle with CPPUDDLE_WITH_HPX_AWARE_ALLOCATORS=ON!"
27#else
28// include runtime to get HPX thread IDs required for the HPX-aware allocators
29#include <hpx/include/runtime.hpp>
30#endif
31#endif
32
33#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX)
34// For builds with The HPX mutex
35#include <hpx/mutex.hpp>
36#endif
37
38#ifdef CPPUDDLE_HAVE_COUNTERS
39#include <boost/core/demangle.hpp>
40#if defined(CPPUDDLE_HAVE_HPX)
41#include <hpx/include/performance_counters.hpp>
42#endif
43#endif
44
46
47namespace cppuddle {
48namespace memory_recycling {
49
50namespace device_selection {
52
55template <typename T, typename Allocator> struct select_device_functor {
56 void operator()(const size_t device_id) {
57 if constexpr (max_number_gpus > 1)
58 throw std::runtime_error(
59 "Allocators used in Multi-GPU builds need explicit Multi-GPU support "
60 "(by having a select_device_functor overload");
61 }
62};
63} // namespace device_selection
64
65namespace detail {
66
69public:
70#if defined(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING)
71
72// Warn about suboptimal performance without recycling
73#pragma message \
74"Warning: Building without buffer recycling! Use only for performance testing! \
75For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON!"
76
77 template <typename T, typename Host_Allocator>
78 static T *get(size_t number_elements, bool manage_content_lifetime = false,
79 std::optional<size_t> location_hint = std::nullopt,
80 std::optional<size_t> device_id = std::nullopt) {
81
82 return Host_Allocator{}.allocate(number_elements);
83 }
85 template <typename T, typename Host_Allocator>
86 static void mark_unused(T *p, size_t number_elements,
87 std::optional<size_t> location_hint = std::nullopt,
88 std::optional<size_t> device_id = std::nullopt) {
89 return Host_Allocator{}.deallocate(p, number_elements);
90 }
91#else
96 template <typename T, typename Host_Allocator>
97 static T *get(size_t number_elements, bool manage_content_lifetime = false,
98 std::optional<size_t> location_hint = std::nullopt,
99 std::optional<size_t> device_id = std::nullopt) {
100 try {
101 return buffer_manager<T, Host_Allocator>::get(
102 number_elements, manage_content_lifetime, location_hint, device_id);
103 } catch (const std::exception &exc) {
104 std::cerr << "ERROR: Encountered unhandled exception in cppuddle get: " << exc.what() << std::endl;
105 std::cerr << "Rethrowing exception... " << std::endl;;
106 throw;
107 }
108 }
113 template <typename T, typename Host_Allocator>
114 static void mark_unused(T *p, size_t number_elements,
115 std::optional<size_t> location_hint = std::nullopt,
116 std::optional<size_t> device_id = std::nullopt) {
117 try {
118 return buffer_manager<T, Host_Allocator>::mark_unused(p, number_elements,
119 location_hint, device_id);
120 } catch (const std::exception &exc) {
121 std::cerr << "ERROR: Encountered unhandled exception in cppuddle mark_unused: " << exc.what() << std::endl;
122 std::cerr << "Rethrowing exception... " << std::endl;;
123 throw;
124 }
125 }
126#endif
128 template <typename T, typename Host_Allocator>
130#ifdef CPPUDDLE_HAVE_COUNTERS
131 buffer_manager<T, Host_Allocator>::register_counters_with_hpx();
132#else
133 std::cerr << "Warning: Trying to register allocator performance counters "
134 "with HPX but CPPuddle was built "
135 "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!"
136 << std::endl;
137#endif
138 }
139
141 static void clean_all() {
142 std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
143 for (const auto &clean_function :
144 instance().total_cleanup_callbacks) {
145 clean_function();
146 }
147 }
149 static void clean_unused_buffers() {
150 std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
151 for (const auto &clean_function :
152 instance().partial_cleanup_callbacks) {
153 clean_function();
154 }
155 }
157 static void finalize() {
158 std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
159 for (const auto &finalize_function :
160 instance().finalize_callbacks) {
161 finalize_function();
162 }
163 }
164
166#ifdef CPPUDDLE_HAVE_COUNTERS
167 std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
168 for (const auto &print_function :
169 instance().print_callbacks) {
170 print_function();
171 }
172#else
173 std::cerr << "Warning: Trying to print allocator performance counters but CPPuddle was built "
174 "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!"
175 << std::endl;
176#endif
177 }
178
179 // Member variables and methods
180private:
181
183 static buffer_interface& instance() {
184 static buffer_interface singleton{};
185 return singleton;
186 }
188 std::list<std::function<void()>> print_callbacks;
191 std::list<std::function<void()>> finalize_callbacks;
194 std::list<std::function<void()>> total_cleanup_callbacks;
197 std::list<std::function<void()>> partial_cleanup_callbacks;
200 buffer_interface() = default;
201
202 mutex_t callback_protection_mut;
204 static void add_total_cleanup_callback(const std::function<void()> &func) {
205 std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
206 instance().total_cleanup_callbacks.push_back(func);
207 }
210 static void add_partial_cleanup_callback(const std::function<void()> &func) {
211 std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
212 instance().partial_cleanup_callbacks.push_back(func);
213 }
216 static void add_finalize_callback(const std::function<void()> &func) {
217 std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
218 instance().finalize_callbacks.push_back(func);
219 }
222 static void add_print_callback(const std::function<void()> &func) {
223 std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
224 instance().print_callbacks.push_back(func);
225 }
226
227public:
228 ~buffer_interface() = default;
229
230 // Subclasses
231private:
233 template <typename T, typename Host_Allocator> class buffer_manager {
234 private:
235 // Tuple content: Pointer to buffer, buffer_size, location ID, Flag
236 // The flag at the end controls whether to buffer content is to be reused as
237 // well
238 using buffer_entry_type = std::tuple<T *, size_t, size_t, bool>;
239
240
241 public:
243 static void clean() {
244 assert(instance() && !is_finalized);
245 for (auto i = 0; i < number_instances * max_number_gpus; i++) {
246 std::lock_guard<mutex_t> guard(instance()[i].mut);
247 instance()[i].clean_all_buffers();
248 }
249 }
250 static void print_performance_counters() {
251 assert(instance() && !is_finalized);
252 for (auto i = 0; i < number_instances * max_number_gpus; i++) {
253 std::lock_guard<mutex_t> guard(instance()[i].mut);
254 instance()[i].print_counters();
255 }
256 }
257 static void finalize() {
258 assert(instance() && !is_finalized);
259 is_finalized = true;
260 for (auto i = 0; i < number_instances * max_number_gpus; i++) {
261 std::lock_guard<mutex_t> guard(instance()[i].mut);
262 instance()[i].clean_all_buffers();
263 }
264 instance().reset();
265 }
267 static void clean_unused_buffers_only() {
268 assert(instance() && !is_finalized);
269 for (auto i = 0; i < number_instances * max_number_gpus; i++) {
270 std::lock_guard<mutex_t> guard(instance()[i].mut);
271 for (auto &buffer_tuple : instance()[i].unused_buffer_list) {
272 Host_Allocator alloc;
273 if (std::get<3>(buffer_tuple)) {
274 std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
275 }
276 alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
277 }
278 instance()[i].unused_buffer_list.clear();
279 }
280 }
281#if defined(CPPUDDLE_HAVE_COUNTERS) && defined(CPPUDDLE_HAVE_HPX)
282 static size_t get_sum_number_recycling(bool reset) {
283 if (reset)
284 sum_number_recycling = 0;
285 return sum_number_recycling;
286 }
287 static size_t get_sum_number_allocation(bool reset) {
288 if (reset)
289 sum_number_allocation = 0;
290 return sum_number_allocation;
291 }
292 static size_t get_sum_number_creation(bool reset) {
293 if (reset)
294 sum_number_creation = 0;
295 return sum_number_creation;
296 }
297 static size_t get_sum_number_deallocation(bool reset) {
298 if (reset)
299 sum_number_deallocation = 0;
300 return sum_number_deallocation;
301 }
302 static size_t get_sum_number_wrong_hints(bool reset) {
303 if (reset)
304 sum_number_wrong_hints = 0;
305 return sum_number_wrong_hints;
306 }
307 static size_t get_sum_number_wrong_device_hints(bool reset) {
308 if (reset)
309 sum_number_wrong_hints = 0;
310 return sum_number_wrong_device_hints;
311 }
312 static size_t get_sum_number_bad_allocs(bool reset) {
313 if (reset)
314 sum_number_bad_allocs = 0;
315 return sum_number_bad_allocs;
316 }
317
318 static void register_counters_with_hpx(void) {
319 std::string alloc_name =
320 boost::core::demangle(typeid(Host_Allocator).name()) +
321 std::string("_") + boost::core::demangle(typeid(T).name());
322 hpx::performance_counters::install_counter_type(
323 std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_recycling/"),
324 &get_sum_number_recycling,
325 "Number of allocations using a recycled buffer with this "
326 "allocator");
327 hpx::performance_counters::install_counter_type(
328 std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_allocations/"),
329 &get_sum_number_allocation,
330 "Number of allocations with this allocator");
331 hpx::performance_counters::install_counter_type(
332 std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_creations/"),
333 &get_sum_number_creation,
334 "Number of allocations not using a recycled buffer with this "
335 "allocator");
336 hpx::performance_counters::install_counter_type(
337 std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_deallocations/"),
338 &get_sum_number_deallocation,
339 "Number of deallocations yielding buffers to be recycled with this "
340 "allocator");
341 hpx::performance_counters::install_counter_type(
342 std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_wrong_hints/"),
343 &get_sum_number_wrong_hints,
344 "Number of wrong hints supplied to the dealloc method with this allocator");
345 hpx::performance_counters::install_counter_type(
346 std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_wrong_device_hints/"),
347 &get_sum_number_wrong_device_hints,
348 "Number of wrong device hints supplied to the dealloc method with this allocator");
349 hpx::performance_counters::install_counter_type(
350 std::string("/cppuddle/allocators/") + alloc_name + std::string("/number_bad_allocs/"),
351 &get_sum_number_bad_allocs,
352 "Number of wrong bad allocs which triggered a cleanup of unused buffers");
353 }
354#endif
355
357 static T *get(size_t number_of_elements, bool manage_content_lifetime,
358 std::optional<size_t> location_hint = std::nullopt,
359 std::optional<size_t> gpu_device_id = std::nullopt) {
360 init_callbacks_once();
361 if (is_finalized) {
362 throw std::runtime_error("Tried allocation after finalization");
363 }
364 assert(instance() && !is_finalized);
365
366 size_t location_id = 0;
367 if (location_hint) {
368 location_id = *location_hint;
369 }
370 if (location_id >= number_instances) {
371 throw std::runtime_error("Tried to create buffer with invalid location_id [get]");
372 }
373 size_t device_id = 0;
374 if (gpu_device_id) {
375 device_id = *gpu_device_id;
376 }
377 if (device_id >= max_number_gpus) {
378 throw std::runtime_error("Tried to create buffer with invalid device id [get]! "
379 "Is multigpu support enabled with the correct number "
380 "of GPUs?");
381 }
382
383 location_id = location_id + device_id * number_instances;
384 std::lock_guard<mutex_t> guard(instance()[location_id].mut);
385
386
387#ifdef CPPUDDLE_HAVE_COUNTERS
388 instance()[location_id].number_allocation++;
389 sum_number_allocation++;
390#endif
391 // Check for unused buffers we can recycle:
392 for (auto iter = instance()[location_id].unused_buffer_list.begin();
393 iter != instance()[location_id].unused_buffer_list.end(); iter++) {
394 auto tuple = *iter;
395 if (std::get<1>(tuple) == number_of_elements) {
396 instance()[location_id].unused_buffer_list.erase(iter);
397
398 // handle the switch from aggressive to non aggressive reusage (or
399 // vice-versa)
400 if (manage_content_lifetime && !std::get<3>(tuple)) {
401 std::uninitialized_value_construct_n(std::get<0>(tuple),
402 number_of_elements);
403 std::get<3>(tuple) = true;
404 } else if (!manage_content_lifetime && std::get<3>(tuple)) {
405 std::destroy_n(std::get<0>(tuple), std::get<1>(tuple));
406 std::get<3>(tuple) = false;
407 }
408 instance()[location_id].buffer_map.insert({std::get<0>(tuple), tuple});
409#ifdef CPPUDDLE_HAVE_COUNTERS
410 instance()[location_id].number_recycling++;
411 sum_number_recycling++;
412#endif
413 return std::get<0>(tuple);
414 }
415 }
416
417 // No unused buffer found -> Create new one and return it
418 try {
420 T, Host_Allocator>{}(device_id);
421 Host_Allocator alloc;
422 T *buffer = alloc.allocate(number_of_elements);
423 instance()[location_id].buffer_map.insert(
424 {buffer, std::make_tuple(buffer, number_of_elements, 1,
425 manage_content_lifetime)});
426#ifdef CPPUDDLE_HAVE_COUNTERS
427 instance()[location_id].number_creation++;
428 sum_number_creation++;
429#endif
430 if (manage_content_lifetime) {
431 std::uninitialized_value_construct_n(buffer, number_of_elements);
432 }
433 return buffer;
434 } catch (std::bad_alloc &e) {
435 // not enough memory left! Cleanup and attempt again:
436 std::cerr
437 << "Not enough memory left. Cleaning up unused buffers now..."
438 << std::endl;
440 std::cerr << "Buffers cleaned! Try allocation again..." << std::endl;
441
442 // If there still isn't enough memory left, the caller has to handle it
443 // We've done all we can in here
444 Host_Allocator alloc;
446 T, Host_Allocator>{}(device_id);
447 T *buffer = alloc.allocate(number_of_elements);
448 instance()[location_id].buffer_map.insert(
449 {buffer, std::make_tuple(buffer, number_of_elements, 1,
450 manage_content_lifetime)});
451#ifdef CPPUDDLE_HAVE_COUNTERS
452 instance()[location_id].number_creation++;
453 sum_number_creation++;
454 instance()[location_id].number_bad_alloc++;
455 sum_number_bad_allocs++;
456#endif
457 std::cerr << "Second attempt allocation successful!" << std::endl;
458 if (manage_content_lifetime) {
459 std::uninitialized_value_construct_n(buffer, number_of_elements);
460 }
461 return buffer;
462 }
463 }
464
465 static void mark_unused(T *memory_location, size_t number_of_elements,
466 std::optional<size_t> location_hint = std::nullopt,
467 std::optional<size_t> device_hint = std::nullopt) {
468 if (is_finalized)
469 return;
470 assert(instance() && !is_finalized);
471
472 size_t location_id = 0;
473 if (location_hint) {
474 location_id = *location_hint;
475 if (location_id >= number_instances) {
476 throw std::runtime_error(
477 "Buffer recylcer received invalid location hint [mark_unused]");
478 }
479 }
480 size_t device_id = 0;
481 if (device_hint) {
482 device_id = *device_hint;
483 if (device_id >= max_number_gpus) {
484 throw std::runtime_error(
485 "Buffer recylcer received invalid devce hint [mark_unused]");
486 }
487 }
488
489 // Attempt 1 to find the correct bucket/location: Look at provided hint:
490 if (location_hint) {
491 size_t location_id = location_hint.value() + device_id * number_instances;
492 std::lock_guard<mutex_t> guard(instance()[location_id].mut);
493 if (instance()[location_id].buffer_map.find(memory_location) !=
494 instance()[location_id].buffer_map.end()) {
495#ifdef CPPUDDLE_HAVE_COUNTERS
496 instance()[location_id].number_deallocation++;
497 sum_number_deallocation++;
498#endif
499 auto it = instance()[location_id].buffer_map.find(memory_location);
500 assert(it != instance()[location_id].buffer_map.end());
501 auto &tuple = it->second;
502 // sanity checks:
503 assert(std::get<1>(tuple) == number_of_elements);
504 // move to the unused_buffer list
505 instance()[location_id].unused_buffer_list.push_front(tuple);
506 instance()[location_id].buffer_map.erase(memory_location);
507 return; // Success
508 }
509 // hint was wrong
510#ifdef CPPUDDLE_HAVE_COUNTERS
511 instance()[location_id].number_wrong_hints++;
512 sum_number_wrong_hints++;
513#endif
514 }
515 // Failed to find buffer in the specified localtion/device!
516 // Attempt 2 - Look for buffer other locations on the same device...
517 for (size_t location_id = device_id * number_instances;
518 location_id < (device_id + 1) * number_instances; location_id++) {
519 if (location_hint) {
520 if (*location_hint + device_id * max_number_gpus == location_id) {
521 continue; // already tried this -> skip
522 }
523 }
524 std::lock_guard<mutex_t> guard(instance()[location_id].mut);
525 if (instance()[location_id].buffer_map.find(memory_location) !=
526 instance()[location_id].buffer_map.end()) {
527#ifdef CPPUDDLE_HAVE_COUNTERS
528 instance()[location_id].number_deallocation++;
529 sum_number_deallocation++;
530#endif
531 auto it = instance()[location_id].buffer_map.find(memory_location);
532 assert(it != instance()[location_id].buffer_map.end());
533 auto &tuple = it->second;
534 // sanity checks:
535 assert(std::get<1>(tuple) == number_of_elements);
536 // move to the unused_buffer list
537 instance()[location_id].unused_buffer_list.push_front(tuple);
538 instance()[location_id].buffer_map.erase(memory_location);
539 return; // Success
540 }
541 }
542 // device hint was wrong
543#ifdef CPPUDDLE_HAVE_COUNTERS
544 if (device_hint) {
545 sum_number_wrong_device_hints++;
546 }
547#endif
548 // Failed to find buffer on the specified device!
549 // Attempt 3 - Look for buffer on other devices...
550 for (size_t local_device_id = 0; local_device_id < max_number_gpus;
551 local_device_id++) {
552 if (local_device_id == device_id)
553 continue; // aldready tried this device
554
555 // Try hint localtion first yet again (though on different device)
556 if (location_hint) {
557 size_t location_id = location_hint.value() + local_device_id * number_instances;
558 std::lock_guard<mutex_t> guard(instance()[location_id].mut);
559 if (instance()[location_id].buffer_map.find(memory_location) !=
560 instance()[location_id].buffer_map.end()) {
561#ifdef CPPUDDLE_HAVE_COUNTERS
562 instance()[location_id].number_deallocation++;
563 sum_number_deallocation++;
564#endif
565 auto it = instance()[location_id].buffer_map.find(memory_location);
566 assert(it != instance()[location_id].buffer_map.end());
567 auto &tuple = it->second;
568 // sanity checks:
569 assert(std::get<1>(tuple) == number_of_elements);
570 // move to the unused_buffer list
571 instance()[location_id].unused_buffer_list.push_front(tuple);
572 instance()[location_id].buffer_map.erase(memory_location);
573 return; // Success
574 }
575 }
576 // Failed - check all other localtions on device
577 for (size_t location_id = local_device_id * number_instances;
578 location_id < (local_device_id + 1) * number_instances; location_id++) {
579 if (location_hint) {
580 if (*location_hint + local_device_id * max_number_gpus == location_id) {
581 continue; // already tried this -> skip
582 }
583 }
584 std::lock_guard<mutex_t> guard(instance()[location_id].mut);
585 if (instance()[location_id].buffer_map.find(memory_location) !=
586 instance()[location_id].buffer_map.end()) {
587#ifdef CPPUDDLE_HAVE_COUNTERS
588 instance()[location_id].number_deallocation++;
589 sum_number_deallocation++;
590#endif
591 auto it = instance()[location_id].buffer_map.find(memory_location);
592 assert(it != instance()[location_id].buffer_map.end());
593 auto &tuple = it->second;
594 // sanity checks:
595 assert(std::get<1>(tuple) == number_of_elements);
596 // move to the unused_buffer list
597 instance()[location_id].unused_buffer_list.push_front(tuple);
598 instance()[location_id].buffer_map.erase(memory_location);
599 return; // Success
600 }
601 }
602 }
603 // Buffer that is to be deleted is nowhere to be found - we looked everywhere!
604 // =>
605 // Failure! Handle here...
606
607 // TODO Throw exception instead in the futures, as soon as the recycler finalize is
608 // in all user codes
609 /* throw std::runtime_error("Tried to delete non-existing buffer"); */
610
611 // This is odd: Print warning -- however, might also happen with static
612 // buffers using these allocators IF the new finalize was not called. For
613 // now, print warning until all user-code is upgraded to the finalize method.
614 // This allows using current versions of cppuddle with older application code
615 std::cerr
616 << "Warning! Tried to delete non-existing buffer within CPPuddle!"
617 << std::endl;
618 std::cerr << "Did you forget to call recycler::finalize?" << std::endl;
619 }
620
621 private:
623 std::unordered_map<T *, buffer_entry_type> buffer_map{};
625 std::list<buffer_entry_type> unused_buffer_list{};
627 mutex_t mut;
628#ifdef CPPUDDLE_HAVE_COUNTERS
630 size_t number_allocation{0}, number_deallocation{0}, number_wrong_hints{0},
631 number_recycling{0}, number_creation{0}, number_bad_alloc{0};
632
633 static inline std::atomic<size_t> sum_number_allocation{0},
634 sum_number_deallocation{0}, sum_number_wrong_hints{0},
635 sum_number_wrong_device_hints{0}, sum_number_recycling{0},
636 sum_number_creation{0}, sum_number_bad_allocs{0};
637#endif
640 buffer_manager() = default;
641 buffer_manager&
642 operator=(buffer_manager<T, Host_Allocator> const &other) = default;
643 buffer_manager&
644 operator=(buffer_manager<T, Host_Allocator> &&other) = delete;
645 static std::unique_ptr<buffer_manager[]>& instance(void) {
646 static std::unique_ptr<buffer_manager[]> instances{
647 new buffer_manager[number_instances * max_number_gpus]};
648 return instances;
649 }
650 static void init_callbacks_once(void) {
651 assert(instance());
652#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX)
653 static hpx::once_flag flag;
654 hpx::call_once(flag, []() {
655#else
656 static std::once_flag flag;
657 std::call_once(flag, []() {
658#endif
659 is_finalized = false;
660 buffer_interface::add_total_cleanup_callback(clean);
661 buffer_interface::add_partial_cleanup_callback(
662 clean_unused_buffers_only);
663 buffer_interface::add_finalize_callback(
664 finalize);
665#ifdef CPPUDDLE_HAVE_COUNTERS
666 buffer_interface::add_print_callback(
667 print_performance_counters);
668#endif
669 });
670 }
671 static inline std::atomic<bool> is_finalized;
672
673#ifdef CPPUDDLE_HAVE_COUNTERS
674 void print_counters(void) {
675 if (number_allocation == 0)
676 return;
677 // Print performance counters
678 size_t number_cleaned = unused_buffer_list.size() + buffer_map.size();
679 std::cout << "\nBuffer manager destructor for (Alloc: "
680 << boost::core::demangle(typeid(Host_Allocator).name()) << ", Type: "
681 << boost::core::demangle(typeid(T).name())
682 << "):" << std::endl
683 << "--------------------------------------------------------------------"
684 << std::endl
685 << "--> Number of bad_allocs that triggered garbage "
686 "collection: "
687 << number_bad_alloc << std::endl
688 << "--> Number of buffers that got requested from this "
689 "manager: "
690 << number_allocation << std::endl
691 << "--> Number of times an unused buffer got recycled for a "
692 "request: "
693 << number_recycling << std::endl
694 << "--> Number of times a new buffer had to be created for a "
695 "request: "
696 << number_creation << std::endl
697 << "--> Number cleaned up buffers: "
698 " "
699 << number_cleaned << std::endl
700 << "--> Number wrong deallocation hints: "
701 " "
702 << number_wrong_hints << std::endl
703 << "--> Number of buffers that were marked as used upon "
704 "cleanup: "
705 << buffer_map.size() << std::endl
706 << "==> Recycle rate: "
707 " "
708 << static_cast<float>(number_recycling) / number_allocation *
709 100.0f
710 << "%" << std::endl;
711 }
712#endif
713
714 void clean_all_buffers(void) {
715#ifdef CPPUDDLE_HAVE_COUNTERS
716 if (number_allocation == 0 && number_recycling == 0 &&
717 number_bad_alloc == 0 && number_creation == 0 &&
718 unused_buffer_list.empty() && buffer_map.empty()) {
719 return;
720 }
721#endif
722 for (auto &buffer_tuple : unused_buffer_list) {
723 Host_Allocator alloc;
724 if (std::get<3>(buffer_tuple)) {
725 std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
726 }
727 alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
728 }
729 for (auto &map_tuple : buffer_map) {
730 auto buffer_tuple = map_tuple.second;
731 Host_Allocator alloc;
732 if (std::get<3>(buffer_tuple)) {
733 std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
734 }
735 alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
736 }
737 unused_buffer_list.clear();
738 buffer_map.clear();
739#ifdef CPPUDDLE_HAVE_COUNTERS
740 number_allocation = 0;
741 number_recycling = 0;
742 number_bad_alloc = 0;
743 number_creation = 0;
744 number_wrong_hints = 0;
745#endif
746 }
747 public:
748 ~buffer_manager() {
749 clean_all_buffers();
750 }
751
752 public: // Putting deleted constructors in public gives more useful error
753 // messages
754 // Bunch of constructors we don't need
755 buffer_manager(
756 buffer_manager<T, Host_Allocator> const &other) = delete;
757 buffer_manager(
758 buffer_manager<T, Host_Allocator> &&other) = delete;
759 };
760
761public:
762 // Putting deleted constructors in public gives more useful error messages
763 // Bunch of constructors we don't need
764 buffer_interface(buffer_interface const &other) = delete;
768};
769
770template <typename T, typename Host_Allocator> struct recycle_allocator {
771 using value_type = T;
772 using underlying_allocator_type = Host_Allocator;
773 static_assert(std::is_same_v<value_type, typename underlying_allocator_type::value_type>);
774 const std::optional<size_t> dealloc_hint;
775 const std::optional<size_t> device_id;
776
777#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
779 : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
780 explicit recycle_allocator(size_t hint) noexcept
781 : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
783 recycle_allocator<T, Host_Allocator> const &other) noexcept
784 : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
785 T *allocate(std::size_t n) {
786 T *data = buffer_interface::get<T, Host_Allocator>(n);
787 return data;
788 }
789 void deallocate(T *p, std::size_t n) {
790 buffer_interface::mark_unused<T, Host_Allocator>(p, n);
791 }
792#else
793 recycle_allocator() noexcept
794 : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(0) {}
795 explicit recycle_allocator(const size_t device_id) noexcept
796 : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(device_id) {}
797 explicit recycle_allocator(const size_t device_i, const size_t location_id) noexcept
798 : dealloc_hint(location_id), device_id(device_id) {}
799 explicit recycle_allocator(
800 recycle_allocator<T, Host_Allocator> const &other) noexcept
801 : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {}
802 T *allocate(std::size_t n) {
803 T *data = buffer_interface::get<T, Host_Allocator>(
804 n, false, hpx::get_worker_thread_num() % number_instances, device_id);
805 return data;
806 }
807 void deallocate(T *p, std::size_t n) {
808 buffer_interface::mark_unused<T, Host_Allocator>(p, n, dealloc_hint,
809 device_id);
810 }
811#endif
812
813 template <typename... Args>
814 inline void construct(T *p, Args... args) noexcept {
815 ::new (static_cast<void *>(p)) T(std::forward<Args>(args)...);
816 }
817 void destroy(T *p) { p->~T(); }
818};
819template <typename T, typename U, typename Host_Allocator>
820constexpr bool
822 recycle_allocator<U, Host_Allocator> const &) noexcept {
823 if constexpr (std::is_same_v<T, U>)
824 return true;
825 else
826 return false;
827}
828template <typename T, typename U, typename Host_Allocator>
829constexpr bool
831 recycle_allocator<U, Host_Allocator> const &) noexcept {
832 if constexpr (std::is_same_v<T, U>)
833 return false;
834 else
835 return true;
836}
837
839template <typename T, typename Host_Allocator>
841 using value_type = T;
842 using underlying_allocator_type = Host_Allocator;
843 static_assert(std::is_same_v<value_type, typename underlying_allocator_type::value_type>);
844 const std::optional<size_t> dealloc_hint;
845 const std::optional<size_t> device_id;
846
847#ifndef CPPUDDLE_HAVE_HPX_AWARE_ALLOCATORS
849 : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
850 explicit aggressive_recycle_allocator(size_t hint) noexcept
851 : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
854 : dealloc_hint(std::nullopt), device_id(std::nullopt) {}
855 T *allocate(std::size_t n) {
856 T *data = buffer_interface::get<T, Host_Allocator>(
857 n, true); // also initializes the buffer if it isn't reused
858 return data;
859 }
860 void deallocate(T *p, std::size_t n) {
861 buffer_interface::mark_unused<T, Host_Allocator>(p, n);
862 }
863#else
865 : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(0) {}
866 explicit aggressive_recycle_allocator(const size_t device_id) noexcept
867 : dealloc_hint(hpx::get_worker_thread_num() % number_instances), device_id(device_id) {}
868 explicit aggressive_recycle_allocator(const size_t device_id, const size_t location_id) noexcept
869 : dealloc_hint(location_id), device_id(device_id) {}
870 explicit aggressive_recycle_allocator(
871 recycle_allocator<T, Host_Allocator> const &other) noexcept
872 : dealloc_hint(other.dealloc_hint), device_id(other.device_id) {}
873 T *allocate(std::size_t n) {
874 T *data = buffer_interface::get<T, Host_Allocator>(
875 n, true, dealloc_hint, device_id); // also initializes the buffer
876 // if it isn't reused
877 return data;
878 }
879 void deallocate(T *p, std::size_t n) {
880 buffer_interface::mark_unused<T, Host_Allocator>(p, n, dealloc_hint,
881 device_id);
882 }
883#endif
884
885#ifndef CPPUDDLE_DEACTIVATE_AGGRESSIVE_ALLOCATORS
886 template <typename... Args>
887 inline void construct(T *p, Args... args) noexcept {
888 // Do nothing here - we reuse the content of the last owner
889 }
890 void destroy(T *p) {
891 // Do nothing here - Contents will be destroyed when the buffer manager is
892 // destroyed, not before
893 }
894#else
895// Warn about suboptimal performance without recycling
896#pragma message \
897"Warning: Building without content reusage for aggressive allocators! \
898For better performance configure with CPPUDDLE_WITH_AGGRESSIVE_CONTENT_RECYCLING=ON !"
899 template <typename... Args>
900 inline void construct(T *p, Args... args) noexcept {
901 ::new (static_cast<void *>(p)) T(std::forward<Args>(args)...);
902 }
903 void destroy(T *p) { p->~T(); }
904#endif
905};
906
907template <typename T, typename U, typename Host_Allocator>
908constexpr bool
911 if constexpr (std::is_same_v<T, U>)
912 return true;
913 else
914 return false;
915}
916template <typename T, typename U, typename Host_Allocator>
917constexpr bool
920 if constexpr (std::is_same_v<T, U>)
921 return false;
922 else
923 return true;
924}
925} // namespace detail
926} // namespace memory_recycling
927} // end namespace cppuddle
928
929#endif
Singleton interface to all buffer_managers.
Definition buffer_management.hpp:68
static void print_performance_counters()
Definition buffer_management.hpp:165
static void register_allocator_counters_with_hpx(void)
Register all CPPuddle counters as HPX performance counters.
Definition buffer_management.hpp:129
buffer_interface & operator=(buffer_interface &&other)=delete
buffer_interface & operator=(buffer_interface const &other)=delete
buffer_interface(buffer_interface const &other)=delete
static void clean_all()
Deallocate all buffers, no matter whether they are marked as used or not.
Definition buffer_management.hpp:141
static T * get(size_t number_elements, bool manage_content_lifetime=false, std::optional< size_t > location_hint=std::nullopt, std::optional< size_t > device_id=std::nullopt)
Definition buffer_management.hpp:97
static void finalize()
Deallocate all buffers, no matter whether they are marked as used or not.
Definition buffer_management.hpp:157
buffer_interface(buffer_interface &&other)=delete
static void clean_unused_buffers()
Deallocated all currently unused buffer.
Definition buffer_management.hpp:149
static void mark_unused(T *p, size_t number_elements, std::optional< size_t > location_hint=std::nullopt, std::optional< size_t > device_id=std::nullopt)
Definition buffer_management.hpp:114
constexpr bool operator!=(recycle_allocator< T, Host_Allocator > const &, recycle_allocator< U, Host_Allocator > const &) noexcept
Definition buffer_management.hpp:830
constexpr bool operator==(recycle_allocator< T, Host_Allocator > const &, recycle_allocator< U, Host_Allocator > const &) noexcept
Definition buffer_management.hpp:821
Definition config.hpp:31
constexpr size_t max_number_gpus
Definition config.hpp:52
std::mutex mutex_t
Definition config.hpp:36
constexpr size_t number_instances
Definition config.hpp:50
Recycles not only allocations but also the contents of a buffer.
Definition buffer_management.hpp:840
const std::optional< size_t > dealloc_hint
Definition buffer_management.hpp:844
aggressive_recycle_allocator(size_t hint) noexcept
Definition buffer_management.hpp:850
void destroy(T *p)
Definition buffer_management.hpp:890
void deallocate(T *p, std::size_t n)
Definition buffer_management.hpp:860
void construct(T *p, Args... args) noexcept
Definition buffer_management.hpp:887
aggressive_recycle_allocator(aggressive_recycle_allocator< T, Host_Allocator > const &) noexcept
Definition buffer_management.hpp:852
T * allocate(std::size_t n)
Definition buffer_management.hpp:855
const std::optional< size_t > device_id
Definition buffer_management.hpp:845
Host_Allocator underlying_allocator_type
Definition buffer_management.hpp:842
aggressive_recycle_allocator() noexcept
Definition buffer_management.hpp:848
Definition buffer_management.hpp:770
T * allocate(std::size_t n)
Definition buffer_management.hpp:785
T value_type
Definition buffer_management.hpp:771
recycle_allocator(size_t hint) noexcept
Definition buffer_management.hpp:780
void destroy(T *p)
Definition buffer_management.hpp:817
recycle_allocator() noexcept
Definition buffer_management.hpp:778
Host_Allocator underlying_allocator_type
Definition buffer_management.hpp:772
void construct(T *p, Args... args) noexcept
Definition buffer_management.hpp:814
void deallocate(T *p, std::size_t n)
Definition buffer_management.hpp:789
const std::optional< size_t > device_id
Definition buffer_management.hpp:775
recycle_allocator(recycle_allocator< T, Host_Allocator > const &other) noexcept
Definition buffer_management.hpp:782
const std::optional< size_t > dealloc_hint
Definition buffer_management.hpp:774
Default device selector - No MultGPU support.
Definition buffer_management.hpp:55
void operator()(const size_t device_id)
Definition buffer_management.hpp:56