70#if defined(CPPUDDLE_DEACTIVATE_BUFFER_RECYCLING)
74"Warning: Building without buffer recycling! Use only for performance testing! \
75For better performance configure CPPuddle with CPPUDDLE_WITH_BUFFER_RECYCLING=ON!"
77 template <
typename T,
typename Host_Allocator>
78 static T *
get(
size_t number_elements,
bool manage_content_lifetime =
false,
79 std::optional<size_t> location_hint = std::nullopt,
80 std::optional<size_t> device_id = std::nullopt) {
82 return Host_Allocator{}.allocate(number_elements);
85 template <
typename T,
typename Host_Allocator>
86 static void mark_unused(T *p,
size_t number_elements,
87 std::optional<size_t> location_hint = std::nullopt,
88 std::optional<size_t> device_id = std::nullopt) {
89 return Host_Allocator{}.deallocate(p, number_elements);
96 template <
typename T,
typename Host_Allocator>
97 static T *
get(
size_t number_elements,
bool manage_content_lifetime =
false,
98 std::optional<size_t> location_hint = std::nullopt,
99 std::optional<size_t> device_id = std::nullopt) {
101 return buffer_manager<T, Host_Allocator>::get(
102 number_elements, manage_content_lifetime, location_hint, device_id);
103 }
catch (
const std::exception &exc) {
104 std::cerr <<
"ERROR: Encountered unhandled exception in cppuddle get: " << exc.what() << std::endl;
105 std::cerr <<
"Rethrowing exception... " << std::endl;;
113 template <
typename T,
typename Host_Allocator>
115 std::optional<size_t> location_hint = std::nullopt,
116 std::optional<size_t> device_id = std::nullopt) {
118 return buffer_manager<T, Host_Allocator>::mark_unused(p, number_elements,
119 location_hint, device_id);
120 }
catch (
const std::exception &exc) {
121 std::cerr <<
"ERROR: Encountered unhandled exception in cppuddle mark_unused: " << exc.what() << std::endl;
122 std::cerr <<
"Rethrowing exception... " << std::endl;;
128 template <
typename T,
typename Host_Allocator>
130#ifdef CPPUDDLE_HAVE_COUNTERS
131 buffer_manager<T, Host_Allocator>::register_counters_with_hpx();
133 std::cerr <<
"Warning: Trying to register allocator performance counters "
134 "with HPX but CPPuddle was built "
135 "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!"
142 std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
143 for (
const auto &clean_function :
144 instance().total_cleanup_callbacks) {
150 std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
151 for (
const auto &clean_function :
152 instance().partial_cleanup_callbacks) {
158 std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
159 for (
const auto &finalize_function :
160 instance().finalize_callbacks) {
166#ifdef CPPUDDLE_HAVE_COUNTERS
167 std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
168 for (
const auto &print_function :
169 instance().print_callbacks) {
173 std::cerr <<
"Warning: Trying to print allocator performance counters but CPPuddle was built "
174 "without CPPUDDLE_WITH_COUNTERS -- operation will be ignored!"
188 std::list<std::function<void()>> print_callbacks;
191 std::list<std::function<void()>> finalize_callbacks;
194 std::list<std::function<void()>> total_cleanup_callbacks;
197 std::list<std::function<void()>> partial_cleanup_callbacks;
200 buffer_interface() =
default;
202 mutex_t callback_protection_mut;
204 static void add_total_cleanup_callback(
const std::function<
void()> &func) {
205 std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
206 instance().total_cleanup_callbacks.push_back(func);
210 static void add_partial_cleanup_callback(
const std::function<
void()> &func) {
211 std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
212 instance().partial_cleanup_callbacks.push_back(func);
216 static void add_finalize_callback(
const std::function<
void()> &func) {
217 std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
218 instance().finalize_callbacks.push_back(func);
222 static void add_print_callback(
const std::function<
void()> &func) {
223 std::lock_guard<mutex_t> guard(instance().callback_protection_mut);
224 instance().print_callbacks.push_back(func);
233 template <
typename T,
typename Host_Allocator>
class buffer_manager {
238 using buffer_entry_type = std::tuple<T *, size_t, size_t, bool>;
243 static void clean() {
244 assert(instance() && !is_finalized);
246 std::lock_guard<mutex_t> guard(instance()[i].mut);
247 instance()[i].clean_all_buffers();
250 static void print_performance_counters() {
251 assert(instance() && !is_finalized);
253 std::lock_guard<mutex_t> guard(instance()[i].mut);
254 instance()[i].print_counters();
257 static void finalize() {
258 assert(instance() && !is_finalized);
261 std::lock_guard<mutex_t> guard(instance()[i].mut);
262 instance()[i].clean_all_buffers();
267 static void clean_unused_buffers_only() {
268 assert(instance() && !is_finalized);
270 std::lock_guard<mutex_t> guard(instance()[i].mut);
271 for (
auto &buffer_tuple : instance()[i].unused_buffer_list) {
272 Host_Allocator alloc;
273 if (std::get<3>(buffer_tuple)) {
274 std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
276 alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
278 instance()[i].unused_buffer_list.clear();
281#if defined(CPPUDDLE_HAVE_COUNTERS) && defined(CPPUDDLE_HAVE_HPX)
282 static size_t get_sum_number_recycling(
bool reset) {
284 sum_number_recycling = 0;
285 return sum_number_recycling;
287 static size_t get_sum_number_allocation(
bool reset) {
289 sum_number_allocation = 0;
290 return sum_number_allocation;
292 static size_t get_sum_number_creation(
bool reset) {
294 sum_number_creation = 0;
295 return sum_number_creation;
297 static size_t get_sum_number_deallocation(
bool reset) {
299 sum_number_deallocation = 0;
300 return sum_number_deallocation;
302 static size_t get_sum_number_wrong_hints(
bool reset) {
304 sum_number_wrong_hints = 0;
305 return sum_number_wrong_hints;
307 static size_t get_sum_number_wrong_device_hints(
bool reset) {
309 sum_number_wrong_hints = 0;
310 return sum_number_wrong_device_hints;
312 static size_t get_sum_number_bad_allocs(
bool reset) {
314 sum_number_bad_allocs = 0;
315 return sum_number_bad_allocs;
318 static void register_counters_with_hpx(
void) {
319 std::string alloc_name =
320 boost::core::demangle(
typeid(Host_Allocator).name()) +
321 std::string(
"_") + boost::core::demangle(
typeid(T).name());
322 hpx::performance_counters::install_counter_type(
323 std::string(
"/cppuddle/allocators/") + alloc_name + std::string(
"/number_recycling/"),
324 &get_sum_number_recycling,
325 "Number of allocations using a recycled buffer with this "
327 hpx::performance_counters::install_counter_type(
328 std::string(
"/cppuddle/allocators/") + alloc_name + std::string(
"/number_allocations/"),
329 &get_sum_number_allocation,
330 "Number of allocations with this allocator");
331 hpx::performance_counters::install_counter_type(
332 std::string(
"/cppuddle/allocators/") + alloc_name + std::string(
"/number_creations/"),
333 &get_sum_number_creation,
334 "Number of allocations not using a recycled buffer with this "
336 hpx::performance_counters::install_counter_type(
337 std::string(
"/cppuddle/allocators/") + alloc_name + std::string(
"/number_deallocations/"),
338 &get_sum_number_deallocation,
339 "Number of deallocations yielding buffers to be recycled with this "
341 hpx::performance_counters::install_counter_type(
342 std::string(
"/cppuddle/allocators/") + alloc_name + std::string(
"/number_wrong_hints/"),
343 &get_sum_number_wrong_hints,
344 "Number of wrong hints supplied to the dealloc method with this allocator");
345 hpx::performance_counters::install_counter_type(
346 std::string(
"/cppuddle/allocators/") + alloc_name + std::string(
"/number_wrong_device_hints/"),
347 &get_sum_number_wrong_device_hints,
348 "Number of wrong device hints supplied to the dealloc method with this allocator");
349 hpx::performance_counters::install_counter_type(
350 std::string(
"/cppuddle/allocators/") + alloc_name + std::string(
"/number_bad_allocs/"),
351 &get_sum_number_bad_allocs,
352 "Number of wrong bad allocs which triggered a cleanup of unused buffers");
357 static T *get(
size_t number_of_elements,
bool manage_content_lifetime,
358 std::optional<size_t> location_hint = std::nullopt,
359 std::optional<size_t> gpu_device_id = std::nullopt) {
360 init_callbacks_once();
362 throw std::runtime_error(
"Tried allocation after finalization");
364 assert(instance() && !is_finalized);
366 size_t location_id = 0;
368 location_id = *location_hint;
371 throw std::runtime_error(
"Tried to create buffer with invalid location_id [get]");
373 size_t device_id = 0;
375 device_id = *gpu_device_id;
378 throw std::runtime_error(
"Tried to create buffer with invalid device id [get]! "
379 "Is multigpu support enabled with the correct number "
384 std::lock_guard<mutex_t> guard(instance()[location_id].mut);
387#ifdef CPPUDDLE_HAVE_COUNTERS
388 instance()[location_id].number_allocation++;
389 sum_number_allocation++;
392 for (
auto iter = instance()[location_id].unused_buffer_list.begin();
393 iter != instance()[location_id].unused_buffer_list.end(); iter++) {
395 if (std::get<1>(tuple) == number_of_elements) {
396 instance()[location_id].unused_buffer_list.erase(iter);
400 if (manage_content_lifetime && !std::get<3>(tuple)) {
401 std::uninitialized_value_construct_n(std::get<0>(tuple),
403 std::get<3>(tuple) =
true;
404 }
else if (!manage_content_lifetime && std::get<3>(tuple)) {
405 std::destroy_n(std::get<0>(tuple), std::get<1>(tuple));
406 std::get<3>(tuple) =
false;
408 instance()[location_id].buffer_map.insert({std::get<0>(tuple), tuple});
409#ifdef CPPUDDLE_HAVE_COUNTERS
410 instance()[location_id].number_recycling++;
411 sum_number_recycling++;
413 return std::get<0>(tuple);
420 T, Host_Allocator>{}(device_id);
421 Host_Allocator alloc;
422 T *buffer = alloc.allocate(number_of_elements);
423 instance()[location_id].buffer_map.insert(
424 {buffer, std::make_tuple(buffer, number_of_elements, 1,
425 manage_content_lifetime)});
426#ifdef CPPUDDLE_HAVE_COUNTERS
427 instance()[location_id].number_creation++;
428 sum_number_creation++;
430 if (manage_content_lifetime) {
431 std::uninitialized_value_construct_n(buffer, number_of_elements);
434 }
catch (std::bad_alloc &e) {
437 <<
"Not enough memory left. Cleaning up unused buffers now..."
440 std::cerr <<
"Buffers cleaned! Try allocation again..." << std::endl;
444 Host_Allocator alloc;
446 T, Host_Allocator>{}(device_id);
447 T *buffer = alloc.allocate(number_of_elements);
448 instance()[location_id].buffer_map.insert(
449 {buffer, std::make_tuple(buffer, number_of_elements, 1,
450 manage_content_lifetime)});
451#ifdef CPPUDDLE_HAVE_COUNTERS
452 instance()[location_id].number_creation++;
453 sum_number_creation++;
454 instance()[location_id].number_bad_alloc++;
455 sum_number_bad_allocs++;
457 std::cerr <<
"Second attempt allocation successful!" << std::endl;
458 if (manage_content_lifetime) {
459 std::uninitialized_value_construct_n(buffer, number_of_elements);
465 static void mark_unused(T *memory_location,
size_t number_of_elements,
466 std::optional<size_t> location_hint = std::nullopt,
467 std::optional<size_t> device_hint = std::nullopt) {
470 assert(instance() && !is_finalized);
472 size_t location_id = 0;
474 location_id = *location_hint;
476 throw std::runtime_error(
477 "Buffer recylcer received invalid location hint [mark_unused]");
480 size_t device_id = 0;
482 device_id = *device_hint;
484 throw std::runtime_error(
485 "Buffer recylcer received invalid devce hint [mark_unused]");
492 std::lock_guard<mutex_t> guard(instance()[location_id].mut);
493 if (instance()[location_id].buffer_map.find(memory_location) !=
494 instance()[location_id].buffer_map.end()) {
495#ifdef CPPUDDLE_HAVE_COUNTERS
496 instance()[location_id].number_deallocation++;
497 sum_number_deallocation++;
499 auto it = instance()[location_id].buffer_map.find(memory_location);
500 assert(it != instance()[location_id].buffer_map.end());
501 auto &tuple = it->second;
503 assert(std::get<1>(tuple) == number_of_elements);
505 instance()[location_id].unused_buffer_list.push_front(tuple);
506 instance()[location_id].buffer_map.erase(memory_location);
510#ifdef CPPUDDLE_HAVE_COUNTERS
511 instance()[location_id].number_wrong_hints++;
512 sum_number_wrong_hints++;
524 std::lock_guard<mutex_t> guard(instance()[location_id].mut);
525 if (instance()[location_id].buffer_map.find(memory_location) !=
526 instance()[location_id].buffer_map.end()) {
527#ifdef CPPUDDLE_HAVE_COUNTERS
528 instance()[location_id].number_deallocation++;
529 sum_number_deallocation++;
531 auto it = instance()[location_id].buffer_map.find(memory_location);
532 assert(it != instance()[location_id].buffer_map.end());
533 auto &tuple = it->second;
535 assert(std::get<1>(tuple) == number_of_elements);
537 instance()[location_id].unused_buffer_list.push_front(tuple);
538 instance()[location_id].buffer_map.erase(memory_location);
543#ifdef CPPUDDLE_HAVE_COUNTERS
545 sum_number_wrong_device_hints++;
552 if (local_device_id == device_id)
557 size_t location_id = location_hint.value() + local_device_id *
number_instances;
558 std::lock_guard<mutex_t> guard(instance()[location_id].mut);
559 if (instance()[location_id].buffer_map.find(memory_location) !=
560 instance()[location_id].buffer_map.end()) {
561#ifdef CPPUDDLE_HAVE_COUNTERS
562 instance()[location_id].number_deallocation++;
563 sum_number_deallocation++;
565 auto it = instance()[location_id].buffer_map.find(memory_location);
566 assert(it != instance()[location_id].buffer_map.end());
567 auto &tuple = it->second;
569 assert(std::get<1>(tuple) == number_of_elements);
571 instance()[location_id].unused_buffer_list.push_front(tuple);
572 instance()[location_id].buffer_map.erase(memory_location);
580 if (*location_hint + local_device_id *
max_number_gpus == location_id) {
584 std::lock_guard<mutex_t> guard(instance()[location_id].mut);
585 if (instance()[location_id].buffer_map.find(memory_location) !=
586 instance()[location_id].buffer_map.end()) {
587#ifdef CPPUDDLE_HAVE_COUNTERS
588 instance()[location_id].number_deallocation++;
589 sum_number_deallocation++;
591 auto it = instance()[location_id].buffer_map.find(memory_location);
592 assert(it != instance()[location_id].buffer_map.end());
593 auto &tuple = it->second;
595 assert(std::get<1>(tuple) == number_of_elements);
597 instance()[location_id].unused_buffer_list.push_front(tuple);
598 instance()[location_id].buffer_map.erase(memory_location);
616 <<
"Warning! Tried to delete non-existing buffer within CPPuddle!"
618 std::cerr <<
"Did you forget to call recycler::finalize?" << std::endl;
623 std::unordered_map<T *, buffer_entry_type> buffer_map{};
625 std::list<buffer_entry_type> unused_buffer_list{};
628#ifdef CPPUDDLE_HAVE_COUNTERS
630 size_t number_allocation{0}, number_deallocation{0}, number_wrong_hints{0},
631 number_recycling{0}, number_creation{0}, number_bad_alloc{0};
633 static inline std::atomic<size_t> sum_number_allocation{0},
634 sum_number_deallocation{0}, sum_number_wrong_hints{0},
635 sum_number_wrong_device_hints{0}, sum_number_recycling{0},
636 sum_number_creation{0}, sum_number_bad_allocs{0};
640 buffer_manager() =
default;
642 operator=(buffer_manager<T, Host_Allocator>
const &other) =
default;
644 operator=(buffer_manager<T, Host_Allocator> &&other) =
delete;
645 static std::unique_ptr<buffer_manager[]>& instance(
void) {
646 static std::unique_ptr<buffer_manager[]> instances{
650 static void init_callbacks_once(
void) {
652#if defined(CPPUDDLE_HAVE_HPX) && defined(CPPUDDLE_HAVE_HPX_MUTEX)
653 static hpx::once_flag flag;
654 hpx::call_once(flag, []() {
656 static std::once_flag flag;
657 std::call_once(flag, []() {
659 is_finalized =
false;
660 buffer_interface::add_total_cleanup_callback(clean);
661 buffer_interface::add_partial_cleanup_callback(
662 clean_unused_buffers_only);
663 buffer_interface::add_finalize_callback(
665#ifdef CPPUDDLE_HAVE_COUNTERS
666 buffer_interface::add_print_callback(
667 print_performance_counters);
671 static inline std::atomic<bool> is_finalized;
673#ifdef CPPUDDLE_HAVE_COUNTERS
674 void print_counters(
void) {
675 if (number_allocation == 0)
678 size_t number_cleaned = unused_buffer_list.size() + buffer_map.size();
679 std::cout <<
"\nBuffer manager destructor for (Alloc: "
680 << boost::core::demangle(
typeid(Host_Allocator).name()) <<
", Type: "
681 << boost::core::demangle(
typeid(T).name())
683 <<
"--------------------------------------------------------------------"
685 <<
"--> Number of bad_allocs that triggered garbage "
687 << number_bad_alloc << std::endl
688 <<
"--> Number of buffers that got requested from this "
690 << number_allocation << std::endl
691 <<
"--> Number of times an unused buffer got recycled for a "
693 << number_recycling << std::endl
694 <<
"--> Number of times a new buffer had to be created for a "
696 << number_creation << std::endl
697 <<
"--> Number cleaned up buffers: "
699 << number_cleaned << std::endl
700 <<
"--> Number wrong deallocation hints: "
702 << number_wrong_hints << std::endl
703 <<
"--> Number of buffers that were marked as used upon "
705 << buffer_map.size() << std::endl
706 <<
"==> Recycle rate: "
708 <<
static_cast<float>(number_recycling) / number_allocation *
714 void clean_all_buffers(
void) {
715#ifdef CPPUDDLE_HAVE_COUNTERS
716 if (number_allocation == 0 && number_recycling == 0 &&
717 number_bad_alloc == 0 && number_creation == 0 &&
718 unused_buffer_list.empty() && buffer_map.empty()) {
722 for (
auto &buffer_tuple : unused_buffer_list) {
723 Host_Allocator alloc;
724 if (std::get<3>(buffer_tuple)) {
725 std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
727 alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
729 for (
auto &map_tuple : buffer_map) {
730 auto buffer_tuple = map_tuple.second;
731 Host_Allocator alloc;
732 if (std::get<3>(buffer_tuple)) {
733 std::destroy_n(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
735 alloc.deallocate(std::get<0>(buffer_tuple), std::get<1>(buffer_tuple));
737 unused_buffer_list.clear();
739#ifdef CPPUDDLE_HAVE_COUNTERS
740 number_allocation = 0;
741 number_recycling = 0;
742 number_bad_alloc = 0;
744 number_wrong_hints = 0;
756 buffer_manager<T, Host_Allocator>
const &other) =
delete;
758 buffer_manager<T, Host_Allocator> &&other) =
delete;