CPPuddle/aggregation__executor__pools_8hpp_source.html

// Copyright (c) 2022-2024 Gregor Daiß

//

// Distributed under the Boost Software License, Version 1.0. (See accompanying

// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)


#include "cppuddle/kernel_aggregation/detail/aggregation_executors_and_allocators.hpp"


#ifndef AGGREGATION_EXECUTOR_POOL_HPP

#define AGGREGATION_EXECUTOR_POOL_HPP


namespace cppuddle {


namespace kernel_aggregation {


namespace detail {


template <const char *kernelname, class Interface, class Pool>


class aggregation_pool {

public:

  template <typename... Ts>


  static void init(size_t number_of_executors, size_t slices_per_executor,

                   aggregated_executor_modes mode, size_t num_devices = 1) {

    if (is_initialized) {

      throw std::runtime_error(

          std::string("Trying to initialize cppuddle aggregation pool twice") +

          " Agg pool name: " + std::string(kernelname));

    }

    if (num_devices > cppuddle::max_number_gpus) {

      throw std::runtime_error(

          std::string(

              "Trying to initialize aggregation with more devices than the "

              "maximum number of GPUs given at compiletime") +

          " Agg pool name: " + std::string(kernelname));

    }

    number_devices = num_devices;

    for (size_t gpu_id = 0; gpu_id < number_devices; gpu_id++) {


      std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);

      assert(instance()[gpu_id].aggregation_executor_pool.empty());

      for (int i = 0; i < number_of_executors; i++) {

        instance()[gpu_id].aggregation_executor_pool.emplace_back(slices_per_executor,

                                                        mode, gpu_id);

      }

      instance()[gpu_id].slices_per_executor = slices_per_executor;

      instance()[gpu_id].mode = mode;

    }

    is_initialized = true;

  }


  static decltype(auto) request_executor_slice(void) {

    if (!is_initialized) {

      throw std::runtime_error(

          std::string("ERROR: Trying to use cppuddle aggregation pool without first calling init!\n") +

          " Agg poolname: " + std::string(kernelname));

    }

    const size_t gpu_id = cppuddle::get_device_id(number_devices);

    /* const size_t gpu_id = 1; */

    std::lock_guard<aggregation_mutex_t> guard(instance()[gpu_id].pool_mutex);

    assert(!instance()[gpu_id].aggregation_executor_pool.empty());

    std::optional<hpx::lcos::future<

        typename aggregated_executor<Interface>::executor_slice>>

        ret;

    size_t local_id = (instance()[gpu_id].current_interface) %

                      instance()[gpu_id].aggregation_executor_pool.size();

    ret = instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice();

    // Expected case: current aggregation executor is free

    if (ret.has_value()) {

      return ret;

    }

    // current interface is bad -> find free one

    size_t abort_counter = 0;

    const size_t abort_number = instance()[gpu_id].aggregation_executor_pool.size() + 1;

    do {

      local_id = (++(instance()[gpu_id].current_interface)) % // increment interface

                 instance()[gpu_id].aggregation_executor_pool.size();

      ret =

          instance()[gpu_id].aggregation_executor_pool[local_id].request_executor_slice();

      if (ret.has_value()) {

        return ret;

      }

      abort_counter++;

    } while (abort_counter <= abort_number);

    // Everything's busy -> create new aggregation executor (growing pool) OR

    // return empty optional

    if (instance()[gpu_id].growing_pool) {

      instance()[gpu_id].aggregation_executor_pool.emplace_back(

          instance()[gpu_id].slices_per_executor, instance()[gpu_id].mode, gpu_id);

      instance()[gpu_id].current_interface =

          instance()[gpu_id].aggregation_executor_pool.size() - 1;

      assert(instance()[gpu_id].aggregation_executor_pool.size() < 20480);

      ret = instance()[gpu_id]

                .aggregation_executor_pool[instance()[gpu_id].current_interface]

                .request_executor_slice();

      assert(ret.has_value()); // fresh executor -- should always have slices

                               // available

    }

    return ret;

  }


private:

  std::deque<aggregated_executor<Interface>> aggregation_executor_pool;

  std::atomic<size_t> current_interface{0};

  size_t slices_per_executor;

  aggregated_executor_modes mode;

  bool growing_pool{true};


private:

  aggregation_mutex_t pool_mutex;

  static std::unique_ptr<aggregation_pool[]>& instance(void) {

    static std::unique_ptr<aggregation_pool[]> pool_instances{

        new aggregation_pool[cppuddle::max_number_gpus]};

    return pool_instances;

  }

  static inline size_t number_devices = 1;

  static inline bool is_initialized = false;

  aggregation_pool() = default;


public:

  ~aggregation_pool() = default;

  // Bunch of constructors we don't need

  aggregation_pool(aggregation_pool const &other) = delete;

  aggregation_pool &operator=(aggregation_pool const &other) = delete;

  aggregation_pool(aggregation_pool &&other) = delete;

  aggregation_pool &operator=(aggregation_pool &&other) = delete;

};


template <typename aggregation_region_t>


void init_area_aggregation_pool(

    const size_t max_slices) {

    constexpr size_t number_aggregation_executors = 128;

    constexpr size_t number_gpus = cppuddle::max_number_gpus;

    aggregated_executor_modes executor_mode = aggregated_executor_modes::EAGER;

    if (max_slices == 1) {

      executor_mode = aggregated_executor_modes::STRICT;

    }

    aggregation_region_t::init(

        number_aggregation_executors, max_slices, executor_mode, number_gpus);

}


} // namespace detail


} // namespace kernel_aggregation


} // namespace cppuddle


#endif

aggregation_executors_and_allocators.hpp

cppuddle::kernel_aggregation::detail::aggregated_executor::executor_slice
Slice class - meant as a scope interface to the aggregated executor.
Definition aggregation_executors_and_allocators.hpp:420

cppuddle::kernel_aggregation::detail::aggregation_pool
Definition aggregation_executor_pools.hpp:16

cppuddle::kernel_aggregation::detail::aggregation_pool::aggregation_pool
aggregation_pool(aggregation_pool &&other)=delete

cppuddle::kernel_aggregation::detail::aggregation_pool::operator=
aggregation_pool & operator=(aggregation_pool const &other)=delete

cppuddle::kernel_aggregation::detail::aggregation_pool::~aggregation_pool
~aggregation_pool()=default

cppuddle::kernel_aggregation::detail::aggregation_pool::operator=
aggregation_pool & operator=(aggregation_pool &&other)=delete

cppuddle::kernel_aggregation::detail::aggregation_pool::request_executor_slice
static decltype(auto) request_executor_slice(void)
Will always return a valid executor slice.
Definition aggregation_executor_pools.hpp:50

cppuddle::kernel_aggregation::detail::aggregation_pool::aggregation_pool
aggregation_pool(aggregation_pool const &other)=delete

cppuddle::kernel_aggregation::detail::aggregation_pool::init
static void init(size_t number_of_executors, size_t slices_per_executor, aggregated_executor_modes mode, size_t num_devices=1)
interface
Definition aggregation_executor_pools.hpp:20

cppuddle::kernel_aggregation::detail::init_area_aggregation_pool
void init_area_aggregation_pool(const size_t max_slices)
Definition aggregation_executor_pools.hpp:131

cppuddle::kernel_aggregation::detail::aggregation_mutex_t
hpx::mutex aggregation_mutex_t
Definition aggregation_executors_and_allocators.hpp:70

cppuddle::kernel_aggregation::detail::aggregated_executor_modes
aggregated_executor_modes
Definition aggregation_executors_and_allocators.hpp:383

cppuddle
Definition config.hpp:31

cppuddle::get_device_id
size_t get_device_id(const size_t number_gpus)
Uses HPX thread information to determine which GPU should be used.
Definition config.hpp:59

cppuddle::max_number_gpus
constexpr size_t max_number_gpus
Definition config.hpp:52