15#ifndef KOKKOS_AGGREGATION_UTIL_HPP
16#define KOKKOS_AGGREGATION_UTIL_HPP
17#include <hpx/futures/future.hpp>
19#include <hpx/kokkos/executors.hpp>
20#include <Kokkos_Core.hpp>
21#include <hpx/kokkos.hpp>
26#include <cuda/std/tuple>
27#if defined(HPX_CUDA_VERSION) && (HPX_CUDA_VERSION < 1202)
35 template<
size_t _Ip,
class... _Tp>
36 struct tuple_element<_Ip, _CUDA_VSTD::tuple<_Tp...>>
37 : _CUDA_VSTD::tuple_element<_Ip, _CUDA_VSTD::tuple<_Tp...>> {};
38 template <
class... _Tp>
39 struct tuple_size<_CUDA_VSTD::tuple<_Tp...>>
40 : _CUDA_VSTD::tuple_size<_CUDA_VSTD::tuple<_Tp...>> {};
45#if defined(__CUDACC__)
46#define CPPUDDLE_HOST_DEVICE_METHOD __host__ __device__
47#elif (defined(__clang__) && defined(__HIP__))
48#define CPPUDDLE_HOST_DEVICE_METHOD __host__ __device__
50#define CPPUDDLE_HOST_DEVICE_METHOD
54namespace kernel_aggregation {
57template <
typename Agg_view_t>
60 const Agg_view_t &agg_view) {
61 const size_t slice_size = agg_view.size() / max_slices;
62 return Kokkos::subview(
63 agg_view, std::make_pair<size_t, size_t>(slice_id * slice_size,
64 (slice_id + 1) * slice_size));
69template <
typename Integer,
70 std::enable_if_t<std::is_integral<Integer>::value,
bool> =
true,
71 typename Agg_view_t,
typename... Args>
74 const Agg_view_t ¤t_arg,
const Args &...rest) {
75 static_assert(Kokkos::is_view<typename Agg_view_t::view_type>::value,
76 "Argument not an aggregated view");
77#if defined(HPX_COMPUTE_DEVICE_CODE) && defined(__NVCC__)
78 if constexpr (
sizeof...(Args) > 0) {
79 return cuda::std::tuple_cat(
80 cuda::std::make_tuple(
84 return cuda::std::make_tuple(
88 if constexpr (
sizeof...(Args) > 0) {
89 return std::tuple_cat(
93 return std::make_tuple(
102 typename Agg_executor_t,
typename Agg_view_t,
103 std::enable_if_t<Kokkos::is_view<typename Agg_view_t::view_type>::value,
108 const Agg_view_t ¤t_arg,
const Args &...rest) {
109 const size_t slice_id = agg_exec.id;
110 const size_t max_slices = agg_exec.max_slices;
111 static_assert(Kokkos::is_view<typename Agg_view_t::view_type>::value,
112 "Argument not an aggregated view");
113 if constexpr (
sizeof...(Args) > 0) {
114 return std::tuple_cat(
118 return std::make_tuple(
124template <
typename Agg_executor_t,
typename TargetView_t,
typename SourceView_t>
126 SourceView_t &source) {
127 if (agg_exec.sync_aggregation_slices()) {
128 Kokkos::deep_copy(agg_exec.get_underlying_executor().instance(), target,
134template <
typename Agg_executor_t,
typename TargetView_t,
typename SourceView_t>
136 SourceView_t &source,
int elements_per_slice) {
137 if (agg_exec.sync_aggregation_slices()) {
138 const size_t number_slices = agg_exec.number_slices;
139 auto target_slices = Kokkos::subview(
141 std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
142 auto source_slices = Kokkos::subview(
144 std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
145 Kokkos::deep_copy(agg_exec.get_underlying_executor().instance(),
146 target_slices, source_slices);
151template <
typename executor_t,
typename TargetView_t,
typename SourceView_t>
154 TargetView_t &target, SourceView_t &source) {
155 const size_t gpu_id = agg_exec.
parent.gpu_id;
156 auto launch_copy_lambda =
157 [gpu_id](TargetView_t &target, SourceView_t &source,
158 executor_t &exec) -> hpx::shared_future<void> {
161 return hpx::kokkos::deep_copy_async(exec.instance(), target, source);
163 return agg_exec.
wrap_async(launch_copy_lambda, target, source,
168template <
typename executor_t,
typename TargetView_t,
typename SourceView_t>
171 TargetView_t &target, SourceView_t &source,
int elements_per_slice) {
173 const size_t gpu_id = agg_exec.
parent.gpu_id;
174 auto launch_copy_lambda = [gpu_id, elements_per_slice, number_slices](
175 TargetView_t &target, SourceView_t &source,
176 executor_t &exec) -> hpx::shared_future<void> {
181 auto target_slices = Kokkos::subview(
183 std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
184 auto source_slices = Kokkos::subview(
186 std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
187 return hpx::kokkos::deep_copy_async(exec.instance(), target_slices,
190 return agg_exec.
wrap_async(launch_copy_lambda, target, source,
static void select_device(size_t gpu_id)
Definition executor_pools_management.hpp:189
Definition executor_pools_management.hpp:54
Slice class - meant as a scope interface to the aggregated executor.
Definition aggregation_executors_and_allocators.hpp:420
aggregated_executor< Executor > & parent
Definition aggregation_executors_and_allocators.hpp:422
Executor & get_underlying_executor(void)
Definition aggregation_executors_and_allocators.hpp:549
const size_t number_slices
How many slices are there overall - required to check the launch criteria.
Definition aggregation_executors_and_allocators.hpp:434
hpx::lcos::shared_future< void > wrap_async(F &&f, Ts &&...ts)
Definition aggregation_executors_and_allocators.hpp:528
#define CPPUDDLE_HOST_DEVICE_METHOD
Definition kokkos_aggregation_util.hpp:50
hpx::shared_future< void > aggregrated_deep_copy_async(typename Aggregated_Executor< executor_t >::Executor_Slice &agg_exec, TargetView_t &target, SourceView_t &source)
Convenience function to launch an aggregated kernel and get a future back.
Definition kokkos_aggregation_util.hpp:152
void aggregated_deep_copy(Agg_executor_t &agg_exec, TargetView_t &target, SourceView_t &source)
Convenience function to perform an aggregated deep copy.
Definition kokkos_aggregation_util.hpp:125
CPPUDDLE_HOST_DEVICE_METHOD auto map_views_to_slice(const Integer slice_id, const Integer max_slices, const Agg_view_t ¤t_arg, const Args &...rest)
Definition kokkos_aggregation_util.hpp:73
CPPUDDLE_HOST_DEVICE_METHOD Agg_view_t::view_type get_slice_subview(const size_t slice_id, const size_t max_slices, const Agg_view_t &agg_view)
Get subview for the current slice.
Definition kokkos_aggregation_util.hpp:59