CPPuddle
kokkos_aggregation_util.hpp
Go to the documentation of this file.
1 // Copyright (c) 2022-2024 Gregor Daiß
2 //
3 // Distributed under the Boost Software License, Version 1.0. (See accompanying
4 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5 
10 
11 // I originally developed and tested these utilities within Octotiger. See:
12 // STEllAR-GROUP/octotiger/pull/469 and STEllAR-GROUP/octotiger/pull/487
13 // However, I think they are better fit for CPPuddle as they can be used
14 // independent of Octotiger with the work aggregation
15 #ifndef KOKKOS_AGGREGATION_UTIL_HPP
16 #define KOKKOS_AGGREGATION_UTIL_HPP
17 #include <hpx/futures/future.hpp>
18 //#define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
19 #include <hpx/kokkos/executors.hpp>
20 #include <Kokkos_Core.hpp>
21 #include <hpx/kokkos.hpp>
22 
24 #include <aggregation_manager.hpp>
25 #ifdef __NVCC__
26 #include <cuda/std/tuple>
27 #if defined(HPX_CUDA_VERSION) && (HPX_CUDA_VERSION < 1202)
28 // cuda::std::tuple structured bindings are broken in CUDA < 1202
29 // See https://github.com/NVIDIA/libcudacxx/issues/316
30 // According to https://github.com/NVIDIA/libcudacxx/pull/317 the fix for this
31 // is to move tuple element and tuple size into the std namespace
32 // which the following snippet does. This is only necessary for old CUDA versions
33 // the newer ones contain a fix for this issue
34 namespace std {
35  template<size_t _Ip, class... _Tp>
36  struct tuple_element<_Ip, _CUDA_VSTD::tuple<_Tp...>>
37  : _CUDA_VSTD::tuple_element<_Ip, _CUDA_VSTD::tuple<_Tp...>> {};
38  template <class... _Tp>
39  struct tuple_size<_CUDA_VSTD::tuple<_Tp...>>
40  : _CUDA_VSTD::tuple_size<_CUDA_VSTD::tuple<_Tp...>> {};
41 }
42 #endif
43 #endif
44 
45 #if defined(__CUDACC__)
46 #define CPPUDDLE_HOST_DEVICE_METHOD __host__ __device__
47 #elif (defined(__clang__) && defined(__HIP__)) // for HIP compilation
48 #define CPPUDDLE_HOST_DEVICE_METHOD __host__ __device__
49 #else
50 #define CPPUDDLE_HOST_DEVICE_METHOD
51 #endif
52 
53 namespace cppuddle {
54 namespace kernel_aggregation {
55 
57 template <typename Agg_view_t>
58 CPPUDDLE_HOST_DEVICE_METHOD typename Agg_view_t::view_type
59 get_slice_subview(const size_t slice_id, const size_t max_slices,
60  const Agg_view_t &agg_view) {
61  const size_t slice_size = agg_view.size() / max_slices;
62  return Kokkos::subview(
63  agg_view, std::make_pair<size_t, size_t>(slice_id * slice_size,
64  (slice_id + 1) * slice_size));
65 }
66 
69 template <typename Integer,
70  std::enable_if_t<std::is_integral<Integer>::value, bool> = true,
71  typename Agg_view_t, typename... Args>
73 map_views_to_slice(const Integer slice_id, const Integer max_slices,
74  const Agg_view_t &current_arg, const Args &...rest) {
75  static_assert(Kokkos::is_view<typename Agg_view_t::view_type>::value,
76  "Argument not an aggregated view");
77 #if defined(HPX_COMPUTE_DEVICE_CODE) && defined(__NVCC__)
78  if constexpr (sizeof...(Args) > 0) {
79  return cuda::std::tuple_cat(
80  cuda::std::make_tuple(
81  get_slice_subview(slice_id, max_slices, current_arg)),
82  map_views_to_slice(slice_id, max_slices, rest...));
83  } else {
84  return cuda::std::make_tuple(
85  get_slice_subview(slice_id, max_slices, current_arg));
86  }
87 #else
88  if constexpr (sizeof...(Args) > 0) {
89  return std::tuple_cat(
90  std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)),
91  map_views_to_slice(slice_id, max_slices, rest...));
92  } else {
93  return std::make_tuple(
94  get_slice_subview(slice_id, max_slices, current_arg));
95  }
96 #endif
97 }
98 
101 template <
102  typename Agg_executor_t, typename Agg_view_t,
103  std::enable_if_t<Kokkos::is_view<typename Agg_view_t::view_type>::value,
104  bool> = true,
105  typename... Args>
107 map_views_to_slice(const Agg_executor_t &agg_exec,
108  const Agg_view_t &current_arg, const Args &...rest) {
109  const size_t slice_id = agg_exec.id;
110  const size_t max_slices = agg_exec.max_slices;
111  static_assert(Kokkos::is_view<typename Agg_view_t::view_type>::value,
112  "Argument not an aggregated view");
113  if constexpr (sizeof...(Args) > 0) {
114  return std::tuple_cat(
115  std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)),
116  map_views_to_slice(agg_exec, rest...));
117  } else {
118  return std::make_tuple(
119  get_slice_subview(slice_id, max_slices, current_arg));
120  }
121 }
122 
124 template <typename Agg_executor_t, typename TargetView_t, typename SourceView_t>
125 void aggregated_deep_copy(Agg_executor_t &agg_exec, TargetView_t &target,
126  SourceView_t &source) {
127  if (agg_exec.sync_aggregation_slices()) {
128  Kokkos::deep_copy(agg_exec.get_underlying_executor().instance(), target,
129  source);
130  }
131 }
132 
134 template <typename Agg_executor_t, typename TargetView_t, typename SourceView_t>
135 void aggregated_deep_copy(Agg_executor_t &agg_exec, TargetView_t &target,
136  SourceView_t &source, int elements_per_slice) {
137  if (agg_exec.sync_aggregation_slices()) {
138  const size_t number_slices = agg_exec.number_slices;
139  auto target_slices = Kokkos::subview(
140  target,
141  std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
142  auto source_slices = Kokkos::subview(
143  source,
144  std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
145  Kokkos::deep_copy(agg_exec.get_underlying_executor().instance(),
146  target_slices, source_slices);
147  }
148 }
149 
151 template <typename executor_t, typename TargetView_t, typename SourceView_t>
152 hpx::shared_future<void> aggregrated_deep_copy_async(
154  TargetView_t &target, SourceView_t &source) {
155  const size_t gpu_id = agg_exec.parent.gpu_id;
156  auto launch_copy_lambda =
157  [gpu_id](TargetView_t &target, SourceView_t &source,
158  executor_t &exec) -> hpx::shared_future<void> {
161  return hpx::kokkos::deep_copy_async(exec.instance(), target, source);
162  };
163  return agg_exec.wrap_async(launch_copy_lambda, target, source,
164  agg_exec.get_underlying_executor());
165 }
166 
168 template <typename executor_t, typename TargetView_t, typename SourceView_t>
169 hpx::shared_future<void> aggregrated_deep_copy_async(
171  TargetView_t &target, SourceView_t &source, int elements_per_slice) {
172  const size_t number_slices = agg_exec.number_slices;
173  const size_t gpu_id = agg_exec.parent.gpu_id;
174  auto launch_copy_lambda = [gpu_id, elements_per_slice, number_slices](
175  TargetView_t &target, SourceView_t &source,
176  executor_t &exec) -> hpx::shared_future<void> {
178  executor_t,
180  gpu_id);
181  auto target_slices = Kokkos::subview(
182  target,
183  std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
184  auto source_slices = Kokkos::subview(
185  source,
186  std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
187  return hpx::kokkos::deep_copy_async(exec.instance(), target_slices,
188  source_slices);
189  };
190  return agg_exec.wrap_async(launch_copy_lambda, target, source,
191  agg_exec.get_underlying_executor());
192 }
193 
194 } // namespace kernel_aggregation
195 } // namespace cppuddle
196 
197 #endif
static void select_device(size_t gpu_id)
Definition: executor_pools_management.hpp:189
Definition: executor_pools_management.hpp:54
Slice class - meant as a scope interface to the aggregated executor.
Definition: aggregation_executors_and_allocators.hpp:420
aggregated_executor< Executor > & parent
Definition: aggregation_executors_and_allocators.hpp:422
hpx::lcos::shared_future< void > wrap_async(F &&f, Ts &&...ts)
Definition: aggregation_executors_and_allocators.hpp:528
Executor & get_underlying_executor(void)
Definition: aggregation_executors_and_allocators.hpp:549
const size_t number_slices
How many slices are there overall - required to check the launch criteria.
Definition: aggregation_executors_and_allocators.hpp:434
#define CPPUDDLE_HOST_DEVICE_METHOD
Definition: kokkos_aggregation_util.hpp:50
hpx::shared_future< void > aggregrated_deep_copy_async(typename Aggregated_Executor< executor_t >::Executor_Slice &agg_exec, TargetView_t &target, SourceView_t &source)
Convenience function to launch an aggregated kernel and get a future back.
Definition: kokkos_aggregation_util.hpp:152
void aggregated_deep_copy(Agg_executor_t &agg_exec, TargetView_t &target, SourceView_t &source)
Convenience function to perform an aggregated deep copy.
Definition: kokkos_aggregation_util.hpp:125
CPPUDDLE_HOST_DEVICE_METHOD auto map_views_to_slice(const Integer slice_id, const Integer max_slices, const Agg_view_t &current_arg, const Args &...rest)
Definition: kokkos_aggregation_util.hpp:73
CPPUDDLE_HOST_DEVICE_METHOD Agg_view_t::view_type get_slice_subview(const size_t slice_id, const size_t max_slices, const Agg_view_t &agg_view)
Get subview for the current slice.
Definition: kokkos_aggregation_util.hpp:59
Definition: config.hpp:31