CPPuddle
Loading...
Searching...
No Matches
kokkos_aggregation_util.hpp
Go to the documentation of this file.
1// Copyright (c) 2022-2024 Gregor Daiß
2//
3// Distributed under the Boost Software License, Version 1.0. (See accompanying
4// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5
10
11// I originally developed and tested these utilities within Octotiger. See:
12// STEllAR-GROUP/octotiger/pull/469 and STEllAR-GROUP/octotiger/pull/487
13// However, I think they are better fit for CPPuddle as they can be used
14// independent of Octotiger with the work aggregation
15#ifndef KOKKOS_AGGREGATION_UTIL_HPP
16#define KOKKOS_AGGREGATION_UTIL_HPP
17#include <hpx/futures/future.hpp>
18//#define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
19#include <hpx/kokkos/executors.hpp>
20#include <Kokkos_Core.hpp>
21#include <hpx/kokkos.hpp>
22
25#ifdef __NVCC__
26#include <cuda/std/tuple>
27#if defined(HPX_CUDA_VERSION) && (HPX_CUDA_VERSION < 1202)
28// cuda::std::tuple structured bindings are broken in CUDA < 1202
29// See https://github.com/NVIDIA/libcudacxx/issues/316
30// According to https://github.com/NVIDIA/libcudacxx/pull/317 the fix for this
31// is to move tuple element and tuple size into the std namespace
32// which the following snippet does. This is only necessary for old CUDA versions
33// the newer ones contain a fix for this issue
34namespace std {
35 template<size_t _Ip, class... _Tp>
36 struct tuple_element<_Ip, _CUDA_VSTD::tuple<_Tp...>>
37 : _CUDA_VSTD::tuple_element<_Ip, _CUDA_VSTD::tuple<_Tp...>> {};
38 template <class... _Tp>
39 struct tuple_size<_CUDA_VSTD::tuple<_Tp...>>
40 : _CUDA_VSTD::tuple_size<_CUDA_VSTD::tuple<_Tp...>> {};
41}
42#endif
43#endif
44
45#if defined(__CUDACC__)
46#define CPPUDDLE_HOST_DEVICE_METHOD __host__ __device__
47#elif (defined(__clang__) && defined(__HIP__)) // for HIP compilation
48#define CPPUDDLE_HOST_DEVICE_METHOD __host__ __device__
49#else
50#define CPPUDDLE_HOST_DEVICE_METHOD
51#endif
52
53namespace cppuddle {
54namespace kernel_aggregation {
55
57template <typename Agg_view_t>
58CPPUDDLE_HOST_DEVICE_METHOD typename Agg_view_t::view_type
59get_slice_subview(const size_t slice_id, const size_t max_slices,
60 const Agg_view_t &agg_view) {
61 const size_t slice_size = agg_view.size() / max_slices;
62 return Kokkos::subview(
63 agg_view, std::make_pair<size_t, size_t>(slice_id * slice_size,
64 (slice_id + 1) * slice_size));
65}
66
69template <typename Integer,
70 std::enable_if_t<std::is_integral<Integer>::value, bool> = true,
71 typename Agg_view_t, typename... Args>
73map_views_to_slice(const Integer slice_id, const Integer max_slices,
74 const Agg_view_t &current_arg, const Args &...rest) {
75 static_assert(Kokkos::is_view<typename Agg_view_t::view_type>::value,
76 "Argument not an aggregated view");
77#if defined(HPX_COMPUTE_DEVICE_CODE) && defined(__NVCC__)
78 if constexpr (sizeof...(Args) > 0) {
79 return cuda::std::tuple_cat(
80 cuda::std::make_tuple(
81 get_slice_subview(slice_id, max_slices, current_arg)),
82 map_views_to_slice(slice_id, max_slices, rest...));
83 } else {
84 return cuda::std::make_tuple(
85 get_slice_subview(slice_id, max_slices, current_arg));
86 }
87#else
88 if constexpr (sizeof...(Args) > 0) {
89 return std::tuple_cat(
90 std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)),
91 map_views_to_slice(slice_id, max_slices, rest...));
92 } else {
93 return std::make_tuple(
94 get_slice_subview(slice_id, max_slices, current_arg));
95 }
96#endif
97}
98
101template <
102 typename Agg_executor_t, typename Agg_view_t,
103 std::enable_if_t<Kokkos::is_view<typename Agg_view_t::view_type>::value,
104 bool> = true,
105 typename... Args>
107map_views_to_slice(const Agg_executor_t &agg_exec,
108 const Agg_view_t &current_arg, const Args &...rest) {
109 const size_t slice_id = agg_exec.id;
110 const size_t max_slices = agg_exec.max_slices;
111 static_assert(Kokkos::is_view<typename Agg_view_t::view_type>::value,
112 "Argument not an aggregated view");
113 if constexpr (sizeof...(Args) > 0) {
114 return std::tuple_cat(
115 std::make_tuple(get_slice_subview(slice_id, max_slices, current_arg)),
116 map_views_to_slice(agg_exec, rest...));
117 } else {
118 return std::make_tuple(
119 get_slice_subview(slice_id, max_slices, current_arg));
120 }
121}
122
124template <typename Agg_executor_t, typename TargetView_t, typename SourceView_t>
125void aggregated_deep_copy(Agg_executor_t &agg_exec, TargetView_t &target,
126 SourceView_t &source) {
127 if (agg_exec.sync_aggregation_slices()) {
128 Kokkos::deep_copy(agg_exec.get_underlying_executor().instance(), target,
129 source);
130 }
131}
132
134template <typename Agg_executor_t, typename TargetView_t, typename SourceView_t>
135void aggregated_deep_copy(Agg_executor_t &agg_exec, TargetView_t &target,
136 SourceView_t &source, int elements_per_slice) {
137 if (agg_exec.sync_aggregation_slices()) {
138 const size_t number_slices = agg_exec.number_slices;
139 auto target_slices = Kokkos::subview(
140 target,
141 std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
142 auto source_slices = Kokkos::subview(
143 source,
144 std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
145 Kokkos::deep_copy(agg_exec.get_underlying_executor().instance(),
146 target_slices, source_slices);
147 }
148}
149
151template <typename executor_t, typename TargetView_t, typename SourceView_t>
152hpx::shared_future<void> aggregrated_deep_copy_async(
154 TargetView_t &target, SourceView_t &source) {
155 const size_t gpu_id = agg_exec.parent.gpu_id;
156 auto launch_copy_lambda =
157 [gpu_id](TargetView_t &target, SourceView_t &source,
158 executor_t &exec) -> hpx::shared_future<void> {
161 return hpx::kokkos::deep_copy_async(exec.instance(), target, source);
162 };
163 return agg_exec.wrap_async(launch_copy_lambda, target, source,
164 agg_exec.get_underlying_executor());
165}
166
168template <typename executor_t, typename TargetView_t, typename SourceView_t>
169hpx::shared_future<void> aggregrated_deep_copy_async(
171 TargetView_t &target, SourceView_t &source, int elements_per_slice) {
172 const size_t number_slices = agg_exec.number_slices;
173 const size_t gpu_id = agg_exec.parent.gpu_id;
174 auto launch_copy_lambda = [gpu_id, elements_per_slice, number_slices](
175 TargetView_t &target, SourceView_t &source,
176 executor_t &exec) -> hpx::shared_future<void> {
178 executor_t,
180 gpu_id);
181 auto target_slices = Kokkos::subview(
182 target,
183 std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
184 auto source_slices = Kokkos::subview(
185 source,
186 std::make_pair<size_t, size_t>(0, number_slices * elements_per_slice));
187 return hpx::kokkos::deep_copy_async(exec.instance(), target_slices,
188 source_slices);
189 };
190 return agg_exec.wrap_async(launch_copy_lambda, target, source,
191 agg_exec.get_underlying_executor());
192}
193
194} // namespace kernel_aggregation
195} // namespace cppuddle
196
197#endif
static void select_device(size_t gpu_id)
Definition executor_pools_management.hpp:189
Definition executor_pools_management.hpp:54
Slice class - meant as a scope interface to the aggregated executor.
Definition aggregation_executors_and_allocators.hpp:420
aggregated_executor< Executor > & parent
Definition aggregation_executors_and_allocators.hpp:422
Executor & get_underlying_executor(void)
Definition aggregation_executors_and_allocators.hpp:549
const size_t number_slices
How many slices are there overall - required to check the launch criteria.
Definition aggregation_executors_and_allocators.hpp:434
hpx::lcos::shared_future< void > wrap_async(F &&f, Ts &&...ts)
Definition aggregation_executors_and_allocators.hpp:528
#define CPPUDDLE_HOST_DEVICE_METHOD
Definition kokkos_aggregation_util.hpp:50
hpx::shared_future< void > aggregrated_deep_copy_async(typename Aggregated_Executor< executor_t >::Executor_Slice &agg_exec, TargetView_t &target, SourceView_t &source)
Convenience function to launch an aggregated kernel and get a future back.
Definition kokkos_aggregation_util.hpp:152
void aggregated_deep_copy(Agg_executor_t &agg_exec, TargetView_t &target, SourceView_t &source)
Convenience function to perform an aggregated deep copy.
Definition kokkos_aggregation_util.hpp:125
CPPUDDLE_HOST_DEVICE_METHOD auto map_views_to_slice(const Integer slice_id, const Integer max_slices, const Agg_view_t &current_arg, const Args &...rest)
Definition kokkos_aggregation_util.hpp:73
CPPUDDLE_HOST_DEVICE_METHOD Agg_view_t::view_type get_slice_subview(const size_t slice_id, const size_t max_slices, const Agg_view_t &agg_view)
Get subview for the current slice.
Definition kokkos_aggregation_util.hpp:59
Definition config.hpp:31