diff options
author | Pablo Reble <pablo@reble.org> | 2021-02-24 17:58:55 -0600 |
---|---|---|
committer | Mikhail Dvorskiy <mikhail.dvorskiy@intel.com> | 2021-02-25 09:59:30 +0300 |
commit | 3cfce2571f01315c5651a70558023ef57f85ca19 (patch) | |
tree | c89fd2ac9faee2b2b72e2a928967c1c0cff1bf59 | |
parent | Turn off strict aliasing optimization if used (#137) (diff) | |
download | llvm-project-3cfce2571f01315c5651a70558023ef57f85ca19.tar.gz llvm-project-3cfce2571f01315c5651a70558023ef57f85ca19.tar.bz2 llvm-project-3cfce2571f01315c5651a70558023ef57f85ca19.zip |
Async api extensions (#78)
* adding async API as an experimental feature
* initial support for DPCPP backend only
* implementation for subset of algorithm/numeric (copy,fill,for_each,sort,reduce,transform,transform_reduce) with suffix async; accepting an arbitrary number of sycl::event's as last argument to express input dependencies
* returning a future-like object of undefined type that is convertible into a sycl::event.
* lifetime of temporary storage is bound to lifetime of returned object
-rw-r--r-- | include/oneapi/dpl/async | 45 | ||||
-rw-r--r-- | include/oneapi/dpl/internal/async_extension_defs.h | 120 | ||||
-rw-r--r-- | include/oneapi/dpl/internal/async_impl/async_backend_sycl.h | 137 | ||||
-rw-r--r-- | include/oneapi/dpl/internal/async_impl/async_impl.h | 93 | ||||
-rw-r--r-- | include/oneapi/dpl/internal/async_impl/async_impl_hetero.h | 201 | ||||
-rw-r--r-- | include/oneapi/dpl/internal/async_impl/async_utils.h | 128 | ||||
-rw-r--r-- | include/oneapi/dpl/internal/async_impl/glue_async_impl.h | 165 | ||||
-rw-r--r-- | include/oneapi/dpl/pstl/hetero/dpcpp/execution_sycl_defs.h | 23 | ||||
-rw-r--r-- | include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 1 | ||||
-rw-r--r-- | test/xpu_api/asynch.pass.cpp | 153 |
10 files changed, 1064 insertions, 2 deletions
diff --git a/include/oneapi/dpl/async b/include/oneapi/dpl/async new file mode 100644 index 000000000000..d76d2b6185db --- /dev/null +++ b/include/oneapi/dpl/async @@ -0,0 +1,45 @@ +// -*- C++ -*- +//===-- async -------------------------------------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _ONEDPL_ASYNC +#define _ONEDPL_ASYNC + +// Workarounds for libstdc++9, libstdc++10 when new TBB version is found in the environment +#if __cplusplus >= 201703L +# if __has_include(<tbb/version.h>) +# ifndef PSTL_USE_PARALLEL_POLICIES +# define PSTL_USE_PARALLEL_POLICIES (_GLIBCXX_RELEASE != 9) +# endif +# ifndef _GLIBCXX_USE_TBB_PAR_BACKEND +# define _GLIBCXX_USE_TBB_PAR_BACKEND (_GLIBCXX_RELEASE > 10) +# endif +# endif // __has_include(<tbb/version.h>) +#endif // __cplusplus >= 201703L + +#include "oneapi/dpl/pstl/onedpl_config.h" + +#if !_ONEDPL_ASYNC_FORWARD_DECLARED +// If not declared, pull in forward declarations +# include "oneapi/dpl/internal/async_extension_defs.h" +# define _ONEDPL_ASYNC_FORWARD_DECLARED 1 +#endif + +#if _ONEDPL_EXECUTION_POLICIES_DEFINED +// If <execution> has already been included, pull in implementations +# include "oneapi/dpl/internal/async_impl/async_impl.h" +# include "oneapi/dpl/internal/async_impl/glue_async_impl.h" +#endif + +#endif /* _ONEDPL_ASYNC */ diff --git a/include/oneapi/dpl/internal/async_extension_defs.h b/include/oneapi/dpl/internal/async_extension_defs.h new file mode 100644 index 000000000000..abf10896d4af --- /dev/null +++ b/include/oneapi/dpl/internal/async_extension_defs.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _ONEDPL_ASYNC_EXTENSION_DEFS_H +#define _ONEDPL_ASYNC_EXTENSION_DEFS_H + +#include "async_impl/async_utils.h" + +namespace oneapi +{ +namespace dpl +{ + +// Public API for asynch algorithms: +namespace experimental +{ + +template <typename... _Ts> +oneapi::dpl::__internal::__enable_if_convertible_to_events<void, _Ts...> +wait_for_all(_Ts&&... __events); + +template <class _ExecutionPolicy, class _ForwardIterator1, class _ForwardIterator2, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy< + _ExecutionPolicy, oneapi::dpl::__internal::__future<_ForwardIterator2>, _Events...> +copy_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first, _ForwardIterator1 __last, _ForwardIterator2 __result, + _Events&&... __dependencies); + +template <class _ExecutionPolicy, class _ForwardIterator, class _Function, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy< + _ExecutionPolicy, oneapi::dpl::__par_backend_hetero::__future<void>, _Events...> +for_each_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Function __f, + _Events&&... __dependencies); + +template <class _ExecutionPolicy, class _ForwardIt, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy< + _ExecutionPolicy, oneapi::dpl::__internal::__future<typename std::iterator_traits<_ForwardIt>::value_type>, + _Events...> +reduce_async(_ExecutionPolicy&& __exec, _ForwardIt __first, _ForwardIt __last, _Events&&... __dependencies); + +template <class _ExecutionPolicy, class _ForwardIt, class _T, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy_single_no_default< + _ExecutionPolicy, oneapi::dpl::__internal::__future<_T>, _T, _Events...> +reduce_async(_ExecutionPolicy&& __exec, _ForwardIt __first, _ForwardIt __last, _T init, _Events&&... __dependencies); + +template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy_double_no_default< + _ExecutionPolicy, oneapi::dpl::__internal::__future<_Tp>, _Tp, _BinaryOperation, _Events...> +reduce_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Tp __init, + _BinaryOperation __binary_op, _Events&&... __dependencies); + +template <class _ExecutionPolicy, class _ForwardIt1, class _ForwardIt2, class _UnaryOperation, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, + oneapi::dpl::__internal::__future<_ForwardIt2>, _Events...> +transform_async(_ExecutionPolicy&& __exec, _ForwardIt1 first1, _ForwardIt1 last1, _ForwardIt2 d_first, + _UnaryOperation unary_op, _Events&&... __dependencies); + +template <class _ExecutionPolicy, class _ForwardIt1, class _ForwardIt2, class _ForwardIt3, class _BinaryOperation, + class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy_single_no_default< + _ExecutionPolicy, oneapi::dpl::__internal::__future<_ForwardIt3>, _BinaryOperation, _Events...> +transform_async(_ExecutionPolicy&& __exec, _ForwardIt1 first1, _ForwardIt1 last1, _ForwardIt2 first2, + _ForwardIt3 d_first, _BinaryOperation binary_op, _Events&&... __dependencies); + +template <class _ExecutionPolicy, class _ForwardIt1, class _ForwardIt2, class _T, class _BinaryOp1, class _BinaryOp2, + class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy_double_no_default< + _ExecutionPolicy, oneapi::dpl::__internal::__future<_T>, _BinaryOp1, _BinaryOp2, _Events...> +transform_reduce_async(_ExecutionPolicy&& __exec, _ForwardIt1 __first1, _ForwardIt1 __last1, _ForwardIt2 __first2, + _T __init, _BinaryOp1 __binary_op1, _BinaryOp2 __binary_op2, _Events&&... __dependencies); + +template <class _ExecutionPolicy, class _ForwardIt1, class _ForwardIt2, class _T, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, oneapi::dpl::__internal::__future<_T>, + _Events...> +transform_reduce_async(_ExecutionPolicy&& __exec, _ForwardIt1 __first1, _ForwardIt1 __last1, _ForwardIt2 __first2, + _T __init, _Events&&... __dependencies); + +template <class _ExecutionPolicy, class _ForwardIt, class _T, class _BinaryOp, class _UnaryOp, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy_single_no_default< + _ExecutionPolicy, oneapi::dpl::__internal::__future<_T>, _UnaryOp, _Events...> +transform_reduce_async(_ExecutionPolicy&& __exec, _ForwardIt __first, _ForwardIt __last, _T __init, + _BinaryOp __binary_op, _UnaryOp __unary_op, _Events&&... __dependencies); + +template <class _ExecutionPolicy, class _RandomAccessIterator, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy< + _ExecutionPolicy, oneapi::dpl::__par_backend_hetero::__future<void>, _Events...> +sort_async(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, + _Events&&... __dependencies); + +template <class _ExecutionPolicy, class _RandomAccessIterator, class _Compare, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy_single_no_default< + _ExecutionPolicy, oneapi::dpl::__par_backend_hetero::__future<void>, _Compare, _Events...> +sort_async(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, + _Events&&... __dependencies); + +template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy< + _ExecutionPolicy, oneapi::dpl::__par_backend_hetero::__future<void>, _Events...> +fill_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, + _Events&&... __dependencies); + +} // namespace experimental + +} // namespace dpl + +} // namespace oneapi + +#endif /* _ONEDPL_ASYNC_EXTENSION_DEFS_H */ diff --git a/include/oneapi/dpl/internal/async_impl/async_backend_sycl.h b/include/oneapi/dpl/internal/async_impl/async_backend_sycl.h new file mode 100644 index 000000000000..7d048e6c8ffc --- /dev/null +++ b/include/oneapi/dpl/internal/async_impl/async_backend_sycl.h @@ -0,0 +1,137 @@ +// -*- C++ -*- +//===-- async_backend_sycl.h ----------------------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +//!!! NOTE: This file should be included under the macro _ONEDPL_BACKEND_SYCL +#ifndef _ONEDPL_async_backend_sycl_H +#define _ONEDPL_async_backend_sycl_H + +namespace oneapi +{ +namespace dpl +{ +namespace __par_backend_hetero +{ + +// TODO: Merge experimental async pattern into dpcpp backend +//------------------------------------------------------------------------ +// parallel_transform_reduce - async pattern +//------------------------------------------------------------------------ + +template <typename _Tp, ::std::size_t __grainsize = 4, typename _ExecutionPolicy, typename _Up, typename _Cp, + typename _Rp, typename... _Ranges> +oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, oneapi::dpl::__internal::__future<_Tp>> +__parallel_transform_reduce_async(_ExecutionPolicy&& __exec, _Up __u, _Cp __combine, _Rp __brick_reduce, + _Ranges&&... __rngs) +{ + auto __n = __get_first_range(__rngs...).size(); + assert(__n > 0); + + using _Size = decltype(__n); + using _Policy = typename ::std::decay<_ExecutionPolicy>::type; + using __kernel_name = typename _Policy::kernel_name; +#if __SYCL_UNNAMED_LAMBDA__ + using __kernel_name_t = __parallel_reduce_kernel<_Up, _Cp, _Rp, __kernel_name, _Ranges...>; +#else + using __kernel_name_t = __parallel_reduce_kernel<__kernel_name>; +#endif + + sycl::cl_uint __max_compute_units = oneapi::dpl::__internal::__max_compute_units(__exec); + ::std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec); + // change __work_group_size according to local memory limit + __work_group_size = oneapi::dpl::__internal::__max_local_allocation_size<_ExecutionPolicy, _Tp>( + ::std::forward<_ExecutionPolicy>(__exec), __work_group_size); +#if _ONEDPL_COMPILE_KERNEL + sycl::kernel __kernel = __kernel_name_t::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec)); + __work_group_size = ::std::min(__work_group_size, oneapi::dpl::__internal::__kernel_work_group_size( + ::std::forward<_ExecutionPolicy>(__exec), __kernel)); +#endif + ::std::size_t __iters_per_work_item = __grainsize; + // distribution is ~1 work groups per compute init + if (__exec.queue().get_device().is_cpu()) + __iters_per_work_item = (__n - 1) / (__max_compute_units * __work_group_size) + 1; + ::std::size_t __size_per_work_group = + __iters_per_work_item * __work_group_size; // number of buffer elements processed within workgroup + _Size __n_groups = (__n - 1) / __size_per_work_group + 1; // number of work groups + _Size __n_items = (__n - 1) / __iters_per_work_item + 1; // number of work items + + _PRINT_INFO_IN_DEBUG_MODE(__exec, __work_group_size, __max_compute_units); + + // Create temporary global buffers to store temporary values + sycl::buffer<_Tp> __temp(sycl::range<1>(2 * __n_groups)); + // __is_first == true. Reduce over each work_group + // __is_first == false. Reduce between work groups + bool __is_first = true; + + // For memory utilization it's better to use one big buffer instead of two small because size of the buffer is close to a few MB + _Size __offset_1 = 0; + _Size __offset_2 = __n_groups; + + sycl::event __reduce_event; + do + { + __reduce_event = __exec.queue().submit([&](sycl::handler& __cgh) { + __cgh.depends_on(__reduce_event); + + oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); //get an access to data under SYCL buffer + auto __temp_acc = __temp.template get_access<access_mode::read_write>(__cgh); + sycl::accessor<_Tp, 1, access_mode::read_write, sycl::access::target::local> __temp_local( + sycl::range<1>(__work_group_size), __cgh); + __cgh.parallel_for<__kernel_name_t>( +#if _ONEDPL_COMPILE_KERNEL + __kernel, +#endif + sycl::nd_range<1>(sycl::range<1>(__n_groups * __work_group_size), sycl::range<1>(__work_group_size)), + [=](sycl::nd_item<1> __item_id) { + ::std::size_t __global_idx = __item_id.get_global_id(0); + ::std::size_t __local_idx = __item_id.get_local_id(0); + // 1. Initialization (transform part). Fill local memory + if (__is_first) + { + __u(__item_id, __n, __iters_per_work_item, __global_idx, __temp_local, __rngs...); + } + else + { + // TODO: check the approach when we use grainsize here too + if (__global_idx < __n_items) + __temp_local[__local_idx] = __temp_acc[__offset_2 + __global_idx]; + __item_id.barrier(sycl::access::fence_space::local_space); + } + // 2. Reduce within work group using local memory + _Tp __result = __brick_reduce(__item_id, __global_idx, __n_items, __temp_local); + if (__local_idx == 0) + { + __temp_acc[__offset_1 + __item_id.get_group(0)] = __result; + } + }); + }); + if (__is_first) + { + __is_first = false; + } + ::std::swap(__offset_1, __offset_2); + __n_items = __n_groups; + __n_groups = (__n_items - 1) / __work_group_size + 1; + } while (__n_items > 1); + //return future to postpone implicit synchronization point accessing return value + return ::oneapi::dpl::__internal::__future<_Tp>(__reduce_event, __temp, __combine, __offset_2); +} + +} // namespace __par_backend_hetero + +} // namespace dpl + +} // namespace oneapi + +#endif /* _ONEDPL_async_backend_sycl_H */ diff --git a/include/oneapi/dpl/internal/async_impl/async_impl.h b/include/oneapi/dpl/internal/async_impl/async_impl.h new file mode 100644 index 000000000000..ac92bc9a7a32 --- /dev/null +++ b/include/oneapi/dpl/internal/async_impl/async_impl.h @@ -0,0 +1,93 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _ONEDPL_ASYNC_IMPL_H +#define _ONEDPL_ASYNC_IMPL_H + +#if _ONEDPL_HETERO_BACKEND +# include "async_impl_hetero.h" +#endif + +#include "glue_async_impl.h" + +namespace oneapi +{ +namespace dpl +{ +namespace experimental +{ + +// [wait_for_all] +template <typename... _Ts> +oneapi::dpl::__internal::__enable_if_convertible_to_events<void, _Ts...> +wait_for_all(_Ts&&... __events) +{ + ::std::initializer_list<int> i = {0, (__events.wait(), 0)...}; + (void)i; +} + +// [async.reduce] +template <class _ExecutionPolicy, class _ForwardIt, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy< + _ExecutionPolicy, oneapi::dpl::__internal::__future<typename std::iterator_traits<_ForwardIt>::value_type>, + _Events...> +reduce_async(_ExecutionPolicy&& __exec, _ForwardIt __first, _ForwardIt __last, _Events&&... __dependencies) +{ + using _Tp = typename std::iterator_traits<_ForwardIt>::value_type; + return reduce_async(::std::forward<_ExecutionPolicy>(__exec), __first, __last, _Tp(0), ::std::plus<_Tp>(), + ::std::forward<_Events>(__dependencies)...); +} + +template <class _ExecutionPolicy, class _ForwardIt, class _T, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy_single_no_default< + _ExecutionPolicy, oneapi::dpl::__internal::__future<_T>, _T, _Events...> +reduce_async(_ExecutionPolicy&& __exec, _ForwardIt __first, _ForwardIt __last, _T __init, _Events&&... __dependencies) +{ + return reduce_async(::std::forward<_ExecutionPolicy>(__exec), __first, __last, __init, ::std::plus<_T>(), + ::std::forward<_Events>(__dependencies)...); +} + +// [async.transform_reduce] +template <class _ExecutionPolicy, class _ForwardIt1, class _ForwardIt2, class _T, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, oneapi::dpl::__internal::__future<_T>, + _Events...> +transform_reduce_async(_ExecutionPolicy&& __exec, _ForwardIt1 __first1, _ForwardIt1 __last1, _ForwardIt2 __first2, + _T __init, _Events&&... __dependencies) +{ + using __T1 = typename ::std::iterator_traits<_ForwardIt1>::value_type; + return transform_reduce_async(::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __init, + ::std::plus<_T>(), ::std::multiplies<__T1>(), + ::std::forward<_Events>(__dependencies)...); +} + +// [async.sort] +template <class _ExecutionPolicy, class _RandomAccessIterator, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy< + _ExecutionPolicy, oneapi::dpl::__par_backend_hetero::__future<void>, _Events...> +sort_async(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, + _Events&&... __dependencies) +{ + using __T = typename ::std::iterator_traits<_RandomAccessIterator>::value_type; + return sort_async(::std::forward<_ExecutionPolicy>(__exec), __first, __last, ::std::less<__T>(), + ::std::forward<_Events>(__dependencies)...); +} + +} // namespace experimental + +} // namespace dpl + +} // namespace oneapi + +#endif /* _ONEDPL_GLUE_ASYNC_IMPL_H */ diff --git a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h new file mode 100644 index 000000000000..3ffe694bdb4d --- /dev/null +++ b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _ONEDPL_ASYNC_IMPL_HETERO_H +#define _ONEDPL_ASYNC_IMPL_HETERO_H + +#if _ONEDPL_BACKEND_SYCL +# include "async_backend_sycl.h" +#endif + +namespace oneapi +{ +namespace dpl +{ +namespace __internal +{ + +template <typename _ExecutionPolicy, typename _ForwardIterator, typename _Function> +oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, + oneapi::dpl::__par_backend_hetero::__future<void>> +__pattern_walk1_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Function __f) +{ + auto __n = __last - __first; + if (__n <= 0) + return oneapi::dpl::__par_backend_hetero::__future<void>(sycl::event{}); + + auto __keep = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator>(); + auto __buf = __keep(__first, __last); + + auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for( + ::std::forward<_ExecutionPolicy>(__exec), unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, + __buf.all_view()); + return __future_obj; +} + +template <typename _IsSync = ::std::false_type, + __par_backend_hetero::access_mode __acc_mode1 = __par_backend_hetero::access_mode::read, + __par_backend_hetero::access_mode __acc_mode2 = __par_backend_hetero::access_mode::write, + typename _ExecutionPolicy, typename _ForwardIterator1, typename _ForwardIterator2, typename _Function> +oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, + oneapi::dpl::__internal::__future<_ForwardIterator2>> +__pattern_walk2_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _ForwardIterator1 __last1, + _ForwardIterator2 __first2, _Function __f) +{ + auto __n = __last1 - __first1; + if (__n <= 0) + return oneapi::dpl::__internal::__future<_ForwardIterator2>(sycl::event{}, __first2); + + auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__acc_mode1, _ForwardIterator1>(); + auto __buf1 = __keep1(__first1, __last1); + + auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__acc_mode2, _ForwardIterator2>(); + auto __buf2 = __keep2(__first2, __first2 + __n); + + auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for( + ::std::forward<_ExecutionPolicy>(__exec), unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, + __buf1.all_view(), __buf2.all_view()); + oneapi::dpl::__internal::__invoke_if(_IsSync(), [&__future_obj]() { __future_obj.wait(); }); + + return oneapi::dpl::__internal::__future<_ForwardIterator2>(__future_obj, __first2 + __n); +} + +template <typename _ExecutionPolicy, typename _ForwardIterator1, typename _ForwardIterator2, typename _ForwardIterator3, + typename _Function> +oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, + oneapi::dpl::__internal::__future<_ForwardIterator3>> +__pattern_walk3_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _ForwardIterator1 __last1, + _ForwardIterator2 __first2, _ForwardIterator3 __first3, _Function __f) +{ + auto __n = __last1 - __first1; + if (__n <= 0) + return oneapi::dpl::__internal::__future<_ForwardIterator3>(sycl::event{}, __first3); + + auto __keep1 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _ForwardIterator1>(); + auto __buf1 = __keep1(__first1, __last1); + auto __keep2 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _ForwardIterator2>(); + auto __buf2 = __keep2(__first2, __first2 + __n); + auto __keep3 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator3>(); + auto __buf3 = __keep3(__first3, __first3 + __n); + + auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for( + ::std::forward<_ExecutionPolicy>(__exec), unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, + __buf1.all_view(), __buf2.all_view(), __buf3.all_view()); + + return oneapi::dpl::__internal::__future<_ForwardIterator3>(__future_obj, __first3 + __n); +} + +template <typename _ExecutionPolicy, typename _ForwardIterator1, typename _ForwardIterator2, typename _Brick> +oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, + oneapi::dpl::__internal::__future<_ForwardIterator2>> +__pattern_walk2_brick_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _ForwardIterator1 __last1, + _ForwardIterator2 __first2, _Brick __brick) +{ + return __pattern_walk2_async( + __par_backend_hetero::make_wrapped_policy<__walk2_brick_wrapper>(::std::forward<_ExecutionPolicy>(__exec)), + __first1, __last1, __first2, __brick); +} + +//------------------------------------------------------------------------ +// transform_reduce (version with two binary functions) +//------------------------------------------------------------------------ + +template <typename _ExecutionPolicy, typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Tp, + typename _BinaryOperation1, typename _BinaryOperation2> +oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, oneapi::dpl::__internal::__future<_Tp>> +__pattern_transform_reduce_async(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1, + _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, _Tp __init, + _BinaryOperation1 __binary_op1, _BinaryOperation2 __binary_op2) +{ + if (__first1 == __last1) + return oneapi::dpl::__internal::__future<_Tp>(__init); + + using _Policy = _ExecutionPolicy; + using _Functor = unseq_backend::walk_n<_Policy, _BinaryOperation2>; + using _RepackedTp = __par_backend_hetero::__repacked_tuple_t<_Tp>; + + auto __n = __last1 - __first1; + auto __keep1 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _RandomAccessIterator1>(); + auto __buf1 = __keep1(__first1, __last1); + auto __keep2 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _RandomAccessIterator2>(); + auto __buf2 = __keep2(__first2, __first2 + __n); + + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce_async<_RepackedTp>( + ::std::forward<_ExecutionPolicy>(__exec), + unseq_backend::transform_init<_Policy, _BinaryOperation1, _Functor>{__binary_op1, + _Functor{__binary_op2}}, // transform + __binary_op1, // combine + unseq_backend::reduce<_Policy, _BinaryOperation1, _RepackedTp>{__binary_op1}, // reduce + __buf1.all_view(), __buf2.all_view()); + __res.set(__init); + return __res; +} + +//------------------------------------------------------------------------ +// transform_reduce (with unary and binary functions) +//------------------------------------------------------------------------ + +template <typename _ExecutionPolicy, typename _ForwardIterator, typename _Tp, typename _BinaryOperation, + typename _UnaryOperation> +oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, oneapi::dpl::__internal::__future<_Tp>> +__pattern_transform_reduce_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, + _Tp __init, _BinaryOperation __binary_op, _UnaryOperation __unary_op) +{ + if (__first == __last) + return oneapi::dpl::__internal::__future<_Tp>(__init); + + using _Policy = _ExecutionPolicy; + using _Functor = unseq_backend::walk_n<_Policy, _UnaryOperation>; + using _RepackedTp = __par_backend_hetero::__repacked_tuple_t<_Tp>; + + auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _ForwardIterator>(); + auto __buf = __keep(__first, __last); + + auto res = oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce_async<_RepackedTp>( + ::std::forward<_ExecutionPolicy>(__exec), + unseq_backend::transform_init<_Policy, _BinaryOperation, _Functor>{__binary_op, + _Functor{__unary_op}}, // transform + __binary_op, // combine + unseq_backend::reduce<_Policy, _BinaryOperation, _RepackedTp>{__binary_op}, // reduce + __buf.all_view()); + res.set(__init); + return res; +} + +template <typename _ExecutionPolicy, typename _ForwardIterator, typename _T> +oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, + oneapi::dpl::__par_backend_hetero::__future<void>> +__pattern_fill_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, const _T& __value) +{ + auto ret_val = + __pattern_walk1_async(::std::forward<_ExecutionPolicy>(__exec), + __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__first), + __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__last), + fill_functor<_T>{__value}); + return ret_val; +} + +} // namespace __internal +} // namespace dpl +} // namespace oneapi + +#endif /* _ONEDPL_ASYNC_IMPL_HETERO_H */ diff --git a/include/oneapi/dpl/internal/async_impl/async_utils.h b/include/oneapi/dpl/internal/async_impl/async_utils.h new file mode 100644 index 000000000000..d4198e74b1c6 --- /dev/null +++ b/include/oneapi/dpl/internal/async_impl/async_utils.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _ONEDPL_ASYNC_UTILS_H +#define _ONEDPL_ASYNC_UTILS_H + +#if _ONEDPL_BACKEND_SYCL +# include <CL/sycl.hpp> +#endif + +namespace oneapi +{ +namespace dpl +{ +namespace __internal +{ + +template <typename _T> +struct async_value_base +{ + virtual ~async_value_base() = default; + virtual _T data(_T) = 0; +}; + +template <typename _T, typename _Buf, typename _Op> +class async_value : public async_value_base<_T> +{ + _Buf __my_buffer; + _Op __my_op; + size_t __my_offset; + + public: + async_value(_Buf __b, _Op __o, size_t __i) : __my_buffer(__b), __my_op(__o), __my_offset(__i) {} + _T + data(_T __init) + { + return __my_op(__my_buffer.template get_access<access_mode::read>()[__my_offset], __init); + } +}; + +template <typename _T> +class __future : public __par_backend_hetero::__future_base +{ + ::std::unique_ptr<async_value_base<_T>> __ret_val; + _T __init; + + public: + __future(_T __i) : __par_backend_hetero::__future_base(), __init(__i) {} + + template <typename _Event, typename _Op, typename _Buf> + __future(_Event __e, _Buf __b, _Op __o, size_t __offset) : __par_backend_hetero::__future_base(__e) + { + __ret_val = ::std::unique_ptr<async_value<_T, _Buf, _Op>>(new async_value<_T, _Buf, _Op>(__b, __o, __offset)); + } + void + set(_T __i) + { + __init = __i; + } + _T + get() + { + this->wait(); + return __ret_val->data(__init); + } +}; + +#if _ONEDPL_BACKEND_SYCL +// Specialization for sycl_iterator +template <typename _T> +class __future<sycl_iterator<sycl::access::mode::read_write, _T, sycl::buffer_allocator>> + : public __par_backend_hetero::__future_base +{ + using _Tp = sycl_iterator<sycl::access::mode::read_write, _T, sycl::buffer_allocator>; + _Tp __data; + ::std::unique_ptr<__par_backend_hetero::__lifetime_keeper_base> __tmp; + + public: + template <typename... _Ts> + __future(sycl::event __e, _Tp __d, _Ts... __t) : __par_backend_hetero::__future_base(__e), __data(__d) + { + if (sizeof...(_Ts) != 0) + __tmp = ::std::unique_ptr<__par_backend_hetero::__lifetime_keeper<_Ts...>>( + new __par_backend_hetero::__lifetime_keeper<_Ts...>(__t...)); + } + _Tp + get() + { + this->wait(); + return __data; + } +}; +#endif + +template <typename _ExecPolicy, typename _T, typename _Op1, typename... _Events> +using __enable_if_device_execution_policy_single_no_default = typename ::std::enable_if< + oneapi::dpl::__internal::__is_device_execution_policy<typename ::std::decay<_ExecPolicy>::type>::value && + !::std::is_convertible<_Op1, sycl::event>::value && + oneapi::dpl::__internal::__is_convertible_to_event<_Events...>::value, + _T>::type; + +template <typename _ExecPolicy, typename _T, typename _Op1, typename _Op2, typename... _Events> +using __enable_if_device_execution_policy_double_no_default = typename ::std::enable_if< + oneapi::dpl::__internal::__is_device_execution_policy<typename ::std::decay<_ExecPolicy>::type>::value && + !::std::is_convertible<_Op1, sycl::event>::value && !::std::is_convertible<_Op2, sycl::event>::value && + oneapi::dpl::__internal::__is_convertible_to_event<_Events...>::value, + _T>::type; + +} // namespace __internal + +} // namespace dpl + +} // namespace oneapi + +#endif /* _ONEDPL_ASYNC_UTILS_H */ diff --git a/include/oneapi/dpl/internal/async_impl/glue_async_impl.h b/include/oneapi/dpl/internal/async_impl/glue_async_impl.h new file mode 100644 index 000000000000..e6b25e0e7c90 --- /dev/null +++ b/include/oneapi/dpl/internal/async_impl/glue_async_impl.h @@ -0,0 +1,165 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _ONEDPL_GLUE_ASYNC_IMPL_H +#define _ONEDPL_GLUE_ASYNC_IMPL_H + +#if _ONEDPL_HETERO_BACKEND +# include "async_impl_hetero.h" +#endif + +namespace oneapi +{ +namespace dpl +{ +namespace experimental +{ + +// [async.transform] +template <class _ExecutionPolicy, class _ForwardIterator1, class _ForwardIterator2, class _UnaryOperation, + class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy< + _ExecutionPolicy, oneapi::dpl::__internal::__future<_ForwardIterator2>, _Events...> +transform_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first, _ForwardIterator1 __last, + _ForwardIterator2 __result, _UnaryOperation __op, _Events&&... __dependencies) +{ + wait_for_all(::std::forward<_Events>(__dependencies)...); + auto ret_val = oneapi::dpl::__internal::__pattern_walk2_async( + ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __result, + oneapi::dpl::__internal::__invoke_unary_op<_UnaryOperation>{::std::move(__op)}); + return ret_val; +} + +template <class _ExecutionPolicy, class _ForwardIterator1, class _ForwardIterator2, class _ForwardIterator, + class _BinaryOperation, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy_single_no_default< + _ExecutionPolicy, oneapi::dpl::__internal::__future<_ForwardIterator>, _BinaryOperation, _Events...> +transform_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _ForwardIterator1 __last1, + _ForwardIterator2 __first2, _ForwardIterator __result, _BinaryOperation __op, + _Events&&... __dependencies) +{ + wait_for_all(::std::forward<_Events>(__dependencies)...); + auto ret_val = oneapi::dpl::__internal::__pattern_walk3_async( + ::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __result, + oneapi::dpl::__internal::__transform_functor< + oneapi::dpl::__internal::__ref_or_copy<_ExecutionPolicy, _BinaryOperation>>(__op)); + return ret_val; +} + +// [async.copy] +template <class _ExecutionPolicy, class _ForwardIterator1, class _ForwardIterator2, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy< + _ExecutionPolicy, oneapi::dpl::__internal::__future<_ForwardIterator2>, _Events...> +copy_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first, _ForwardIterator1 __last, _ForwardIterator2 __result, + _Events&&... __dependencies) +{ + wait_for_all(::std::forward<_Events>(__dependencies)...); + auto ret_val = oneapi::dpl::__internal::__pattern_walk2_brick_async( + ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __result, + oneapi::dpl::__internal::__brick_copy<_ExecutionPolicy>{}); + return ret_val; +} + +// [async.sort] +template <class _ExecutionPolicy, class _Iterator, class _Compare, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy_single_no_default< + _ExecutionPolicy, oneapi::dpl::__par_backend_hetero::__future<void>, _Compare, _Events...> +sort_async(_ExecutionPolicy&& __exec, _Iterator __first, _Iterator __last, _Compare __comp, _Events&&... __dependencies) +{ + wait_for_all(::std::forward<_Events>(__dependencies)...); + if (__last - __first < 2) + return oneapi::dpl::__par_backend_hetero::__future<void>(sycl::event{}); + + auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator>(); + auto __buf = __keep(__first, __last); + + return __par_backend_hetero::__parallel_stable_sort(::std::forward<_ExecutionPolicy>(__exec), __buf.all_view(), + __comp); +} + +// [async.for_each] +template <class _ExecutionPolicy, class _ForwardIterator, class _Function, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy< + _ExecutionPolicy, oneapi::dpl::__par_backend_hetero::__future<void>, _Events...> +for_each_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Function __f, + _Events&&... __dependencies) +{ + wait_for_all(::std::forward<_Events>(__dependencies)...); + auto ret_val = + oneapi::dpl::__internal::__pattern_walk1_async(::std::forward<_ExecutionPolicy>(__exec), __first, __last, __f); + return ret_val; +} + +// [async.reduce] + +template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy_double_no_default< + _ExecutionPolicy, oneapi::dpl::__internal::__future<_Tp>, _Tp, _BinaryOperation, _Events...> +reduce_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Tp __init, + _BinaryOperation __binary_op, _Events&&... __dependencies) +{ + wait_for_all(::std::forward<_Events>(__dependencies)...); + typedef typename ::std::iterator_traits<_ForwardIterator>::value_type _InputType; + auto ret_val = oneapi::dpl::__internal::__pattern_transform_reduce_async( + ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __init, ::std::plus<_InputType>(), + oneapi::dpl::__internal::__no_op()); + return ret_val; +} + +// [async.fill] + +template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy< + _ExecutionPolicy, oneapi::dpl::__par_backend_hetero::__future<void>, _Events...> +fill_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, + _Events&&... __dependencies) +{ + wait_for_all(::std::forward<_Events>(__dependencies)...); + return oneapi::dpl::__internal::__pattern_fill_async(::std::forward<_ExecutionPolicy>(__exec), __first, __last, + __value); +} + +// [async.transform_reduce] + +template <class _ExecutionPolicy, class _ForwardIt1, class _ForwardIt2, class _T, class _BinaryOp1, class _BinaryOp2, + class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy_double_no_default< + _ExecutionPolicy, oneapi::dpl::__internal::__future<_T>, _BinaryOp1, _BinaryOp2, _Events...> +transform_reduce_async(_ExecutionPolicy&& __exec, _ForwardIt1 __first1, _ForwardIt1 __last1, _ForwardIt2 __first2, + _T __init, _BinaryOp1 __binary_op1, _BinaryOp2 __binary_op2, _Events&&... __dependencies) +{ + wait_for_all(::std::forward<_Events>(__dependencies)...); + return oneapi::dpl::__internal::__pattern_transform_reduce_async( + ::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __init, __binary_op1, __binary_op2); +} + +template <class _ExecutionPolicy, class _ForwardIt, class _T, class _BinaryOp, class _UnaryOp, class... _Events> +oneapi::dpl::__internal::__enable_if_device_execution_policy_single_no_default< + _ExecutionPolicy, oneapi::dpl::__internal::__future<_T>, _UnaryOp, _Events...> +transform_reduce_async(_ExecutionPolicy&& __exec, _ForwardIt __first, _ForwardIt __last, _T __init, + _BinaryOp __binary_op, _UnaryOp __unary_op, _Events&&... __dependencies) +{ + wait_for_all(::std::forward<_Events>(__dependencies)...); + return oneapi::dpl::__internal::__pattern_transform_reduce_async(::std::forward<_ExecutionPolicy>(__exec), __first, + __last, __init, __binary_op, __unary_op); +} + +} // namespace experimental + +} // namespace dpl + +} // namespace oneapi + +#endif /* _ONEDPL_GLUE_ASYNC_IMPL_H */ diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/execution_sycl_defs.h b/include/oneapi/dpl/pstl/hetero/dpcpp/execution_sycl_defs.h index b871719d4d28..c989ab40c405 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/execution_sycl_defs.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/execution_sycl_defs.h @@ -280,10 +280,29 @@ struct __ref_or_copy_impl<execution::device_policy<PolicyParams...>, _T> using type = _T; }; +// Extension: check if parameter pack is convertible to events +template <bool...> +struct __is_true_helper +{ +}; + +template <bool... _Ts> +using __is_all_true = ::std::is_same<__is_true_helper<_Ts..., true>, __is_true_helper<true, _Ts...>>; + +template <class... _Ts> +using __is_convertible_to_event = + __is_all_true<::std::is_convertible<typename ::std::decay<_Ts>::type, sycl::event>::value...>; + +template <typename _T, typename... _Events> +using __enable_if_convertible_to_events = + typename ::std::enable_if<oneapi::dpl::__internal::__is_convertible_to_event<_Events...>::value, _T>::type; + // Extension: execution policies type traits -template <typename _ExecPolicy, typename _T> +template <typename _ExecPolicy, typename _T, typename... _Events> using __enable_if_device_execution_policy = typename ::std::enable_if< - oneapi::dpl::__internal::__is_device_execution_policy<typename ::std::decay<_ExecPolicy>::type>::value, _T>::type; + oneapi::dpl::__internal::__is_device_execution_policy<typename ::std::decay<_ExecPolicy>::type>::value && + oneapi::dpl::__internal::__is_convertible_to_event<_Events...>::value, + _T>::type; template <typename _ExecPolicy, typename _T> using __enable_if_hetero_execution_policy = typename ::std::enable_if< diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 81595360d37c..c3dfc4564917 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -386,6 +386,7 @@ class __future_base sycl::event __my_event; public: + __future_base() : __my_event(sycl::event{}) {} __future_base(sycl::event __e) : __my_event(__e) {} void wait() diff --git a/test/xpu_api/asynch.pass.cpp b/test/xpu_api/asynch.pass.cpp new file mode 100644 index 000000000000..0ba6d5cc19e3 --- /dev/null +++ b/test/xpu_api/asynch.pass.cpp @@ -0,0 +1,153 @@ +// -*- C++ -*- +//===-- async.pass.cpp ----------------------------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#include "oneapi/dpl/execution" +#include "oneapi/dpl/async" +#include "oneapi/dpl/iterator" + +#include "support/pstl_test_config.h" + +#include <iostream> +#include <iomanip> +#include <numeric> + +#if TEST_SYCL_PRESENT +# include <CL/sycl.hpp> +#endif + +template <typename _T1, typename _T2> +void +ASSERT_EQUAL(_T1&& X, _T2&& Y) +{ + if (X != Y) + std::cout << "CHECK CORRECTNESS (ASYNC): fail (" << X << "," << Y << ")" << std::endl; +} + +#if TEST_DPCPP_BACKEND_PRESENT +void +test_with_buffers() +{ + const int n = 100; + { + //sycl::queue q; + sycl::buffer<int> x{n}; + sycl::buffer<int> y{n}; + + //auto my_policy = oneapi::dpl::execution::make_device_policy(q); + auto my_policy = oneapi::dpl::execution::make_device_policy<class Copy1>(oneapi::dpl::execution::dpcpp_default); + auto res_1a = oneapi::dpl::experimental::copy_async(my_policy, oneapi::dpl::counting_iterator<int>(0), + oneapi::dpl::counting_iterator<int>(n), + oneapi::dpl::begin(x)); // x = [0..n] + auto my_policy1 = oneapi::dpl::execution::make_device_policy<class Fill1>(my_policy); + auto res_1b = oneapi::dpl::experimental::fill_async(my_policy1, oneapi::dpl::begin(y), oneapi::dpl::end(y), + 7); // y = [7..7] + auto my_policy2 = oneapi::dpl::execution::make_device_policy<class ForEach1>(my_policy); + auto res_2a = oneapi::dpl::experimental::for_each_async( + my_policy2, oneapi::dpl::begin(x), oneapi::dpl::end(x), [](int& e) { ++e; }, res_1a); // x = [1..n] + auto my_policy3 = oneapi::dpl::execution::make_device_policy<class Transform1>(my_policy); + auto res_2b = oneapi::dpl::experimental::transform_async( + my_policy3, oneapi::dpl::begin(y), oneapi::dpl::end(y), oneapi::dpl::begin(y), + [](const int& e) { return e / 2; }, + res_1b); // y = [3..3] + + sycl::buffer<int> z{n}; //std::vector<int> z(n); + auto my_policy4 = oneapi::dpl::execution::make_device_policy<class Transform2>(my_policy); + auto res_3 = oneapi::dpl::experimental::transform_async(my_policy4, oneapi::dpl::begin(x), oneapi::dpl::end(x), + oneapi::dpl::begin(y), oneapi::dpl::begin(z), + std::plus<int>(), res_2a, res_2b); // z = [4..n+3] + auto my_policy5 = oneapi::dpl::execution::make_device_policy<class Reduce1>(my_policy); + auto alpha = oneapi::dpl::experimental::reduce_async(my_policy5, oneapi::dpl::begin(x), oneapi::dpl::end(x), 0, + std::plus<int>(), + res_2a) + .get(); // alpha = n*(n+1)/2 + auto my_policy6 = oneapi::dpl::execution::make_device_policy<class Reduce2>(my_policy); + auto beta = + oneapi::dpl::experimental::transform_reduce_async(my_policy6, oneapi::dpl::begin(z), oneapi::dpl::end(z), 0, + std::plus<int>(), [=](int e) { return alpha * e; }) + .get(); + + ASSERT_EQUAL(beta, (n * (n + 1) / 2) * ((n + 3) * (n + 4) / 2 - 6)); + } +} + +void +test_with_usm() +{ + cl::sycl::queue q; + const int n = 1024; + const int n_small = 13; + + // ASYNC TEST USING USM // + // TODO: Extend tests by checking true async behavior in more detail + { + // Allocate space for data using USM. + uint64_t* data1 = + static_cast<uint64_t*>(cl::sycl::malloc_shared(n * sizeof(uint64_t), q.get_device(), q.get_context())); + uint64_t* data2 = + static_cast<uint64_t*>(cl::sycl::malloc_shared(n * sizeof(uint64_t), q.get_device(), q.get_context())); + + // Initialize data + for (int i = 0; i != n - 1; ++i) + { + data1[i] = i % 4 + 1; + data2[i] = data1[i] + 1; + if (i > 3 && i != n - 2) + { + ++i; + data1[i] = data1[i - 1]; + data2[i] = data2[i - 1]; + } + } + data1[n - 1] = 0; + data2[n - 1] = 0; + + // compute reference values + const uint64_t ref1 = std::inner_product(data2, data2 + n, data1, 0); + const uint64_t ref2 = std::accumulate(data1, data1 + n_small, 0); + + // call first algorithm + auto new_policy1 = oneapi::dpl::execution::make_device_policy<class async1>(q); + auto fut1 = oneapi::dpl::experimental::transform_reduce_async( + new_policy1, data2, data2 + n, data1, 0, std::plus<uint64_t>(), std::multiplies<uint64_t>()); + + // call second algorithm and wait for result + auto new_policy2 = oneapi::dpl::execution::make_device_policy<class async2>(q); + auto res2 = oneapi::dpl::experimental::reduce_async(new_policy2, data1, data1 + n_small).get(); + + // call third algorithm that has to wait for first to complete + auto new_policy3 = oneapi::dpl::execution::make_device_policy<class async3>(q); + oneapi::dpl::experimental::sort_async(new_policy3, data2, data2 + n, fut1); + + // check values + auto res1 = fut1.get(); + ASSERT_EQUAL(res1, ref1); + ASSERT_EQUAL(res2, ref2); + + sycl::free(data1, q); + sycl::free(data2, q); + } +} +#endif + +int +main() +{ +#if TEST_DPCPP_BACKEND_PRESENT + test_with_buffers(); + test_with_usm(); +#endif + std::cout << "done" << std::endl; + return 0; +} |