Make the single WG implementation without static dispatch

akukanov · akukanov · commit e452faf75ffe · 2025-10-31T01:13:22.000+01:00
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -483,27 +483,25 @@ struct __parallel_transform_scan_static_single_group_submitter<_Inclusive, _Elem
     }
 };
 
-template <typename _Size, std::uint16_t _ElemsPerItem, std::uint16_t _WGSize, typename _KernelName>
+template <typename _Size, typename _KernelName>
 struct __parallel_copy_if_static_single_group_submitter;
 
-template <typename _Size, std::uint16_t _ElemsPerItem, std::uint16_t _WGSize, typename... _ScanKernelName>
-struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _WGSize,
-                                                        __internal::__optional_kernel_name<_ScanKernelName...>>
+template <typename _Size, typename... _ScanKernelName>
+struct __parallel_copy_if_static_single_group_submitter<_Size, __internal::__optional_kernel_name<_ScanKernelName...>>
 {
     template <typename _InRng, typename _OutRng, typename _UnaryOp, typename _Assign>
     __future<sycl::event, __result_and_scratch_storage<_Size>>
     operator()(sycl::queue& __q, _InRng&& __in_rng, _OutRng&& __out_rng, std::size_t __n, _UnaryOp __unary_op,
-               _Assign __assign)
+               _Assign __assign, std::uint16_t __n_uniform, std::uint16_t __wg_size)
     {
-        using _ValueType = ::std::uint16_t;
+        using _ValueType = std::uint16_t;
 
-        // This type is used as a workaround for when an internal tuple is assigned to ::std::tuple, such as
+        // This type is used as a workaround for when an internal tuple is assigned to std::tuple, such as
         // with zip_iterator
         using __tuple_type =
             typename ::oneapi::dpl::__internal::__get_tuple_type<std::decay_t<decltype(__in_rng[0])>,
                                                                  std::decay_t<decltype(__out_rng[0])>>::__type;
 
-        constexpr ::std::uint32_t __elems_per_wg = _ElemsPerItem * _WGSize;
         using __result_and_scratch_storage_t = __result_and_scratch_storage<_Size>;
         __result_and_scratch_storage_t __result{__q, 0};
 
@@ -513,37 +511,37 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
             // Local memory is split into two parts. The first half stores the result of applying the
             // predicate on each element of the input range. The second half stores the index of the output
             // range to copy elements of the input range.
-            auto __lacc = __dpl_sycl::__local_accessor<_ValueType>(sycl::range<1>{__elems_per_wg * 2}, __hdl);
+            auto __lacc = __dpl_sycl::__local_accessor<_ValueType>(sycl::range<1>(std::size_t(__n_uniform) * 2), __hdl);
             auto __res_acc =
                 __result.template __get_result_acc<sycl::access_mode::write>(__hdl, __dpl_sycl::__no_init{});
 
             __hdl.parallel_for<_ScanKernelName...>(
-                sycl::nd_range<1>(_WGSize, _WGSize), [=](sycl::nd_item<1> __self_item) {
+                sycl::nd_range<1>(__wg_size, __wg_size), [=](sycl::nd_item<1> __self_item) {
                     auto __res_ptr = __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__res_acc);
                     const auto& __group = __self_item.get_group();
                     // This kernel is only launched for sizes less than 2^16
-                    const ::std::uint16_t __item_id = __self_item.get_local_linear_id();
+                    const std::uint16_t __item_id = __self_item.get_local_linear_id();
                     auto __lacc_ptr = __dpl_sycl::__get_accessor_ptr(__lacc);
-                    for (std::uint16_t __idx = __item_id; __idx < __n; __idx += _WGSize)
+                    for (std::uint16_t __idx = __item_id; __idx < __n; __idx += __wg_size)
                     {
                         __lacc[__idx] = __unary_op(__in_rng[__idx]);
                     }
 
                     __scan_work_group<_ValueType, /* _Inclusive */ false>(
-                        __group, __lacc_ptr, __lacc_ptr + __elems_per_wg, __lacc_ptr + __elems_per_wg,
+                        __group, __lacc_ptr, __lacc_ptr + __n, __lacc_ptr + __n_uniform,
                         sycl::plus<_ValueType>{});
 
-                    for (::std::uint16_t __idx = __item_id; __idx < __n; __idx += _WGSize)
+                    for (std::uint16_t __idx = __item_id; __idx < __n; __idx += __wg_size)
                     {
                         if (__lacc[__idx])
                             __assign(static_cast<__tuple_type>(__in_rng[__idx]),
-                                     __out_rng[__lacc[__idx + __elems_per_wg]]);
+                                     __out_rng[__lacc[__idx + __n_uniform]]);
                     }
 
                     if (__item_id == 0)
                     {
                         // Add predicate of last element to account for the scan's exclusivity
-                        *__res_ptr = __lacc[__elems_per_wg + __n - 1] + __lacc[__n - 1];
+                        *__res_ptr = __lacc[__n_uniform + __n - 1] + __lacc[__n - 1];
                     }
                 });
         });
@@ -726,30 +724,6 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag, _Execut
         unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init});
 }
 
-template <typename _CustomName, typename _SizeType>
-struct __invoke_single_group_copy_if
-{
-    // Specialization for devices that have a max work-group size of at least 1024
-    static constexpr ::std::uint16_t __targeted_wg_size = 1024;
-
-    template <std::uint16_t _Size, typename _InRng, typename _OutRng, typename _Pred,
-              typename _Assign = oneapi::dpl::__internal::__pstl_assign>
-    auto
-    operator()(sycl::queue& __q, std::size_t __n, _InRng&& __in_rng, _OutRng&& __out_rng, _Pred __pred,
-               _Assign __assign)
-    {
-        constexpr ::std::uint16_t __wg_size = ::std::min(_Size, __targeted_wg_size);
-        constexpr ::std::uint16_t __num_elems_per_item = ::oneapi::dpl::__internal::__dpl_ceiling_div(_Size, __wg_size);
-
-        using _KernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
-            __scan_copy_single_wg_kernel<std::integral_constant<std::uint16_t, __wg_size>,
-                                         std::integral_constant<std::uint16_t, __num_elems_per_item>, _CustomName>>;
-        return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter<
-            _SizeType, __num_elems_per_item, __wg_size, _KernelName>()
-            (__q, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, __pred, __assign);
-    }
-};
-
 template <typename _CustomName, typename _InRng, typename _OutRng, typename _Size, typename _GenMask, typename _WriteOp,
           typename _IsUniquePattern>
 __future<sycl::event, __result_and_scratch_storage<_Size>>
@@ -920,8 +894,6 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli
 {
     using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
 
-    using _SingleGroupInvoker = __invoke_single_group_copy_if<_CustomName, _Size>;
-
     // Next power of 2 greater than or equal to __n
     auto __n_uniform = ::oneapi::dpl::__internal::__dpl_bit_ceil(static_cast<std::make_unsigned_t<_Size>>(__n));
 
@@ -934,18 +906,18 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPoli
     // The kernel stores n integers for the predicate and another n integers for the offsets
     const auto __req_slm_size = sizeof(std::uint16_t) * __n_uniform * 2;
 
-    constexpr std::uint16_t __single_group_upper_limit = 2048;
+//    constexpr std::uint16_t __single_group_upper_limit = 2048;
+    constexpr std::uint16_t __max_elem_per_item = 2;
 
     std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__q_local);
 
-    if (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size &&
-        __max_wg_size >= _SingleGroupInvoker::__targeted_wg_size)
+    if (__n <= __max_wg_size * __max_elem_per_item && __max_slm_size >= __req_slm_size)
     {
-        using _SizeBreakpoints = std::integer_sequence<std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048>;
-
-        return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch(
-            _SingleGroupInvoker{}, __n, __q_local, __n, std::forward<_InRng>(__in_rng),
-            std::forward<_OutRng>(__out_rng), __pred, __assign);
+        using _KernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+            __scan_copy_single_wg_kernel<_CustomName>>;
+        return __par_backend_hetero::__parallel_copy_if_static_single_group_submitter<_Size, _KernelName>()(
+            __q_local, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, __pred, __assign,
+            static_cast<std::uint16_t>(__n_uniform), static_cast<std::uint16_t>(std::min(__n_uniform, __max_wg_size)));
     }
     else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__q_local))
     {