KernelTuner
diff --git a/‎docs/guides/accuracy.md
Lines changed: 13 additions & 8 deletions b/‎docs/guides/accuracy.md
Lines changed: 13 additions & 8 deletions
diff --git a/‎include/kernel_float/apply.h
Lines changed: 57 additions & 36 deletions b/‎include/kernel_float/apply.h
Lines changed: 57 additions & 36 deletions
diff --git a/‎include/kernel_float/approx.h
Lines changed: 19 additions & 19 deletions b/‎include/kernel_float/approx.h
Lines changed: 19 additions & 19 deletions
diff --git a/‎include/kernel_float/bf16.h
Lines changed: 33 additions & 20 deletions b/‎include/kernel_float/bf16.h
Lines changed: 33 additions & 20 deletions
diff --git a/‎include/kernel_float/binops.h
Lines changed: 11 additions & 7 deletions b/‎include/kernel_float/binops.h
Lines changed: 11 additions & 7 deletions
@@ -25,13 +25,13 @@ kf::vec<float, 4> c = kf::fast_rcp(x);
 kf::vec<float, 4> d = kf::fast_div(a, b);
 ```
 
-These functions are only functional for 32-bit and 16-bit floats. 
+These functions are only functional for 32-bit and 16-bit floats.
 For other input types, the operation falls back to the regular version.
 
 ## Approximate Math
 
-For 16-bit floats, several approximate functions are provided. 
-These use approximations (typically low-degree polynomials) to calculate rough estimates of the functions. 
+For 16-bit floats, several approximate functions are provided.
+These use approximations (typically low-degree polynomials) to calculate rough estimates of the functions.
 This can be very fast but also less accurate.
 
 
@@ -69,14 +69,15 @@ kf::vec<half, 4> a = kf::approx_sin<3>(x);
 
 ## Tuning Accuracy Level
 
-Many functions in Kernel Float accept an additional Accuracy option as a template parameter. 
+Many functions in Kernel Float accept an additional `Accuracy` option as a template parameter.
 This allows you to tune the accuracy level without changing the function name.
 
-There are four possible values for this parameter:
+There are five possible values for this parameter:
 
 - `kf::accurate_policy`: Use the most accurate version of the function available.
 - `kf::fast_policy`: Use the "fast math" version.
-- `kf::approx_policy<N>`: Use the approximate version with degree `N`.
+- `kf::approx_level_policy<N>`: Use the approximate version with accuracy level `N` (higher is more accurate).
+- `kf::approx_policy`: Use the approximate version with a default accuracy level.
 - `kf::default_policy`: Use a global default policy (see the next section).
 
 For example, consider this code:
@@ -97,15 +98,19 @@ kf::vec<float, 2> c = kf::cos<kf::accurate_policy>(input);
 kf::vec<float, 2> d = kf::cos<kf::fast_policy>(input);
 
 // Use the approximate policy
-kf::vec<float, 2> e = kf::cos<kf::approx_policy<3>>(input);
+kf::vec<float, 2> e = kf::cos<kf::approx_policy>(input);
+
+// Use the approximate policy with degree 3 polynomial.
+kf::vec<float, 2> f = kf::cos<kf::approx_level_policy<3>>(input);
 
 // You can use aliases to define your own policy
 using my_own_policy = kf::fast_policy;
-kf::vec<float, 2> f = kf::cos<my_own_policy>(input);
+kf::vec<float, 2> g = kf::cos<my_own_policy>(input);
 ```
 
 ## Setting `default_policy`
 
+If no policy is explicitly set, any function use the `kf::default_policy`.
 By default, `kf::default_policy` is set to `kf::accurate_policy`.
 
 Set the preprocessor option `KERNEL_FLOAT_FAST_MATH=1` to change the default policy to `kf::fast_policy`.
 
@@ -116,10 +116,49 @@ broadcast_like(const V& input, const R& other) {
     return broadcast(input, vector_extent_type<R> {});
 }
 
+/**
+ * The accurate_policy is designed for computations where maximum accuracy is essential. This policy ensures that all
+ * operations are performed without any approximations or optimizations that could potentially alter the precise
+ * outcome of the computations
+ */
+struct accurate_policy {};
+
+/**
+ * The fast_policy is intended for scenarios where performance and execution speed are more critical than achieving
+ * the utmost accuracy. This policy leverages optimizations to accelerate computations, which may involve
+ * approximations that slightly compromise precision.
+ */
+struct fast_policy {};
+
+/**
+ * This template policy allows developers to specify a custom degree of approximation for their computations. By
+ * adjusting the `Level` parameter, you can fine-tune the balance between accuracy and performance to meet the
+ * specific needs of your application. Higher values mean more precision.
+ */
+template<int Level = -1>
+struct approx_level_policy {};
+
+/**
+ * The approximate_policy serves as the default approximation policy, providing a standard level of approximation
+ * without requiring explicit configuration. It balances accuracy and performance, making it suitable for
+ * general-purpose use cases where neither extreme precision nor maximum speed is necessary.
+ */
+using approx_policy = approx_level_policy<>;
+
+#ifndef KERNEL_FLOAT_POLICY
+#define KERNEL_FLOAT_POLICY accurate_policy;
+#endif
+
+/**
+ * The `default_policy` acts as the standard computation policy. It can be configured externally using the
+ * `KERNEL_FLOAT_POLICY` macro. If `KERNEL_FLOAT_POLICY` is not defined, it defaults to `accurate_policy`.
+ */
+using default_policy = KERNEL_FLOAT_POLICY;
+
 namespace detail {
 
-template<typename F, size_t N, typename Output, typename... Args>
-struct apply_impl {
+template<typename Policy, typename F, size_t N, typename Output, typename... Args>
+struct apply_base_impl {
     KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) {
 #pragma unroll
         for (size_t i = 0; i < N; i++) {
@@ -128,49 +167,31 @@ struct apply_impl {
     }
 };
 
-template<typename F, size_t N, typename Output, typename... Args>
-struct apply_fastmath_impl: apply_impl<F, N, Output, Args...> {};
-
-template<int Deg, typename F, size_t N, typename Output, typename... Args>
-struct apply_approx_impl: apply_fastmath_impl<F, N, Output, Args...> {};
-}  // namespace detail
-
-struct accurate_policy {
-    template<typename F, size_t N, typename Output, typename... Args>
-    using type = detail::apply_impl<F, N, Output, Args...>;
-};
-
-struct fast_policy {
-    template<typename F, size_t N, typename Output, typename... Args>
-    using type = detail::apply_fastmath_impl<F, N, Output, Args...>;
-};
-
-template<int Degree = -1>
-struct approximate_policy {
-    template<typename F, size_t N, typename Output, typename... Args>
-    using type = detail::apply_approx_impl<Degree, F, N, Output, Args...>;
-};
+template<typename Policy, typename F, size_t N, typename Output, typename... Args>
+struct apply_impl: apply_base_impl<Policy, F, N, Output, Args...> {};
 
-using default_approximate_policy = approximate_policy<>;
+template<typename F, size_t N, typename Output, typename... Args>
+struct apply_base_impl<fast_policy, F, N, Output, Args...>:
+    apply_impl<accurate_policy, F, N, Output, Args...> {};
 
-#ifdef KERNEL_FLOAT_POLICY
-using default_policy = KERNEL_FLOAT_POLICY;
-#else
-using default_policy = accurate_policy;
-#endif
+template<typename F, size_t N, typename Output, typename... Args>
+struct apply_base_impl<approx_policy, F, N, Output, Args...>:
+    apply_impl<fast_policy, F, N, Output, Args...> {};
 
-namespace detail {
+template<int Level, typename F, size_t N, typename Output, typename... Args>
+struct apply_base_impl<approx_level_policy<Level>, F, N, Output, Args...>:
+    apply_impl<approx_policy, F, N, Output, Args...> {};
 
 template<typename Policy, typename F, size_t N, typename Output, typename... Args>
-struct map_policy_impl {
+struct map_impl {
     static constexpr size_t packet_size = preferred_vector_size<Output>::value;
     static constexpr size_t remainder = N % packet_size;
 
     KERNEL_FLOAT_INLINE static void call(F fun, Output* output, const Args*... args) {
         if constexpr (N / packet_size > 0) {
 #pragma unroll
             for (size_t i = 0; i < N - remainder; i += packet_size) {
-                Policy::template type<F, packet_size, Output, Args...>::call(
+                apply_impl<Policy, F, packet_size, Output, Args...>::call(
                     fun,
                     output + i,
                     (args + i)...);
@@ -180,14 +201,14 @@ struct map_policy_impl {
         if constexpr (remainder > 0) {
 #pragma unroll
             for (size_t i = N - remainder; i < N; i++) {
-                Policy::template type<F, 1, Output, Args...>::call(fun, output + i, (args + i)...);
+                apply_impl<Policy, F, 1, Output, Args...>::call(fun, output + i, (args + i)...);
             }
         }
     }
 };
 
 template<typename F, size_t N, typename Output, typename... Args>
-using map_impl = map_policy_impl<default_policy, F, N, Output, Args...>;
+using default_map_impl = map_impl<default_policy, F, N, Output, Args...>;
 
 }  // namespace detail
 
@@ -211,7 +232,7 @@ KERNEL_FLOAT_INLINE map_type<F, Args...> map(F fun, const Args&... args) {
     using E = broadcast_vector_extent_type<Args...>;
     vector_storage<Output, extent_size<E>> result;
 
-    detail::map_policy_impl<Accuracy, F, extent_size<E>, Output, vector_value_type<Args>...>::call(
+    detail::map_impl<Accuracy, F, extent_size<E>, Output, vector_value_type<Args>...>::call(
         fun,
         result.data(),
         (detail::broadcast_impl<vector_value_type<Args>, vector_extent_type<Args>, E>::call(
 
@@ -359,25 +359,25 @@ KERNEL_FLOAT_DEVICE __bfloat162 exp(__bfloat162 arg) {
 #endif
 }  // namespace approx
 
-#define KERNEL_FLOAT_DEFINE_APPROX_FUN(FULL_NAME, FUN, DEG)                        \
-    namespace detail {                                                             \
-    template<int Degree>                                                           \
-    struct apply_approx_impl<Deg, ops::FUN<__half>, 2, __half, __half> {           \
-        KERNEL_FLOAT_INLINE static void                                            \
-        call(ops::FUN<__half> fun, __half* output, const __half* input) {          \
-            __half2 res = approx::FUN<Degree>(__half2 {input[0], input[1]});       \
-            output[0] = res.x;                                                     \
-            output[1] = res.y;                                                     \
-        }                                                                          \
-    };                                                                             \
-    template<>                                                                     \
-    struct apply_approx_impl<-1, ops::FUN<__half>, 2, __half, __half>:             \
-        apply_approx_impl<DEG, ops::FUN<__half>, 2, __half, __half> {};            \
-    }                                                                              \
-                                                                                   \
-    template<typename V>                                                           \
-    KERNEL_FLOAT_INLINE into_vector_type<V> approx_##FUN(const V& args) {          \
-        return map<approximate_policy<>>(ops::FUN<vector_value_type<V>> {}, args); \
+#define KERNEL_FLOAT_DEFINE_APPROX_FUN(FULL_NAME, FUN, DEG)                               \
+    namespace detail {                                                                    \
+    template<int Degree>                                                                  \
+    struct apply_impl<approx_level_policy<Degree>, ops::FUN<__half>, 2, __half, __half> { \
+        KERNEL_FLOAT_INLINE static void                                                   \
+        call(ops::FUN<__half> fun, __half* output, const __half* input) {                 \
+            __half2 res = approx::FUN<Degree>(__half2 {input[0], input[1]});              \
+            output[0] = res.x;                                                            \
+            output[1] = res.y;                                                            \
+        }                                                                                 \
+    };                                                                                    \
+    template<>                                                                            \
+    struct apply_impl<approx_policy, ops::FUN<__half>, 2, __half, __half>:                \
+        apply_impl<approx_level_policy<DEG>, ops::FUN<__half>, 2, __half, __half> {};     \
+    }                                                                                     \
+                                                                                          \
+    template<int Level = -1, typename V>                                                  \
+    KERNEL_FLOAT_INLINE into_vector_type<V> approx_##FUN(const V& args) {                 \
+        return map<approx_level_policy<Level>>(ops::FUN<vector_value_type<V>> {}, args);  \
     }
 
 KERNEL_FLOAT_DEFINE_APPROX_FUN(approx_sin, sin, 4)
 
@@ -61,24 +61,24 @@ struct allow_float_fallback<__bfloat16> {
 };  // namespace detail
 
 #if KERNEL_FLOAT_BF16_OPS_SUPPORTED
-#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2)                          \
-    namespace ops {                                                            \
-    template<>                                                                 \
-    struct NAME<__bfloat16> {                                                  \
-        KERNEL_FLOAT_INLINE __bfloat16 operator()(__bfloat16 input) {          \
-            return FUN1(input);                                                \
-        }                                                                      \
-    };                                                                         \
-    }                                                                          \
-    namespace detail {                                                         \
-    template<>                                                                 \
-    struct apply_impl<ops::NAME<__bfloat16>, 2, __bfloat16, __bfloat16> {      \
-        KERNEL_FLOAT_INLINE static void                                        \
-        call(ops::NAME<__bfloat16>, __bfloat16* result, const __bfloat16* a) { \
-            __bfloat162 r = FUN2(__bfloat162 {a[0], a[1]});                    \
-            result[0] = r.x, result[1] = r.y;                                  \
-        }                                                                      \
-    };                                                                         \
+#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2)                                      \
+    namespace ops {                                                                        \
+    template<>                                                                             \
+    struct NAME<__bfloat16> {                                                              \
+        KERNEL_FLOAT_INLINE __bfloat16 operator()(__bfloat16 input) {                      \
+            return FUN1(input);                                                            \
+        }                                                                                  \
+    };                                                                                     \
+    }                                                                                      \
+    namespace detail {                                                                     \
+    template<>                                                                             \
+    struct apply_impl<accurate_policy, ops::NAME<__bfloat16>, 2, __bfloat16, __bfloat16> { \
+        KERNEL_FLOAT_INLINE static void                                                    \
+        call(ops::NAME<__bfloat16>, __bfloat16* result, const __bfloat16* a) {             \
+            __bfloat162 r = FUN2(__bfloat162 {a[0], a[1]});                                \
+            result[0] = r.x, result[1] = r.y;                                              \
+        }                                                                                  \
+    };                                                                                     \
     }
 
 KERNEL_FLOAT_BF16_UNARY_FUN(sin, ::hsin, ::h2sin)
@@ -115,7 +115,13 @@ KERNEL_FLOAT_BF16_UNARY_FUN(negate, ::__hneg, ::__hneg2)
     }                                                                                        \
     namespace detail {                                                                       \
     template<>                                                                               \
-    struct apply_impl<ops::NAME<__bfloat16>, 2, __bfloat16, __bfloat16, __bfloat16> {        \
+    struct apply_impl<                                                                       \
+        accurate_policy,                                                                     \
+        ops::NAME<__bfloat16>,                                                               \
+        2,                                                                                   \
+        __bfloat16,                                                                          \
+        __bfloat16,                                                                          \
+        __bfloat16> {                                                                        \
         KERNEL_FLOAT_INLINE static void call(                                                \
             ops::NAME<__bfloat16>,                                                           \
             __bfloat16* result,                                                              \
@@ -154,7 +160,14 @@ struct fma<__bfloat16> {
 
 namespace detail {
 template<>
-struct apply_impl<ops::fma<__bfloat16>, 2, __bfloat16, __bfloat16, __bfloat16, __bfloat16> {
+struct apply_impl<
+    accurate_policy,
+    ops::fma<__bfloat16>,
+    2,
+    __bfloat16,
+    __bfloat16,
+    __bfloat16,
+    __bfloat16> {
     KERNEL_FLOAT_INLINE static void call(
         ops::fma<__bfloat16>,
         __bfloat16* result,
 
@@ -52,7 +52,7 @@ KERNEL_FLOAT_INLINE zip_common_type<F, L, R> zip_common(F fun, const L& left, co
 
     vector_storage<O, extent_size<E>> result;
 
-    detail::map_impl<F, extent_size<E>, O, T, T>::call(
+    detail::default_map_impl<F, extent_size<E>, O, T, T>::call(
         fun,
         result.data(),
         detail::convert_impl<vector_value_type<L>, vector_extent_type<L>, T, E>::call(
@@ -290,21 +290,25 @@ struct multiply<bool> {
 };  // namespace ops
 
 namespace detail {
-template<typename T, size_t N>
-struct apply_fastmath_impl<ops::divide<T>, N, T, T, T> {
+template<typename Policy, typename T, size_t N>
+struct apply_impl<Policy, ops::divide<T>, N, T, T, T> {
     KERNEL_FLOAT_INLINE static void
     call(ops::divide<T> fun, T* result, const T* lhs, const T* rhs) {
         T rhs_rcp[N];
 
         // Fast way to perform division is to multiply by the reciprocal
-        apply_fastmath_impl<ops::rcp<T>, N, T, T>::call({}, rhs_rcp, rhs);
-        apply_fastmath_impl<ops::multiply<T>, N, T, T, T>::call({}, result, lhs, rhs_rcp);
+        apply_impl<Policy, ops::rcp<T>, N, T, T>::call({}, rhs_rcp, rhs);
+        apply_impl<Policy, ops::multiply<T>, N, T, T, T>::call({}, result, lhs, rhs_rcp);
     }
 };
 
+template<typename T, size_t N>
+struct apply_impl<accurate_policy, ops::divide<T>, N, T, T, T>:
+    apply_base_impl<accurate_policy, ops::divide<T>, N, T, T, T> {};
+
 #if KERNEL_FLOAT_IS_DEVICE
 template<>
-struct apply_fastmath_impl<ops::divide<float>, 1, float, float, float> {
+struct apply_impl<fast_policy, ops::divide<float>, 1, float, float, float> {
     KERNEL_FLOAT_INLINE static void
     call(ops::divide<float> fun, float* result, const float* lhs, const float* rhs) {
         *result = __fdividef(*lhs, *rhs);
@@ -319,7 +323,7 @@ fast_divide(const L& left, const R& right) {
     using E = broadcast_vector_extent_type<L, R>;
     vector_storage<T, extent_size<E>> result;
 
-    detail::map_policy_impl<fast_policy, ops::divide<T>, extent_size<E>, T, T, T>::call(
+    detail::map_impl<fast_policy, ops::divide<T>, extent_size<E>, T, T, T>::call(
         ops::divide<T> {},
         result.data(),
         detail::convert_impl<vector_value_type<L>, vector_extent_type<L>, T, E>::call(