Rollback exp 1.5 ulp

awxkee · Jun 16, 2024 · 355a4d8 · 355a4d8
1 parent 9390e06
commit 355a4d8
Show file tree

Hide file tree

Showing 7 changed files with 78 additions and 21 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -2,7 +2,7 @@ workspace = { members = ["src/app"] }
 
 [package]
 name = "colorutils-rs"
-version = "0.4.6"
+version = "0.4.7"
 edition = "2021"
 description = "High performance utilities for color format handling and conversion."
 readme = "README.md"

diff --git a/src/app/src/main.rs b/src/app/src/main.rs
@@ -59,7 +59,7 @@ fn main() {
         lab_store.resize(width as usize * components * height as usize, 0f32);
         let src_stride = width * components as u32;
         let start_time = Instant::now();
-        rgb_to_sigmoidal(
+        rgb_to_lab(
             src_bytes,
             src_stride,
             &mut lab_store,
@@ -93,7 +93,7 @@ fn main() {
         // }
 
         let start_time = Instant::now();
-        sigmoidal_to_rgb(
+        lab_to_srgb(
             &lab_store,
             store_stride as u32,
             &mut dst_slice,

diff --git a/src/avx/math.rs b/src/avx/math.rs
@@ -105,7 +105,68 @@ pub unsafe fn _mm256_select_si256(
 
 #[inline(always)]
 pub unsafe fn _mm256_exp_ps(x: __m256) -> __m256 {
-    _mm256_exp_ps_impl::<false>(x)
+    _mm256_exp_ps_ulp_1_5::<false>(x)
+}
+
+#[inline(always)]
+pub unsafe fn _mm256_exp_ps_ulp_1_5<const HANDLE_NAN: bool>(x: __m256) -> __m256 {
+    let c1 = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f7ffff6)); // x^1: 0x1.ffffecp-1f
+    let c2 = _mm256_castsi256_ps(_mm256_set1_epi32(0x3efffedb)); // x^2: 0x1.fffdb6p-2f
+    let c3 = _mm256_castsi256_ps(_mm256_set1_epi32(0x3e2aaf33)); // x^3: 0x1.555e66p-3f
+    let c4 = _mm256_castsi256_ps(_mm256_set1_epi32(0x3d2b9f17)); // x^4: 0x1.573e2ep-5f
+    let c5 = _mm256_castsi256_ps(_mm256_set1_epi32(0x3c072010)); // x^5: 0x1.0e4020p-7f
+
+    let shift = _mm256_castsi256_ps(_mm256_set1_epi32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
+    let inv_ln2 = _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
+    let neg_ln2_hi = _mm256_castsi256_ps(_mm256_set1_epi32(-1087278592i32)); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
+    let neg_ln2_lo = _mm256_castsi256_ps(_mm256_set1_epi32(-1245725042i32)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
+
+    // Range reduction:
+    //   e^x = 2^n * e^r
+    // where:
+    //   n = floor(x / ln(2))
+    //   r = x - n * ln(2)
+    //
+    // By adding x / ln(2) with 2^23 + 127 (shift):
+    //   * As FP32 fraction part only has 23-bits, the addition of 2^23 + 127 forces decimal part
+    //     of x / ln(2) out of the result. The integer part of x / ln(2) (i.e. n) + 127 will occupy
+    //     the whole fraction part of z in FP32 format.
+    //     Subtracting 2^23 + 127 (shift) from z will result in the integer part of x / ln(2)
+    //     (i.e. n) because the decimal part has been pushed out and lost.
+    //   * The addition of 127 makes the FP32 fraction part of z ready to be used as the exponent
+    //     in FP32 format. Left shifting z by 23 bits will result in 2^n.
+    let z = _mm256_prefer_fma_ps(shift, x, inv_ln2);
+    let n = _mm256_sub_ps(z, shift);
+    let scale = _mm256_castsi256_ps(_mm256_slli_epi32::<23>(_mm256_castps_si256(z))); // 2^n
+
+    // The calculation of n * ln(2) is done using 2 steps to achieve accuracy beyond FP32.
+    // This outperforms longer Taylor series (3-4 tabs) both in terms of accuracy and performance.
+    let r_hi = _mm256_prefer_fma_ps(x, n, neg_ln2_hi);
+    let r = _mm256_prefer_fma_ps(r_hi, n, neg_ln2_lo);
+
+    // Compute the truncated Taylor series of e^r.
+    //   poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5)
+    let r2 = _mm256_mul_ps(r, r);
+
+    let p1 = _mm256_mul_ps(c1, r);
+    let p23 = _mm256_prefer_fma_ps(c2, c3, r);
+    let p45 = _mm256_prefer_fma_ps(c4, c5, r);
+    let p2345 = _mm256_prefer_fma_ps(p23, p45, r2);
+    let p12345 = _mm256_prefer_fma_ps(p1, p2345, r2);
+
+    let mut poly = _mm256_prefer_fma_ps(scale, p12345, scale);
+
+    if HANDLE_NAN {
+        let inf = _mm256_set1_ps(f32::INFINITY);
+        let max_input = _mm256_set1_ps(88.37f32); // Approximately ln(2^127.5)
+        let zero = _mm256_set1_ps(0f32);
+        let min_input = _mm256_set1_ps(-86.64f32); // Approximately ln(2^-125)
+        // Handle underflow and overflow.
+        poly = _mm256_select_ps(_mm256_cmp_ps::<_CMP_LT_OS>(x, min_input), zero, poly);
+        poly = _mm256_select_ps(_mm256_cmp_ps::<_CMP_GT_OS>(x, max_input), inf, poly);
+    }
+
+    return poly;
 }
 
 #[inline(always)]
@@ -245,7 +306,7 @@ pub(crate) unsafe fn _mm256_neg_epi32(x: __m256i) -> __m256i {
 
 #[inline(always)]
 /// This is Cube Root using Pow functions,
-/// it also precise however due to of inexact nature of power 1/3 result slightly differ
+/// it is also precise however due to of inexact nature of power 1/3 result slightly differ
 /// from real cbrt with about ULP 3-4, but this is almost 2 times faster than cbrt with real ULP 3.5
 pub unsafe fn _mm256_cbrt_ps(d: __m256) -> __m256 {
     _mm256_pow_n_ps(d, 1f32 / 3f32)

diff --git a/src/avx/xyz_lab_to_image.rs b/src/avx/xyz_lab_to_image.rs
@@ -68,11 +68,10 @@ unsafe fn avx_xyz_lab_vld<
     r_f32 = _mm256_mul_ps(r_f32, v_scale_color);
     g_f32 = _mm256_mul_ps(g_f32, v_scale_color);
     b_f32 = _mm256_mul_ps(b_f32, v_scale_color);
-    const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
     (
-        _mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(r_f32)),
-        _mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(g_f32)),
-        _mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(b_f32)),
+        _mm256_cvtps_epi32(_mm256_round_ps::<0>(r_f32)),
+        _mm256_cvtps_epi32(_mm256_round_ps::<0>(g_f32)),
+        _mm256_cvtps_epi32(_mm256_round_ps::<0>(b_f32)),
     )
 }
 
@@ -204,28 +203,27 @@ pub unsafe fn avx_xyz_to_channels<
         let dst_ptr = dst.add(dst_offset + cx * channels);
 
         if USE_ALPHA {
-            const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
             let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx);
             let a_low_0_f = _mm256_loadu_ps(offset_a_src_ptr);
-            let a_row0_ = _mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(_mm256_mul_ps(
+            let a_row0_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps(
                 a_low_0_f,
                 color_rescale,
             )));
 
             let a_low_1_f = _mm256_loadu_ps(offset_a_src_ptr.add(8));
-            let a_row1_ = _mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(_mm256_mul_ps(
+            let a_row1_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps(
                 a_low_1_f,
                 color_rescale,
             )));
 
             let a_low_2_f = _mm256_loadu_ps(offset_a_src_ptr.add(16));
-            let a_row2_ = _mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(_mm256_mul_ps(
+            let a_row2_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps(
                 a_low_2_f,
                 color_rescale,
             )));
 
             let a_low_3_f = _mm256_loadu_ps(offset_a_src_ptr.add(24));
-            let a_row3_ = _mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(_mm256_mul_ps(
+            let a_row3_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps(
                 a_low_3_f,
                 color_rescale,
             )));

diff --git a/src/avx/xyza_laba_to_image.rs b/src/avx/xyza_laba_to_image.rs
@@ -67,12 +67,11 @@ unsafe fn avx_xyza_lab_vld<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     g_f32 = _mm256_mul_ps(g_f32, v_scale_color);
     b_f32 = _mm256_mul_ps(b_f32, v_scale_color);
     let a_f32 = _mm256_mul_ps(a_f32, v_scale_color);
-    const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
     (
-        _mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(r_f32)),
-        _mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(g_f32)),
-        _mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(b_f32)),
-        _mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(a_f32)),
+        _mm256_cvtps_epi32(_mm256_round_ps::<0>(r_f32)),
+        _mm256_cvtps_epi32(_mm256_round_ps::<0>(g_f32)),
+        _mm256_cvtps_epi32(_mm256_round_ps::<0>(b_f32)),
+        _mm256_cvtps_epi32(_mm256_round_ps::<0>(a_f32)),
     )
 }
 

diff --git a/src/sse/math.rs b/src/sse/math.rs
@@ -284,7 +284,6 @@ pub unsafe fn _mm_cbrt_ps(d: __m128) -> __m128 {
     _mm_pow_n_ps(d, 1f32 / 3f32)
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
 #[allow(dead_code)]
 /// Precise version of Cube Root, ULP 3.5