Increase speed and precision of cube root

awxkee · Jun 16, 2024 · 329cc05 · 329cc05
1 parent 050bb5b
commit 329cc05
Show file tree

Hide file tree

Showing 7 changed files with 27 additions and 36 deletions.
diff --git a/src/app/src/main.rs b/src/app/src/main.rs
@@ -1,4 +1,3 @@
-
 use std::time::Instant;
 
 use image::io::Reader as ImageReader;

diff --git a/src/avx/math.rs b/src/avx/math.rs
@@ -161,7 +161,7 @@ pub unsafe fn _mm256_exp_ps_ulp_1_5<const HANDLE_NAN: bool>(x: __m256) -> __m256
         let max_input = _mm256_set1_ps(88.37f32); // Approximately ln(2^127.5)
         let zero = _mm256_set1_ps(0f32);
         let min_input = _mm256_set1_ps(-86.64f32); // Approximately ln(2^-125)
-        // Handle underflow and overflow.
+                                                   // Handle underflow and overflow.
         poly = _mm256_select_ps(_mm256_cmp_ps::<_CMP_LT_OS>(x, min_input), zero, poly);
         poly = _mm256_select_ps(_mm256_cmp_ps::<_CMP_GT_OS>(x, max_input), inf, poly);
     }

diff --git a/src/avx/mod.rs b/src/avx/mod.rs
@@ -5,27 +5,27 @@
  * // license that can be found in the LICENSE file.
  */
 
-mod to_xyz_lab;
-mod utils;
 mod color;
+mod from_sigmoidal;
 mod gamma_curves;
+mod linear_to_image;
 mod math;
+mod sigmoidal;
 mod support;
-mod xyz_lab_to_image;
-mod linear_to_image;
-mod xyza_laba_to_image;
 mod to_linear;
-mod sigmoidal;
 mod to_sigmoidal;
-mod from_sigmoidal;
+mod to_xyz_lab;
+mod utils;
+mod xyz_lab_to_image;
+mod xyza_laba_to_image;
 
+pub use from_sigmoidal::avx_from_sigmoidal_row;
 pub use linear_to_image::avx_linear_to_gamma;
 pub use math::*;
 pub use support::*;
+pub use to_linear::avx_channels_to_linear;
+pub use to_sigmoidal::avx_image_to_sigmoidal_row;
 pub use to_xyz_lab::*;
 pub use utils::*;
 pub use xyz_lab_to_image::*;
 pub use xyza_laba_to_image::*;
-pub use to_linear::avx_channels_to_linear;
-pub use to_sigmoidal::avx_image_to_sigmoidal_row;
-pub use from_sigmoidal::avx_from_sigmoidal_row;
diff --git a/src/avx/sigmoidal.rs b/src/avx/sigmoidal.rs
@@ -1,9 +1,8 @@
-
+use crate::avx::{_mm256_exp_ps, _mm256_log_ps, _mm256_neg_ps, _mm256_select_ps};
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
-use crate::avx::{_mm256_exp_ps, _mm256_log_ps, _mm256_neg_ps, _mm256_select_ps};
 
 #[inline(always)]
 pub(crate) unsafe fn avx_color_to_sigmoidal(x: __m256) -> __m256 {

diff --git a/src/image_to_sigmoidal.rs b/src/image_to_sigmoidal.rs
@@ -1,3 +1,7 @@
+#[cfg(all(
+    any(target_arch = "x86_64", target_arch = "x86"),
+    target_feature = "avx2"
+))]
 use crate::avx::avx_image_to_sigmoidal_row;
 use std::slice;
 

diff --git a/src/neon/math.rs b/src/neon/math.rs
@@ -401,27 +401,15 @@ pub(crate) unsafe fn vmlafq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t)
     prefer_vfmaq_f32(c, b, a)
 }
 
-#[cfg(all(
-    any(target_arch = "aarch64", target_arch = "arm"),
-    target_feature = "neon"
-))]
 #[inline(always)]
 #[allow(dead_code)]
-/// This is Cube Root using Pow functions,
-/// it also precise however due to of inexact nature of power 1/3 result slightly differ
-/// from real cbrt with about ULP 3-4, but this is almost 2 times faster than cbrt with real ULP 3.5
 pub unsafe fn vcbrtq_f32(d: float32x4_t) -> float32x4_t {
-    vpowq_n_f32(d, 1f32 / 3f32)
+    vcbrtq_f32_ulp2::<false>(d)
 }
 
-#[cfg(all(
-    any(target_arch = "aarch64", target_arch = "arm"),
-    target_feature = "neon"
-))]
 #[inline(always)]
-#[allow(dead_code)]
 /// Precise version of Cube Root with ULP 2
-pub unsafe fn vcbrtq_f32_ulp2(x: float32x4_t) -> float32x4_t {
+pub unsafe fn vcbrtq_f32_ulp2<const HANDLE_NAN: bool>(x: float32x4_t) -> float32x4_t {
     let x1p24 = vreinterpretq_f32_u32(vdupq_n_u32(0x4b800000)); // 0x1p24f === 2 ^ 24
 
     let mut ui = vreinterpretq_u32_f32(x);
@@ -462,15 +450,13 @@ pub unsafe fn vcbrtq_f32_ulp2(x: float32x4_t) -> float32x4_t {
         vdivq_f32(vaddq_f32(sum_x, r), vaddq_f32(vaddq_f32(r, r), x)),
         t,
     );
-    t = vbslq_f32(nan_mask, vdupq_n_f32(f32::NAN), t);
-    t = vbslq_f32(is_zero_mask, vdupq_n_f32(0f32), t);
+    if HANDLE_NAN {
+        t = vbslq_f32(nan_mask, vdupq_n_f32(f32::NAN), t);
+        t = vbslq_f32(is_zero_mask, vdupq_n_f32(0f32), t);
+    }
     t
 }
 
-#[cfg(all(
-    any(target_arch = "aarch64", target_arch = "arm"),
-    target_feature = "neon"
-))]
 #[inline(always)]
 #[allow(dead_code)]
 /// Precise version of Cube Root with ULP 3.5

diff --git a/src/sigmoidal_to_image.rs b/src/sigmoidal_to_image.rs
@@ -1,5 +1,7 @@
-use std::slice;
-
+#[cfg(all(
+    any(target_arch = "x86_64", target_arch = "x86"),
+    target_feature = "avx2"
+))]
 use crate::avx::avx_from_sigmoidal_row;
 use crate::image::ImageConfiguration;
 #[cfg(all(
@@ -13,6 +15,7 @@ use crate::neon::neon_from_sigmoidal_row;
 ))]
 use crate::sse::sse_from_sigmoidal_row;
 use crate::{Rgb, Sigmoidal};
+use std::slice;
 
 #[inline]
 fn sigmoidal_to_image<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1,3 @@

		use std::time::Instant;

		use image::io::Reader as ImageReader;
Expand Down