Skip to content

Commit

Permalink
Increase speed and precision of cube root
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Jun 16, 2024
1 parent 050bb5b commit 329cc05
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 36 deletions.
1 change: 0 additions & 1 deletion src/app/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

use std::time::Instant;

use image::io::Reader as ImageReader;
Expand Down
2 changes: 1 addition & 1 deletion src/avx/math.rs
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ pub unsafe fn _mm256_exp_ps_ulp_1_5<const HANDLE_NAN: bool>(x: __m256) -> __m256
let max_input = _mm256_set1_ps(88.37f32); // Approximately ln(2^127.5)
let zero = _mm256_set1_ps(0f32);
let min_input = _mm256_set1_ps(-86.64f32); // Approximately ln(2^-125)
// Handle underflow and overflow.
// Handle underflow and overflow.
poly = _mm256_select_ps(_mm256_cmp_ps::<_CMP_LT_OS>(x, min_input), zero, poly);
poly = _mm256_select_ps(_mm256_cmp_ps::<_CMP_GT_OS>(x, max_input), inf, poly);
}
Expand Down
20 changes: 10 additions & 10 deletions src/avx/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,27 @@
* // license that can be found in the LICENSE file.
*/

mod to_xyz_lab;
mod utils;
mod color;
mod from_sigmoidal;
mod gamma_curves;
mod linear_to_image;
mod math;
mod sigmoidal;
mod support;
mod xyz_lab_to_image;
mod linear_to_image;
mod xyza_laba_to_image;
mod to_linear;
mod sigmoidal;
mod to_sigmoidal;
mod from_sigmoidal;
mod to_xyz_lab;
mod utils;
mod xyz_lab_to_image;
mod xyza_laba_to_image;

pub use from_sigmoidal::avx_from_sigmoidal_row;
pub use linear_to_image::avx_linear_to_gamma;
pub use math::*;
pub use support::*;
pub use to_linear::avx_channels_to_linear;
pub use to_sigmoidal::avx_image_to_sigmoidal_row;
pub use to_xyz_lab::*;
pub use utils::*;
pub use xyz_lab_to_image::*;
pub use xyza_laba_to_image::*;
pub use to_linear::avx_channels_to_linear;
pub use to_sigmoidal::avx_image_to_sigmoidal_row;
pub use from_sigmoidal::avx_from_sigmoidal_row;
3 changes: 1 addition & 2 deletions src/avx/sigmoidal.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@

use crate::avx::{_mm256_exp_ps, _mm256_log_ps, _mm256_neg_ps, _mm256_select_ps};
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
use crate::avx::{_mm256_exp_ps, _mm256_log_ps, _mm256_neg_ps, _mm256_select_ps};

#[inline(always)]
pub(crate) unsafe fn avx_color_to_sigmoidal(x: __m256) -> __m256 {
Expand Down
4 changes: 4 additions & 0 deletions src/image_to_sigmoidal.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
#[cfg(all(
any(target_arch = "x86_64", target_arch = "x86"),
target_feature = "avx2"
))]
use crate::avx::avx_image_to_sigmoidal_row;
use std::slice;

Expand Down
26 changes: 6 additions & 20 deletions src/neon/math.rs
Original file line number Diff line number Diff line change
Expand Up @@ -401,27 +401,15 @@ pub(crate) unsafe fn vmlafq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t)
prefer_vfmaq_f32(c, b, a)
}

#[cfg(all(
any(target_arch = "aarch64", target_arch = "arm"),
target_feature = "neon"
))]
#[inline(always)]
#[allow(dead_code)]
/// This is Cube Root using Pow functions,
/// it also precise however due to of inexact nature of power 1/3 result slightly differ
/// from real cbrt with about ULP 3-4, but this is almost 2 times faster than cbrt with real ULP 3.5
pub unsafe fn vcbrtq_f32(d: float32x4_t) -> float32x4_t {
vpowq_n_f32(d, 1f32 / 3f32)
vcbrtq_f32_ulp2::<false>(d)
}

#[cfg(all(
any(target_arch = "aarch64", target_arch = "arm"),
target_feature = "neon"
))]
#[inline(always)]
#[allow(dead_code)]
/// Precise version of Cube Root with ULP 2
pub unsafe fn vcbrtq_f32_ulp2(x: float32x4_t) -> float32x4_t {
pub unsafe fn vcbrtq_f32_ulp2<const HANDLE_NAN: bool>(x: float32x4_t) -> float32x4_t {
let x1p24 = vreinterpretq_f32_u32(vdupq_n_u32(0x4b800000)); // 0x1p24f === 2 ^ 24

let mut ui = vreinterpretq_u32_f32(x);
Expand Down Expand Up @@ -462,15 +450,13 @@ pub unsafe fn vcbrtq_f32_ulp2(x: float32x4_t) -> float32x4_t {
vdivq_f32(vaddq_f32(sum_x, r), vaddq_f32(vaddq_f32(r, r), x)),
t,
);
t = vbslq_f32(nan_mask, vdupq_n_f32(f32::NAN), t);
t = vbslq_f32(is_zero_mask, vdupq_n_f32(0f32), t);
if HANDLE_NAN {
t = vbslq_f32(nan_mask, vdupq_n_f32(f32::NAN), t);
t = vbslq_f32(is_zero_mask, vdupq_n_f32(0f32), t);
}
t
}

#[cfg(all(
any(target_arch = "aarch64", target_arch = "arm"),
target_feature = "neon"
))]
#[inline(always)]
#[allow(dead_code)]
/// Precise version of Cube Root with ULP 3.5
Expand Down
7 changes: 5 additions & 2 deletions src/sigmoidal_to_image.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use std::slice;

#[cfg(all(
any(target_arch = "x86_64", target_arch = "x86"),
target_feature = "avx2"
))]
use crate::avx::avx_from_sigmoidal_row;
use crate::image::ImageConfiguration;
#[cfg(all(
Expand All @@ -13,6 +15,7 @@ use crate::neon::neon_from_sigmoidal_row;
))]
use crate::sse::sse_from_sigmoidal_row;
use crate::{Rgb, Sigmoidal};
use std::slice;

#[inline]
fn sigmoidal_to_image<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
Expand Down

0 comments on commit 329cc05

Please sign in to comment.