Skip to content

Commit

Permalink
Rollback exp 1.5 ulp
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Jun 16, 2024
1 parent 9390e06 commit 355a4d8
Show file tree
Hide file tree
Showing 7 changed files with 78 additions and 21 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ workspace = { members = ["src/app"] }

[package]
name = "colorutils-rs"
version = "0.4.6"
version = "0.4.7"
edition = "2021"
description = "High performance utilities for color format handling and conversion."
readme = "README.md"
Expand Down
4 changes: 2 additions & 2 deletions src/app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ fn main() {
lab_store.resize(width as usize * components * height as usize, 0f32);
let src_stride = width * components as u32;
let start_time = Instant::now();
rgb_to_sigmoidal(
rgb_to_lab(
src_bytes,
src_stride,
&mut lab_store,
Expand Down Expand Up @@ -93,7 +93,7 @@ fn main() {
// }

let start_time = Instant::now();
sigmoidal_to_rgb(
lab_to_srgb(
&lab_store,
store_stride as u32,
&mut dst_slice,
Expand Down
65 changes: 63 additions & 2 deletions src/avx/math.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,68 @@ pub unsafe fn _mm256_select_si256(

#[inline(always)]
pub unsafe fn _mm256_exp_ps(x: __m256) -> __m256 {
_mm256_exp_ps_impl::<false>(x)
_mm256_exp_ps_ulp_1_5::<false>(x)
}

#[inline(always)]
pub unsafe fn _mm256_exp_ps_ulp_1_5<const HANDLE_NAN: bool>(x: __m256) -> __m256 {
let c1 = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f7ffff6)); // x^1: 0x1.ffffecp-1f
let c2 = _mm256_castsi256_ps(_mm256_set1_epi32(0x3efffedb)); // x^2: 0x1.fffdb6p-2f
let c3 = _mm256_castsi256_ps(_mm256_set1_epi32(0x3e2aaf33)); // x^3: 0x1.555e66p-3f
let c4 = _mm256_castsi256_ps(_mm256_set1_epi32(0x3d2b9f17)); // x^4: 0x1.573e2ep-5f
let c5 = _mm256_castsi256_ps(_mm256_set1_epi32(0x3c072010)); // x^5: 0x1.0e4020p-7f

let shift = _mm256_castsi256_ps(_mm256_set1_epi32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
let inv_ln2 = _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
let neg_ln2_hi = _mm256_castsi256_ps(_mm256_set1_epi32(-1087278592i32)); // -ln(2) from bits -1 to -19: -0x1.62e400p-1f
let neg_ln2_lo = _mm256_castsi256_ps(_mm256_set1_epi32(-1245725042i32)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f

// Range reduction:
// e^x = 2^n * e^r
// where:
// n = floor(x / ln(2))
// r = x - n * ln(2)
//
// By adding x / ln(2) with 2^23 + 127 (shift):
// * As FP32 fraction part only has 23-bits, the addition of 2^23 + 127 forces decimal part
// of x / ln(2) out of the result. The integer part of x / ln(2) (i.e. n) + 127 will occupy
// the whole fraction part of z in FP32 format.
// Subtracting 2^23 + 127 (shift) from z will result in the integer part of x / ln(2)
// (i.e. n) because the decimal part has been pushed out and lost.
// * The addition of 127 makes the FP32 fraction part of z ready to be used as the exponent
// in FP32 format. Left shifting z by 23 bits will result in 2^n.
let z = _mm256_prefer_fma_ps(shift, x, inv_ln2);
let n = _mm256_sub_ps(z, shift);
let scale = _mm256_castsi256_ps(_mm256_slli_epi32::<23>(_mm256_castps_si256(z))); // 2^n

// The calculation of n * ln(2) is done using 2 steps to achieve accuracy beyond FP32.
// This outperforms longer Taylor series (3-4 tabs) both in terms of accuracy and performance.
let r_hi = _mm256_prefer_fma_ps(x, n, neg_ln2_hi);
let r = _mm256_prefer_fma_ps(r_hi, n, neg_ln2_lo);

// Compute the truncated Taylor series of e^r.
// poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5)
let r2 = _mm256_mul_ps(r, r);

let p1 = _mm256_mul_ps(c1, r);
let p23 = _mm256_prefer_fma_ps(c2, c3, r);
let p45 = _mm256_prefer_fma_ps(c4, c5, r);
let p2345 = _mm256_prefer_fma_ps(p23, p45, r2);
let p12345 = _mm256_prefer_fma_ps(p1, p2345, r2);

let mut poly = _mm256_prefer_fma_ps(scale, p12345, scale);

if HANDLE_NAN {
let inf = _mm256_set1_ps(f32::INFINITY);
let max_input = _mm256_set1_ps(88.37f32); // Approximately ln(2^127.5)
let zero = _mm256_set1_ps(0f32);
let min_input = _mm256_set1_ps(-86.64f32); // Approximately ln(2^-125)
// Handle underflow and overflow.
poly = _mm256_select_ps(_mm256_cmp_ps::<_CMP_LT_OS>(x, min_input), zero, poly);
poly = _mm256_select_ps(_mm256_cmp_ps::<_CMP_GT_OS>(x, max_input), inf, poly);
}

return poly;
}

#[inline(always)]
Expand Down Expand Up @@ -245,7 +306,7 @@ pub(crate) unsafe fn _mm256_neg_epi32(x: __m256i) -> __m256i {

#[inline(always)]
/// This is Cube Root using Pow functions,
/// it also precise however due to of inexact nature of power 1/3 result slightly differ
/// it is also precise however due to of inexact nature of power 1/3 result slightly differ
/// from real cbrt with about ULP 3-4, but this is almost 2 times faster than cbrt with real ULP 3.5
pub unsafe fn _mm256_cbrt_ps(d: __m256) -> __m256 {
_mm256_pow_n_ps(d, 1f32 / 3f32)
Expand Down
16 changes: 7 additions & 9 deletions src/avx/xyz_lab_to_image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,10 @@ unsafe fn avx_xyz_lab_vld<
r_f32 = _mm256_mul_ps(r_f32, v_scale_color);
g_f32 = _mm256_mul_ps(g_f32, v_scale_color);
b_f32 = _mm256_mul_ps(b_f32, v_scale_color);
const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
(
_mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(r_f32)),
_mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(g_f32)),
_mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(b_f32)),
_mm256_cvtps_epi32(_mm256_round_ps::<0>(r_f32)),
_mm256_cvtps_epi32(_mm256_round_ps::<0>(g_f32)),
_mm256_cvtps_epi32(_mm256_round_ps::<0>(b_f32)),
)
}

Expand Down Expand Up @@ -204,28 +203,27 @@ pub unsafe fn avx_xyz_to_channels<
let dst_ptr = dst.add(dst_offset + cx * channels);

if USE_ALPHA {
const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx);
let a_low_0_f = _mm256_loadu_ps(offset_a_src_ptr);
let a_row0_ = _mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(_mm256_mul_ps(
let a_row0_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps(
a_low_0_f,
color_rescale,
)));

let a_low_1_f = _mm256_loadu_ps(offset_a_src_ptr.add(8));
let a_row1_ = _mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(_mm256_mul_ps(
let a_row1_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps(
a_low_1_f,
color_rescale,
)));

let a_low_2_f = _mm256_loadu_ps(offset_a_src_ptr.add(16));
let a_row2_ = _mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(_mm256_mul_ps(
let a_row2_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps(
a_low_2_f,
color_rescale,
)));

let a_low_3_f = _mm256_loadu_ps(offset_a_src_ptr.add(24));
let a_row3_ = _mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(_mm256_mul_ps(
let a_row3_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps(
a_low_3_f,
color_rescale,
)));
Expand Down
9 changes: 4 additions & 5 deletions src/avx/xyza_laba_to_image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,11 @@ unsafe fn avx_xyza_lab_vld<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
g_f32 = _mm256_mul_ps(g_f32, v_scale_color);
b_f32 = _mm256_mul_ps(b_f32, v_scale_color);
let a_f32 = _mm256_mul_ps(a_f32, v_scale_color);
const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
(
_mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(r_f32)),
_mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(g_f32)),
_mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(b_f32)),
_mm256_cvtps_epi32(_mm256_round_ps::<ROUNDING_FLAGS>(a_f32)),
_mm256_cvtps_epi32(_mm256_round_ps::<0>(r_f32)),
_mm256_cvtps_epi32(_mm256_round_ps::<0>(g_f32)),
_mm256_cvtps_epi32(_mm256_round_ps::<0>(b_f32)),
_mm256_cvtps_epi32(_mm256_round_ps::<0>(a_f32)),
)
}

Expand Down
1 change: 0 additions & 1 deletion src/sse/math.rs
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,6 @@ pub unsafe fn _mm_cbrt_ps(d: __m128) -> __m128 {
_mm_pow_n_ps(d, 1f32 / 3f32)
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
/// Precise version of Cube Root, ULP 3.5
Expand Down

0 comments on commit 355a4d8

Please sign in to comment.