LUV bugfixes, improve LCh

awxkee · Jun 5, 2024 · 69af0da · 69af0da
1 parent 41d2886
commit 69af0da
Show file tree

Hide file tree

Showing 20 changed files with 714 additions and 111 deletions.
diff --git a/src/app/src/main.rs b/src/app/src/main.rs
@@ -27,13 +27,13 @@ fn main() {
     //     _mm_storeu_ps(dst.as_mut_ptr() as *mut f32, ln);
     //     println!("{:?}", dst);
     // }
-    // #[cfg(target_arch = "aarch64")]
-    // unsafe {
-    //     let m = vdupq_n_f32(std::f32::consts::E);
-    //     let cbrt = vlogq_f32_ulp35(m);
-    //     let l = vgetq_lane_f32::<0>(cbrt);
-    //     println!("Exp {}", l);
-    // }
+    #[cfg(target_arch = "aarch64")]
+    unsafe {
+        let m = vdupq_n_f32(27f32);
+        let cbrt = vcbrtq_f32_ulp2(m);
+        let l = vgetq_lane_f32::<0>(cbrt);
+        println!("Cbrt {}", l);
+    }
 
     let img = ImageReader::open("./assets/asset_middle.jpg")
         .unwrap()
@@ -68,9 +68,7 @@ fn main() {
         let mut lab_store: Vec<f32> = vec![];
         let store_stride = width as usize * 4usize * std::mem::size_of::<f32>();
         lab_store.resize(width as usize * 4usize * height as usize, 0f32);
-        let mut alpha_store: Vec<f32> = vec![];
-        let alpha_stride = width as usize * std::mem::size_of::<f32>();
-        alpha_store.resize(width as usize * height as usize, 0f32);
+        let start_time = Instant::now();
         rgba_to_lab_with_alpha(
             src_bytes,
             4u32 * width,
@@ -110,6 +108,10 @@ fn main() {
             height,
         );
 
+        let elapsed_time = start_time.elapsed();
+        // Print the elapsed time in milliseconds
+        println!("Fast image resize: {:.2?}", elapsed_time);
+
         // laba_to_srgb(
         //     &lab_store,
         //     lab_stride as u32,
@@ -124,51 +126,16 @@ fn main() {
         src_bytes = &dst_slice;
     }
 
-    let mut xyz: Vec<f32> = vec![];
-    xyz.resize(4 * width as usize * height as usize, 0f32);
-
-    let mut a_plane: Vec<f32> = vec![];
-    a_plane.resize(width as usize * height as usize, 0f32);
-
-    for i in 0..1 {
-        let start_time = Instant::now();
-        // srgba_to_xyza(
-        //     src_bytes,
-        //     width * components,
-        //     &mut xyz,
-        //     width * 3 * std::mem::size_of::<f32>() as u32,
-        //     &mut a_plane,
-        //     width as u32 * std::mem::size_of::<f32>() as u32,
-        //     width,
-        //     height,
-        // );
-        // rgba_to_linear(
-        //     src_bytes,
-        //     width * components,
-        //     &mut xyz,
-        //     width * 3 * std::mem::size_of::<f32>() as u32,
-        //     width,
-        //     height,
-        //     TransferFunction::Srgb,
-        // );
-        rgba_to_linear(
-            src_bytes,
-            width * components,
-            &mut xyz,
-            width * 4 * std::mem::size_of::<f32>() as u32,
-            width,
-            height,
-            TransferFunction::Srgb,
-        );
-        let elapsed_time = start_time.elapsed();
-        // Print the elapsed time in milliseconds
-        println!("sRGB to XYZ: {:.2?}", elapsed_time);
-    }
-
-    let mut dst_bytes: Vec<u8> = vec![];
-    dst_bytes.resize(width as usize * components as usize * height as usize, 0u8);
-
-    let start_time = Instant::now();
+    // let mut xyz: Vec<f32> = vec![];
+    // xyz.resize(4 * width as usize * height as usize, 0f32);
+    //
+    // let mut a_plane: Vec<f32> = vec![];
+    // a_plane.resize(width as usize * height as usize, 0f32);
+    //
+    // let mut dst_bytes: Vec<u8> = vec![];
+    // dst_bytes.resize(width as usize * components as usize * height as usize, 0u8);
+    //
+    // let start_time = Instant::now();
     // xyz_to_srgb(
     //     &xyz,
     //     width * 3 * std::mem::size_of::<f32>() as u32,
@@ -177,16 +144,16 @@ fn main() {
     //     width,
     //     height,
     // );
-
-    linear_to_rgba(
-        &xyz,
-        width * 4 * std::mem::size_of::<f32>() as u32,
-        &mut dst_bytes,
-        width * components,
-        width,
-        height,
-        TransferFunction::Srgb,
-    );
+    //
+    // linear_to_rgba(
+    //     &xyz,
+    //     width * 4 * std::mem::size_of::<f32>() as u32,
+    //     &mut dst_bytes,
+    //     width * components,
+    //     width,
+    //     height,
+    //     TransferFunction::Srgb,
+    // );
 
     // linear_to_rgb(
     //     &xyz,
@@ -198,16 +165,16 @@ fn main() {
     //     TransferFunction::Srgb,
     // );
 
-    let elapsed_time = start_time.elapsed();
+    // let elapsed_time = start_time.elapsed();
     // Print the elapsed time in milliseconds
-    println!("XYZ to sRGB: {:.2?}", elapsed_time);
+    // println!("XYZ to sRGB: {:.2?}", elapsed_time);
 
     // let rgba = rgb_to_rgba(&dst_bytes, width, height);
 
     if components == 4 {
         image::save_buffer(
             "converted.png",
-            dst_bytes.as_bytes(),
+            src_bytes.as_bytes(),
             dimensions.0,
             dimensions.1,
             image::ExtendedColorType::Rgba8,
@@ -216,7 +183,7 @@ fn main() {
     } else {
         image::save_buffer(
             "converted.jpg",
-            dst_bytes.as_bytes(),
+            src_bytes.as_bytes(),
             dimensions.0,
             dimensions.1,
             image::ExtendedColorType::Rgb8,

diff --git a/src/avx2_to_xyz_lab.rs b/src/avx2_to_xyz_lab.rs
@@ -21,6 +21,7 @@ use crate::x86_64_simd_support::*;
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
+use crate::luv::{LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y};
 
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
@@ -66,6 +67,37 @@ unsafe fn avx2_triple_to_xyz(
     (x, y, z)
 }
 
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[inline(always)]
+pub(crate) unsafe fn avx2_triple_to_luv(
+    x: __m256,
+    y: __m256,
+    z: __m256,
+) -> (__m256, __m256, __m256) {
+    let zeros = _mm256_setzero_ps();
+    let den = _mm256_prefer_fma_ps(
+        _mm256_prefer_fma_ps(x, z, _mm256_set1_ps(3f32)),
+        y,
+        _mm256_set1_ps(15f32),
+    );
+    let nan_mask = _mm256_cmp_ps::<_CMP_LT_OS>(den, _mm256_set1_ps(0f32));
+    let l_low_mask = _mm256_cmp_ps::<_CMP_LT_OS>(y, _mm256_set1_ps(LUV_CUTOFF_FORWARD_Y));
+    let y_cbrt = _mm256_cbrt_ps(y);
+    let l = _mm256_select_ps(
+        l_low_mask,
+        _mm256_mul_ps(y, _mm256_set1_ps(LUV_MULTIPLIER_FORWARD_Y)),
+        _mm256_prefer_fma_ps(_mm256_set1_ps(-16f32), y_cbrt, _mm256_set1_ps(116f32)),
+    );
+    let u_prime = _mm256_div_ps(_mm256_mul_ps(x, _mm256_set1_ps(4f32)), den);
+    let v_prime = _mm256_div_ps(_mm256_mul_ps(y, _mm256_set1_ps(9f32)), den);
+    let sub_u_prime = _mm256_sub_ps(u_prime, _mm256_set1_ps(crate::luv::LUV_WHITE_U_PRIME));
+    let sub_v_prime = _mm256_sub_ps(v_prime, _mm256_set1_ps(crate::luv::LUV_WHITE_V_PRIME));
+    let l13 = _mm256_mul_ps(l, _mm256_set1_ps(13f32));
+    let u = _mm256_select_ps(nan_mask, zeros, _mm256_mul_ps(l13, sub_u_prime));
+    let v = _mm256_select_ps(nan_mask, zeros, _mm256_mul_ps(l13, sub_v_prime));
+    (l, u, v)
+}
+
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
 #[allow(dead_code)]
@@ -191,6 +223,12 @@ pub(crate) unsafe fn avx2_channels_to_xyz_or_lab<
                 z_low_low = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = avx2_triple_to_luv(x_low_low, y_low_low, z_low_low);
+                x_low_low = l;
+                y_low_low = u;
+                z_low_low = v;
+            }
         }
 
         let write_dst_ptr = dst_ptr.add(cx * 3);
@@ -218,6 +256,12 @@ pub(crate) unsafe fn avx2_channels_to_xyz_or_lab<
                 z_low_high = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = avx2_triple_to_luv(x_low_high, y_low_high, z_low_high);
+                x_low_high = l;
+                y_low_high = u;
+                z_low_high = v;
+            }
         }
 
         let (v0, v1, v2) = avx2_interleave_rgb_ps(x_low_high, y_low_high, z_low_high);
@@ -246,6 +290,12 @@ pub(crate) unsafe fn avx2_channels_to_xyz_or_lab<
                 z_high_low = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = avx2_triple_to_luv(x_high_low, y_high_low, z_high_low);
+                x_high_low = l;
+                y_high_low = u;
+                z_high_low = v;
+            }
         }
 
         let (v0, v1, v2) = avx2_interleave_rgb_ps(x_high_low, y_high_low, z_high_low);
@@ -281,6 +331,12 @@ pub(crate) unsafe fn avx2_channels_to_xyz_or_lab<
                 z_high_high = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = avx2_triple_to_luv(x_high_high, y_high_high, z_high_high);
+                x_high_high = l;
+                y_high_high = u;
+                z_high_high = v;
+            }
         }
 
         let (v0, v1, v2) = avx2_interleave_rgb_ps(x_high_high, y_high_high, z_high_high);

diff --git a/src/concat_alpha.rs b/src/concat_alpha.rs
@@ -37,6 +37,7 @@ pub fn append_alpha(
 
     #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
     {
+        #[cfg(target_feature = "sse4.1")]
         if is_x86_feature_detected!("sse4.1") {
             _use_sse = true;
         }

diff --git a/src/image_to_linear.rs b/src/image_to_linear.rs
@@ -38,7 +38,7 @@ fn channels_to_linear<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
     #[cfg(target_arch = "x86_64")]
     let mut has_sse = false;
 
-    #[cfg(target_arch = "x86_64")]
+    #[cfg(all(target_arch = "x86_64", target_feature = "sse4.1"))]
     if is_x86_feature_detected!("sse4.1") {
         has_sse = true;
     }

diff --git a/src/image_to_linear_u8.rs b/src/image_to_linear_u8.rs
@@ -45,7 +45,10 @@ fn channels_to_linear<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
     #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
     let mut _has_sse = false;
 
-    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    #[cfg(all(
+        any(target_arch = "x86_64", target_arch = "x86"),
+        target_feature = "sse4.1"
+    ))]
     if is_x86_feature_detected!("sse4.1") {
         _has_sse = true;
     }