From 69af0da8174d4277334a03acacdb174a0758169e Mon Sep 17 00:00:00 2001
From: awxkee <radzivon.bartoshyk@proton.me>
Date: Wed, 5 Jun 2024 23:57:40 +0100
Subject: [PATCH] LUV bugfixes, improve LCh

---
 src/app/src/main.rs            | 105 +++++++++++----------------------
 src/avx2_to_xyz_lab.rs         |  56 ++++++++++++++++++
 src/concat_alpha.rs            |   1 +
 src/image_to_linear.rs         |   2 +-
 src/image_to_linear_u8.rs      |   5 +-
 src/image_to_xyz_lab.rs        |  77 +++++++++++++++++++++++-
 src/image_xyza_laba.rs         |  82 ++++++++++++++++++++++++-
 src/lib.rs                     |   8 +++
 src/linear_to_image.rs         |   5 +-
 src/linear_to_image_u8.rs      |   7 ++-
 src/luv.rs                     |  28 ++++-----
 src/neon_math.rs               |  87 +++++++++++++++++++++++----
 src/neon_to_xyz_lab.rs         |  60 +++++++++++++++++++
 src/neon_to_xyza_laba.rs       |  29 +++++++++
 src/neon_xyz_lab_to_image.rs   |  62 +++++++++++++++++--
 src/neon_xyza_laba_to_image.rs |  11 ++++
 src/rgb_expand.rs              |   1 +
 src/sse_to_xyz_lab.rs          |  56 ++++++++++++++++++
 src/xyz_lab_to_image.rs        |  72 +++++++++++++++++++++-
 src/xyza_laba_to_image.rs      |  71 +++++++++++++++++++++-
 20 files changed, 714 insertions(+), 111 deletions(-)
diff --git a/src/app/src/main.rs b/src/app/src/main.rs
index ecbc2ff..dd643a4 100644
--- a/src/app/src/main.rs
+++ b/src/app/src/main.rs
@@ -27,13 +27,13 @@ fn main() {
     //     _mm_storeu_ps(dst.as_mut_ptr() as *mut f32, ln);
     //     println!("{:?}", dst);
     // }
-    // #[cfg(target_arch = "aarch64")]
-    // unsafe {
-    //     let m = vdupq_n_f32(std::f32::consts::E);
-    //     let cbrt = vlogq_f32_ulp35(m);
-    //     let l = vgetq_lane_f32::<0>(cbrt);
-    //     println!("Exp {}", l);
-    // }
+    #[cfg(target_arch = "aarch64")]
+    unsafe {
+        let m = vdupq_n_f32(27f32);
+        let cbrt = vcbrtq_f32_ulp2(m);
+        let l = vgetq_lane_f32::<0>(cbrt);
+        println!("Cbrt {}", l);
+    }
 
     let img = ImageReader::open("./assets/asset_middle.jpg")
         .unwrap()
@@ -68,9 +68,7 @@ fn main() {
         let mut lab_store: Vec<f32> = vec![];
         let store_stride = width as usize * 4usize * std::mem::size_of::<f32>();
         lab_store.resize(width as usize * 4usize * height as usize, 0f32);
-        let mut alpha_store: Vec<f32> = vec![];
-        let alpha_stride = width as usize * std::mem::size_of::<f32>();
-        alpha_store.resize(width as usize * height as usize, 0f32);
+        let start_time = Instant::now();
         rgba_to_lab_with_alpha(
             src_bytes,
             4u32 * width,
@@ -110,6 +108,10 @@ fn main() {
             height,
         );
 
+        let elapsed_time = start_time.elapsed();
+        // Print the elapsed time in milliseconds
+        println!("Fast image resize: {:.2?}", elapsed_time);
+
         // laba_to_srgb(
         //     &lab_store,
         //     lab_stride as u32,
@@ -124,51 +126,16 @@ fn main() {
         src_bytes = &dst_slice;
     }
 
-    let mut xyz: Vec<f32> = vec![];
-    xyz.resize(4 * width as usize * height as usize, 0f32);
-
-    let mut a_plane: Vec<f32> = vec![];
-    a_plane.resize(width as usize * height as usize, 0f32);
-
-    for i in 0..1 {
-        let start_time = Instant::now();
-        // srgba_to_xyza(
-        //     src_bytes,
-        //     width * components,
-        //     &mut xyz,
-        //     width * 3 * std::mem::size_of::<f32>() as u32,
-        //     &mut a_plane,
-        //     width as u32 * std::mem::size_of::<f32>() as u32,
-        //     width,
-        //     height,
-        // );
-        // rgba_to_linear(
-        //     src_bytes,
-        //     width * components,
-        //     &mut xyz,
-        //     width * 3 * std::mem::size_of::<f32>() as u32,
-        //     width,
-        //     height,
-        //     TransferFunction::Srgb,
-        // );
-        rgba_to_linear(
-            src_bytes,
-            width * components,
-            &mut xyz,
-            width * 4 * std::mem::size_of::<f32>() as u32,
-            width,
-            height,
-            TransferFunction::Srgb,
-        );
-        let elapsed_time = start_time.elapsed();
-        // Print the elapsed time in milliseconds
-        println!("sRGB to XYZ: {:.2?}", elapsed_time);
-    }
-
-    let mut dst_bytes: Vec<u8> = vec![];
-    dst_bytes.resize(width as usize * components as usize * height as usize, 0u8);
-
-    let start_time = Instant::now();
+    // let mut xyz: Vec<f32> = vec![];
+    // xyz.resize(4 * width as usize * height as usize, 0f32);
+    //
+    // let mut a_plane: Vec<f32> = vec![];
+    // a_plane.resize(width as usize * height as usize, 0f32);
+    //
+    // let mut dst_bytes: Vec<u8> = vec![];
+    // dst_bytes.resize(width as usize * components as usize * height as usize, 0u8);
+    //
+    // let start_time = Instant::now();
     // xyz_to_srgb(
     //     &xyz,
     //     width * 3 * std::mem::size_of::<f32>() as u32,
@@ -177,16 +144,16 @@ fn main() {
     //     width,
     //     height,
     // );
-
-    linear_to_rgba(
-        &xyz,
-        width * 4 * std::mem::size_of::<f32>() as u32,
-        &mut dst_bytes,
-        width * components,
-        width,
-        height,
-        TransferFunction::Srgb,
-    );
+    //
+    // linear_to_rgba(
+    //     &xyz,
+    //     width * 4 * std::mem::size_of::<f32>() as u32,
+    //     &mut dst_bytes,
+    //     width * components,
+    //     width,
+    //     height,
+    //     TransferFunction::Srgb,
+    // );
 
     // linear_to_rgb(
     //     &xyz,
@@ -198,16 +165,16 @@ fn main() {
     //     TransferFunction::Srgb,
     // );
 
-    let elapsed_time = start_time.elapsed();
+    // let elapsed_time = start_time.elapsed();
     // Print the elapsed time in milliseconds
-    println!("XYZ to sRGB: {:.2?}", elapsed_time);
+    // println!("XYZ to sRGB: {:.2?}", elapsed_time);
 
     // let rgba = rgb_to_rgba(&dst_bytes, width, height);
 
     if components == 4 {
         image::save_buffer(
             "converted.png",
-            dst_bytes.as_bytes(),
+            src_bytes.as_bytes(),
             dimensions.0,
             dimensions.1,
             image::ExtendedColorType::Rgba8,
@@ -216,7 +183,7 @@ fn main() {
     } else {
         image::save_buffer(
             "converted.jpg",
-            dst_bytes.as_bytes(),
+            src_bytes.as_bytes(),
             dimensions.0,
             dimensions.1,
             image::ExtendedColorType::Rgb8,
diff --git a/src/avx2_to_xyz_lab.rs b/src/avx2_to_xyz_lab.rs
index 6b19bc9..8a64fde 100644
--- a/src/avx2_to_xyz_lab.rs
+++ b/src/avx2_to_xyz_lab.rs
@@ -21,6 +21,7 @@ use crate::x86_64_simd_support::*;
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
+use crate::luv::{LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y};
 
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
@@ -66,6 +67,37 @@ unsafe fn avx2_triple_to_xyz(
     (x, y, z)
 }
 
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[inline(always)]
+pub(crate) unsafe fn avx2_triple_to_luv(
+    x: __m256,
+    y: __m256,
+    z: __m256,
+) -> (__m256, __m256, __m256) {
+    let zeros = _mm256_setzero_ps();
+    let den = _mm256_prefer_fma_ps(
+        _mm256_prefer_fma_ps(x, z, _mm256_set1_ps(3f32)),
+        y,
+        _mm256_set1_ps(15f32),
+    );
+    let nan_mask = _mm256_cmp_ps::<_CMP_LT_OS>(den, _mm256_set1_ps(0f32));
+    let l_low_mask = _mm256_cmp_ps::<_CMP_LT_OS>(y, _mm256_set1_ps(LUV_CUTOFF_FORWARD_Y));
+    let y_cbrt = _mm256_cbrt_ps(y);
+    let l = _mm256_select_ps(
+        l_low_mask,
+        _mm256_mul_ps(y, _mm256_set1_ps(LUV_MULTIPLIER_FORWARD_Y)),
+        _mm256_prefer_fma_ps(_mm256_set1_ps(-16f32), y_cbrt, _mm256_set1_ps(116f32)),
+    );
+    let u_prime = _mm256_div_ps(_mm256_mul_ps(x, _mm256_set1_ps(4f32)), den);
+    let v_prime = _mm256_div_ps(_mm256_mul_ps(y, _mm256_set1_ps(9f32)), den);
+    let sub_u_prime = _mm256_sub_ps(u_prime, _mm256_set1_ps(crate::luv::LUV_WHITE_U_PRIME));
+    let sub_v_prime = _mm256_sub_ps(v_prime, _mm256_set1_ps(crate::luv::LUV_WHITE_V_PRIME));
+    let l13 = _mm256_mul_ps(l, _mm256_set1_ps(13f32));
+    let u = _mm256_select_ps(nan_mask, zeros, _mm256_mul_ps(l13, sub_u_prime));
+    let v = _mm256_select_ps(nan_mask, zeros, _mm256_mul_ps(l13, sub_v_prime));
+    (l, u, v)
+}
+
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
 #[allow(dead_code)]
@@ -191,6 +223,12 @@ pub(crate) unsafe fn avx2_channels_to_xyz_or_lab<
                 z_low_low = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = avx2_triple_to_luv(x_low_low, y_low_low, z_low_low);
+                x_low_low = l;
+                y_low_low = u;
+                z_low_low = v;
+            }
         }
 
         let write_dst_ptr = dst_ptr.add(cx * 3);
@@ -218,6 +256,12 @@ pub(crate) unsafe fn avx2_channels_to_xyz_or_lab<
                 z_low_high = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = avx2_triple_to_luv(x_low_high, y_low_high, z_low_high);
+                x_low_high = l;
+                y_low_high = u;
+                z_low_high = v;
+            }
         }
 
         let (v0, v1, v2) = avx2_interleave_rgb_ps(x_low_high, y_low_high, z_low_high);
@@ -246,6 +290,12 @@ pub(crate) unsafe fn avx2_channels_to_xyz_or_lab<
                 z_high_low = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = avx2_triple_to_luv(x_high_low, y_high_low, z_high_low);
+                x_high_low = l;
+                y_high_low = u;
+                z_high_low = v;
+            }
         }
 
         let (v0, v1, v2) = avx2_interleave_rgb_ps(x_high_low, y_high_low, z_high_low);
@@ -281,6 +331,12 @@ pub(crate) unsafe fn avx2_channels_to_xyz_or_lab<
                 z_high_high = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = avx2_triple_to_luv(x_high_high, y_high_high, z_high_high);
+                x_high_high = l;
+                y_high_high = u;
+                z_high_high = v;
+            }
         }
 
         let (v0, v1, v2) = avx2_interleave_rgb_ps(x_high_high, y_high_high, z_high_high);
diff --git a/src/concat_alpha.rs b/src/concat_alpha.rs
index 0d657d8..30d804c 100644
--- a/src/concat_alpha.rs
+++ b/src/concat_alpha.rs
@@ -37,6 +37,7 @@ pub fn append_alpha(
 
     #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
     {
+        #[cfg(target_feature = "sse4.1")]
         if is_x86_feature_detected!("sse4.1") {
             _use_sse = true;
         }
diff --git a/src/image_to_linear.rs b/src/image_to_linear.rs
index 67ce497..8997066 100644
--- a/src/image_to_linear.rs
+++ b/src/image_to_linear.rs
@@ -38,7 +38,7 @@ fn channels_to_linear<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
     #[cfg(target_arch = "x86_64")]
     let mut has_sse = false;
 
-    #[cfg(target_arch = "x86_64")]
+    #[cfg(all(target_arch = "x86_64", target_feature = "sse4.1"))]
     if is_x86_feature_detected!("sse4.1") {
         has_sse = true;
     }
diff --git a/src/image_to_linear_u8.rs b/src/image_to_linear_u8.rs
index 107d1d0..1928117 100644
--- a/src/image_to_linear_u8.rs
+++ b/src/image_to_linear_u8.rs
@@ -45,7 +45,10 @@ fn channels_to_linear<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
     #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
     let mut _has_sse = false;
 
-    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    #[cfg(all(
+        any(target_arch = "x86_64", target_arch = "x86"),
+        target_feature = "sse4.1"
+    ))]
     if is_x86_feature_detected!("sse4.1") {
         _has_sse = true;
     }
diff --git a/src/image_to_xyz_lab.rs b/src/image_to_xyz_lab.rs
index 62725e5..3536ad8 100644
--- a/src/image_to_xyz_lab.rs
+++ b/src/image_to_xyz_lab.rs
@@ -3,7 +3,7 @@
 use crate::avx2_to_xyz_lab::*;
 use crate::gamma_curves::TransferFunction;
 use crate::image::ImageConfiguration;
-use crate::image_to_xyz_lab::XyzTarget::{LAB, XYZ};
+use crate::image_to_xyz_lab::XyzTarget::{LAB, LUV, XYZ};
 #[cfg(all(
     any(target_arch = "aarch64", target_arch = "arm"),
     target_feature = "neon"
@@ -17,6 +17,7 @@ use std::slice;
 pub(crate) enum XyzTarget {
     LAB = 0,
     XYZ = 1,
+    LUV = 2,
 }
 
 impl From<u8> for XyzTarget {
@@ -24,6 +25,7 @@ impl From<u8> for XyzTarget {
         match value {
             0 => LAB,
             1 => XYZ,
+            2 => LUV,
             _ => {
                 panic!("Not implemented")
             }
@@ -69,6 +71,7 @@ fn channels_to_xyz<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
         if is_x86_feature_detected!("avx2") {
             _has_avx2 = true;
         }
+        #[cfg(target_feature = "sse4.1")]
         if is_x86_feature_detected!("sse4.1") {
             _has_sse = true;
         }
@@ -209,6 +212,14 @@ fn channels_to_xyz<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
                         *dst_slice.get_unchecked_mut(x * 3 + 2) = xyz.z;
                     }
                 }
+                XyzTarget::LUV => {
+                    let luv = rgb.to_luv();
+                    unsafe {
+                        *dst_slice.get_unchecked_mut(x * 3) = luv.l;
+                        *dst_slice.get_unchecked_mut(x * 3 + 1) = luv.u;
+                        *dst_slice.get_unchecked_mut(x * 3 + 2) = luv.v;
+                    }
+                }
             }
 
             if USE_ALPHA && image_configuration.has_alpha() {
@@ -611,3 +622,67 @@ pub fn bgr_to_lab(
         TransferFunction::Srgb,
     );
 }
+
+/// This function converts RGB to CIE L*uv against D65 white point. This is much more effective than naive direct transformation
+///
+/// # Arguments
+/// * `src` - A slice contains RGB data
+/// * `src_stride` - Bytes per row for src data.
+/// * `width` - Image width
+/// * `height` - Image height
+/// * `dst` - A mutable slice to receive LAB data
+/// * `dst_stride` - Bytes per row for dst data
+pub fn rgb_to_luv(
+    src: &[u8],
+    src_stride: u32,
+    dst: &mut [f32],
+    dst_stride: u32,
+    width: u32,
+    height: u32,
+) {
+    let mut empty_vec = vec![];
+    channels_to_xyz::<{ ImageConfiguration::Rgb as u8 }, false, { LUV as u8 }>(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        &mut empty_vec,
+        0,
+        width,
+        height,
+        &SRGB_TO_XYZ_D65,
+        TransferFunction::Srgb,
+    );
+}
+
+/// This function converts BGR to CIE L*ab against D65 white point. This is much more effective than naive direct transformation
+///
+/// # Arguments
+/// * `src` - A slice contains BGR data
+/// * `src_stride` - Bytes per row for src data.
+/// * `width` - Image width
+/// * `height` - Image height
+/// * `dst` - A mutable slice to receive LAB data
+/// * `dst_stride` - Bytes per row for dst data
+pub fn bgr_to_luv(
+    src: &[u8],
+    src_stride: u32,
+    dst: &mut [f32],
+    dst_stride: u32,
+    width: u32,
+    height: u32,
+) {
+    let mut empty_vec = vec![];
+    channels_to_xyz::<{ ImageConfiguration::Bgr as u8 }, false, { LUV as u8 }>(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        &mut empty_vec,
+        0,
+        width,
+        height,
+        &SRGB_TO_XYZ_D65,
+        TransferFunction::Srgb,
+    );
+}
diff --git a/src/image_xyza_laba.rs b/src/image_xyza_laba.rs
index acb1890..e58ac2f 100644
--- a/src/image_xyza_laba.rs
+++ b/src/image_xyza_laba.rs
@@ -1,6 +1,6 @@
 use crate::image::ImageConfiguration;
 use crate::image_to_xyz_lab::XyzTarget;
-use crate::image_to_xyz_lab::XyzTarget::{LAB, XYZ};
+use crate::image_to_xyz_lab::XyzTarget::{LAB, LUV, XYZ};
 use crate::{Rgb, TransferFunction, Xyz, SRGB_TO_XYZ_D65};
 use std::slice;
 #[cfg(all(
@@ -73,11 +73,11 @@ fn channels_to_xyz_with_alpha<const CHANNELS_CONFIGURATION: u8, const TARGET: u8
             };
 
             let rgb = Rgb::<u8>::new(r, g, b);
+            let px = x * CHANNELS;
             match target {
                 LAB => {
                     let lab = rgb.to_lab();
                     unsafe {
-                        let px = x * CHANNELS;
                         *dst_slice.get_unchecked_mut(px) = lab.l;
                         *dst_slice.get_unchecked_mut(px + 1) = lab.a;
                         *dst_slice.get_unchecked_mut(px + 2) = lab.b;
@@ -85,13 +85,20 @@ fn channels_to_xyz_with_alpha<const CHANNELS_CONFIGURATION: u8, const TARGET: u8
                 }
                 XYZ => {
                     let xyz = Xyz::from_rgb(&rgb, &matrix, transfer_function);
-                    let px = x * CHANNELS;
                     unsafe {
                         *dst_slice.get_unchecked_mut(px) = xyz.x;
                         *dst_slice.get_unchecked_mut(px + 1) = xyz.y;
                         *dst_slice.get_unchecked_mut(px + 2) = xyz.z;
                     }
                 }
+                XyzTarget::LUV => {
+                    let luv = rgb.to_luv();
+                    unsafe {
+                        *dst_slice.get_unchecked_mut(px) = luv.l;
+                        *dst_slice.get_unchecked_mut(px + 1) = luv.u;
+                        *dst_slice.get_unchecked_mut(px + 2) = luv.v;
+                    }
+                }
             }
 
             let a = unsafe {
@@ -175,3 +182,72 @@ pub fn bgra_to_lab_with_alpha(
         TransferFunction::Srgb,
     );
 }
+
+
+/// This function converts RGBA to CIE L*uv against D65 white point and preserving and normalizing alpha channels keeping it at last positions. This is much more effective than naive direct transformation
+///
+/// # Arguments
+/// * `src` - A slice contains RGBA data
+/// * `src_stride` - Bytes per row for src data.
+/// * `width` - Image width
+/// * `height` - Image height
+/// * `dst` - A mutable slice to receive LAB(a) data
+/// * `dst_stride` - Bytes per row for dst data
+/// * `a_plane` - A mutable slice to receive XYZ data
+/// * `a_stride` - Bytes per row for dst data
+pub fn rgba_to_luv_with_alpha(
+    src: &[u8],
+    src_stride: u32,
+    dst: &mut [f32],
+    dst_stride: u32,
+    width: u32,
+    height: u32,
+) {
+    channels_to_xyz_with_alpha::<
+        { ImageConfiguration::Rgba as u8 },
+        { LUV as u8 },
+    >(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        width,
+        height,
+        &SRGB_TO_XYZ_D65,
+        TransferFunction::Srgb,
+    );
+}
+
+/// This function converts BGRA to CIE L*uv against D65 white point and preserving and normalizing alpha channels keeping it at last positions. This is much more effective than naive direct transformation
+///
+/// # Arguments
+/// * `src` - A slice contains BGRA data
+/// * `src_stride` - Bytes per row for src data.
+/// * `width` - Image width
+/// * `height` - Image height
+/// * `dst` - A mutable slice to receive LAB(a) data
+/// * `dst_stride` - Bytes per row for dst data
+/// * `a_plane` - A mutable slice to receive XYZ data
+/// * `a_stride` - Bytes per row for dst data
+pub fn bgra_to_luv_with_alpha(
+    src: &[u8],
+    src_stride: u32,
+    dst: &mut [f32],
+    dst_stride: u32,
+    width: u32,
+    height: u32,
+) {
+    channels_to_xyz_with_alpha::<
+        { ImageConfiguration::Bgra as u8 },
+        { LUV as u8 },
+    >(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        width,
+        height,
+        &SRGB_TO_XYZ_D65,
+        TransferFunction::Srgb,
+    );
+}
diff --git a/src/lib.rs b/src/lib.rs
index dd5786d..87ada82 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -67,18 +67,26 @@ pub use image_to_xyz_lab::bgra_to_laba;
 pub use image_to_xyz_lab::bgr_to_lab;
 pub use image_to_xyz_lab::srgb_to_xyz;
 pub use image_to_xyz_lab::rgba_to_lab;
+pub use image_to_xyz_lab::bgr_to_luv;
+pub use image_to_xyz_lab::rgb_to_luv;
 pub use xyz_lab_to_image::xyz_to_rgb;
 pub use xyz_lab_to_image::lab_to_srgb;
 pub use xyz_lab_to_image::xyz_to_srgb;
 pub use xyz_lab_to_image::laba_to_srgb;
 pub use xyz_lab_to_image::xyza_to_rgba;
+pub use xyz_lab_to_image::luv_to_rgb;
+pub use xyz_lab_to_image::luv_to_bgr;
 pub use image_to_linear::*;
 pub use linear_to_image::*;
 pub use concat_alpha::append_alpha;
 pub use image_xyza_laba::rgba_to_lab_with_alpha;
 pub use image_xyza_laba::bgra_to_lab_with_alpha;
+pub use image_xyza_laba::rgba_to_luv_with_alpha;
+pub use image_xyza_laba::bgra_to_luv_with_alpha;
 pub use xyza_laba_to_image::lab_with_alpha_to_bgra;
 pub use xyza_laba_to_image::lab_with_alpha_to_rgba;
+pub use xyza_laba_to_image::luv_with_alpha_to_bgra;
+pub use xyza_laba_to_image::luv_with_alpha_to_rgba;
 
 pub use image_to_linear_u8::*;
 pub use linear_to_image_u8::*;
diff --git a/src/linear_to_image.rs b/src/linear_to_image.rs
index 730b0f0..4304104 100644
--- a/src/linear_to_image.rs
+++ b/src/linear_to_image.rs
@@ -38,7 +38,10 @@ fn linear_to_gamma_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: b
     #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
     let mut _has_sse = false;
 
-    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    #[cfg(all(
+        any(target_arch = "x86_64", target_arch = "x86"),
+        target_feature = "sse4.1"
+    ))]
     if is_x86_feature_detected!("sse4.1") {
         _has_sse = true;
     }
diff --git a/src/linear_to_image_u8.rs b/src/linear_to_image_u8.rs
index 7757bf1..548e74e 100644
--- a/src/linear_to_image_u8.rs
+++ b/src/linear_to_image_u8.rs
@@ -14,9 +14,9 @@ use crate::neon_linear_to_image::get_neon_gamma_transfer;
 use crate::neon_to_linear_u8::neon_image_linear_to_u8;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::sse_image_to_linear_u8::sse_image_to_linear_unsigned;
-use crate::Rgb;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::sse_linear_to_image::get_sse_gamma_transfer;
+use crate::Rgb;
 
 #[inline]
 fn linear_to_gamma_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
@@ -45,7 +45,10 @@ fn linear_to_gamma_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: b
     #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
     let mut _has_sse = false;
 
-    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    #[cfg(all(
+        any(target_arch = "x86_64", target_arch = "x86"),
+        target_feature = "sse4.1"
+    ))]
     if is_x86_feature_detected!("sse4.1") {
         _has_sse = true;
     }
diff --git a/src/luv.rs b/src/luv.rs
index 218580f..217c481 100644
--- a/src/luv.rs
+++ b/src/luv.rs
@@ -50,22 +50,22 @@ use crate::rgb::Rgb;
 use crate::rgba::Rgba;
 use crate::xyz::Xyz;
 
-const WHITE_U_PRIME: f32 =
+pub(crate) const LUV_WHITE_U_PRIME: f32 =
     4.0f32 * D65_XYZ[1] / (D65_XYZ[0] + 15.0 * D65_XYZ[1] + 3.0 * D65_XYZ[2]);
-const WHITE_V_PRIME: f32 =
+pub(crate) const LUV_WHITE_V_PRIME: f32 =
     9.0f32 * D65_XYZ[1] / (D65_XYZ[0] + 15.0 * D65_XYZ[1] + 3.0 * D65_XYZ[2]);
 
-const CUTOFF_FORWARD_Y: f32 = (6f32 / 29f32) * (6f32 / 29f32) * (6f32 / 29f32);
-const MULTIPLIER_FORWARD_Y: f32 = (29f32 / 3f32) * (29f32 / 3f32) * (29f32 / 3f32);
-const MULTIPLIER_INVERSE_Y: f32 = (3f32 / 29f32) * (3f32 / 29f32) * (3f32 / 29f32);
+pub(crate) const LUV_CUTOFF_FORWARD_Y: f32 = (6f32 / 29f32) * (6f32 / 29f32) * (6f32 / 29f32);
+pub(crate) const LUV_MULTIPLIER_FORWARD_Y: f32 = (29f32 / 3f32) * (29f32 / 3f32) * (29f32 / 3f32);
+pub(crate) const LUV_MULTIPLIER_INVERSE_Y: f32 = (3f32 / 29f32) * (3f32 / 29f32) * (3f32 / 29f32);
 impl Luv {
     pub fn from_rgb(rgb: &Rgb<u8>) -> Self {
         let xyz = Xyz::from_srgb(rgb);
         let [x, y, z] = [xyz.x, xyz.y, xyz.z];
         let den = x + 15.0 * y + 3.0 * z;
 
-        let l = (if y < CUTOFF_FORWARD_Y {
-            MULTIPLIER_FORWARD_Y * y
+        let l = (if y < LUV_CUTOFF_FORWARD_Y {
+            LUV_MULTIPLIER_FORWARD_Y * y
         } else {
             116f32 * y.cbrt() - 16f32
         })
@@ -73,10 +73,10 @@ impl Luv {
         .max(0f32);
         let (u, v);
         if den != 0f32 {
-            let u_prime = 4.0 * x / den;
-            let v_prime = 9.0 * y / den;
-            u = 13f32 * l * (u_prime - WHITE_U_PRIME);
-            v = 13f32 * l * (v_prime - WHITE_V_PRIME);
+            let u_prime = 4f32 * x / den;
+            let v_prime = 9f32 * y / den;
+            u = 13f32 * l * (u_prime - LUV_WHITE_U_PRIME);
+            v = 13f32 * l * (v_prime - LUV_WHITE_V_PRIME);
         } else {
             u = 0f32;
             v = 0f32;
@@ -95,12 +95,12 @@ impl Luv {
             return Xyz::new(0f32, 0f32, 0f32).to_srgb();
         }
         let l13 = 1f32 / (13f32 * self.l);
-        let u = self.u * l13 + WHITE_U_PRIME;
-        let v = self.v * l13 + WHITE_V_PRIME;
+        let u = self.u * l13 + LUV_WHITE_U_PRIME;
+        let v = self.v * l13 + LUV_WHITE_V_PRIME;
         let y = if self.l > 8f32 {
             ((self.l + 16f32) / 116f32).powi(3)
         } else {
-            self.l * MULTIPLIER_INVERSE_Y
+            self.l * LUV_MULTIPLIER_INVERSE_Y
         };
         let (x, z);
         if v != 0f32 {
diff --git a/src/neon_math.rs b/src/neon_math.rs
index 76982cb..e7712b1 100644
--- a/src/neon_math.rs
+++ b/src/neon_math.rs
@@ -64,7 +64,7 @@ pub unsafe fn vrintq_s32(d: float32x4_t) -> int32x4_t {
                 vreinterpretq_u32_f32(vdupq_n_f32(-0.0f32)),
             ),
             vreinterpretq_u32_f32(vdupq_n_f32(0.5f32)),
-        ),)
+        )),
     ));
 }
 
@@ -144,7 +144,11 @@ pub unsafe fn vexpq_f32_ulp1(d: float32x4_t) -> float32x4_t {
     let q = vrintq_s32(vmulq_f32(d, vdupq_n_f32(std::f32::consts::LOG2_E)));
 
     let mut s = vmlafq_f32(vcvtq_f32_s32(q), vdupq_n_f32(-std::f32::consts::LN_2), d);
-    s = vmlafq_f32(vcvtq_f32_s32(q), vdupq_n_f32(-1.428606765330187045e-06f32), s);
+    s = vmlafq_f32(
+        vcvtq_f32_s32(q),
+        vdupq_n_f32(-1.428606765330187045e-06f32),
+        s,
+    );
 
     let mut u = vdupq_n_f32(0.000198527617612853646278381f32);
     u = vmlafq_f32(u, s, vdupq_n_f32(0.00139304355252534151077271f32));
@@ -157,8 +161,15 @@ pub unsafe fn vexpq_f32_ulp1(d: float32x4_t) -> float32x4_t {
 
     u = vldexp2q_f32(u, q);
 
-    u = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(u), vcltq_f32(d, vdupq_n_f32(-104f32))));
-    u = vbslq_f32(vcltq_f32(vdupq_n_f32(100f32), d), vdupq_n_f32(f32::INFINITY), u);
+    u = vreinterpretq_f32_u32(vbicq_u32(
+        vreinterpretq_u32_f32(u),
+        vcltq_f32(d, vdupq_n_f32(-104f32)),
+    ));
+    u = vbslq_f32(
+        vcltq_f32(vdupq_n_f32(100f32), d),
+        vdupq_n_f32(f32::INFINITY),
+        u,
+    );
     u
 }
 
@@ -204,7 +215,9 @@ pub unsafe fn vlogq_f32(x: float32x4_t) -> float32x4_t {
 ))]
 #[inline(always)]
 #[allow(dead_code)]
-pub unsafe fn visnanq_f32(x: float32x4_t) -> uint32x4_t { return vmvnq_u32(vceqq_f32(x, x)); }
+pub unsafe fn visnanq_f32(x: float32x4_t) -> uint32x4_t {
+    return vmvnq_u32(vceqq_f32(x, x));
+}
 
 #[cfg(all(
     any(target_arch = "aarch64", target_arch = "arm"),
@@ -212,7 +225,9 @@ pub unsafe fn visnanq_f32(x: float32x4_t) -> uint32x4_t { return vmvnq_u32(vceqq
 ))]
 #[inline(always)]
 #[allow(dead_code)]
-pub unsafe fn vispinfq_f32(d: float32x4_t) -> uint32x4_t { return vceqq_f32(d, vdupq_n_f32(f32::INFINITY)); }
+pub unsafe fn vispinfq_f32(d: float32x4_t) -> uint32x4_t {
+    return vceqq_f32(d, vdupq_n_f32(f32::INFINITY));
+}
 
 #[cfg(all(
     any(target_arch = "aarch64", target_arch = "arm"),
@@ -225,11 +240,14 @@ pub unsafe fn vlogq_f32_ulp35(d: float32x4_t) -> float32x4_t {
     let o = vceqq_f32(d, vdupq_n_f32(f32::MIN));
     let m = (1i64 << 32i64) as f32;
     let d = vbslq_f32(o, vmulq_f32(d, vdupq_n_f32(m * m)), d);
-    let e = vilogbk_vi2_vf(vmulq_f32(d, vdupq_n_f32(1.0f32/0.75f32)));
+    let e = vilogbk_vi2_vf(vmulq_f32(d, vdupq_n_f32(1.0f32 / 0.75f32)));
     let m = vldexp2q_f32(d, vnegq_s32(e));
     let e = vbslq_s32(o, vsubq_s32(e, vdupq_n_s32(64)), e);
 
-    let mut x = vdivq_f32(vsubq_f32(m, vdupq_n_f32(1.0f32)), vaddq_f32(vdupq_n_f32(1.0f32), m));
+    let mut x = vdivq_f32(
+        vsubq_f32(m, vdupq_n_f32(1.0f32)),
+        vaddq_f32(vdupq_n_f32(1.0f32), m),
+    );
     let x2 = vmulq_f32(x, x);
 
     let mut t = vdupq_n_f32(0.2392828464508056640625f32);
@@ -238,9 +256,17 @@ pub unsafe fn vlogq_f32_ulp35(d: float32x4_t) -> float32x4_t {
     t = vmlafq_f32(t, x2, vdupq_n_f32(0.666666686534881591796875f32));
     t = vmlafq_f32(t, x2, vdupq_n_f32(2.0f32));
 
-    x = vmlafq_f32(x, t, vmulq_f32(vdupq_n_f32(std::f32::consts::LN_2), vcvtq_f32_s32(e)));
+    x = vmlafq_f32(
+        x,
+        t,
+        vmulq_f32(vdupq_n_f32(std::f32::consts::LN_2), vcvtq_f32_s32(e)),
+    );
     x = vbslq_f32(vispinfq_f32(d), vdupq_n_f32(f32::NAN), x);
-    x = vbslq_f32(vorrq_u32(vcltq_f32(d, vdupq_n_f32(0f32)), visnanq_f32(d)), vdupq_n_f32(f32::NAN), x);
+    x = vbslq_f32(
+        vorrq_u32(vcltq_f32(d, vdupq_n_f32(0f32)), visnanq_f32(d)),
+        vdupq_n_f32(f32::NAN),
+        x,
+    );
     x = vbslq_f32(vceqq_f32(d, vdupq_n_f32(0f32)), vdupq_n_f32(-f32::NAN), x);
     return x;
 }
@@ -351,6 +377,47 @@ pub unsafe fn vcbrtq_f32(d: float32x4_t) -> float32x4_t {
     vpowq_n_f32(d, 1f32 / 3f32)
 }
 
+#[cfg(all(
+    any(target_arch = "aarch64", target_arch = "arm"),
+    target_feature = "neon"
+))]
+#[inline(always)]
+#[allow(dead_code)]
+/// Precise version of Cube Root with ULP 2
+pub unsafe fn vcbrtq_f32_ulp2(x: float32x4_t) -> float32x4_t {
+    let x1p24 = vreinterpretq_f32_u32(vdupq_n_u32(0x4b800000)); // 0x1p24f === 2 ^ 24
+
+    let mut ui = vreinterpretq_u32_f32(x);
+    let hx = vandq_u32(ui, vdupq_n_u32(0x7fffffff));
+
+    let nan_mask = vcgeq_u32(hx, vdupq_n_u32(0x7f800000));
+    let is_zero_mask = vceqzq_u32(hx);
+
+    let lo_mask = vcltq_u32(hx, vdupq_n_u32(0x00800000));
+    let hi_ui_f = vreinterpretq_u32_f32(vmulq_f32(x, x1p24));
+    let mut lo_hx = vandq_u32(hi_ui_f, vdupq_n_u32(0x7fffffff));
+    lo_hx = vaddq_u32(vcvtq_u32_f32(vmulq_n_f32(vcvtq_f32_u32(lo_hx), 1f32/3f32)), vdupq_n_u32(642849266));
+    let hi_hx = vaddq_u32(vcvtq_u32_f32(vmulq_n_f32(vcvtq_f32_u32(hx), 1f32/3f32)), vdupq_n_u32(709958130));
+    let hx = vbslq_u32(lo_mask, lo_hx, hi_hx);
+
+    ui = vbslq_u32(lo_mask, hi_ui_f, ui);
+    ui = vandq_u32(ui, vdupq_n_u32(0x80000000));
+    ui = vorrq_u32(ui, hx);
+
+    let mut t = vreinterpretq_f32_u32(ui);
+    let mut r = vmulq_f32(vmulq_f32(t, t), t);
+
+    let sum_x = vaddq_f32(x, x);
+
+    t = vmulq_f32(vdivq_f32(vaddq_f32(sum_x, r), vaddq_f32(vaddq_f32(r, r), x)), t);
+
+    r = vmulq_f32(vmulq_f32(t, t), t);
+    t = vmulq_f32(vdivq_f32(vaddq_f32(sum_x, r), vaddq_f32(vaddq_f32(r, r), x)), t);
+    t = vbslq_f32(nan_mask, vdupq_n_f32(f32::NAN), t);
+    t = vbslq_f32(is_zero_mask, vdupq_n_f32(0f32), t);
+    t
+}
+
 #[cfg(all(
     any(target_arch = "aarch64", target_arch = "arm"),
     target_feature = "neon"
diff --git a/src/neon_to_xyz_lab.rs b/src/neon_to_xyz_lab.rs
index 6936176..54b49e2 100644
--- a/src/neon_to_xyz_lab.rs
+++ b/src/neon_to_xyz_lab.rs
@@ -5,6 +5,8 @@ use crate::image::ImageConfiguration;
 #[allow(unused_imports)]
 use crate::image_to_xyz_lab::XyzTarget;
 #[allow(unused_imports)]
+use crate::luv::{LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y};
+#[allow(unused_imports)]
 use crate::neon_gamma_curves::*;
 #[cfg(all(
     any(target_arch = "aarch64", target_arch = "arm"),
@@ -63,6 +65,40 @@ pub(crate) unsafe fn neon_triple_to_xyz(
     (x, y, z)
 }
 
+#[cfg(all(
+    any(target_arch = "aarch64", target_arch = "arm"),
+    target_feature = "neon"
+))]
+#[inline(always)]
+pub(crate) unsafe fn neon_triple_to_luv(
+    x: float32x4_t,
+    y: float32x4_t,
+    z: float32x4_t,
+) -> (float32x4_t, float32x4_t, float32x4_t) {
+    let zeros = vdupq_n_f32(0f32);
+    let den = prefer_vfmaq_f32(
+        prefer_vfmaq_f32(x, z, vdupq_n_f32(3f32)),
+        y,
+        vdupq_n_f32(15f32),
+    );
+    let nan_mask = vceqzq_f32(den);
+    let l_low_mask = vcltq_f32(y, vdupq_n_f32(LUV_CUTOFF_FORWARD_Y));
+    let y_cbrt = vcbrtq_f32(y);
+    let l = vbslq_f32(
+        l_low_mask,
+        vmulq_n_f32(y, LUV_MULTIPLIER_FORWARD_Y),
+        prefer_vfmaq_f32(vdupq_n_f32(-16f32), y_cbrt, vdupq_n_f32(116f32)),
+    );
+    let u_prime = vdivq_f32(vmulq_n_f32(x, 4f32), den);
+    let v_prime = vdivq_f32(vmulq_n_f32(y, 9f32), den);
+    let sub_u_prime = vsubq_f32(u_prime, vdupq_n_f32(crate::luv::LUV_WHITE_U_PRIME));
+    let sub_v_prime = vsubq_f32(v_prime, vdupq_n_f32(crate::luv::LUV_WHITE_V_PRIME));
+    let l13 = vmulq_n_f32(l, 13f32);
+    let u = vbslq_f32(nan_mask, zeros, vmulq_f32(l13, sub_u_prime));
+    let v = vbslq_f32(nan_mask, zeros, vmulq_f32(l13, sub_v_prime));
+    (l, u, v)
+}
+
 #[cfg(all(
     any(target_arch = "aarch64", target_arch = "arm"),
     target_feature = "neon"
@@ -191,6 +227,12 @@ pub(crate) unsafe fn neon_channels_to_xyz_or_lab<
                 z_low_low = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = neon_triple_to_luv(x_low_low, y_low_low, z_low_low);
+                x_low_low = l;
+                y_low_low = u;
+                z_low_low = v;
+            }
         }
 
         let xyz_low_low = float32x4x3_t(x_low_low, y_low_low, z_low_low);
@@ -213,6 +255,12 @@ pub(crate) unsafe fn neon_channels_to_xyz_or_lab<
                 z_low_high = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = neon_triple_to_luv(x_low_high, y_low_high, z_low_high);
+                x_low_high = l;
+                y_low_high = u;
+                z_low_high = v;
+            }
         }
 
         let xyz_low_low = float32x4x3_t(x_low_high, y_low_high, z_low_high);
@@ -239,6 +287,12 @@ pub(crate) unsafe fn neon_channels_to_xyz_or_lab<
                 z_high_low = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = neon_triple_to_luv(x_high_low, y_high_low, z_high_low);
+                x_high_low = l;
+                y_high_low = u;
+                z_high_low = v;
+            }
         }
 
         let xyz_low_low = float32x4x3_t(x_high_low, y_high_low, z_high_low);
@@ -272,6 +326,12 @@ pub(crate) unsafe fn neon_channels_to_xyz_or_lab<
                 z_high_high = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = neon_triple_to_luv(x_high_high, y_high_high, z_high_high);
+                x_high_high = l;
+                y_high_high = u;
+                z_high_high = v;
+            }
         }
 
         let xyz_low_low = float32x4x3_t(x_high_high, y_high_high, z_high_high);
diff --git a/src/neon_to_xyza_laba.rs b/src/neon_to_xyza_laba.rs
index 963391a..4a16914 100644
--- a/src/neon_to_xyza_laba.rs
+++ b/src/neon_to_xyza_laba.rs
@@ -21,6 +21,11 @@ use crate::neon_to_xyz_lab::get_neon_linear_transfer;
     any(target_arch = "aarch64", target_arch = "arm"),
     target_feature = "neon"
 ))]
+use crate::neon_to_xyz_lab::neon_triple_to_luv;
+#[cfg(all(
+    any(target_arch = "aarch64", target_arch = "arm"),
+    target_feature = "neon"
+))]
 use crate::neon_to_xyz_lab::{neon_triple_to_lab, neon_triple_to_xyz};
 
 #[cfg(all(
@@ -113,6 +118,12 @@ pub(crate) unsafe fn neon_channels_to_xyza_or_laba<
                 z_low_low = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = neon_triple_to_luv(x_low_low, y_low_low, z_low_low);
+                x_low_low = l;
+                y_low_low = u;
+                z_low_low = v;
+            }
         }
 
         let a_low = vmovl_u8(vget_low_u8(a_chan));
@@ -139,6 +150,12 @@ pub(crate) unsafe fn neon_channels_to_xyza_or_laba<
                 z_low_high = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = neon_triple_to_luv(x_low_high, y_low_high, z_low_high);
+                x_low_high = l;
+                y_low_high = u;
+                z_low_high = v;
+            }
         }
 
         let a_low_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_low)), 1f32 / 255f32);
@@ -167,6 +184,12 @@ pub(crate) unsafe fn neon_channels_to_xyza_or_laba<
                 z_high_low = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = neon_triple_to_luv(x_high_low, y_high_low, z_high_low);
+                x_high_low = l;
+                y_high_low = u;
+                z_high_low = v;
+            }
         }
 
         let a_high = vmovl_high_u8(a_chan);
@@ -206,6 +229,12 @@ pub(crate) unsafe fn neon_channels_to_xyza_or_laba<
                 z_high_high = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = neon_triple_to_luv(x_high_high, y_high_high, z_high_high);
+                x_high_high = l;
+                y_high_high = u;
+                z_high_high = v;
+            }
         }
 
         let a_high_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_high)), 1f32 / 255f32);
diff --git a/src/neon_xyz_lab_to_image.rs b/src/neon_xyz_lab_to_image.rs
index 24a9332..5a0bcfa 100644
--- a/src/neon_xyz_lab_to_image.rs
+++ b/src/neon_xyz_lab_to_image.rs
@@ -6,12 +6,17 @@ use crate::image_to_xyz_lab::XyzTarget;
     any(target_arch = "aarch64", target_arch = "arm"),
     target_feature = "neon"
 ))]
+use crate::luv::*;
+#[cfg(all(
+    any(target_arch = "aarch64", target_arch = "arm"),
+    target_feature = "neon"
+))]
 use crate::neon_linear_to_image::get_neon_gamma_transfer;
 #[cfg(all(
     any(target_arch = "aarch64", target_arch = "arm"),
     target_feature = "neon"
 ))]
-use crate::neon_math::vcolorq_matrix_f32;
+use crate::neon_math::*;
 #[allow(unused_imports)]
 use crate::TransferFunction;
 #[cfg(all(
@@ -29,6 +34,50 @@ unsafe fn vcubeq_f32(x: float32x4_t) -> float32x4_t {
     vmulq_f32(vmulq_f32(x, x), x)
 }
 
+#[cfg(all(
+    any(target_arch = "aarch64", target_arch = "arm"),
+    target_feature = "neon"
+))]
+#[inline(always)]
+pub(crate) unsafe fn neon_luv_to_xyz(
+    l: float32x4_t,
+    u: float32x4_t,
+    v: float32x4_t,
+) -> (float32x4_t, float32x4_t, float32x4_t) {
+    let zero_mask = vclezq_f32(l);
+    let zeros = vdupq_n_f32(0f32);
+    let l13 = vrecpeq_f32(vmulq_n_f32(l, 13f32));
+    let u = prefer_vfmaq_f32(vdupq_n_f32(LUV_WHITE_U_PRIME), l13, u);
+    let v = prefer_vfmaq_f32(vdupq_n_f32(LUV_WHITE_V_PRIME), l13, v);
+    let l_h = vmulq_n_f32(vaddq_f32(l, vdupq_n_f32(16f32)), 1f32 / 116f32);
+    let y_high = vmulq_f32(vmulq_f32(l_h, l_h), l_h);
+    let y_low = vmulq_n_f32(l, LUV_MULTIPLIER_INVERSE_Y);
+    let y = vbslq_f32(
+        zero_mask,
+        zeros,
+        vbslq_f32(vcgtq_f32(l, vdupq_n_f32(8f32)), y_high, y_low),
+    );
+    let zero_mask_2 = vclezq_f32(v);
+    let den = vrecpeq_f32(vmulq_n_f32(v, 4f32));
+    let mut x = vmulq_n_f32(vmulq_f32(vmulq_f32(y, u), den), 9f32);
+    x = vbslq_f32(zero_mask, zeros, x);
+    x = vbslq_f32(zero_mask_2, zeros, x);
+    let mut z = vmulq_f32(
+        vmulq_f32(
+            prefer_vfmaq_f32(
+                prefer_vfmaq_f32(vdupq_n_f32(12f32), vdupq_n_f32(-3f32), u),
+                v,
+                vdupq_n_f32(-20f32),
+            ),
+            y,
+        ),
+        den,
+    );
+    z = vbslq_f32(zero_mask, zeros, z);
+    z = vbslq_f32(zero_mask_2, zeros, z);
+    (x, y, z)
+}
+
 #[cfg(all(
     any(target_arch = "aarch64", target_arch = "arm"),
     target_feature = "neon"
@@ -94,6 +143,12 @@ pub(crate) unsafe fn neon_xyz_lab_vld<
             g_f32 = y;
             b_f32 = z;
         }
+        XyzTarget::LUV => {
+            let (x, y, z) = neon_luv_to_xyz(r_f32, g_f32, b_f32);
+            r_f32 = x;
+            g_f32 = y;
+            b_f32 = z;
+        }
         _ => {}
     }
 
@@ -246,10 +301,9 @@ pub unsafe fn neon_xyz_to_channels<
         let b_row = vcombine_u8(vqmovn_u16(b_row01), vqmovn_u16(b_row23));
 
         let dst_ptr = dst.add(dst_offset + cx * channels);
-        
+
         if USE_ALPHA {
-            let offset_a_src_ptr =
-                ((a_channel as *const u8).add(a_offset) as *const f32).add(cx);
+            let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx);
             let a_low_0_f = vld1q_f32(offset_a_src_ptr);
             let a_row0_ = vcvtq_u32_f32(vmulq_n_f32(a_low_0_f, 255f32));
 
diff --git a/src/neon_xyza_laba_to_image.rs b/src/neon_xyza_laba_to_image.rs
index f4c08bb..665da17 100644
--- a/src/neon_xyza_laba_to_image.rs
+++ b/src/neon_xyza_laba_to_image.rs
@@ -32,6 +32,11 @@ use crate::TransferFunction;
     target_feature = "neon"
 ))]
 use std::arch::aarch64::*;
+#[cfg(all(
+    any(target_arch = "aarch64", target_arch = "arm"),
+    target_feature = "neon"
+))]
+use crate::neon_xyz_lab_to_image::*;
 
 #[cfg(all(
     any(target_arch = "aarch64", target_arch = "arm"),
@@ -64,6 +69,12 @@ pub(crate) unsafe fn neon_xyza_lab_vld<const CHANNELS_CONFIGURATION: u8, const T
             g_f32 = y;
             b_f32 = z;
         }
+        XyzTarget::LUV => {
+            let (x, y, z) = neon_luv_to_xyz(r_f32, g_f32, b_f32);
+            r_f32 = x;
+            g_f32 = y;
+            b_f32 = z;
+        }
         _ => {}
     }
 
diff --git a/src/rgb_expand.rs b/src/rgb_expand.rs
index 6ba0758..fe7b39e 100644
--- a/src/rgb_expand.rs
+++ b/src/rgb_expand.rs
@@ -31,6 +31,7 @@ pub fn rgb_to_rgba(
 
     #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
     {
+        #[cfg(target_feature = "sse4.1")]
         if is_x86_feature_detected!("sse4.1") {
             _use_sse = true;
         }
diff --git a/src/sse_to_xyz_lab.rs b/src/sse_to_xyz_lab.rs
index 0a877a4..36dba83 100644
--- a/src/sse_to_xyz_lab.rs
+++ b/src/sse_to_xyz_lab.rs
@@ -17,6 +17,7 @@ use crate::x86_64_simd_support::*;
 use std::arch::x86_64::*;
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
+use crate::luv::{LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y};
 
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 pub unsafe fn get_sse_linear_transfer(
@@ -59,6 +60,37 @@ unsafe fn sse_triple_to_xyz(
     (x, y, z)
 }
 
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[inline(always)]
+pub(crate) unsafe fn sse_triple_to_luv(
+    x: __m128,
+    y: __m128,
+    z: __m128,
+) -> (__m128, __m128, __m128) {
+    let zeros = _mm_setzero_ps();
+    let den = _mm_prefer_fma_ps(
+        _mm_prefer_fma_ps(x, z, _mm_set1_ps(3f32)),
+        y,
+        _mm_set1_ps(15f32),
+    );
+    let nan_mask = _mm_cmpeq_ps(den, _mm_set1_ps(0f32));
+    let l_low_mask = _mm_cmplt_ps(y, _mm_set1_ps(LUV_CUTOFF_FORWARD_Y));
+    let y_cbrt = _mm_cbrt_ps(y);
+    let l = _mm_select_ps(
+        l_low_mask,
+        _mm_mul_ps(y, _mm_set1_ps(LUV_MULTIPLIER_FORWARD_Y)),
+        _mm_prefer_fma_ps(_mm_set1_ps(-16f32), y_cbrt, _mm_set1_ps(116f32)),
+    );
+    let u_prime = _mm_div_ps(_mm_mul_ps(x, _mm_set1_ps(4f32)), den);
+    let v_prime = _mm_div_ps(_mm_mul_ps(y, _mm_set1_ps(9f32)), den);
+    let sub_u_prime = _mm_sub_ps(u_prime, _mm_set1_ps(crate::luv::LUV_WHITE_U_PRIME));
+    let sub_v_prime = _mm_sub_ps(v_prime, _mm_set1_ps(crate::luv::LUV_WHITE_V_PRIME));
+    let l13 = _mm_mul_ps(l, _mm_set1_ps(13f32));
+    let u = _mm_select_ps(nan_mask, zeros, _mm_mul_ps(l13, sub_u_prime));
+    let v = _mm_select_ps(nan_mask, zeros, _mm_mul_ps(l13, sub_v_prime));
+    (l, u, v)
+}
+
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
 unsafe fn sse_triple_to_lab(x: __m128, y: __m128, z: __m128) -> (__m128, __m128, __m128) {
@@ -182,6 +214,12 @@ pub(crate) unsafe fn sse_channels_to_xyz_or_lab<
                 z_low_low = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = sse_triple_to_luv(x_low_low, y_low_low, z_low_low);
+                x_low_low = l;
+                y_low_low = u;
+                z_low_low = v;
+            }
         }
 
         let (v0, v1, v2) = sse_interleave_ps_rgb(x_low_low, y_low_low, z_low_low);
@@ -206,6 +244,12 @@ pub(crate) unsafe fn sse_channels_to_xyz_or_lab<
                 z_low_high = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = sse_triple_to_luv(x_low_high, y_low_high, z_low_high);
+                x_low_high = l;
+                y_low_high = u;
+                z_low_high = v;
+            }
         }
 
         let (v0, v1, v2) = sse_interleave_ps_rgb(x_low_high, y_low_high, z_low_high);
@@ -234,6 +278,12 @@ pub(crate) unsafe fn sse_channels_to_xyz_or_lab<
                 z_high_low = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = sse_triple_to_luv(x_high_low, y_high_low, z_high_low);
+                x_high_low = l;
+                y_high_low = u;
+                z_high_low = v;
+            }
         }
 
         let (v0, v1, v2) = sse_interleave_ps_rgb(x_high_low, y_high_low, z_high_low);
@@ -269,6 +319,12 @@ pub(crate) unsafe fn sse_channels_to_xyz_or_lab<
                 z_high_high = b;
             }
             XyzTarget::XYZ => {}
+            XyzTarget::LUV => {
+                let (l, u, v) = sse_triple_to_luv(x_high_high, y_high_high, z_high_high);
+                x_high_high = l;
+                y_high_high = u;
+                z_high_high = v;
+            }
         }
 
         let (v0, v1, v2) = sse_interleave_ps_rgb(x_high_high, y_high_high, z_high_high);
diff --git a/src/xyz_lab_to_image.rs b/src/xyz_lab_to_image.rs
index 13c8cb0..9c6156a 100644
--- a/src/xyz_lab_to_image.rs
+++ b/src/xyz_lab_to_image.rs
@@ -3,13 +3,13 @@ use std::slice;
 use crate::gamma_curves::TransferFunction;
 use crate::image::ImageConfiguration;
 use crate::image_to_xyz_lab::XyzTarget;
-use crate::image_to_xyz_lab::XyzTarget::{LAB, XYZ};
+use crate::image_to_xyz_lab::XyzTarget::{LAB, LUV, XYZ};
 #[cfg(all(
     any(target_arch = "aarch64", target_arch = "arm"),
     target_feature = "neon"
 ))]
 use crate::neon_xyz_lab_to_image::neon_xyz_to_channels;
-use crate::{Lab, Xyz, XYZ_TO_SRGB_D65};
+use crate::{Lab, Luv, Xyz, XYZ_TO_SRGB_D65};
 
 fn xyz_to_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, const TARGET: u8>(
     src: &[f32],
@@ -95,6 +95,10 @@ fn xyz_to_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
                     let xyz = Xyz::new(l_x, l_y, l_z);
                     rgb = xyz.to_rgb(&matrix, transfer_function);
                 }
+                XyzTarget::LUV => {
+                    let luv = Luv::new(l_x, l_y, l_z);
+                    rgb = luv.to_rgb();
+                }
             }
 
             dst_slice[x * channels + image_configuration.get_r_channel_offset()] = rgb.r;
@@ -287,3 +291,67 @@ pub fn xyza_to_rgba(
         transfer_function,
     );
 }
+
+/// This function converts LUV to RGB. This is much more effective than naive direct transformation
+///
+/// # Arguments
+/// * `src` - A slice contains LAB data
+/// * `src_stride` - Bytes per row for src data.
+/// * `width` - Image width
+/// * `height` - Image height
+/// * `dst` - A mutable slice to receive RGB data
+/// * `dst_stride` - Bytes per row for dst data
+pub fn luv_to_rgb(
+    src: &[f32],
+    src_stride: u32,
+    dst: &mut [u8],
+    dst_stride: u32,
+    width: u32,
+    height: u32,
+) {
+    let empty_vec = vec![];
+    xyz_to_channels::<{ ImageConfiguration::Rgb as u8 }, false, { LUV as u8 }>(
+        src,
+        src_stride,
+        &empty_vec,
+        0,
+        dst,
+        dst_stride,
+        width,
+        height,
+        &XYZ_TO_SRGB_D65,
+        TransferFunction::Srgb,
+    );
+}
+
+/// This function converts LUV to RGB. This is much more effective than naive direct transformation
+///
+/// # Arguments
+/// * `src` - A slice contains LAB data
+/// * `src_stride` - Bytes per row for src data.
+/// * `width` - Image width
+/// * `height` - Image height
+/// * `dst` - A mutable slice to receive RGB data
+/// * `dst_stride` - Bytes per row for dst data
+pub fn luv_to_bgr(
+    src: &[f32],
+    src_stride: u32,
+    dst: &mut [u8],
+    dst_stride: u32,
+    width: u32,
+    height: u32,
+) {
+    let empty_vec = vec![];
+    xyz_to_channels::<{ ImageConfiguration::Bgr as u8 }, false, { LUV as u8 }>(
+        src,
+        src_stride,
+        &empty_vec,
+        0,
+        dst,
+        dst_stride,
+        width,
+        height,
+        &XYZ_TO_SRGB_D65,
+        TransferFunction::Srgb,
+    );
+}
\ No newline at end of file
diff --git a/src/xyza_laba_to_image.rs b/src/xyza_laba_to_image.rs
index 5181bf7..cea50a0 100644
--- a/src/xyza_laba_to_image.rs
+++ b/src/xyza_laba_to_image.rs
@@ -3,13 +3,13 @@ use std::slice;
 use crate::gamma_curves::TransferFunction;
 use crate::image::ImageConfiguration;
 use crate::image_to_xyz_lab::XyzTarget;
-use crate::image_to_xyz_lab::XyzTarget::{LAB, XYZ};
-use crate::{Lab, Xyz, XYZ_TO_SRGB_D65};
+use crate::image_to_xyz_lab::XyzTarget::{LAB, LUV, XYZ};
 #[cfg(all(
     any(target_arch = "aarch64", target_arch = "arm"),
     target_feature = "neon"
 ))]
 use crate::neon_xyza_laba_to_image::neon_xyza_to_image;
+use crate::{Lab, Luv, Xyz, XYZ_TO_SRGB_D65};
 
 fn xyz_with_alpha_to_channels<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     src: &[f32],
@@ -53,7 +53,6 @@ fn xyz_with_alpha_to_channels<const CHANNELS_CONFIGURATION: u8, const TARGET: u8
             )
         }
 
-
         let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *mut f32 };
         let dst_ptr = unsafe { dst.as_mut_ptr().add(dst_offset) };
 
@@ -75,6 +74,10 @@ fn xyz_with_alpha_to_channels<const CHANNELS_CONFIGURATION: u8, const TARGET: u8
                     let xyz = Xyz::new(l_x, l_y, l_z);
                     rgb = xyz.to_rgb(&matrix, transfer_function);
                 }
+                LUV => {
+                    let luv = Luv::new(l_x, l_y, l_z);
+                    rgb = luv.to_rgb();
+                }
             }
 
             let l_a = unsafe { *src_slice.get_unchecked(px + 3) };
@@ -160,3 +163,65 @@ pub fn lab_with_alpha_to_bgra(
         TransferFunction::Srgb,
     );
 }
+
+/// This function converts LUV with separate alpha channel to RGBA. This is much more effective than naive direct transformation
+///
+/// # Arguments
+/// * `src` - A slice contains LAB data
+/// * `src_stride` - Bytes per row for src data.
+/// * `a_plane` - A slice contains Alpha data
+/// * `a_stride` - Bytes per row for alpha plane data
+/// * `dst` - A mutable slice to receive RGBA data
+/// * `dst_stride` - Bytes per row for dst data
+/// * `width` - Image width
+/// * `height` - Image height
+pub fn luv_with_alpha_to_rgba(
+    src: &[f32],
+    src_stride: u32,
+    dst: &mut [u8],
+    dst_stride: u32,
+    width: u32,
+    height: u32,
+) {
+    xyz_with_alpha_to_channels::<{ ImageConfiguration::Rgba as u8 }, { LUV as u8 }>(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        width,
+        height,
+        &XYZ_TO_SRGB_D65,
+        TransferFunction::Srgb,
+    );
+}
+
+/// This function converts LUV with separate alpha channel to BGRA. This is much more effective than naive direct transformation
+///
+/// # Arguments
+/// * `src` - A slice contains LAB data
+/// * `src_stride` - Bytes per row for src data.
+/// * `a_plane` - A slice contains Alpha data
+/// * `a_stride` - Bytes per row for alpha plane data
+/// * `dst` - A mutable slice to receive BGRA data
+/// * `dst_stride` - Bytes per row for dst data
+/// * `width` - Image width
+/// * `height` - Image height
+pub fn luv_with_alpha_to_bgra(
+    src: &[f32],
+    src_stride: u32,
+    dst: &mut [u8],
+    dst_stride: u32,
+    width: u32,
+    height: u32,
+) {
+    xyz_with_alpha_to_channels::<{ ImageConfiguration::Bgra as u8 }, { LAB as u8 }>(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        width,
+        height,
+        &XYZ_TO_SRGB_D65,
+        TransferFunction::Srgb,
+    );
+}