Finalize HSV conversion

awxkee · Jun 8, 2024 · 4ceb35b · 4ceb35b
1 parent 226dc54
commit 4ceb35b
Show file tree

Hide file tree

Showing 7 changed files with 141 additions and 37 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -2,7 +2,7 @@ workspace = { members = ["src/app"] }
 
 [package]
 name = "colorutils-rs"
-version = "0.2.11"
+version = "0.3.0"
 edition = "2021"
 description = "High performance utilities for color format handling and conversion."
 readme = "README.md"

diff --git a/src/app/src/main.rs b/src/app/src/main.rs
@@ -1,3 +1,4 @@
+use std::arch::aarch64::{vdupq_n_f32, vdupq_n_u32, vgetq_lane_f32, vgetq_lane_u32};
 use colorutils_rs::*;
 use image::io::Reader as ImageReader;
 use image::{EncodableLayout, GenericImageView};
@@ -43,54 +44,54 @@ fn main() {
     println!("Back RGB {:?}", hsl.to_rgb8());
 
     // unsafe {
-    //     let (h, s, l) = sse_rgb_to_hsl(
-    //         _mm_set1_epi32(r as i32),
-    //         _mm_set1_epi32(g as i32),
-    //         _mm_set1_epi32(b as i32),
-    //         _mm_set1_ps(1f32),
+    //     let (h, s, l) = neon_rgb_to_hsl(
+    //         vdupq_n_u32(r as u32),
+    //         vdupq_n_u32(g as u32),
+    //         vdupq_n_u32(b as u32),
+    //         vdupq_n_f32(1f32),
     //     );
     //     println!(
     //         "NEON HSL {}, {}, {}",
-    //         f32::from_bits(_mm_extract_ps::<0>(h) as u32),
-    //         f32::from_bits(_mm_extract_ps::<0>(s) as u32),
-    //         f32::from_bits(_mm_extract_ps::<0>(l) as u32)
+    //         vgetq_lane_f32::<0>(h),
+    //         vgetq_lane_f32::<0>(s),
+    //         vgetq_lane_f32::<0>(l)
     //     );
-    //     let (r1, g1, b1) = sse_hsl_to_rgb(h, s, l, _mm_set1_ps(1f32));
+    //     let (r1, g1, b1) = neon_hsl_to_rgb(h, s, l, vdupq_n_f32(1f32));
     //
     //     println!(
     //         "NEON HSL -> RGB {}, {}, {}",
-    //         _mm_extract_epi32::<0>(r1),
-    //         _mm_extract_epi32::<0>(g1),
-    //         _mm_extract_epi32::<0>(b1)
+    //         vgetq_lane_u32::<0>(r1),
+    //         vgetq_lane_u32::<0>(g1),
+    //         vgetq_lane_u32::<0>(b1)
     //     );
     // }
     //
     // unsafe {
-    //     let (h, s, v) = sse_rgb_to_hsv(
-    //         _mm_set1_epi32(r as i32),
-    //         _mm_set1_epi32(g as i32),
-    //         _mm_set1_epi32(b as i32),
-    //         _mm_set1_ps(1f32),
+    //     let (h, s, v) = neon_rgb_to_hsv(
+    //         vdupq_n_u32(r as u32),
+    //         vdupq_n_u32(g as u32),
+    //         vdupq_n_u32(b as u32),
+    //         vdupq_n_f32(1f32),
     //     );
     //     let hsv = rgb.to_hsv();
     //     println!("HSV {:?}", hsv);
     //     println!("HSV->RBB {:?}", hsv.to_rgb8());
     //     println!(
     //         "NEON HSV {}, {}, {}",
-    //         f32::from_bits(_mm_extract_ps::<0>(h) as u32),
-    //         f32::from_bits(_mm_extract_ps::<0>(s) as u32),
-    //         f32::from_bits(_mm_extract_ps::<0>(v) as u32)
+    //         vgetq_lane_f32::<0>(h),
+    //         vgetq_lane_f32::<0>(s),
+    //         vgetq_lane_f32::<0>(v)
     //     );
-    //     let (r1, g1, b1) = sse_hsv_to_rgb(h, s, v, _mm_set1_ps(1f32));
+    //     let (r1, g1, b1) = neon_hsv_to_rgb(h, s, v, vdupq_n_f32(1f32));
     //     println!(
     //         "NEON RGB {}, {}, {}",
-    //         _mm_extract_epi32::<0>(r1),
-    //         _mm_extract_epi32::<0>(g1),
-    //         _mm_extract_epi32::<0>(b1)
+    //         vgetq_lane_u32::<0>(r1),
+    //         vgetq_lane_u32::<0>(g1),
+    //         vgetq_lane_u32::<0>(b1)
     //     );
     // }
 
-    let img = ImageReader::open("./assets/asset.jpg")
+    let img = ImageReader::open("./assets/asset_middle.jpg")
         .unwrap()
         .decode()
         .unwrap();
@@ -125,7 +126,7 @@ fn main() {
         lab_store.resize(width as usize * components * height as usize, 0u16);
         let src_stride = width * components as u32;
         let start_time = Instant::now();
-        rgb_to_hsl(
+        rgb_to_hsv(
             src_bytes,
             src_stride,
             &mut lab_store,
@@ -160,7 +161,7 @@ fn main() {
         // }
 
         let start_time = Instant::now();
-        hsl_to_rgb(
+        hsv_to_rgb(
             &lab_store,
             store_stride as u32,
             &mut dst_slice,

diff --git a/src/hsv_to_image.rs b/src/hsv_to_image.rs
@@ -75,8 +75,8 @@ fn hsv_u16_to_channels<
             target_feature = "neon"
         ))]
         unsafe {
-            cx = neon_hsv_u16_to_image::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                cx,
+            _cx = neon_hsv_u16_to_image::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
+                _cx,
                 src.as_ptr(),
                 src_offset,
                 width,

diff --git a/src/neon/neon_hsv_to_image.rs b/src/neon/neon_hsv_to_image.rs
@@ -1,8 +1,8 @@
 use std::arch::aarch64::*;
 
-use crate::{neon_hsl_to_rgb, neon_hsv_to_rgb};
 use crate::image::ImageConfiguration;
 use crate::image_to_hsv_support::HsvTarget;
+use crate::neon::{neon_hsl_to_rgb, neon_hsv_to_rgb};
 
 #[cfg(all(
     any(target_arch = "aarch64", target_arch = "arm"),
@@ -32,11 +32,109 @@ pub unsafe fn neon_hsv_u16_to_image<
     }
 
     let channels = image_configuration.get_channels_count();
-
     let v_scale = vdupq_n_f32(scale);
-
     let dst_ptr = dst.add(dst_offset);
 
+    while cx + 16 < width as usize {
+        let (h_chan, s_chan, v_chan, a_chan_lo);
+        let src_ptr = ((src as *const u8).add(src_offset) as *const u16).add(cx * channels);
+
+        match image_configuration {
+            ImageConfiguration::Rgb | ImageConfiguration::Bgr => {
+                let hsv_pixel = vld3q_u16(src_ptr);
+                h_chan = hsv_pixel.0;
+                s_chan = hsv_pixel.1;
+                v_chan = hsv_pixel.2;
+                a_chan_lo = vdupq_n_u16(255);
+            }
+            ImageConfiguration::Rgba | ImageConfiguration::Bgra => {
+                let hsv_pixel = vld4q_u16(src_ptr);
+                h_chan = hsv_pixel.0;
+                s_chan = hsv_pixel.1;
+                v_chan = hsv_pixel.2;
+                a_chan_lo = hsv_pixel.3;
+            }
+        }
+
+        let h_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(h_chan)));
+        let s_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(s_chan)));
+        let v_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_chan)));
+
+        let (r_low, g_low, b_low) = match target {
+            HsvTarget::HSV => neon_hsv_to_rgb(h_low, s_low, v_low, v_scale),
+            HsvTarget::HSL => neon_hsl_to_rgb(h_low, s_low, v_low, v_scale),
+        };
+
+        let h_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(h_chan)));
+        let s_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(s_chan)));
+        let v_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_chan)));
+
+        let (r_high, g_high, b_high) = match target {
+            HsvTarget::HSV => neon_hsv_to_rgb(h_high, s_high, v_high, v_scale),
+            HsvTarget::HSL => neon_hsl_to_rgb(h_high, s_high, v_high, v_scale),
+        };
+
+        let r_chan_16_lo = vcombine_u16(vmovn_u32(r_low), vmovn_u32(r_high));
+        let g_chan_16_lo = vcombine_u16(vmovn_u32(g_low), vmovn_u32(g_high));
+        let b_chan_16_lo = vcombine_u16(vmovn_u32(b_low), vmovn_u32(b_high));
+
+        let src_ptr = src_ptr.add(8 * channels);
+        let (h_chan, s_chan, v_chan, a_chan_hi);
+        match image_configuration {
+            ImageConfiguration::Rgb | ImageConfiguration::Bgr => {
+                let hsv_pixel = vld3q_u16(src_ptr);
+                h_chan = hsv_pixel.0;
+                s_chan = hsv_pixel.1;
+                v_chan = hsv_pixel.2;
+                a_chan_hi = vdupq_n_u16(255);
+            }
+            ImageConfiguration::Rgba | ImageConfiguration::Bgra => {
+                let hsv_pixel = vld4q_u16(src_ptr);
+                h_chan = hsv_pixel.0;
+                s_chan = hsv_pixel.1;
+                v_chan = hsv_pixel.2;
+                a_chan_hi = hsv_pixel.3;
+            }
+        }
+
+        let h_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(h_chan)));
+        let s_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(s_chan)));
+        let v_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_chan)));
+
+        let (r_low, g_low, b_low) = match target {
+            HsvTarget::HSV => neon_hsv_to_rgb(h_low, s_low, v_low, v_scale),
+            HsvTarget::HSL => neon_hsl_to_rgb(h_low, s_low, v_low, v_scale),
+        };
+
+        let h_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(h_chan)));
+        let s_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(s_chan)));
+        let v_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_chan)));
+
+        let (r_high, g_high, b_high) = match target {
+            HsvTarget::HSV => neon_hsv_to_rgb(h_high, s_high, v_high, v_scale),
+            HsvTarget::HSL => neon_hsl_to_rgb(h_high, s_high, v_high, v_scale),
+        };
+
+        let r_chan_16_hi = vcombine_u16(vmovn_u32(r_low), vmovn_u32(r_high));
+        let g_chan_16_hi = vcombine_u16(vmovn_u32(g_low), vmovn_u32(g_high));
+        let b_chan_16_hi = vcombine_u16(vmovn_u32(b_low), vmovn_u32(b_high));
+
+        let r_chan = vcombine_u8(vqmovn_u16(r_chan_16_lo), vqmovn_u16(r_chan_16_hi));
+        let g_chan = vcombine_u8(vqmovn_u16(g_chan_16_lo), vqmovn_u16(g_chan_16_hi));
+        let b_chan = vcombine_u8(vqmovn_u16(b_chan_16_lo), vqmovn_u16(b_chan_16_hi));
+
+        if USE_ALPHA {
+            let a_chan = vcombine_u8(vqmovn_u16(a_chan_lo), vqmovn_u16(a_chan_hi));
+            let pixel_set = uint8x16x4_t(r_chan, g_chan, b_chan, a_chan);
+            vst4q_u8(dst_ptr.add(cx * channels), pixel_set);
+        } else {
+            let pixel_set = uint8x16x3_t(r_chan, g_chan, b_chan);
+            vst3q_u8(dst_ptr.add(cx * channels), pixel_set);
+        }
+
+        cx += 16;
+    }
+
     while cx + 8 < width as usize {
         let (h_chan, s_chan, v_chan, a_chan);
         let src_ptr = ((src as *const u8).add(src_offset) as *const u16).add(cx * channels);

diff --git a/src/neon/neon_image_to_hsv.rs b/src/neon/neon_image_to_hsv.rs
@@ -1,6 +1,6 @@
 use crate::image::ImageConfiguration;
 use crate::image_to_hsv_support::HsvTarget;
-use crate::{neon_rgb_to_hsl, neon_rgb_to_hsv};
+use crate::neon::{neon_rgb_to_hsl, neon_rgb_to_hsv};
 use std::arch::aarch64::*;
 
 #[cfg(all(

diff --git a/src/neon/neon_math.rs b/src/neon/neon_math.rs
@@ -8,8 +8,13 @@ use std::arch::aarch64::*;
 #[inline(always)]
 #[allow(dead_code)]
 pub(crate) unsafe fn vfmodq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    let scale = vrndq_f32(vmulq_f32(a, vrecpeq_f32(b)));
-    prefer_vfmaq_f32(a, vnegq_f32(scale), b)
+    let dividend_vec = a;
+    let divisor_vec = b;
+    let division = vmulq_f32(dividend_vec, vrecpeq_f32(divisor_vec));
+    let int_part = vcvtq_f32_s32(vcvtq_s32_f32(division));
+    let product = vmulq_f32(int_part, divisor_vec);
+    let remainder = vsubq_f32(dividend_vec, product);
+    remainder
 }
 
 #[cfg(all(