Added lalphabeta, improvements SSE

awxkee · Aug 3, 2024 · ec13222 · ec13222
1 parent 23aa806
commit ec13222
Show file tree

Hide file tree

Showing 12 changed files with 1,003 additions and 41 deletions.
diff --git a/src/avx/to_sigmoidal.rs b/src/avx/to_sigmoidal.rs
@@ -10,7 +10,10 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
-use crate::avx::routines::avx_vld_u8_and_deinterleave;
+use crate::avx::routines::{
+    avx_vld_u8_and_deinterleave, avx_vld_u8_and_deinterleave_half,
+    avx_vld_u8_and_deinterleave_quarter,
+};
 use crate::avx::sigmoidal::avx_rgb_to_sigmoidal;
 use crate::avx::{avx2_interleave_rgb_ps, avx2_interleave_rgba_ps};
 use crate::image::ImageConfiguration;
@@ -191,5 +194,134 @@ pub unsafe fn avx_image_to_sigmoidal_row<
         cx += 32;
     }
 
+    while cx + 16 < width as usize {
+        let src_ptr = src.add(cx * channels);
+        let (r_chan, g_chan, b_chan, a_chan) =
+            avx_vld_u8_and_deinterleave_half::<CHANNELS_CONFIGURATION>(src_ptr);
+
+        let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan));
+        let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan));
+        let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan));
+        let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan));
+
+        let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low));
+        let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low));
+        let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low));
+
+        let (x_low_low, y_low_low, z_low_low) =
+            avx_rgb_to_sigmoidal(r_low_low, g_low_low, b_low_low);
+
+        let u8_scale = _mm256_set1_ps(1f32 / 255f32);
+
+        if USE_ALPHA {
+            let a_low_low = _mm256_mul_ps(
+                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))),
+                u8_scale,
+            );
+
+            let ptr = dst_ptr.add(cx * channels);
+            avx_store_and_interleave_v4_f32!(
+                ptr,
+                image_configuration,
+                x_low_low,
+                y_low_low,
+                z_low_low,
+                a_low_low
+            );
+        } else {
+            let ptr = dst_ptr.add(cx * channels);
+            avx_store_and_interleave_v3_f32!(
+                ptr,
+                image_configuration,
+                x_low_low,
+                y_low_low,
+                z_low_low
+            );
+        }
+
+        let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low));
+        let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low));
+        let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low));
+
+        let (x_low_high, y_low_high, z_low_high) =
+            avx_rgb_to_sigmoidal(r_low_high, g_low_high, b_low_high);
+
+        if USE_ALPHA {
+            let a_low_high = _mm256_mul_ps(
+                _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))),
+                u8_scale,
+            );
+
+            let ptr = dst_ptr.add(cx * channels + 8 * channels);
+            avx_store_and_interleave_v4_f32!(
+                ptr,
+                image_configuration,
+                x_low_high,
+                y_low_high,
+                z_low_high,
+                a_low_high
+            );
+        } else {
+            let ptr = dst_ptr.add(cx * channels + 8 * channels);
+            avx_store_and_interleave_v3_f32!(
+                ptr,
+                image_configuration,
+                x_low_high,
+                y_low_high,
+                z_low_high
+            );
+        }
+
+        cx += 16;
+    }
+
+    while cx + 8 < width as usize {
+        let src_ptr = src.add(cx * channels);
+        let (r_chan, g_chan, b_chan, a_chan) =
+            avx_vld_u8_and_deinterleave_quarter::<CHANNELS_CONFIGURATION>(src_ptr);
+
+        let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan));
+        let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan));
+        let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan));
+        let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan));
+
+        let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low));
+        let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low));
+        let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low));
+
+        let (x_low_low, y_low_low, z_low_low) =
+            avx_rgb_to_sigmoidal(r_low_low, g_low_low, b_low_low);
+
+        let u8_scale = _mm256_set1_ps(1f32 / 255f32);
+
+        if USE_ALPHA {
+            let a_low_low = _mm256_mul_ps(
+                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))),
+                u8_scale,
+            );
+
+            let ptr = dst_ptr.add(cx * channels);
+            avx_store_and_interleave_v4_f32!(
+                ptr,
+                image_configuration,
+                x_low_low,
+                y_low_low,
+                z_low_low,
+                a_low_low
+            );
+        } else {
+            let ptr = dst_ptr.add(cx * channels);
+            avx_store_and_interleave_v3_f32!(
+                ptr,
+                image_configuration,
+                x_low_low,
+                y_low_low,
+                z_low_low
+            );
+        }
+
+        cx += 8;
+    }
+
     cx
 }
diff --git a/src/image_to_lalphabeta.rs b/src/image_to_lalphabeta.rs
@@ -0,0 +1,194 @@
+/*
+ * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved.
+ * //
+ * // Use of this source code is governed by a BSD-style
+ * // license that can be found in the LICENSE file.
+ */
+use crate::image::ImageConfiguration;
+use crate::{Rgb, TransferFunction};
+
+#[inline(always)]
+fn channels_to_lalphabeta<const CHANNELS_CONFIGURATION: u8>(
+    src: &[u8],
+    src_stride: u32,
+    dst: &mut [f32],
+    dst_stride: u32,
+    width: u32,
+    height: u32,
+    transfer_function: TransferFunction,
+) {
+    let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
+
+    let channels = image_configuration.get_channels_count();
+
+    let mut src_offset = 0usize;
+    let mut dst_offset = 0usize;
+
+    for _ in 0..height as usize {
+        let mut _cx = 0usize;
+
+        let src_ptr = unsafe { src.as_ptr().add(src_offset) };
+        let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 };
+
+        for x in _cx..width as usize {
+            let px = x * channels;
+
+            let src = unsafe { src_ptr.add(px) };
+            let r = unsafe {
+                src.add(image_configuration.get_r_channel_offset())
+                    .read_unaligned()
+            };
+            let g = unsafe {
+                src.add(image_configuration.get_g_channel_offset())
+                    .read_unaligned()
+            };
+            let b = unsafe {
+                src.add(image_configuration.get_b_channel_offset())
+                    .read_unaligned()
+            };
+
+            let rgb = Rgb::<u8>::new(r, g, b);
+            let dst_store = unsafe { dst_ptr.add(px) };
+            let lalphabeta = rgb.to_lalphabeta(transfer_function);
+            unsafe {
+                dst_store.write_unaligned(lalphabeta.l);
+                dst_store.add(1).write_unaligned(lalphabeta.alpha);
+                dst_store.add(2).write_unaligned(lalphabeta.beta);
+            }
+
+            if image_configuration.has_alpha() {
+                let a = unsafe {
+                    src.add(image_configuration.get_a_channel_offset())
+                        .read_unaligned()
+                };
+                let a_lin = a as f32 * (1f32 / 255f32);
+                unsafe {
+                    dst_store.add(3).write_unaligned(a_lin);
+                }
+            }
+        }
+
+        src_offset += src_stride as usize;
+        dst_offset += dst_stride as usize;
+    }
+}
+
+/// This function converts RGB to *lαβ* against D65 white point. This is much more effective than naive direct transformation
+///
+/// # Arguments
+/// * `src` - A slice contains RGB data
+/// * `src_stride` - Bytes per row for src data.
+/// * `width` - Image width
+/// * `height` - Image height
+/// * `dst` - A mutable slice to receive LAB(a) data
+/// * `dst_stride` - Bytes per row for dst data
+/// * `transfer_function` - transfer function to linear colorspace
+pub fn rgb_to_lalphabeta(
+    src: &[u8],
+    src_stride: u32,
+    dst: &mut [f32],
+    dst_stride: u32,
+    width: u32,
+    height: u32,
+    transfer_function: TransferFunction,
+) {
+    channels_to_lalphabeta::<{ ImageConfiguration::Rgb as u8 }>(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        width,
+        height,
+        transfer_function,
+    );
+}
+
+/// This function converts RGBA to *lαβ* against D65 white point and preserving and normalizing alpha channels keeping it at last positions. This is much more effective than naive direct transformation
+///
+/// # Arguments
+/// * `src` - A slice contains RGBA data
+/// * `src_stride` - Bytes per row for src data.
+/// * `width` - Image width
+/// * `height` - Image height
+/// * `dst` - A mutable slice to receive LAB(a) data
+/// * `dst_stride` - Bytes per row for dst data
+/// * `transfer_function` - transfer function to linear colorspace
+pub fn rgba_to_lalphabeta(
+    src: &[u8],
+    src_stride: u32,
+    dst: &mut [f32],
+    dst_stride: u32,
+    width: u32,
+    height: u32,
+    transfer_function: TransferFunction,
+) {
+    channels_to_lalphabeta::<{ ImageConfiguration::Rgba as u8 }>(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        width,
+        height,
+        transfer_function,
+    );
+}
+
+/// This function converts BGRA to *lαβ* against D65 white point and preserving and normalizing alpha channels keeping it at last positions. This is much more effective than naive direct transformation
+///
+/// # Arguments
+/// * `src` - A slice contains BGRA data
+/// * `src_stride` - Bytes per row for src data.
+/// * `width` - Image width
+/// * `height` - Image height
+/// * `dst` - A mutable slice to receive LAB(a) data
+/// * `dst_stride` - Bytes per row for dst data
+/// * `transfer_function` - transfer function to linear colorspace
+pub fn bgra_to_lalphabeta(
+    src: &[u8],
+    src_stride: u32,
+    dst: &mut [f32],
+    dst_stride: u32,
+    width: u32,
+    height: u32,
+    transfer_function: TransferFunction,
+) {
+    channels_to_lalphabeta::<{ ImageConfiguration::Bgra as u8 }>(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        width,
+        height,
+        transfer_function,
+    );
+}
+
+/// This function converts BGR to *lαβ* against D65 white point. This is much more effective than naive direct transformation
+///
+/// # Arguments
+/// * `src` - A slice contains BGR data
+/// * `src_stride` - Bytes per row for src data.
+/// * `width` - Image width
+/// * `height` - Image height
+/// * `dst` - A mutable slice to receive LAB(a) data
+/// * `dst_stride` - Bytes per row for dst data
+/// * `transfer_function` - transfer function to linear colorspace
+pub fn bgr_to_lalphabeta(
+    src: &[u8],
+    src_stride: u32,
+    dst: &mut [f32],
+    dst_stride: u32,
+    width: u32,
+    height: u32,
+    transfer_function: TransferFunction,
+) {
+    channels_to_lalphabeta::<{ ImageConfiguration::Bgr as u8 }>(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        width,
+        height,
+        transfer_function,
+    );
+}