Skip to content

Commit

Permalink
Added lalphabeta, improvements SSE
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Aug 3, 2024
1 parent 23aa806 commit ec13222
Show file tree
Hide file tree
Showing 12 changed files with 1,003 additions and 41 deletions.
134 changes: 133 additions & 1 deletion src/avx/to_sigmoidal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@ use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

use crate::avx::routines::avx_vld_u8_and_deinterleave;
use crate::avx::routines::{
avx_vld_u8_and_deinterleave, avx_vld_u8_and_deinterleave_half,
avx_vld_u8_and_deinterleave_quarter,
};
use crate::avx::sigmoidal::avx_rgb_to_sigmoidal;
use crate::avx::{avx2_interleave_rgb_ps, avx2_interleave_rgba_ps};
use crate::image::ImageConfiguration;
Expand Down Expand Up @@ -191,5 +194,134 @@ pub unsafe fn avx_image_to_sigmoidal_row<
cx += 32;
}

while cx + 16 < width as usize {
let src_ptr = src.add(cx * channels);
let (r_chan, g_chan, b_chan, a_chan) =
avx_vld_u8_and_deinterleave_half::<CHANNELS_CONFIGURATION>(src_ptr);

let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan));
let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan));
let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan));
let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan));

let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low));
let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low));
let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low));

let (x_low_low, y_low_low, z_low_low) =
avx_rgb_to_sigmoidal(r_low_low, g_low_low, b_low_low);

let u8_scale = _mm256_set1_ps(1f32 / 255f32);

if USE_ALPHA {
let a_low_low = _mm256_mul_ps(
_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))),
u8_scale,
);

let ptr = dst_ptr.add(cx * channels);
avx_store_and_interleave_v4_f32!(
ptr,
image_configuration,
x_low_low,
y_low_low,
z_low_low,
a_low_low
);
} else {
let ptr = dst_ptr.add(cx * channels);
avx_store_and_interleave_v3_f32!(
ptr,
image_configuration,
x_low_low,
y_low_low,
z_low_low
);
}

let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low));
let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low));
let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low));

let (x_low_high, y_low_high, z_low_high) =
avx_rgb_to_sigmoidal(r_low_high, g_low_high, b_low_high);

if USE_ALPHA {
let a_low_high = _mm256_mul_ps(
_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))),
u8_scale,
);

let ptr = dst_ptr.add(cx * channels + 8 * channels);
avx_store_and_interleave_v4_f32!(
ptr,
image_configuration,
x_low_high,
y_low_high,
z_low_high,
a_low_high
);
} else {
let ptr = dst_ptr.add(cx * channels + 8 * channels);
avx_store_and_interleave_v3_f32!(
ptr,
image_configuration,
x_low_high,
y_low_high,
z_low_high
);
}

cx += 16;
}

while cx + 8 < width as usize {
let src_ptr = src.add(cx * channels);
let (r_chan, g_chan, b_chan, a_chan) =
avx_vld_u8_and_deinterleave_quarter::<CHANNELS_CONFIGURATION>(src_ptr);

let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan));
let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan));
let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan));
let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan));

let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low));
let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low));
let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low));

let (x_low_low, y_low_low, z_low_low) =
avx_rgb_to_sigmoidal(r_low_low, g_low_low, b_low_low);

let u8_scale = _mm256_set1_ps(1f32 / 255f32);

if USE_ALPHA {
let a_low_low = _mm256_mul_ps(
_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))),
u8_scale,
);

let ptr = dst_ptr.add(cx * channels);
avx_store_and_interleave_v4_f32!(
ptr,
image_configuration,
x_low_low,
y_low_low,
z_low_low,
a_low_low
);
} else {
let ptr = dst_ptr.add(cx * channels);
avx_store_and_interleave_v3_f32!(
ptr,
image_configuration,
x_low_low,
y_low_low,
z_low_low
);
}

cx += 8;
}

cx
}
194 changes: 194 additions & 0 deletions src/image_to_lalphabeta.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
/*
* // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved.
* //
* // Use of this source code is governed by a BSD-style
* // license that can be found in the LICENSE file.
*/
use crate::image::ImageConfiguration;
use crate::{Rgb, TransferFunction};

#[inline(always)]
fn channels_to_lalphabeta<const CHANNELS_CONFIGURATION: u8>(
src: &[u8],
src_stride: u32,
dst: &mut [f32],
dst_stride: u32,
width: u32,
height: u32,
transfer_function: TransferFunction,
) {
let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();

let channels = image_configuration.get_channels_count();

let mut src_offset = 0usize;
let mut dst_offset = 0usize;

for _ in 0..height as usize {
let mut _cx = 0usize;

let src_ptr = unsafe { src.as_ptr().add(src_offset) };
let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 };

for x in _cx..width as usize {
let px = x * channels;

let src = unsafe { src_ptr.add(px) };
let r = unsafe {
src.add(image_configuration.get_r_channel_offset())
.read_unaligned()
};
let g = unsafe {
src.add(image_configuration.get_g_channel_offset())
.read_unaligned()
};
let b = unsafe {
src.add(image_configuration.get_b_channel_offset())
.read_unaligned()
};

let rgb = Rgb::<u8>::new(r, g, b);
let dst_store = unsafe { dst_ptr.add(px) };
let lalphabeta = rgb.to_lalphabeta(transfer_function);
unsafe {
dst_store.write_unaligned(lalphabeta.l);
dst_store.add(1).write_unaligned(lalphabeta.alpha);
dst_store.add(2).write_unaligned(lalphabeta.beta);
}

if image_configuration.has_alpha() {
let a = unsafe {
src.add(image_configuration.get_a_channel_offset())
.read_unaligned()
};
let a_lin = a as f32 * (1f32 / 255f32);
unsafe {
dst_store.add(3).write_unaligned(a_lin);
}
}
}

src_offset += src_stride as usize;
dst_offset += dst_stride as usize;
}
}

/// This function converts RGB to *lαβ* against D65 white point. This is much more effective than naive direct transformation
///
/// # Arguments
/// * `src` - A slice contains RGB data
/// * `src_stride` - Bytes per row for src data.
/// * `width` - Image width
/// * `height` - Image height
/// * `dst` - A mutable slice to receive LAB(a) data
/// * `dst_stride` - Bytes per row for dst data
/// * `transfer_function` - transfer function to linear colorspace
pub fn rgb_to_lalphabeta(
src: &[u8],
src_stride: u32,
dst: &mut [f32],
dst_stride: u32,
width: u32,
height: u32,
transfer_function: TransferFunction,
) {
channels_to_lalphabeta::<{ ImageConfiguration::Rgb as u8 }>(
src,
src_stride,
dst,
dst_stride,
width,
height,
transfer_function,
);
}

/// This function converts RGBA to *lαβ* against D65 white point and preserving and normalizing alpha channels keeping it at last positions. This is much more effective than naive direct transformation
///
/// # Arguments
/// * `src` - A slice contains RGBA data
/// * `src_stride` - Bytes per row for src data.
/// * `width` - Image width
/// * `height` - Image height
/// * `dst` - A mutable slice to receive LAB(a) data
/// * `dst_stride` - Bytes per row for dst data
/// * `transfer_function` - transfer function to linear colorspace
pub fn rgba_to_lalphabeta(
src: &[u8],
src_stride: u32,
dst: &mut [f32],
dst_stride: u32,
width: u32,
height: u32,
transfer_function: TransferFunction,
) {
channels_to_lalphabeta::<{ ImageConfiguration::Rgba as u8 }>(
src,
src_stride,
dst,
dst_stride,
width,
height,
transfer_function,
);
}

/// This function converts BGRA to *lαβ* against D65 white point and preserving and normalizing alpha channels keeping it at last positions. This is much more effective than naive direct transformation
///
/// # Arguments
/// * `src` - A slice contains BGRA data
/// * `src_stride` - Bytes per row for src data.
/// * `width` - Image width
/// * `height` - Image height
/// * `dst` - A mutable slice to receive LAB(a) data
/// * `dst_stride` - Bytes per row for dst data
/// * `transfer_function` - transfer function to linear colorspace
pub fn bgra_to_lalphabeta(
src: &[u8],
src_stride: u32,
dst: &mut [f32],
dst_stride: u32,
width: u32,
height: u32,
transfer_function: TransferFunction,
) {
channels_to_lalphabeta::<{ ImageConfiguration::Bgra as u8 }>(
src,
src_stride,
dst,
dst_stride,
width,
height,
transfer_function,
);
}

/// This function converts BGR to *lαβ* against D65 white point. This is much more effective than naive direct transformation
///
/// # Arguments
/// * `src` - A slice contains BGR data
/// * `src_stride` - Bytes per row for src data.
/// * `width` - Image width
/// * `height` - Image height
/// * `dst` - A mutable slice to receive LAB(a) data
/// * `dst_stride` - Bytes per row for dst data
/// * `transfer_function` - transfer function to linear colorspace
pub fn bgr_to_lalphabeta(
src: &[u8],
src_stride: u32,
dst: &mut [f32],
dst_stride: u32,
width: u32,
height: u32,
transfer_function: TransferFunction,
) {
channels_to_lalphabeta::<{ ImageConfiguration::Bgr as u8 }>(
src,
src_stride,
dst,
dst_stride,
width,
height,
transfer_function,
);
}
Loading

0 comments on commit ec13222

Please sign in to comment.