diff --git a/src/app/src/main.rs b/src/app/src/main.rs index dbf9e34..2bfae04 100644 --- a/src/app/src/main.rs +++ b/src/app/src/main.rs @@ -66,7 +66,7 @@ fn main() { lab_store.resize(width as usize * components * height as usize, 0f32); let src_stride = width * components as u32; let start_time = Instant::now(); - rgb_to_oklab( + rgb_to_jzazbz( src_bytes, src_stride, &mut lab_store, @@ -101,7 +101,7 @@ fn main() { // } let start_time = Instant::now(); - oklab_to_rgb( + jzazbz_to_rgb( &lab_store, store_stride as u32, &mut dst_slice, diff --git a/src/image_to_jzazbz.rs b/src/image_to_jzazbz.rs new file mode 100644 index 0000000..3f14ddb --- /dev/null +++ b/src/image_to_jzazbz.rs @@ -0,0 +1,407 @@ +/* + * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved. + * // + * // Use of this source code is governed by a BSD-style + * // license that can be found in the LICENSE file. + */ +use crate::image::ImageConfiguration; +use crate::image_to_jzazbz::JzazbzTarget::{JZAZBZ, JZCZHZ}; +#[cfg(all( + any(target_arch = "aarch64", target_arch = "arm"), + target_feature = "neon" +))] +use crate::neon::neon_image_to_jzazbz; +#[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" +))] +use crate::sse::sse_image_to_jzazbz; +use crate::{Jzazbz, Jzczhz, Rgb, TransferFunction}; + +#[repr(u8)] +#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] +pub(crate) enum JzazbzTarget { + JZAZBZ = 0, + JZCZHZ = 1, +} + +impl From for JzazbzTarget { + fn from(value: u8) -> Self { + match value { + 0 => JZAZBZ, + 1 => JZCZHZ, + _ => { + panic!("Not known value {}", value) + } + } + } +} + +#[inline(always)] +fn channels_to_jzaz( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + let target: JzazbzTarget = TARGET.into(); + let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); + + let channels = image_configuration.get_channels_count(); + + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + let mut _has_sse = false; + + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + if is_x86_feature_detected!("sse4.1") { + _has_sse = true; + } + + let mut src_offset = 0usize; + let mut dst_offset = 0usize; + + for _ in 0..height as usize { + let mut _cx = 0usize; + + let src_ptr = unsafe { src.as_ptr().add(src_offset) }; + let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 }; + + #[cfg(all( + any(target_arch = "aarch64", target_arch = "arm"), + target_feature = "neon" + ))] + unsafe { + _cx = neon_image_to_jzazbz::( + _cx, + src.as_ptr(), + src_offset, + width, + dst.as_mut_ptr(), + dst_offset, + transfer_function, + ) + } + + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + unsafe { + if _has_sse { + _cx = sse_image_to_jzazbz::( + _cx, + src.as_ptr(), + src_offset, + width, + dst.as_mut_ptr(), + dst_offset, + transfer_function, + ) + } + } + + for x in _cx..width as usize { + let px = x * channels; + + let src = unsafe { src_ptr.add(px) }; + let r = unsafe { + src.add(image_configuration.get_r_channel_offset()) + .read_unaligned() + }; + let g = unsafe { + src.add(image_configuration.get_g_channel_offset()) + .read_unaligned() + }; + let b = unsafe { + src.add(image_configuration.get_b_channel_offset()) + .read_unaligned() + }; + + let rgb = Rgb::::new(r, g, b); + + let dst_store = unsafe { dst_ptr.add(px) }; + + match target { + JZAZBZ => { + let jzazbz = Jzazbz::from_rgb(rgb, transfer_function); + unsafe { + dst_store.write_unaligned(jzazbz.jz); + dst_store.add(1).write_unaligned(jzazbz.az); + dst_store.add(2).write_unaligned(jzazbz.bz); + } + } + JZCZHZ => { + let jzczhz = Jzczhz::from_rgb(rgb, transfer_function); + unsafe { + dst_store.write_unaligned(jzczhz.jz); + dst_store.add(1).write_unaligned(jzczhz.cz); + dst_store.add(2).write_unaligned(jzczhz.hz); + } + } + } + + if image_configuration.has_alpha() { + let a = unsafe { + src.add(image_configuration.get_a_channel_offset()) + .read_unaligned() + }; + let a_lin = a as f32 * (1f32 / 255f32); + unsafe { + dst_store.add(3).write_unaligned(a_lin); + } + } + } + + src_offset += src_stride as usize; + dst_offset += dst_stride as usize; + } +} + +/// This function converts RGB to Jzazbz against D65 white point. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains RGB data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive Jzazbz(a) data +/// * `dst_stride` - Bytes per row for dst data +/// * `transfer_function` - transfer function to linear colorspace +pub fn rgb_to_jzazbz( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + channels_to_jzaz::<{ ImageConfiguration::Rgb as u8 }, { JZAZBZ as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); +} + +/// This function converts RGBA to Jzazbz against D65 white point and preserving and normalizing alpha channels keeping it at last positions. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains RGBA data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive Jzazbz(a) data +/// * `dst_stride` - Bytes per row for dst data +/// * `transfer_function` - transfer function to linear colorspace +pub fn rgba_to_jzazbz( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + channels_to_jzaz::<{ ImageConfiguration::Rgba as u8 }, { JZAZBZ as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); +} + +/// This function converts BGRA to Jzazbz against D65 white point and preserving and normalizing alpha channels keeping it at last positions. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains BGRA data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive Jzazbz(a) data +/// * `dst_stride` - Bytes per row for dst data +/// * `transfer_function` - transfer function to linear colorspace +pub fn bgra_to_jzazbz( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + channels_to_jzaz::<{ ImageConfiguration::Bgra as u8 }, { JZAZBZ as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); +} + +/// This function converts BGR to Jzazbz against D65 white point. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains BGR data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive Jzazbz(a) data +/// * `dst_stride` - Bytes per row for dst data +/// * `transfer_function` - transfer function to linear colorspace +pub fn bgr_to_jzazbz( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + channels_to_jzaz::<{ ImageConfiguration::Bgr as u8 }, { JZAZBZ as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); +} + +/// This function converts RGB to Jzczhz against D65 white point. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains RGB data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive Jzczhz(a) data +/// * `dst_stride` - Bytes per row for dst data +/// * `transfer_function` - transfer function to linear colorspace +pub fn rgb_to_jzczhz( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + channels_to_jzaz::<{ ImageConfiguration::Rgb as u8 }, { JZCZHZ as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); +} + +/// This function converts RGBA to Jzczhz against D65 white point and preserving and normalizing alpha channels keeping it at last positions. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains RGBA data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive Jzczhz(a) data +/// * `dst_stride` - Bytes per row for dst data +/// * `transfer_function` - transfer function to linear colorspace +pub fn rgba_to_jzczhz( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + channels_to_jzaz::<{ ImageConfiguration::Rgba as u8 }, { JZCZHZ as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); +} + +/// This function converts BGRA to Jzczhz against D65 white point and preserving and normalizing alpha channels keeping it at last positions. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains BGRA data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive Jzczhz(a) data +/// * `dst_stride` - Bytes per row for dst data +/// * `transfer_function` - transfer function to linear colorspace +pub fn bgra_to_jzczhz( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + channels_to_jzaz::<{ ImageConfiguration::Bgra as u8 }, { JZCZHZ as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); +} + +/// This function converts BGR to Jzczhz against D65 white point. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains BGR data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive Jzczhz(a) data +/// * `dst_stride` - Bytes per row for dst data +/// * `transfer_function` - transfer function to linear colorspace +pub fn bgr_to_jzczhz( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + channels_to_jzaz::<{ ImageConfiguration::Bgr as u8 }, { JZCZHZ as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); +} diff --git a/src/jzazbz.rs b/src/jzazbz.rs index 61f73f3..1140786 100644 --- a/src/jzazbz.rs +++ b/src/jzazbz.rs @@ -31,8 +31,11 @@ fn perceptual_quantizer_inverse(x: f32) -> f32 { #[derive(Debug, Copy, Clone, PartialOrd, PartialEq)] /// Represents Jzazbz pub struct Jzazbz { + /// Jz(lightness) generally expects to be between [0;1] pub jz: f32, + /// Az generally expects to be between [-1;1] pub az: f32, + /// Bz generally expects to be between [-1;1] pub bz: f32, } @@ -70,7 +73,7 @@ impl Jzazbz { pub fn to_xyz(&self) -> Xyz { let jz = self.jz + 1.6295499532821566e-11; - let iz = jz / (0.44 + 0.56 * jz); + let iz = jz / (0.44f32 + 0.56f32 * jz); let l = perceptual_quantizer_inverse( iz + 1.386050432715393e-1 * self.az + 5.804731615611869e-2 * self.bz, ); diff --git a/src/jzazbz_to_image.rs b/src/jzazbz_to_image.rs new file mode 100644 index 0000000..69106f4 --- /dev/null +++ b/src/jzazbz_to_image.rs @@ -0,0 +1,370 @@ +/* + * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved. + * // + * // Use of this source code is governed by a BSD-style + * // license that can be found in the LICENSE file. + */ +use crate::image::ImageConfiguration; +use crate::image_to_jzazbz::JzazbzTarget; +#[cfg(all( + any(target_arch = "aarch64", target_arch = "arm"), + target_feature = "neon" +))] +use crate::neon::neon_jzazbz_to_image; +#[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" +))] +use crate::sse::sse_jzazbz_to_image; +use crate::{Jzazbz, Jzczhz, TransferFunction}; + +fn jzazbz_to_image( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); + let target: JzazbzTarget = TARGET.into(); + + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + let mut _has_sse = false; + + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + if is_x86_feature_detected!("sse4.1") { + _has_sse = true; + } + + let mut src_offset = 0usize; + let mut dst_offset = 0usize; + + let channels = image_configuration.get_channels_count(); + + for _ in 0..height as usize { + let mut _cx = 0usize; + + let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *mut f32 }; + let dst_ptr = unsafe { dst.as_mut_ptr().add(dst_offset) }; + + #[cfg(all( + any(target_arch = "aarch64", target_arch = "arm"), + target_feature = "neon" + ))] + unsafe { + _cx = neon_jzazbz_to_image::( + _cx, + src.as_ptr(), + src_offset as u32, + dst.as_mut_ptr(), + dst_offset as u32, + width, + transfer_function, + ); + } + + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + unsafe { + if _has_sse { + _cx = sse_jzazbz_to_image::( + _cx, + src.as_ptr(), + src_offset as u32, + dst.as_mut_ptr(), + dst_offset as u32, + width, + transfer_function, + ) + } + } + + for x in _cx..width as usize { + let px = x * channels; + let l_x = unsafe { src_ptr.add(px).read_unaligned() }; + let l_y = unsafe { src_ptr.add(px + 1).read_unaligned() }; + let l_z = unsafe { src_ptr.add(px + 2).read_unaligned() }; + let rgb; + + match target { + JzazbzTarget::JZAZBZ => { + let jzazbz = Jzazbz::new(l_x, l_y, l_z); + rgb = jzazbz.to_rgb(transfer_function); + } + JzazbzTarget::JZCZHZ => { + let jzczhz = Jzczhz::new(l_x, l_y, l_z); + rgb = jzczhz.to_rgb(transfer_function); + } + } + + unsafe { + let dst = dst_ptr.add(x * channels); + dst.add(image_configuration.get_r_channel_offset()) + .write_unaligned(rgb.r); + dst.add(image_configuration.get_g_channel_offset()) + .write_unaligned(rgb.g); + dst.add(image_configuration.get_b_channel_offset()) + .write_unaligned(rgb.b); + if image_configuration.has_alpha() { + let l_a = src_ptr.add(px + 3).read_unaligned(); + let a_value = (l_a * 255f32).max(0f32); + dst.add(image_configuration.get_a_channel_offset()) + .write_unaligned(a_value as u8); + } + } + } + + src_offset += src_stride as usize; + dst_offset += dst_stride as usize; + } +} + +/// This function converts Jzazbz with interleaved alpha channel to RGBA. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains LAB data +/// * `src_stride` - Bytes per row for src data. +/// * `dst` - A mutable slice to receive RGBA data +/// * `dst_stride` - Bytes per row for dst data +/// * `width` - Image width +/// * `height` - Image height +/// * `transfer_function` - Transfer function from linear colorspace to gamma +pub fn jzazbz_to_rgba( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + jzazbz_to_image::<{ ImageConfiguration::Rgba as u8 }, { JzazbzTarget::JZAZBZ as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); +} + +/// This function converts Jzazbz to RGB. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains Jzazbz data +/// * `src_stride` - Bytes per row for src data. +/// * `dst` - A mutable slice to receive RGB data +/// * `dst_stride` - Bytes per row for dst data +/// * `width` - Image width +/// * `height` - Image height +/// * `transfer_function` - Transfer function from linear colorspace to gamma +pub fn jzazbz_to_rgb( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + jzazbz_to_image::<{ ImageConfiguration::Rgb as u8 }, { JzazbzTarget::JZAZBZ as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); +} + +/// This function converts Jzazbz to BGR. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains Jzazbz data +/// * `src_stride` - Bytes per row for src data. +/// * `dst` - A mutable slice to receive BGR data +/// * `dst_stride` - Bytes per row for dst data +/// * `width` - Image width +/// * `height` - Image height +/// * `transfer_function` - Transfer function from linear colorspace to gamma +pub fn jzazbz_to_bgr( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + jzazbz_to_image::<{ ImageConfiguration::Bgr as u8 }, { JzazbzTarget::JZAZBZ as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); +} + +/// This function converts Jzazbz with interleaved alpha channel to BGRA. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains Jzazbz data +/// * `src_stride` - Bytes per row for src data. +/// * `dst` - A mutable slice to receive BGRA data +/// * `dst_stride` - Bytes per row for dst data +/// * `width` - Image width +/// * `height` - Image height +/// * `transfer_function` - Transfer function from linear colorspace to gamma +pub fn jzazbz_to_bgra( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + jzazbz_to_image::<{ ImageConfiguration::Bgra as u8 }, { JzazbzTarget::JZAZBZ as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); +} + +/// This function converts Jzczhz with interleaved alpha channel to RGBA. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains Jzczhz data +/// * `src_stride` - Bytes per row for src data. +/// * `dst` - A mutable slice to receive RGBA data +/// * `dst_stride` - Bytes per row for dst data +/// * `width` - Image width +/// * `height` - Image height +/// * `transfer_function` - Transfer function from linear colorspace to gamma +pub fn jzczhz_to_rgba( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + jzazbz_to_image::<{ ImageConfiguration::Rgba as u8 }, { JzazbzTarget::JZCZHZ as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); +} + +/// This function converts Jzczhz to RGB. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains LAB data +/// * `src_stride` - Bytes per row for src data. +/// * `dst` - A mutable slice to receive RGB data +/// * `dst_stride` - Bytes per row for dst data +/// * `width` - Image width +/// * `height` - Image height +/// * `transfer_function` - Transfer function from linear colorspace to gamma +pub fn jzczhz_to_rgb( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + jzazbz_to_image::<{ ImageConfiguration::Rgb as u8 }, { JzazbzTarget::JZCZHZ as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); +} + +/// This function converts Jzczhz to BGR. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains Jzczhz data +/// * `src_stride` - Bytes per row for src data. +/// * `dst` - A mutable slice to receive BGR data +/// * `dst_stride` - Bytes per row for dst data +/// * `width` - Image width +/// * `height` - Image height +/// * `transfer_function` - Transfer function from linear colorspace to gamma +pub fn jzczhz_to_bgr( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + jzazbz_to_image::<{ ImageConfiguration::Bgr as u8 }, { JzazbzTarget::JZCZHZ as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); +} + +/// This function converts Jzczhz with interleaved alpha channel to BGRA. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains Jzczhz data +/// * `src_stride` - Bytes per row for src data. +/// * `dst` - A mutable slice to receive BGRA data +/// * `dst_stride` - Bytes per row for dst data +/// * `width` - Image width +/// * `height` - Image height +/// * `transfer_function` - Transfer function from linear colorspace to gamma +pub fn jzczhz_to_bgra( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + jzazbz_to_image::<{ ImageConfiguration::Bgra as u8 }, { JzazbzTarget::JZCZHZ as u8 }>( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); +} diff --git a/src/jzczhz.rs b/src/jzczhz.rs index fddf974..cbab076 100644 --- a/src/jzczhz.rs +++ b/src/jzczhz.rs @@ -5,13 +5,16 @@ * // license that can be found in the LICENSE file. */ use crate::{EuclideanDistance, Jzazbz, Rgb, TransferFunction, Xyz}; -use erydanos::{ehypot3f, ehypotf, Cosine, Euclidean2DDistance, Sine}; +use erydanos::{eatan2f, ehypot3f, ehypotf, Cosine, Sine}; /// Represents Jzazbz in polar coordinates as Jzczhz #[derive(Debug, Copy, Clone, PartialOrd, PartialEq)] pub struct Jzczhz { + /// Jz(lightness) generally expects to be between [0;1] pub jz: f32, + /// Cz generally expects to be between [-1;1] pub cz: f32, + /// Hz generally expects to be between [-1;1] pub hz: f32, } @@ -36,7 +39,7 @@ impl Jzczhz { #[inline] pub fn from_jzazbz(jzazbz: Jzazbz) -> Jzczhz { let cz = ehypotf(jzazbz.az, jzazbz.bz); - let hz = jzazbz.bz.ehypot(jzazbz.az); + let hz = eatan2f(jzazbz.bz, jzazbz.az); Jzczhz::new(jzazbz.jz, cz, hz) } diff --git a/src/lib.rs b/src/lib.rs index e5a0f24..e743bd1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,6 +19,7 @@ mod hsv_to_image; mod image; mod image_to_hsv; mod image_to_hsv_support; +mod image_to_jzazbz; mod image_to_linear; mod image_to_linear_u8; mod image_to_oklab; @@ -26,6 +27,7 @@ mod image_to_sigmoidal; mod image_to_xyz_lab; mod image_xyza_laba; mod jzazbz; +mod jzazbz_to_image; mod jzczhz; mod lab; mod linear_to_image; @@ -123,6 +125,14 @@ pub use xyza_laba_to_image::xyz_with_alpha_to_bgra; pub use xyza_laba_to_image::xyz_with_alpha_to_rgba; pub use euclidean::EuclideanDistance; +pub use image_to_jzazbz::bgr_to_jzazbz; +pub use image_to_jzazbz::bgr_to_jzczhz; +pub use image_to_jzazbz::bgra_to_jzazbz; +pub use image_to_jzazbz::bgra_to_jzczhz; +pub use image_to_jzazbz::rgb_to_jzazbz; +pub use image_to_jzazbz::rgb_to_jzczhz; +pub use image_to_jzazbz::rgba_to_jzazbz; +pub use image_to_jzazbz::rgba_to_jzczhz; pub use image_to_oklab::bgr_to_oklab; pub use image_to_oklab::bgra_to_oklab; pub use image_to_oklab::rgb_to_oklab; @@ -131,6 +141,14 @@ pub use image_to_sigmoidal::bgra_to_sigmoidal; pub use image_to_sigmoidal::rgb_to_sigmoidal; pub use image_to_sigmoidal::rgba_to_sigmoidal; pub use jzazbz::Jzazbz; +pub use jzazbz_to_image::jzazbz_to_bgr; +pub use jzazbz_to_image::jzazbz_to_bgra; +pub use jzazbz_to_image::jzazbz_to_rgb; +pub use jzazbz_to_image::jzazbz_to_rgba; +pub use jzazbz_to_image::jzczhz_to_bgr; +pub use jzazbz_to_image::jzczhz_to_bgra; +pub use jzazbz_to_image::jzczhz_to_rgb; +pub use jzazbz_to_image::jzczhz_to_rgba; pub use jzczhz::Jzczhz; pub use oklab::Oklab; pub use oklab_to_image::oklab_to_bgr; diff --git a/src/neon/image_to_jzazbz.rs b/src/neon/image_to_jzazbz.rs new file mode 100644 index 0000000..e5ec5c7 --- /dev/null +++ b/src/neon/image_to_jzazbz.rs @@ -0,0 +1,234 @@ +/* + * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved. + * // + * // Use of this source code is governed by a BSD-style + * // license that can be found in the LICENSE file. + */ +use crate::image::ImageConfiguration; +use crate::image_to_jzazbz::JzazbzTarget; +use crate::neon::get_neon_linear_transfer; +use crate::neon::math::{vcolorq_matrix_f32, vmlafq_f32, vpowq_n_f32}; +use crate::{TransferFunction, SRGB_TO_XYZ_D65}; +use erydanos::{vatan2q_f32, vhypotq_fast_f32}; +use std::arch::aarch64::*; + +macro_rules! perceptual_quantizer { + ($color: expr) => {{ + let xx = vpowq_n_f32(vmulq_n_f32($color, 1e-4), 0.1593017578125); + let jx = vmlafq_f32(vdupq_n_f32(18.8515625), xx, vdupq_n_f32(0.8359375)); + let den_jx = vmlafq_f32(xx, vdupq_n_f32(18.6875), vdupq_n_f32(1.)); + vpowq_n_f32(vdivq_f32(jx, den_jx), 134.034375) + }}; +} + +macro_rules! triple_to_jzazbz { + ($r: expr, $g: expr, $b: expr, $transfer: expr, $target: expr + ) => {{ + let r_f = vmulq_n_f32(vcvtq_f32_u32($r), 1f32 / 255f32); + let g_f = vmulq_n_f32(vcvtq_f32_u32($g), 1f32 / 255f32); + let b_f = vmulq_n_f32(vcvtq_f32_u32($b), 1f32 / 255f32); + let dl_l = $transfer(r_f); + let dl_m = $transfer(g_f); + let dl_s = $transfer(b_f); + + let (x0, x1, x2, x3, x4, x5, x6, x7, x8) = ( + vdupq_n_f32(*SRGB_TO_XYZ_D65.get_unchecked(0).get_unchecked(0)), + vdupq_n_f32(*SRGB_TO_XYZ_D65.get_unchecked(0).get_unchecked(1)), + vdupq_n_f32(*SRGB_TO_XYZ_D65.get_unchecked(0).get_unchecked(2)), + vdupq_n_f32(*SRGB_TO_XYZ_D65.get_unchecked(1).get_unchecked(0)), + vdupq_n_f32(*SRGB_TO_XYZ_D65.get_unchecked(1).get_unchecked(1)), + vdupq_n_f32(*SRGB_TO_XYZ_D65.get_unchecked(1).get_unchecked(2)), + vdupq_n_f32(*SRGB_TO_XYZ_D65.get_unchecked(2).get_unchecked(0)), + vdupq_n_f32(*SRGB_TO_XYZ_D65.get_unchecked(2).get_unchecked(1)), + vdupq_n_f32(*SRGB_TO_XYZ_D65.get_unchecked(2).get_unchecked(2)), + ); + + let (x, y, z) = vcolorq_matrix_f32(dl_l, dl_m, dl_s, x0, x1, x2, x3, x4, x5, x6, x7, x8); + + let (l0, l1, l2, l3, l4, l5, l6, l7, l8) = ( + vdupq_n_f32(0.674207838), + vdupq_n_f32(0.382799340), + vdupq_n_f32(-0.047570458), + vdupq_n_f32(0.149284160), + vdupq_n_f32(0.739628340), + vdupq_n_f32(0.083327300), + vdupq_n_f32(0.070941080), + vdupq_n_f32(0.174768000), + vdupq_n_f32(0.67097002), + ); + + let (l, m, s) = vcolorq_matrix_f32(x, y, z, l0, l1, l2, l3, l4, l5, l6, l7, l8); + + let lp = perceptual_quantizer!(l); + let mp = perceptual_quantizer!(m); + let sp = perceptual_quantizer!(s); + + let iz = vmulq_n_f32(vaddq_f32(lp, mp), 0.5f32); + let az = vmlafq_f32( + vdupq_n_f32(3.524000), + lp, + vmlafq_f32(vdupq_n_f32(-4.066708), mp, vmulq_n_f32(sp, 0.542708)), + ); + let bz = vmlafq_f32( + vdupq_n_f32(0.199076), + lp, + vmlafq_f32(vdupq_n_f32(1.096799), mp, vmulq_n_f32(sp, -1.295875)), + ); + let num = vmulq_n_f32(iz, 0.44); + let den = vsubq_f32( + vmlafq_f32(iz, vdupq_n_f32(-0.56), vdupq_n_f32(1.)), + vdupq_n_f32(1.6295499532821566e-11), + ); + let jz = vdivq_f32(num, den); + + match $target { + JzazbzTarget::JZAZBZ => { + (jz, az, bz) + } + JzazbzTarget::JZCZHZ => { + let cz = vhypotq_fast_f32(az, bz); + let hz = vatan2q_f32(bz, az); + (jz, cz, hz) + } + } + }}; +} + +#[inline(always)] +pub unsafe fn neon_image_to_jzazbz( + start_cx: usize, + src: *const u8, + src_offset: usize, + width: u32, + dst: *mut f32, + dst_offset: usize, + transfer_function: TransferFunction, +) -> usize { + let target: JzazbzTarget = TARGET.into(); + let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); + let channels = image_configuration.get_channels_count(); + let mut cx = start_cx; + + let transfer = get_neon_linear_transfer(transfer_function); + + let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32; + + while cx + 16 < width as usize { + let (r_chan, g_chan, b_chan, a_chan); + let src_ptr = src.add(src_offset + cx * channels); + match image_configuration { + ImageConfiguration::Rgb | ImageConfiguration::Bgr => { + let ldr = vld3q_u8(src_ptr); + if image_configuration == ImageConfiguration::Rgb { + r_chan = ldr.0; + g_chan = ldr.1; + b_chan = ldr.2; + } else { + r_chan = ldr.2; + g_chan = ldr.1; + b_chan = ldr.0; + } + a_chan = vdupq_n_u8(255); + } + ImageConfiguration::Rgba => { + let ldr = vld4q_u8(src_ptr); + r_chan = ldr.0; + g_chan = ldr.1; + b_chan = ldr.2; + a_chan = ldr.3; + } + ImageConfiguration::Bgra => { + let ldr = vld4q_u8(src_ptr); + r_chan = ldr.2; + g_chan = ldr.1; + b_chan = ldr.0; + a_chan = ldr.3; + } + } + + let r_low = vmovl_u8(vget_low_u8(r_chan)); + let g_low = vmovl_u8(vget_low_u8(g_chan)); + let b_low = vmovl_u8(vget_low_u8(b_chan)); + + let r_low_low = vmovl_u16(vget_low_u16(r_low)); + let g_low_low = vmovl_u16(vget_low_u16(g_low)); + let b_low_low = vmovl_u16(vget_low_u16(b_low)); + + let (x_low_low, y_low_low, z_low_low) = + triple_to_jzazbz!(r_low_low, g_low_low, b_low_low, &transfer, target); + + let a_low = vmovl_u8(vget_low_u8(a_chan)); + + if image_configuration.has_alpha() { + let a_low_low = + vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32); + let xyz_low_low = float32x4x4_t(x_low_low, y_low_low, z_low_low, a_low_low); + vst4q_f32(dst_ptr.add(cx * channels), xyz_low_low); + } else { + let xyz_low_low = float32x4x3_t(x_low_low, y_low_low, z_low_low); + vst3q_f32(dst_ptr.add(cx * channels), xyz_low_low); + } + + let r_low_high = vmovl_high_u16(r_low); + let g_low_high = vmovl_high_u16(g_low); + let b_low_high = vmovl_high_u16(b_low); + + let (x_low_high, y_low_high, z_low_high) = + triple_to_jzazbz!(r_low_high, g_low_high, b_low_high, &transfer, target); + + if image_configuration.has_alpha() { + let a_low_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_low)), 1f32 / 255f32); + let xyz_low_low = float32x4x4_t(x_low_high, y_low_high, z_low_high, a_low_high); + vst4q_f32(dst_ptr.add(cx * channels + 4 * channels), xyz_low_low); + } else { + let xyz_low_low = float32x4x3_t(x_low_high, y_low_high, z_low_high); + vst3q_f32(dst_ptr.add(cx * channels + 4 * channels), xyz_low_low); + } + + let r_high = vmovl_high_u8(r_chan); + let g_high = vmovl_high_u8(g_chan); + let b_high = vmovl_high_u8(b_chan); + + let r_high_low = vmovl_u16(vget_low_u16(r_high)); + let g_high_low = vmovl_u16(vget_low_u16(g_high)); + let b_high_low = vmovl_u16(vget_low_u16(b_high)); + + let (x_high_low, y_high_low, z_high_low) = + triple_to_jzazbz!(r_high_low, g_high_low, b_high_low, &transfer, target); + + let a_high = vmovl_high_u8(a_chan); + + if image_configuration.has_alpha() { + let a_high_low = vmulq_n_f32( + vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_high))), + 1f32 / 255f32, + ); + + let xyz_low_low = float32x4x4_t(x_high_low, y_high_low, z_high_low, a_high_low); + vst4q_f32(dst_ptr.add(cx * channels + 4 * channels * 2), xyz_low_low); + } else { + let xyz_low_low = float32x4x3_t(x_high_low, y_high_low, z_high_low); + vst3q_f32(dst_ptr.add(cx * channels + 4 * channels * 2), xyz_low_low); + } + + let r_high_high = vmovl_high_u16(r_high); + let g_high_high = vmovl_high_u16(g_high); + let b_high_high = vmovl_high_u16(b_high); + + let (x_high_high, y_high_high, z_high_high) = + triple_to_jzazbz!(r_high_high, g_high_high, b_high_high, &transfer, target); + + if image_configuration.has_alpha() { + let a_high_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_high)), 1f32 / 255f32); + let xyz_low_low = float32x4x4_t(x_high_high, y_high_high, z_high_high, a_high_high); + vst4q_f32(dst_ptr.add(cx * channels + 4 * channels * 3), xyz_low_low); + } else { + let xyz_low_low = float32x4x3_t(x_high_high, y_high_high, z_high_high); + vst3q_f32(dst_ptr.add(cx * channels + 4 * channels * 3), xyz_low_low); + } + + cx += 16; + } + + cx +} diff --git a/src/neon/jzazbz_to_image.rs b/src/neon/jzazbz_to_image.rs new file mode 100644 index 0000000..07d553a --- /dev/null +++ b/src/neon/jzazbz_to_image.rs @@ -0,0 +1,183 @@ +/* + * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved. + * // + * // Use of this source code is governed by a BSD-style + * // license that can be found in the LICENSE file. + */ + +use std::arch::aarch64::*; + +use erydanos::{vcosq_f32, vsinq_f32}; + +use crate::image::ImageConfiguration; +use crate::image_to_jzazbz::JzazbzTarget; +use crate::neon::get_neon_gamma_transfer; +use crate::neon::math::{vcolorq_matrix_f32, vmlafq_f32, vpowq_n_f32}; +use crate::{load_f32_and_deinterleave, TransferFunction, XYZ_TO_SRGB_D65}; + +macro_rules! perceptual_quantizer_inverse { + ($color: expr) => {{ + let xx = vpowq_n_f32($color, 7.460772656268214e-03); + let num = vsubq_f32(vdupq_n_f32(0.8359375), xx); + let den = vmlafq_f32(xx, vdupq_n_f32(18.6875), vdupq_n_f32(-18.8515625)); + vmulq_n_f32(vpowq_n_f32(vdivq_f32(num, den), 6.277394636015326), 1e4) + }}; +} + +#[inline(always)] +unsafe fn neon_jzazbz_gamma_vld( + src: *const f32, + transfer_function: TransferFunction, + target: JzazbzTarget, +) -> (uint32x4_t, uint32x4_t, uint32x4_t, uint32x4_t) { + let transfer = get_neon_gamma_transfer(transfer_function); + let v_scale_alpha = vdupq_n_f32(255f32); + let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); + let (jz, mut az, mut bz, mut a_f32) = load_f32_and_deinterleave!(src, image_configuration); + + if target == JzazbzTarget::JZCZHZ { + let cz = az; + let hz = bz; + az = vmulq_f32(cz, vcosq_f32(hz)); + bz = vmulq_f32(cz, vsinq_f32(hz)); + } + + let jz = vaddq_f32(jz, vdupq_n_f32(1.6295499532821566e-11)); + let iz = vdivq_f32( + jz, + vmlafq_f32(jz, vdupq_n_f32(0.56f32), vdupq_n_f32(0.44f32)), + ); + + let (m0, m1, m2, m3, m4, m5, m6, m7, m8) = ( + vdupq_n_f32(1f32), + vdupq_n_f32(1.386050432715393e-1), + vdupq_n_f32(5.804731615611869e-2), + vdupq_n_f32(1f32), + vdupq_n_f32(-1.386050432715393e-1), + vdupq_n_f32(-5.804731615611891e-2), + vdupq_n_f32(1f32), + vdupq_n_f32(-9.601924202631895e-2), + vdupq_n_f32(-8.118918960560390e-1), + ); + + let (mut l_l, mut l_m, mut l_s) = + vcolorq_matrix_f32(iz, az, bz, m0, m1, m2, m3, m4, m5, m6, m7, m8); + + l_l = perceptual_quantizer_inverse!(l_l); + l_m = perceptual_quantizer_inverse!(l_m); + l_s = perceptual_quantizer_inverse!(l_s); + + let (c0, c1, c2, c3, c4, c5, c6, c7, c8) = ( + vdupq_n_f32(1.661373055774069e+00), + vdupq_n_f32(-9.145230923250668e-01), + vdupq_n_f32(2.313620767186147e-01), + vdupq_n_f32(-3.250758740427037e-01), + vdupq_n_f32(1.571847038366936e+00), + vdupq_n_f32(-2.182538318672940e-01), + vdupq_n_f32(-9.098281098284756e-02), + vdupq_n_f32(-3.127282905230740e-01), + vdupq_n_f32(1.522766561305260e+00), + ); + + let (x, y, z) = vcolorq_matrix_f32(l_l, l_m, l_s, c0, c1, c2, c3, c4, c5, c6, c7, c8); + + let (x0, x1, x2, x3, x4, x5, x6, x7, x8) = ( + vdupq_n_f32(*XYZ_TO_SRGB_D65.get_unchecked(0).get_unchecked(0)), + vdupq_n_f32(*XYZ_TO_SRGB_D65.get_unchecked(0).get_unchecked(1)), + vdupq_n_f32(*XYZ_TO_SRGB_D65.get_unchecked(0).get_unchecked(2)), + vdupq_n_f32(*XYZ_TO_SRGB_D65.get_unchecked(1).get_unchecked(0)), + vdupq_n_f32(*XYZ_TO_SRGB_D65.get_unchecked(1).get_unchecked(1)), + vdupq_n_f32(*XYZ_TO_SRGB_D65.get_unchecked(1).get_unchecked(2)), + vdupq_n_f32(*XYZ_TO_SRGB_D65.get_unchecked(2).get_unchecked(0)), + vdupq_n_f32(*XYZ_TO_SRGB_D65.get_unchecked(2).get_unchecked(1)), + vdupq_n_f32(*XYZ_TO_SRGB_D65.get_unchecked(2).get_unchecked(2)), + ); + + let (r_l, g_l, b_l) = vcolorq_matrix_f32(x, y, z, x0, x1, x2, x3, x4, x5, x6, x7, x8); + + let mut r_f32 = transfer(r_l); + let mut g_f32 = transfer(g_l); + let mut b_f32 = transfer(b_l); + r_f32 = vmulq_f32(r_f32, v_scale_alpha); + g_f32 = vmulq_f32(g_f32, v_scale_alpha); + b_f32 = vmulq_f32(b_f32, v_scale_alpha); + if image_configuration.has_alpha() { + a_f32 = vmulq_f32(a_f32, v_scale_alpha); + } + ( + vcvtaq_u32_f32(r_f32), + vcvtaq_u32_f32(g_f32), + vcvtaq_u32_f32(b_f32), + vcvtaq_u32_f32(a_f32), + ) +} + +#[inline(always)] +pub unsafe fn neon_jzazbz_to_image( + start_cx: usize, + src: *const f32, + src_offset: u32, + dst: *mut u8, + dst_offset: u32, + width: u32, + transfer_function: TransferFunction, +) -> usize { + let target: JzazbzTarget = TARGET.into(); + let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); + let channels = image_configuration.get_channels_count(); + let mut cx = start_cx; + + while cx + 16 < width as usize { + let offset_src_ptr = + ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); + + let src_ptr_0 = offset_src_ptr; + + let (r_row0_, g_row0_, b_row0_, a_row0_) = + neon_jzazbz_gamma_vld::(src_ptr_0, transfer_function, target); + + let src_ptr_1 = offset_src_ptr.add(4 * channels); + + let (r_row1_, g_row1_, b_row1_, a_row1_) = + neon_jzazbz_gamma_vld::(src_ptr_1, transfer_function, target); + + let src_ptr_2 = offset_src_ptr.add(4 * 2 * channels); + + let (r_row2_, g_row2_, b_row2_, a_row2_) = + neon_jzazbz_gamma_vld::(src_ptr_2, transfer_function, target); + + let src_ptr_3 = offset_src_ptr.add(4 * 3 * channels); + + let (r_row3_, g_row3_, b_row3_, a_row3_) = + neon_jzazbz_gamma_vld::(src_ptr_3, transfer_function, target); + + let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_)); + let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_)); + let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_)); + + let r_row23 = vcombine_u16(vqmovn_u32(r_row2_), vqmovn_u32(r_row3_)); + let g_row23 = vcombine_u16(vqmovn_u32(g_row2_), vqmovn_u32(g_row3_)); + let b_row23 = vcombine_u16(vqmovn_u32(b_row2_), vqmovn_u32(b_row3_)); + + let r_row = vcombine_u8(vqmovn_u16(r_row01), vqmovn_u16(r_row23)); + let g_row = vcombine_u8(vqmovn_u16(g_row01), vqmovn_u16(g_row23)); + let b_row = vcombine_u8(vqmovn_u16(b_row01), vqmovn_u16(b_row23)); + + let dst_ptr = dst.add(dst_offset as usize + cx * channels); + + if image_configuration.has_alpha() { + let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_)); + let a_row23 = vcombine_u16(vqmovn_u32(a_row2_), vqmovn_u32(a_row3_)); + let a_row = vcombine_u8(vqmovn_u16(a_row01), vqmovn_u16(a_row23)); + let store_rows = uint8x16x4_t(r_row, g_row, b_row, a_row); + vst4q_u8(dst_ptr, store_rows); + } else { + let store_rows = uint8x16x3_t(r_row, g_row, b_row); + vst3q_u8(dst_ptr, store_rows); + } + + cx += 16; + } + + cx +} diff --git a/src/neon/mod.rs b/src/neon/mod.rs index 4e57171..c7c20f9 100644 --- a/src/neon/mod.rs +++ b/src/neon/mod.rs @@ -11,7 +11,9 @@ mod from_sigmoidal; mod gamma_curves; mod hsv_to_image; mod image_to_hsv; +mod image_to_jzazbz; mod image_to_oklab; +mod jzazbz_to_image; mod linear_to_image; pub mod linear_to_planar; mod math; @@ -32,7 +34,9 @@ pub use from_sigmoidal::neon_from_sigmoidal_row; pub use gamma_curves::*; pub use hsv_to_image::*; pub use image_to_hsv::*; +pub use image_to_jzazbz::neon_image_to_jzazbz; pub use image_to_oklab::neon_image_to_oklab; +pub use jzazbz_to_image::neon_jzazbz_to_image; pub use linear_to_image::*; pub use oklab_to_image::neon_oklab_to_image; pub use to_linear::*; diff --git a/src/rgb.rs b/src/rgb.rs index b2c7b31..e54a39d 100644 --- a/src/rgb.rs +++ b/src/rgb.rs @@ -30,6 +30,9 @@ impl Rgb { } /// Converts rgb to XYZ + /// + /// # Arguments + /// `transfer_function` - Transfer function to convert RGB into linear RGB #[inline] pub fn to_xyz(&self, matrix: &[[f32; 3]; 3], transfer_function: TransferFunction) -> Xyz { Xyz::from_rgb(*self, matrix, transfer_function) diff --git a/src/sse/image_to_jzazbz.rs b/src/sse/image_to_jzazbz.rs new file mode 100644 index 0000000..ab05ff3 --- /dev/null +++ b/src/sse/image_to_jzazbz.rs @@ -0,0 +1,225 @@ +/* + * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved. + * // + * // Use of this source code is governed by a BSD-style + * // license that can be found in the LICENSE file. + */ + +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +use erydanos::{_mm_atan2_ps, _mm_hypot_fast_ps, _mm_mlaf_ps}; + +use crate::image::ImageConfiguration; +use crate::image_to_jzazbz::JzazbzTarget; +use crate::sse::{ + _mm_color_matrix_ps, _mm_pow_n_ps, get_sse_linear_transfer, sse_deinterleave_rgb, + sse_deinterleave_rgba, sse_interleave_ps_rgb, sse_interleave_ps_rgba, +}; +use crate::{ + load_u8_and_deinterleave, store_and_interleave_v3_f32, store_and_interleave_v4_f32, + TransferFunction, SRGB_TO_XYZ_D65, +}; + +macro_rules! perceptual_quantizer { + ($color: expr) => {{ + let xx = _mm_pow_n_ps(_mm_mul_ps($color, _mm_set1_ps(1e-4)), 0.1593017578125); + let jx = _mm_mlaf_ps(_mm_set1_ps(18.8515625), xx, _mm_set1_ps(0.8359375)); + let den_jx = _mm_mlaf_ps(xx, _mm_set1_ps(18.6875), _mm_set1_ps(1.)); + _mm_pow_n_ps(_mm_div_ps(jx, den_jx), 134.034375) + }}; +} + +macro_rules! triple_to_jzazbz { + ($r: expr, $g: expr, $b: expr, $transfer: expr, $target: expr + ) => {{ + let u8_scale = _mm_set1_ps(1f32 / 255f32); + let r_f = _mm_mul_ps(_mm_cvtepi32_ps($r), u8_scale); + let g_f = _mm_mul_ps(_mm_cvtepi32_ps($g), u8_scale); + let b_f = _mm_mul_ps(_mm_cvtepi32_ps($b), u8_scale); + let r_linear = $transfer(r_f); + let g_linear = $transfer(g_f); + let b_linear = $transfer(b_f); + + let (x0, x1, x2, x3, x4, x5, x6, x7, x8) = ( + _mm_set1_ps(*SRGB_TO_XYZ_D65.get_unchecked(0).get_unchecked(0)), + _mm_set1_ps(*SRGB_TO_XYZ_D65.get_unchecked(0).get_unchecked(1)), + _mm_set1_ps(*SRGB_TO_XYZ_D65.get_unchecked(0).get_unchecked(2)), + _mm_set1_ps(*SRGB_TO_XYZ_D65.get_unchecked(1).get_unchecked(0)), + _mm_set1_ps(*SRGB_TO_XYZ_D65.get_unchecked(1).get_unchecked(1)), + _mm_set1_ps(*SRGB_TO_XYZ_D65.get_unchecked(1).get_unchecked(2)), + _mm_set1_ps(*SRGB_TO_XYZ_D65.get_unchecked(2).get_unchecked(0)), + _mm_set1_ps(*SRGB_TO_XYZ_D65.get_unchecked(2).get_unchecked(1)), + _mm_set1_ps(*SRGB_TO_XYZ_D65.get_unchecked(2).get_unchecked(2)), + ); + + let (x, y, z) = _mm_color_matrix_ps( + r_linear, g_linear, b_linear, x0, x1, x2, x3, x4, x5, x6, x7, x8, + ); + + let (l0, l1, l2, l3, l4, l5, l6, l7, l8) = ( + _mm_set1_ps(0.674207838), + _mm_set1_ps(0.382799340), + _mm_set1_ps(-0.047570458), + _mm_set1_ps(0.149284160), + _mm_set1_ps(0.739628340), + _mm_set1_ps(0.083327300), + _mm_set1_ps(0.070941080), + _mm_set1_ps(0.174768000), + _mm_set1_ps(0.67097002), + ); + + let (l_l, l_m, l_s) = + _mm_color_matrix_ps(x, y, z, l0, l1, l2, l3, l4, l5, l6, l7, l8); + + let lp = perceptual_quantizer!(l_l); + let mp = perceptual_quantizer!(l_m); + let sp = perceptual_quantizer!(l_s); + + let iz = _mm_mul_ps(_mm_add_ps(lp, mp), _mm_set1_ps(0.5f32)); + let az = _mm_mlaf_ps( + _mm_set1_ps(3.524000), + lp, + _mm_mlaf_ps(_mm_set1_ps(-4.066708), mp, _mm_mul_ps(sp, _mm_set1_ps(0.542708))), + ); + let bz = _mm_mlaf_ps( + _mm_set1_ps(0.199076), + lp, + _mm_mlaf_ps(_mm_set1_ps(1.096799), mp, _mm_mul_ps(sp, _mm_set1_ps(-1.295875))), + ); + let num = _mm_mul_ps(iz, _mm_set1_ps(0.44)); + let den = _mm_sub_ps( + _mm_mlaf_ps(iz, _mm_set1_ps(-0.56), _mm_set1_ps(1.)), + _mm_set1_ps(1.6295499532821566e-11), + ); + let jz = _mm_div_ps(num, den); + + match $target { + JzazbzTarget::JZAZBZ => { + (jz, az, bz) + } + JzazbzTarget::JZCZHZ => { + let cz = _mm_hypot_fast_ps(az, bz); + let hz = _mm_atan2_ps(bz, az); + (jz, cz, hz) + } + } + }}; +} + +#[inline(always)] +pub unsafe fn sse_image_to_jzazbz( + start_cx: usize, + src: *const u8, + src_offset: usize, + width: u32, + dst: *mut f32, + dst_offset: usize, + transfer_function: TransferFunction, +) -> usize { + let target: JzazbzTarget = TARGET.into(); + let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); + let channels = image_configuration.get_channels_count(); + let mut cx = start_cx; + + let transfer = get_sse_linear_transfer(transfer_function); + + let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32; + + while cx + 16 < width as usize { + let src_ptr = src.add(src_offset + cx * channels); + let (r_chan, g_chan, b_chan, a_chan) = + load_u8_and_deinterleave!(src_ptr, image_configuration); + + let r_low = _mm_cvtepu8_epi16(r_chan); + let g_low = _mm_cvtepu8_epi16(g_chan); + let b_low = _mm_cvtepu8_epi16(b_chan); + + let r_low_low = _mm_cvtepu16_epi32(r_low); + let g_low_low = _mm_cvtepu16_epi32(g_low); + let b_low_low = _mm_cvtepu16_epi32(b_low); + + let (x_low_low, y_low_low, z_low_low) = + triple_to_jzazbz!(r_low_low, g_low_low, b_low_low, &transfer, target); + + let a_low = _mm_cvtepu8_epi16(a_chan); + + let u8_scale = _mm_set1_ps(1f32 / 255f32); + + if image_configuration.has_alpha() { + let a_low_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_low)), u8_scale); + let ptr = dst_ptr.add(cx * 4); + store_and_interleave_v4_f32!(ptr, x_low_low, y_low_low, z_low_low, a_low_low); + } else { + let ptr = dst_ptr.add(cx * 3); + store_and_interleave_v3_f32!(ptr, x_low_low, y_low_low, z_low_low); + } + + let r_low_high = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(r_low)); + let g_low_high = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(g_low)); + let b_low_high = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(b_low)); + + let (x_low_high, y_low_high, z_low_high) = + triple_to_jzazbz!(r_low_high, g_low_high, b_low_high, &transfer, target); + + if image_configuration.has_alpha() { + let a_low_high = _mm_mul_ps( + _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128::<8>(a_low))), + u8_scale, + ); + + let ptr = dst_ptr.add(cx * 4 + 16); + store_and_interleave_v4_f32!(ptr, x_low_high, y_low_high, z_low_high, a_low_high); + } else { + let ptr = dst_ptr.add(cx * 3 + 4 * 3); + store_and_interleave_v3_f32!(ptr, x_low_high, y_low_high, z_low_high); + } + + let r_high = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(r_chan)); + let g_high = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(g_chan)); + let b_high = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(b_chan)); + + let r_high_low = _mm_cvtepu16_epi32(r_high); + let g_high_low = _mm_cvtepu16_epi32(g_high); + let b_high_low = _mm_cvtepu16_epi32(b_high); + + let (x_high_low, y_high_low, z_high_low) = + triple_to_jzazbz!(r_high_low, g_high_low, b_high_low, &transfer, target); + + let a_high = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(a_chan)); + + if image_configuration.has_alpha() { + let a_high_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_high)), u8_scale); + let ptr = dst_ptr.add(cx * 4 + 4 * 4 * 2); + store_and_interleave_v4_f32!(ptr, x_high_low, y_high_low, z_high_low, a_high_low); + } else { + let ptr = dst_ptr.add(cx * 3 + 4 * 3 * 2); + store_and_interleave_v3_f32!(ptr, x_high_low, y_high_low, z_high_low); + } + + let r_high_high = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(r_high)); + let g_high_high = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(g_high)); + let b_high_high = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(b_high)); + + let (x_high_high, y_high_high, z_high_high) = + triple_to_jzazbz!(r_high_high, g_high_high, b_high_high, &transfer, target); + + if image_configuration.has_alpha() { + let a_high_high = _mm_mul_ps( + _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128::<8>(a_high))), + u8_scale, + ); + let ptr = dst_ptr.add(cx * 4 + 4 * 4 * 3); + store_and_interleave_v4_f32!(ptr, x_high_high, y_high_high, z_high_high, a_high_high); + } else { + let ptr = dst_ptr.add(cx * 3 + 4 * 3 * 3); + store_and_interleave_v3_f32!(ptr, x_high_high, y_high_high, z_high_high); + } + + cx += 16; + } + + cx +} diff --git a/src/sse/jzazbz_to_image.rs b/src/sse/jzazbz_to_image.rs new file mode 100644 index 0000000..7fd9987 --- /dev/null +++ b/src/sse/jzazbz_to_image.rs @@ -0,0 +1,204 @@ +/* + * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved. + * // + * // Use of this source code is governed by a BSD-style + * // license that can be found in the LICENSE file. + */ + +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +use erydanos::{_mm_cos_ps, _mm_mlaf_ps, _mm_sin_ps}; + +use crate::{ + load_f32_and_deinterleave, store_and_interleave_v3_u8, store_and_interleave_v4_u8, + TransferFunction, XYZ_TO_SRGB_D65, +}; +use crate::image::ImageConfiguration; +use crate::image_to_jzazbz::JzazbzTarget; +use crate::sse::{ + _mm_color_matrix_ps, _mm_pow_n_ps, get_sse_gamma_transfer, + sse_deinterleave_rgb_ps, sse_deinterleave_rgba_ps, sse_interleave_rgb, sse_interleave_rgba, +}; + +macro_rules! perceptual_quantizer_inverse { + ($color: expr) => {{ + let xx = _mm_pow_n_ps($color, 7.460772656268214e-03); + let num = _mm_sub_ps(_mm_set1_ps(0.8359375), xx); + let den = _mm_mlaf_ps(xx, _mm_set1_ps(18.6875), _mm_set1_ps(-18.8515625)); + _mm_mul_ps( + _mm_pow_n_ps(_mm_div_ps(num, den), 6.277394636015326), + _mm_set1_ps(1e4), + ) + }}; +} + +#[inline(always)] +unsafe fn sse_jzazbz_vld( + src: *const f32, + transfer_function: TransferFunction, +) -> (__m128i, __m128i, __m128i, __m128i) { + let target: JzazbzTarget = TARGET.into(); + let transfer = get_sse_gamma_transfer(transfer_function); + let v_scale_alpha = _mm_set1_ps(255f32); + let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); + + let (mut jz, mut az, mut bz, mut a_f32) = load_f32_and_deinterleave!(src, image_configuration); + + if target == JzazbzTarget::JZCZHZ { + let cz = az; + let hz = bz; + az = _mm_mul_ps(cz, _mm_cos_ps(hz)); + bz = _mm_mul_ps(cz, _mm_sin_ps(hz)); + } + + let jz = _mm_add_ps(jz, _mm_set1_ps(1.6295499532821566e-11)); + let iz = _mm_div_ps( + jz, + _mm_mlaf_ps(jz, _mm_set1_ps(0.56f32), _mm_set1_ps(0.44f32)), + ); + + let (m0, m1, m2, m3, m4, m5, m6, m7, m8) = ( + _mm_set1_ps(1f32), + _mm_set1_ps(1.386050432715393e-1), + _mm_set1_ps(5.804731615611869e-2), + _mm_set1_ps(1f32), + _mm_set1_ps(-1.386050432715393e-1), + _mm_set1_ps(-5.804731615611891e-2), + _mm_set1_ps(1f32), + _mm_set1_ps(-9.601924202631895e-2), + _mm_set1_ps(-8.118918960560390e-1), + ); + + let (mut l_l, mut l_m, mut l_s) = + _mm_color_matrix_ps(iz, az, bz, m0, m1, m2, m3, m4, m5, m6, m7, m8); + + l_l = perceptual_quantizer_inverse!(l_l); + l_m = perceptual_quantizer_inverse!(l_m); + l_s = perceptual_quantizer_inverse!(l_s); + + let (c0, c1, c2, c3, c4, c5, c6, c7, c8) = ( + _mm_set1_ps(1.661373055774069e+00), + _mm_set1_ps(-9.145230923250668e-01), + _mm_set1_ps(2.313620767186147e-01), + _mm_set1_ps(-3.250758740427037e-01), + _mm_set1_ps(1.571847038366936e+00), + _mm_set1_ps(-2.182538318672940e-01), + _mm_set1_ps(-9.098281098284756e-02), + _mm_set1_ps(-3.127282905230740e-01), + _mm_set1_ps(1.522766561305260e+00), + ); + + let (x, y, z) = _mm_color_matrix_ps(l_l, l_m, l_s, c0, c1, c2, c3, c4, c5, c6, c7, c8); + + let (x0, x1, x2, x3, x4, x5, x6, x7, x8) = ( + _mm_set1_ps(*XYZ_TO_SRGB_D65.get_unchecked(0).get_unchecked(0)), + _mm_set1_ps(*XYZ_TO_SRGB_D65.get_unchecked(0).get_unchecked(1)), + _mm_set1_ps(*XYZ_TO_SRGB_D65.get_unchecked(0).get_unchecked(2)), + _mm_set1_ps(*XYZ_TO_SRGB_D65.get_unchecked(1).get_unchecked(0)), + _mm_set1_ps(*XYZ_TO_SRGB_D65.get_unchecked(1).get_unchecked(1)), + _mm_set1_ps(*XYZ_TO_SRGB_D65.get_unchecked(1).get_unchecked(2)), + _mm_set1_ps(*XYZ_TO_SRGB_D65.get_unchecked(2).get_unchecked(0)), + _mm_set1_ps(*XYZ_TO_SRGB_D65.get_unchecked(2).get_unchecked(1)), + _mm_set1_ps(*XYZ_TO_SRGB_D65.get_unchecked(2).get_unchecked(2)), + ); + + let (r_l, g_l, b_l) = _mm_color_matrix_ps(x, y, z, x0, x1, x2, x3, x4, x5, x6, x7, x8); + + let mut r_f32 = transfer(r_l); + let mut g_f32 = transfer(g_l); + let mut b_f32 = transfer(b_l); + r_f32 = _mm_mul_ps(r_f32, v_scale_alpha); + g_f32 = _mm_mul_ps(g_f32, v_scale_alpha); + b_f32 = _mm_mul_ps(b_f32, v_scale_alpha); + if image_configuration.has_alpha() { + a_f32 = _mm_mul_ps(a_f32, v_scale_alpha); + } + + const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; + + if image_configuration.has_alpha() { + ( + _mm_cvtps_epi32(_mm_round_ps::(r_f32)), + _mm_cvtps_epi32(_mm_round_ps::(g_f32)), + _mm_cvtps_epi32(_mm_round_ps::(b_f32)), + _mm_cvtps_epi32(_mm_round_ps::(a_f32)), + ) + } else { + ( + _mm_cvtps_epi32(_mm_round_ps::(r_f32)), + _mm_cvtps_epi32(_mm_round_ps::(g_f32)), + _mm_cvtps_epi32(_mm_round_ps::(b_f32)), + _mm_set1_epi32(255), + ) + } +} + +#[inline(always)] +pub unsafe fn sse_jzazbz_to_image( + start_cx: usize, + src: *const f32, + src_offset: u32, + dst: *mut u8, + dst_offset: u32, + width: u32, + transfer_function: TransferFunction, +) -> usize { + let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); + let channels = image_configuration.get_channels_count(); + let mut cx = start_cx; + + while cx + 16 < width as usize { + let offset_src_ptr = + ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); + + let src_ptr_0 = offset_src_ptr; + + let (r_row0_, g_row0_, b_row0_, a_row0_) = + sse_jzazbz_vld::(src_ptr_0, transfer_function); + + let src_ptr_1 = offset_src_ptr.add(4 * channels); + + let (r_row1_, g_row1_, b_row1_, a_row1_) = + sse_jzazbz_vld::(src_ptr_1, transfer_function); + + let src_ptr_2 = offset_src_ptr.add(4 * 2 * channels); + + let (r_row2_, g_row2_, b_row2_, a_row2_) = + sse_jzazbz_vld::(src_ptr_2, transfer_function); + + let src_ptr_3 = offset_src_ptr.add(4 * 3 * channels); + + let (r_row3_, g_row3_, b_row3_, a_row3_) = + sse_jzazbz_vld::(src_ptr_3, transfer_function); + + let r_row01 = _mm_packus_epi32(r_row0_, r_row1_); + let g_row01 = _mm_packus_epi32(g_row0_, g_row1_); + let b_row01 = _mm_packus_epi32(b_row0_, b_row1_); + + let r_row23 = _mm_packus_epi32(r_row2_, r_row3_); + let g_row23 = _mm_packus_epi32(g_row2_, g_row3_); + let b_row23 = _mm_packus_epi32(b_row2_, b_row3_); + + let r_row = _mm_packus_epi16(r_row01, r_row23); + let g_row = _mm_packus_epi16(g_row01, g_row23); + let b_row = _mm_packus_epi16(b_row01, b_row23); + + let dst_ptr = dst.add(dst_offset as usize + cx * channels); + + if image_configuration.has_alpha() { + let a_row01 = _mm_packus_epi32(a_row0_, a_row1_); + let a_row23 = _mm_packus_epi32(a_row2_, a_row3_); + let a_row = _mm_packus_epi16(a_row01, a_row23); + store_and_interleave_v4_u8!(dst_ptr, r_row, g_row, b_row, a_row); + } else { + store_and_interleave_v3_u8!(dst_ptr, r_row, g_row, b_row); + } + + cx += 16; + } + + cx +} diff --git a/src/sse/mod.rs b/src/sse/mod.rs index 81e3356..db85415 100644 --- a/src/sse/mod.rs +++ b/src/sse/mod.rs @@ -33,7 +33,9 @@ mod xyz_lab_to_image; mod cie; mod from_sigmoidal; +mod image_to_jzazbz; mod image_to_oklab; +mod jzazbz_to_image; mod linear_to_planar; mod oklab_to_image; mod planar_to_linear; @@ -46,8 +48,10 @@ pub use from_sigmoidal::sse_from_sigmoidal_row; pub use gamma_curves::*; pub use hsv_to_image::*; pub use image_to_hsv::*; +pub use image_to_jzazbz::sse_image_to_jzazbz; pub use image_to_linear_u8::*; pub use image_to_oklab::sse_image_to_oklab; +pub use jzazbz_to_image::sse_jzazbz_to_image; pub use linear_to_image::*; pub use linear_to_planar::sse_linear_plane_to_gamma; pub use math::*;