From fb21ab3e7eea56f2cede639d06fc6bf46f3f9b2e Mon Sep 17 00:00:00 2001 From: awxkee Date: Thu, 10 Oct 2024 23:45:25 +0100 Subject: [PATCH] Big reworking with speed increasing --- Cargo.lock | 2 +- src/app/src/main.rs | 10 +- src/avx/gamma_curves.rs | 27 +---- src/gamma_curves.rs | 214 +++++++++++++++++++++++++++++++++- src/image_to_jzazbz.rs | 211 +++++++++++++++++---------------- src/image_to_lalphabeta.rs | 162 +++++++++++++++----------- src/image_to_linear.rs | 35 +++--- src/image_to_linear_u8.rs | 45 ++++--- src/image_to_xyz_lab.rs | 68 ++++++++--- src/image_xyza_laba.rs | 104 +++++++++++++---- src/jzazbz_to_image.rs | 199 ++++++++++++++++++++----------- src/jzczhz.rs | 23 +++- src/lalphabeta.rs | 16 ++- src/lalphabeta_to_image.rs | 71 ++++++++--- src/linear_to_planar.rs | 102 +++++----------- src/neon/gamma_curves.rs | 31 +---- src/neon/image_to_jzazbz.rs | 220 ++--------------------------------- src/neon/jzazbz_to_image.rs | 208 +++------------------------------ src/neon/linear_to_planar.rs | 78 ------------- src/neon/mod.rs | 3 - src/neon/planar_to_linear.rs | 84 ------------- src/oklab_to_image.rs | 44 ++++--- src/planar_to_linear.rs | 84 ++++--------- src/sse/gamma_curves.rs | 29 +---- src/sse/image_to_jzazbz.rs | 212 +++------------------------------ src/sse/jzazbz_to_image.rs | 154 +++--------------------- src/sse/linear_to_planar.rs | 84 ------------- src/sse/planar_to_linear.rs | 92 --------------- src/xyz_lab_to_image.rs | 130 ++++++++++++++++----- src/xyza_laba_to_image.rs | 136 ++++++++++++++++------ 30 files changed, 1151 insertions(+), 1727 deletions(-) delete mode 100644 src/neon/linear_to_planar.rs delete mode 100644 src/neon/planar_to_linear.rs delete mode 100644 src/sse/linear_to_planar.rs delete mode 100644 src/sse/planar_to_linear.rs diff --git a/Cargo.lock b/Cargo.lock index 9a7e897..ccd37b4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -169,7 +169,7 @@ checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" [[package]] name = "colorutils-rs" -version = "0.6.2" +version = "0.7.0" dependencies = [ "erydanos", "half", diff --git a/src/app/src/main.rs b/src/app/src/main.rs index 0b52aab..bf7c58f 100644 --- a/src/app/src/main.rs +++ b/src/app/src/main.rs @@ -68,15 +68,14 @@ fn main() { lab_store.resize(width as usize * components * height as usize, 0.); let src_stride = width * components as u32; let start_time = Instant::now(); - rgb_to_jzazbz( + bgr_to_lalphabeta( src_bytes, src_stride, &mut lab_store, store_stride as u32, width, height, - 200., - TransferFunction::Srgb, + TransferFunction::Pq, ); let elapsed_time = start_time.elapsed(); // Print the elapsed time in milliseconds @@ -104,15 +103,14 @@ fn main() { // } let start_time = Instant::now(); - jzazbz_to_rgb( + lalphabeta_to_bgr( &lab_store, store_stride as u32, &mut dst_slice, src_stride, width, height, - 200., - TransferFunction::Srgb, + TransferFunction::Pq, ); let elapsed_time = start_time.elapsed(); diff --git a/src/avx/gamma_curves.rs b/src/avx/gamma_curves.rs index 76898df..e7d8e43 100644 --- a/src/avx/gamma_curves.rs +++ b/src/avx/gamma_curves.rs @@ -4,7 +4,7 @@ * // Use of this source code is governed by a BSD-style * // license that can be found in the LICENSE file. */ - +#![allow(dead_code)] use crate::avx::math::*; #[allow(unused_imports)] use crate::gamma_curves::TransferFunction; @@ -148,28 +148,3 @@ pub unsafe fn avx2_gamma2p2_from_linear(linear: __m256) -> __m256 { pub unsafe fn avx2_gamma2p8_from_linear(linear: __m256) -> __m256 { avx2_pure_gamma(linear, 1f32 / 2.8f32) } - -#[inline(always)] -pub unsafe fn perform_avx_gamma_transfer(transfer_function: TransferFunction, v: __m256) -> __m256 { - match transfer_function { - TransferFunction::Srgb => avx2_srgb_from_linear(v), - TransferFunction::Rec709 => avx2_rec709_from_linear(v), - TransferFunction::Gamma2p2 => avx2_gamma2p2_from_linear(v), - TransferFunction::Gamma2p8 => avx2_gamma2p8_from_linear(v), - TransferFunction::Smpte428 => avx2_smpte428_from_linear(v), - } -} - -#[inline(always)] -pub unsafe fn perform_avx2_linear_transfer( - transfer_function: TransferFunction, - v: __m256, -) -> __m256 { - match transfer_function { - TransferFunction::Srgb => avx2_srgb_to_linear(v), - TransferFunction::Rec709 => avx2_rec709_to_linear(v), - TransferFunction::Gamma2p2 => avx2_gamma2p2_to_linear(v), - TransferFunction::Gamma2p8 => avx2_gamma2p8_to_linear(v), - TransferFunction::Smpte428 => avx2_smpte428_to_linear(v), - } -} diff --git a/src/gamma_curves.rs b/src/gamma_curves.rs index 01324cb..62ca494 100644 --- a/src/gamma_curves.rs +++ b/src/gamma_curves.rs @@ -75,6 +75,110 @@ pub fn smpte428_from_linear(linear: f32) -> f32 { (0.91655527974030934f32 * linear.max(0.)).powf(POWER_VALUE) } +#[inline] +/// Linear transfer function for Smpte 240 +pub fn smpte240_to_linear(gamma: f32) -> f32 { + if gamma < 0.0 { + 0.0 + } else if gamma < 4.0 * 0.022821585529445 { + gamma / 4.0 + } else if gamma < 1.0 { + f32::powf((gamma + 0.111572195921731) / 1.111572195921731, 1.0 / 0.45) + } else { + 1.0 + } +} + +#[inline] +/// Gamma transfer function for Smpte 240 +pub fn smpte240_from_linear(linear: f32) -> f32 { + if linear < 0.0 { + 0.0 + } else if linear < 0.022821585529445 { + linear * 4.0 + } else if linear < 1.0 { + 1.111572195921731 * f32::powf(linear, 0.45) - 0.111572195921731 + } else { + 1.0 + } +} + +#[inline] +/// Gamma transfer function for Log100 +pub fn log100_from_linear(linear: f32) -> f32 { + if linear <= 0.01f32 { + 0. + } else { + 1. + linear.min(1.).log10() / 2.0 + } +} + +#[inline] +/// Linear transfer function for Log100 +pub fn log100_to_linear(gamma: f32) -> f32 { + // The function is non-bijective so choose the middle of [0, 0.00316227766f]. + const MID_INTERVAL: f32 = 0.01 / 2.; + if gamma <= 0. { + MID_INTERVAL + } else { + 10f32.powf(2. * (gamma.min(1.) - 1.)) + } +} + +#[inline] +/// Linear transfer function for Log100Sqrt10 +pub fn log100_sqrt10_to_linear(gamma: f32) -> f32 { + // The function is non-bijective so choose the middle of [0, 0.00316227766f]. + const MID_INTERVAL: f32 = 0.00316227766 / 2.; + if gamma <= 0. { + MID_INTERVAL + } else { + 10f32.powf(2.5 * (gamma.min(1.) - 1.)) + } +} + +#[inline] +/// Gamma transfer function for Log100Sqrt10 +pub fn log100_sqrt10_from_linear(linear: f32) -> f32 { + if linear <= 0.00316227766 { + 0.0 + } else { + 1.0 + linear.min(1.).log10() / 2.5 + } +} + +#[inline] +/// Gamma transfer function for Bt.1361 +pub fn bt1361_from_linear(linear: f32) -> f32 { + if linear < -0.25 { + -0.25 + } else if linear < 0.0 { + -0.27482420670236 * f32::powf(-4.0 * linear, 0.45) + 0.02482420670236 + } else if linear < 0.018053968510807 { + linear * 4.5 + } else if linear < 1.0 { + 1.09929682680944 * f32::powf(linear, 0.45) - 0.09929682680944 + } else { + 1.0 + } +} + +#[inline] +/// Linear transfer function for Bt.1361 +pub fn bt1361_to_linear(gamma: f32) -> f32 { + if gamma < -0.25 { + -0.25 + } else if gamma < 0.0 { + f32::powf((gamma - 0.02482420670236) / -0.27482420670236, 1.0 / 0.45) / -4.0 + } else if gamma < 4.5 * 0.018053968510807 { + gamma / 4.5 + } else if gamma < 1.0 { + f32::powf((gamma + 0.09929682680944) / 1.09929682680944, 1.0 / 0.45) + } else { + 1.0 + } +} + #[inline(always)] /// Pure gamma transfer function for gamma 2.2 pub fn pure_gamma_function(x: f32, gamma: f32) -> f32 { @@ -111,6 +215,80 @@ pub fn gamma2p8_to_linear(gamma: f32) -> f32 { pure_gamma_function(gamma, 2.8f32) } +#[inline] +/// Linear transfer function for PQ +pub fn pq_to_linear(gamma: f32) -> f32 { + if gamma > 0.0 { + let pow_gamma = f32::powf(gamma, 1.0 / 78.84375); + let num = (pow_gamma - 0.8359375).max(0.); + let den = (18.8515625 - 18.6875 * pow_gamma).max(f32::MIN); + let linear = f32::powf(num / den, 1.0 / 0.1593017578125); + // Scale so that SDR white is 1.0 (extended SDR). + const PQ_MAX_NITS: f32 = 10000.; + const SDR_WHITE_NITS: f32 = 203.; + linear * PQ_MAX_NITS / SDR_WHITE_NITS + } else { + 0.0 + } +} + +#[inline] +/// Gamma transfer function for PQ +pub fn pq_from_linear(linear: f32) -> f32 { + const PQ_MAX_NITS: f32 = 10000.; + const SDR_WHITE_NITS: f32 = 203.; + + if linear > 0.0 { + // Scale from extended SDR range to [0.0, 1.0]. + let linear = (linear * SDR_WHITE_NITS / PQ_MAX_NITS).clamp(0., 1.); + let pow_linear = f32::powf(linear, 0.1593017578125); + let num = 0.1640625 * pow_linear - 0.1640625; + let den = 1.0 + 18.6875 * pow_linear; + f32::powf(1.0 + num / den, 78.84375) + } else { + 0.0 + } +} + +#[inline] +/// Linear transfer function for HLG +pub fn hlg_to_linear(gamma: f32) -> f32 { + const SDR_WHITE_NITS: f32 = 203.; + const HLG_WHITE_NITS: f32 = 1000.; + if gamma < 0.0 { + return 0.0; + } + let linear; + if gamma <= 0.5 { + linear = f32::powf((gamma * gamma) * (1.0 / 3.0), 1.2); + } else { + linear = f32::powf( + (f32::exp((gamma - 0.55991073) / 0.17883277) + 0.28466892) / 12.0, + 1.2, + ); + } + // Scale so that SDR white is 1.0 (extended SDR). + linear * HLG_WHITE_NITS / SDR_WHITE_NITS +} + +#[inline] +/// Gamma transfer function for HLG +pub fn hlg_from_linear(linear: f32) -> f32 { + const SDR_WHITE_NITS: f32 = 203.; + const HLG_WHITE_NITS: f32 = 1000.; + // Scale from extended SDR range to [0.0, 1.0]. + let mut linear = (linear * (SDR_WHITE_NITS / HLG_WHITE_NITS)).clamp(0., 1.); + // Inverse OOTF followed by OETF see Table 5 and Note 5i in ITU-R BT.2100-2 page 7-8. + linear = f32::powf(linear, 1.0 / 1.2); + if linear < 0.0 { + 0.0 + } else if linear <= (1.0 / 12.0) { + f32::sqrt(3.0 * linear) + } else { + 0.17883277 * f32::ln(12.0 * linear - 0.28466892) + 0.55991073 + } +} + #[repr(C)] #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] /// Declares transfer function for transfer components into a linear colorspace and its inverse @@ -119,12 +297,24 @@ pub enum TransferFunction { Srgb, /// Rec.709 Transfer function Rec709, - /// Pure gamma 2.2 Transfer function + /// Pure gamma 2.2 Transfer function, ITU-R 470M Gamma2p2, - /// Pure gamma 2.8 Transfer function + /// Pure gamma 2.8 Transfer function, ITU-R 470BG Gamma2p8, /// Smpte 428 Transfer function Smpte428, + /// Log100 Transfer function + Log100, + /// Log100Sqrt10 Transfer function + Log100Sqrt10, + /// Bt1361 Transfer function + Bt1361, + /// Smpte 240 Transfer function + Smpte240, + /// PQ Transfer function + Pq, + /// HLG (Hybrid log gamma) Transfer function + Hlg, } impl From for TransferFunction { @@ -136,6 +326,12 @@ impl From for TransferFunction { 2 => TransferFunction::Gamma2p2, 3 => TransferFunction::Gamma2p8, 4 => TransferFunction::Smpte428, + 5 => TransferFunction::Log100, + 6 => TransferFunction::Log100Sqrt10, + 7 => TransferFunction::Bt1361, + 8 => TransferFunction::Smpte240, + 9 => TransferFunction::Pq, + 10 => TransferFunction::Hlg, _ => TransferFunction::Srgb, } } @@ -150,6 +346,12 @@ impl TransferFunction { TransferFunction::Gamma2p8 => gamma2p8_to_linear(v), TransferFunction::Gamma2p2 => gamma2p2_to_linear(v), TransferFunction::Smpte428 => smpte428_to_linear(v), + TransferFunction::Log100 => log100_to_linear(v), + TransferFunction::Log100Sqrt10 => log100_sqrt10_to_linear(v), + TransferFunction::Bt1361 => bt1361_to_linear(v), + TransferFunction::Smpte240 => smpte240_to_linear(v), + TransferFunction::Pq => pq_to_linear(v), + TransferFunction::Hlg => hlg_to_linear(v), } } @@ -160,7 +362,13 @@ impl TransferFunction { TransferFunction::Rec709 => rec709_from_linear(v), TransferFunction::Gamma2p2 => gamma2p2_from_linear(v), TransferFunction::Gamma2p8 => gamma2p8_from_linear(v), - TransferFunction::Smpte428 => smpte428_to_linear(v), + TransferFunction::Smpte428 => smpte428_from_linear(v), + TransferFunction::Log100 => log100_from_linear(v), + TransferFunction::Log100Sqrt10 => log100_sqrt10_from_linear(v), + TransferFunction::Bt1361 => bt1361_from_linear(v), + TransferFunction::Smpte240 => smpte240_from_linear(v), + TransferFunction::Pq => pq_from_linear(v), + TransferFunction::Hlg => hlg_from_linear(v), } } } diff --git a/src/image_to_jzazbz.rs b/src/image_to_jzazbz.rs index dd7df02..315dc45 100644 --- a/src/image_to_jzazbz.rs +++ b/src/image_to_jzazbz.rs @@ -9,12 +9,11 @@ use crate::image::ImageConfiguration; use crate::neon::neon_image_to_jzazbz; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::sse::sse_image_to_jzazbz; -use crate::{Jzazbz, Jzczhz, Rgb, TransferFunction}; +use crate::{Jzazbz, Jzczhz, Rgb, TransferFunction, Xyz, SRGB_TO_XYZ_D65}; #[cfg(feature = "rayon")] use rayon::iter::{IndexedParallelIterator, ParallelIterator}; #[cfg(feature = "rayon")] use rayon::prelude::{ParallelSlice, ParallelSliceMut}; -#[cfg(feature = "rayon")] use std::slice; #[repr(u8)] @@ -53,7 +52,7 @@ fn channels_to_jzaz( let channels = image_configuration.get_channels_count(); let mut _wide_row_handle: Option< - unsafe fn(usize, *const u8, usize, u32, *mut f32, usize, f32, TransferFunction) -> usize, + unsafe fn(usize, *const f32, usize, u32, *mut f32, usize, f32) -> usize, > = None; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] @@ -66,72 +65,88 @@ fn channels_to_jzaz( _wide_row_handle = Some(sse_image_to_jzazbz::); } + let dst_slice_safe_align = unsafe { + slice::from_raw_parts_mut( + dst.as_mut_ptr() as *mut u8, + dst_stride as usize * height as usize, + ) + }; + + let mut lut_table = vec![0f32; 256]; + for i in 0..256 { + lut_table[i] = transfer_function.linearize(i as f32 * (1. / 255.0)); + } + #[cfg(feature = "rayon")] { - let dst_slice_safe_align = unsafe { - slice::from_raw_parts_mut( - dst.as_mut_ptr() as *mut u8, - dst_stride as usize * height as usize, - ) - }; dst_slice_safe_align .par_chunks_exact_mut(dst_stride as usize) .zip(src.par_chunks_exact(src_stride as usize)) .for_each(|(dst, src)| unsafe { let mut _cx = 0usize; - let src_ptr = src.as_ptr(); let dst_ptr = dst.as_mut_ptr() as *mut f32; + let mut linearized_row = vec![0f32; width as usize * channels]; + for (linear_chunk, src_chunk) in linearized_row + .chunks_exact_mut(channels) + .zip(src.chunks_exact(channels)) + { + linear_chunk[image_configuration.get_r_channel_offset()] = *lut_table + .get_unchecked( + src_chunk[image_configuration.get_r_channel_offset()] as usize, + ); + linear_chunk[image_configuration.get_g_channel_offset()] = *lut_table + .get_unchecked( + src_chunk[image_configuration.get_g_channel_offset()] as usize, + ); + linear_chunk[image_configuration.get_b_channel_offset()] = *lut_table + .get_unchecked( + src_chunk[image_configuration.get_b_channel_offset()] as usize, + ); + if image_configuration.has_alpha() { + linear_chunk[image_configuration.get_a_channel_offset()] = + src_chunk[image_configuration.get_a_channel_offset()] as f32 + * (1. / 255.0); + } + } + if let Some(dispatcher) = _wide_row_handle { _cx = dispatcher( _cx, - src.as_ptr(), + linearized_row.as_ptr(), 0, width, dst.as_mut_ptr() as *mut f32, 0, display_luminance, - transfer_function, ); } for x in _cx..width as usize { let px = x * channels; - let src = src_ptr.add(px); - let r = src - .add(image_configuration.get_r_channel_offset()) - .read_unaligned(); - let g = src - .add(image_configuration.get_g_channel_offset()) - .read_unaligned(); - let b = src - .add(image_configuration.get_b_channel_offset()) - .read_unaligned(); + let src = linearized_row.get_unchecked(px..); + let r = *src.get_unchecked(image_configuration.get_r_channel_offset()); + let g = *src.get_unchecked(image_configuration.get_g_channel_offset()); + let b = *src.get_unchecked(image_configuration.get_b_channel_offset()); - let rgb = Rgb::::new(r, g, b); + let xyz = Xyz::from_linear_rgb(Rgb::::new(r, g, b), &SRGB_TO_XYZ_D65); let dst_store = dst_ptr.add(px); match target { JzazbzTarget::Jzazbz => { - let jzazbz = Jzazbz::from_rgb_with_luminance( - rgb, - display_luminance, - transfer_function, - ); + let jzazbz = + Jzazbz::from_xyz_with_display_luminance(xyz, display_luminance); dst_store.write_unaligned(jzazbz.jz); dst_store.add(1).write_unaligned(jzazbz.az); dst_store.add(2).write_unaligned(jzazbz.bz); } JzazbzTarget::Jzczhz => { - let jzczhz = Jzczhz::from_rgb_with_luminance( - rgb, - display_luminance, - transfer_function, - ); + let jzczhz = + Jzczhz::from_xyz_with_display_luminance(xyz, display_luminance); dst_store.write_unaligned(jzczhz.jz); dst_store.add(1).write_unaligned(jzczhz.cz); @@ -140,12 +155,8 @@ fn channels_to_jzaz( } if image_configuration.has_alpha() { - let a = src - .add(image_configuration.get_a_channel_offset()) - .read_unaligned(); - let a_lin = a as f32 * (1f32 / 255f32); - - dst_store.add(3).write_unaligned(a_lin); + let a = *src.get_unchecked(image_configuration.get_a_channel_offset()); + dst_store.add(3).write_unaligned(a); } } }); @@ -153,92 +164,88 @@ fn channels_to_jzaz( #[cfg(not(feature = "rayon"))] { - let mut src_offset = 0usize; - let mut dst_offset = 0usize; + for (dst, src) in dst_slice_safe_align + .chunks_exact_mut(dst_stride as usize) + .zip(src.chunks_exact(src_stride as usize)) + { + unsafe { + let mut _cx = 0usize; - for _ in 0..height as usize { - let mut _cx = 0usize; + let dst_ptr = dst.as_mut_ptr() as *mut f32; - let src_ptr = unsafe { src.as_ptr().add(src_offset) }; - let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 }; + let mut linearized_row = vec![0f32; width as usize * channels]; + for (linear_chunk, src_chunk) in linearized_row + .chunks_exact_mut(channels) + .zip(src.chunks_exact(channels)) + { + linear_chunk[image_configuration.get_r_channel_offset()] = *lut_table + .get_unchecked( + src_chunk[image_configuration.get_r_channel_offset()] as usize, + ); + linear_chunk[image_configuration.get_g_channel_offset()] = *lut_table + .get_unchecked( + src_chunk[image_configuration.get_g_channel_offset()] as usize, + ); + linear_chunk[image_configuration.get_b_channel_offset()] = *lut_table + .get_unchecked( + src_chunk[image_configuration.get_b_channel_offset()] as usize, + ); + if image_configuration.has_alpha() { + linear_chunk[image_configuration.get_a_channel_offset()] = + src_chunk[image_configuration.get_a_channel_offset()] as f32 + * (1. / 255.0); + } + } - if let Some(dispatcher) = _wide_row_handle { - unsafe { + if let Some(dispatcher) = _wide_row_handle { _cx = dispatcher( _cx, - src.as_ptr(), - src_offset, + linearized_row.as_ptr(), + 0, width, - dst.as_mut_ptr(), - dst_offset, + dst.as_mut_ptr() as *mut f32, + 0, display_luminance, - transfer_function, ); } - } - for x in _cx..width as usize { - let px = x * channels; - - let src = unsafe { src_ptr.add(px) }; - let r = unsafe { - src.add(image_configuration.get_r_channel_offset()) - .read_unaligned() - }; - let g = unsafe { - src.add(image_configuration.get_g_channel_offset()) - .read_unaligned() - }; - let b = unsafe { - src.add(image_configuration.get_b_channel_offset()) - .read_unaligned() - }; - - let rgb = Rgb::::new(r, g, b); - - let dst_store = unsafe { dst_ptr.add(px) }; - - match target { - JzazbzTarget::Jzazbz => { - let jzazbz = Jzazbz::from_rgb_with_luminance( - rgb, - display_luminance, - transfer_function, - ); - unsafe { + for x in _cx..width as usize { + let px = x * channels; + + let src = linearized_row.get_unchecked(px..); + let r = *src.get_unchecked(image_configuration.get_r_channel_offset()); + let g = *src.get_unchecked(image_configuration.get_g_channel_offset()); + let b = *src.get_unchecked(image_configuration.get_b_channel_offset()); + + let xyz = Xyz::from_linear_rgb(Rgb::::new(r, g, b), &SRGB_TO_XYZ_D65); + + let dst_store = dst_ptr.add(px); + + match target { + JzazbzTarget::Jzazbz => { + let jzazbz = + Jzazbz::from_xyz_with_display_luminance(xyz, display_luminance); + dst_store.write_unaligned(jzazbz.jz); dst_store.add(1).write_unaligned(jzazbz.az); dst_store.add(2).write_unaligned(jzazbz.bz); } - } - JzazbzTarget::Jzczhz => { - let jzczhz = Jzczhz::from_rgb_with_luminance( - rgb, - display_luminance, - transfer_function, - ); - unsafe { + JzazbzTarget::Jzczhz => { + let jzczhz = + Jzczhz::from_xyz_with_display_luminance(xyz, display_luminance); + dst_store.write_unaligned(jzczhz.jz); dst_store.add(1).write_unaligned(jzczhz.cz); dst_store.add(2).write_unaligned(jzczhz.hz); } } - } - if image_configuration.has_alpha() { - let a = unsafe { - src.add(image_configuration.get_a_channel_offset()) - .read_unaligned() - }; - let a_lin = a as f32 * (1f32 / 255f32); - unsafe { - dst_store.add(3).write_unaligned(a_lin); + if image_configuration.has_alpha() { + let a = *src.get_unchecked(image_configuration.get_a_channel_offset()); + dst_store.add(3).write_unaligned(a); } } } - - src_offset += src_stride as usize; - dst_offset += dst_stride as usize; } } } diff --git a/src/image_to_lalphabeta.rs b/src/image_to_lalphabeta.rs index b59e30b..08f1def 100644 --- a/src/image_to_lalphabeta.rs +++ b/src/image_to_lalphabeta.rs @@ -5,12 +5,11 @@ * // license that can be found in the LICENSE file. */ use crate::image::ImageConfiguration; -use crate::{Rgb, TransferFunction}; +use crate::{LAlphaBeta, Rgb, TransferFunction, SRGB_TO_XYZ_D65}; #[cfg(feature = "rayon")] use rayon::iter::{IndexedParallelIterator, ParallelIterator}; #[cfg(feature = "rayon")] use rayon::prelude::{ParallelSlice, ParallelSliceMut}; -#[cfg(feature = "rayon")] use std::slice; #[inline(always)] @@ -27,50 +26,70 @@ fn channels_to_lalphabeta( let channels = image_configuration.get_channels_count(); + let mut lut_table = vec![0f32; 256]; + for i in 0..256 { + lut_table[i] = transfer_function.linearize(i as f32 * (1. / 255.0)); + } + + let dst_slice_safe_align = unsafe { + slice::from_raw_parts_mut( + dst.as_mut_ptr() as *mut u8, + dst_stride as usize * height as usize, + ) + }; + #[cfg(feature = "rayon")] { - let dst_slice_safe_align = unsafe { - slice::from_raw_parts_mut( - dst.as_mut_ptr() as *mut u8, - dst_stride as usize * height as usize, - ) - }; dst_slice_safe_align .par_chunks_exact_mut(dst_stride as usize) .zip(src.par_chunks_exact(src_stride as usize)) .for_each(|(dst, src)| unsafe { let mut _cx = 0usize; - let src_ptr = src.as_ptr(); + let mut linearized_row = vec![0f32; width as usize * channels]; + for (linear_chunk, src_chunk) in linearized_row + .chunks_exact_mut(channels) + .zip(src.chunks_exact(channels)) + { + linear_chunk[image_configuration.get_r_channel_offset()] = *lut_table + .get_unchecked( + src_chunk[image_configuration.get_r_channel_offset()] as usize, + ); + linear_chunk[image_configuration.get_g_channel_offset()] = *lut_table + .get_unchecked( + src_chunk[image_configuration.get_g_channel_offset()] as usize, + ); + linear_chunk[image_configuration.get_b_channel_offset()] = *lut_table + .get_unchecked( + src_chunk[image_configuration.get_b_channel_offset()] as usize, + ); + if image_configuration.has_alpha() { + linear_chunk[image_configuration.get_a_channel_offset()] = + src_chunk[image_configuration.get_g_channel_offset()] as f32 + * (1. / 255.0); + } + } + let dst_ptr = dst.as_mut_ptr() as *mut f32; for x in _cx..width as usize { let px = x * channels; - let src = src_ptr.add(px); - let r = src - .add(image_configuration.get_r_channel_offset()) - .read_unaligned(); - let g = src - .add(image_configuration.get_g_channel_offset()) - .read_unaligned(); - let b = src - .add(image_configuration.get_b_channel_offset()) - .read_unaligned(); - - let rgb = Rgb::::new(r, g, b); + let src = linearized_row.get_unchecked(px..); + let r = *src.get_unchecked(image_configuration.get_r_channel_offset()); + let g = *src.get_unchecked(image_configuration.get_g_channel_offset()); + let b = *src.get_unchecked(image_configuration.get_b_channel_offset()); + + let rgb = Rgb::::new(r, g, b); let dst_store = dst_ptr.add(px); - let lalphabeta = rgb.to_lalphabeta(transfer_function); + let lalphabeta = LAlphaBeta::from_linear_rgb(rgb, &SRGB_TO_XYZ_D65); dst_store.write_unaligned(lalphabeta.l); dst_store.add(1).write_unaligned(lalphabeta.alpha); dst_store.add(2).write_unaligned(lalphabeta.beta); if image_configuration.has_alpha() { - let a = src - .add(image_configuration.get_a_channel_offset()) - .read_unaligned(); - let a_lin = a as f32 * (1f32 / 255f32); - dst_store.add(3).write_unaligned(a_lin); + let a = *src.get_unchecked(image_configuration.get_g_channel_offset()); + dst_store.add(3).write_unaligned(a); } } }); @@ -78,55 +97,60 @@ fn channels_to_lalphabeta( #[cfg(not(feature = "rayon"))] { - let mut src_offset = 0usize; - let mut dst_offset = 0usize; - - for _ in 0..height as usize { - let mut _cx = 0usize; - - let src_ptr = unsafe { src.as_ptr().add(src_offset) }; - let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 }; - - for x in _cx..width as usize { - let px = x * channels; - - let src = unsafe { src_ptr.add(px) }; - let r = unsafe { - src.add(image_configuration.get_r_channel_offset()) - .read_unaligned() - }; - let g = unsafe { - src.add(image_configuration.get_g_channel_offset()) - .read_unaligned() - }; - let b = unsafe { - src.add(image_configuration.get_b_channel_offset()) - .read_unaligned() - }; - - let rgb = Rgb::::new(r, g, b); - let dst_store = unsafe { dst_ptr.add(px) }; - let lalphabeta = rgb.to_lalphabeta(transfer_function); - unsafe { + for (dst, src) in dst_slice_safe_align + .chunks_exact_mut(dst_stride as usize) + .zip(src.chunks_exact(src_stride as usize)) + { + unsafe { + let mut _cx = 0usize; + + let mut linearized_row = vec![0f32; width as usize * channels]; + for (linear_chunk, src_chunk) in linearized_row + .chunks_exact_mut(channels) + .zip(src.chunks_exact(channels)) + { + linear_chunk[image_configuration.get_r_channel_offset()] = *lut_table + .get_unchecked( + src_chunk[image_configuration.get_r_channel_offset()] as usize, + ); + linear_chunk[image_configuration.get_g_channel_offset()] = *lut_table + .get_unchecked( + src_chunk[image_configuration.get_g_channel_offset()] as usize, + ); + linear_chunk[image_configuration.get_b_channel_offset()] = *lut_table + .get_unchecked( + src_chunk[image_configuration.get_b_channel_offset()] as usize, + ); + if image_configuration.has_alpha() { + linear_chunk[image_configuration.get_a_channel_offset()] = + src_chunk[image_configuration.get_g_channel_offset()] as f32 + * (1. / 255.0); + } + } + + let dst_ptr = dst.as_mut_ptr() as *mut f32; + + for x in _cx..width as usize { + let px = x * channels; + + let src = linearized_row.get_unchecked(px..); + let r = *src.get_unchecked(image_configuration.get_r_channel_offset()); + let g = *src.get_unchecked(image_configuration.get_g_channel_offset()); + let b = *src.get_unchecked(image_configuration.get_b_channel_offset()); + + let rgb = Rgb::::new(r, g, b); + let dst_store = dst_ptr.add(px); + let lalphabeta = LAlphaBeta::from_linear_rgb(rgb, &SRGB_TO_XYZ_D65); dst_store.write_unaligned(lalphabeta.l); dst_store.add(1).write_unaligned(lalphabeta.alpha); dst_store.add(2).write_unaligned(lalphabeta.beta); - } - if image_configuration.has_alpha() { - let a = unsafe { - src.add(image_configuration.get_a_channel_offset()) - .read_unaligned() - }; - let a_lin = a as f32 * (1f32 / 255f32); - unsafe { - dst_store.add(3).write_unaligned(a_lin); + if image_configuration.has_alpha() { + let a = *src.get_unchecked(image_configuration.get_g_channel_offset()); + dst_store.add(3).write_unaligned(a); } } } - - src_offset += src_stride as usize; - dst_offset += dst_stride as usize; } } } diff --git a/src/image_to_linear.rs b/src/image_to_linear.rs index 0e9c7bc..0d46c98 100644 --- a/src/image_to_linear.rs +++ b/src/image_to_linear.rs @@ -65,23 +65,20 @@ fn channels_to_linear( let rgb = Rgb::::new(r, g, b); - unsafe { - dst.write_unaligned(*lut_table.get_unchecked(rgb.r as usize)); - dst.add(1) - .write_unaligned(*lut_table.get_unchecked(rgb.g as usize)); - dst.add(2) - .write_unaligned(*lut_table.get_unchecked(rgb.b as usize)); - } + dst.add(image_configuration.get_r_channel_offset()) + .write_unaligned(*lut_table.get_unchecked(rgb.r as usize)); + dst.add(image_configuration.get_g_channel_offset()) + .write_unaligned(*lut_table.get_unchecked(rgb.g as usize)); + dst.add(image_configuration.get_b_channel_offset()) + .write_unaligned(*lut_table.get_unchecked(rgb.b as usize)); if USE_ALPHA && image_configuration.has_alpha() { - let a = unsafe { - src.add(image_configuration.get_a_channel_offset()) - .read_unaligned() - }; + let a = src + .add(image_configuration.get_a_channel_offset()) + .read_unaligned(); let a_lin = a as f32 * (1f32 / 255f32); - unsafe { - dst.add(3).write_unaligned(a_lin); - } + dst.add(image_configuration.get_a_channel_offset()) + .write_unaligned(a_lin); } } @@ -123,10 +120,11 @@ fn channels_to_linear( let rgb = Rgb::::new(r, g, b); - dst.write_unaligned(*lut_table.get_unchecked(rgb.r as usize)); - dst.add(1) + dst.add(image_configuration.get_r_channel_offset()) + .write_unaligned(*lut_table.get_unchecked(rgb.r as usize)); + dst.add(image_configuration.get_g_channel_offset()) .write_unaligned(*lut_table.get_unchecked(rgb.g as usize)); - dst.add(2) + dst.add(image_configuration.get_b_channel_offset()) .write_unaligned(*lut_table.get_unchecked(rgb.b as usize)); if USE_ALPHA && image_configuration.has_alpha() { @@ -134,7 +132,8 @@ fn channels_to_linear( .add(image_configuration.get_a_channel_offset()) .read_unaligned(); let a_lin = a as f32 * (1f32 / 255f32); - dst.add(3).write_unaligned(a_lin); + dst.add(image_configuration.get_a_channel_offset()) + .write_unaligned(a_lin); } } }); diff --git a/src/image_to_linear_u8.rs b/src/image_to_linear_u8.rs index 84b41ee..4db8e5a 100644 --- a/src/image_to_linear_u8.rs +++ b/src/image_to_linear_u8.rs @@ -45,31 +45,22 @@ fn channels_to_linear( for x in _cx..width as usize { let px = x * channels; - let r = - unsafe { *src_row.get_unchecked(px + image_configuration.get_r_channel_offset()) }; - let g = - unsafe { *src_row.get_unchecked(px + image_configuration.get_g_channel_offset()) }; - let b = - unsafe { *src_row.get_unchecked(px + image_configuration.get_b_channel_offset()) }; + let r = *src_row.get_unchecked(px + image_configuration.get_r_channel_offset()); + let g = *src_row.get_unchecked(px + image_configuration.get_g_channel_offset()); + let b = *src_row.get_unchecked(px + image_configuration.get_b_channel_offset()); let rgb = Rgb::::new(r, g, b); - let mut rgb_f32 = rgb.to_rgb_f32(); - rgb_f32 = rgb_f32.linearize(transfer_function); - let rgb = rgb_f32.to_u8(); - - unsafe { - *dst_row.get_unchecked_mut(px) = *lut_table.get_unchecked(rgb.r as usize); - *dst_row.get_unchecked_mut(px + 1) = *lut_table.get_unchecked(rgb.g as usize); - *dst_row.get_unchecked_mut(px + 2) = *lut_table.get_unchecked(rgb.b as usize); - } + + *dst_row.get_unchecked_mut(px + image_configuration.get_r_channel_offset()) = + *lut_table.get_unchecked(rgb.r as usize); + *dst_row.get_unchecked_mut(px + image_configuration.get_g_channel_offset()) = + *lut_table.get_unchecked(rgb.g as usize); + *dst_row.get_unchecked_mut(px + image_configuration.get_b_channel_offset()) = + *lut_table.get_unchecked(rgb.b as usize); if USE_ALPHA && image_configuration.has_alpha() { - let a = unsafe { - *src_row.get_unchecked(px + image_configuration.get_a_channel_offset()) - }; - unsafe { - *dst_row.get_unchecked_mut(px + 3) = a; - } + let a = *src_row.get_unchecked(px + image_configuration.get_a_channel_offset()); + *dst_row.get_unchecked_mut(px + image_configuration.get_a_channel_offset()) = a; } } } @@ -90,14 +81,18 @@ fn channels_to_linear( let rgb = Rgb::::new(r, g, b); - *dst_row.get_unchecked_mut(px) = *lut_table.get_unchecked(rgb.r as usize); - *dst_row.get_unchecked_mut(px + 1) = *lut_table.get_unchecked(rgb.g as usize); - *dst_row.get_unchecked_mut(px + 2) = *lut_table.get_unchecked(rgb.b as usize); + *dst_row.get_unchecked_mut(px + image_configuration.get_r_channel_offset()) = + *lut_table.get_unchecked(rgb.r as usize); + *dst_row.get_unchecked_mut(px + image_configuration.get_g_channel_offset()) = + *lut_table.get_unchecked(rgb.g as usize); + *dst_row.get_unchecked_mut(px + image_configuration.get_b_channel_offset()) = + *lut_table.get_unchecked(rgb.b as usize); if USE_ALPHA && image_configuration.has_alpha() { let a = *src_row.get_unchecked(px + image_configuration.get_a_channel_offset()); - *dst_row.get_unchecked_mut(px + 3) = a; + *dst_row + .get_unchecked_mut(px + image_configuration.get_a_channel_offset()) = a; } } }); diff --git a/src/image_to_xyz_lab.rs b/src/image_to_xyz_lab.rs index 5617d10..a0d210f 100644 --- a/src/image_to_xyz_lab.rs +++ b/src/image_to_xyz_lab.rs @@ -108,10 +108,21 @@ fn channels_to_xyz( src, @@ -257,8 +283,8 @@ pub fn rgba_to_lab_with_alpha( dst_stride, width, height, - &SRGB_TO_XYZ_D65, - TransferFunction::Srgb, + matrix, + transfer_function, ); } @@ -275,6 +301,8 @@ pub fn rgba_to_lab_with_alpha( /// * `height` - Image height /// * `dst` - A mutable slice to receive LAB(a) data /// * `dst_stride` - Bytes per row for dst data +/// * `matrix` - Transformation matrix from RGB to XYZ. If you don't have specific just pick `XYZ_TO_SRGB_D65` +/// * `transfer_function` - Transfer function. If you don't have specific pick `Srgb` pub fn bgra_to_lab_with_alpha( src: &[u8], src_stride: u32, @@ -282,6 +310,8 @@ pub fn bgra_to_lab_with_alpha( dst_stride: u32, width: u32, height: u32, + matrix: &[[f32; 3]; 3], + transfer_function: TransferFunction, ) { channels_to_xyz_with_alpha::<{ ImageConfiguration::Bgra as u8 }, { XyzTarget::Lab as u8 }>( src, @@ -290,8 +320,8 @@ pub fn bgra_to_lab_with_alpha( dst_stride, width, height, - &SRGB_TO_XYZ_D65, - TransferFunction::Srgb, + matrix, + transfer_function, ); } @@ -308,6 +338,8 @@ pub fn bgra_to_lab_with_alpha( /// * `height` - Image height /// * `dst` - A mutable slice to receive LAB(a) data /// * `dst_stride` - Bytes per row for dst data +/// * `matrix` - Transformation matrix from RGB to XYZ. If you don't have specific just pick `XYZ_TO_SRGB_D65` +/// * `transfer_function` - Transfer function. If you don't have specific pick `Srgb` pub fn rgba_to_luv_with_alpha( src: &[u8], src_stride: u32, @@ -315,6 +347,8 @@ pub fn rgba_to_luv_with_alpha( dst_stride: u32, width: u32, height: u32, + matrix: &[[f32; 3]; 3], + transfer_function: TransferFunction, ) { channels_to_xyz_with_alpha::<{ ImageConfiguration::Rgba as u8 }, { XyzTarget::Luv as u8 }>( src, @@ -323,8 +357,8 @@ pub fn rgba_to_luv_with_alpha( dst_stride, width, height, - &SRGB_TO_XYZ_D65, - TransferFunction::Srgb, + matrix, + transfer_function, ); } @@ -341,6 +375,8 @@ pub fn rgba_to_luv_with_alpha( /// * `height` - Image height /// * `dst` - A mutable slice to receive LAB(a) data /// * `dst_stride` - Bytes per row for dst data +/// * `matrix` - Transformation matrix from RGB to XYZ. If you don't have specific just pick `XYZ_TO_SRGB_D65` +/// * `transfer_function` - Transfer function. If you don't have specific pick `Srgb` pub fn bgra_to_luv_with_alpha( src: &[u8], src_stride: u32, @@ -348,6 +384,8 @@ pub fn bgra_to_luv_with_alpha( dst_stride: u32, width: u32, height: u32, + matrix: &[[f32; 3]; 3], + transfer_function: TransferFunction, ) { channels_to_xyz_with_alpha::<{ ImageConfiguration::Bgra as u8 }, { XyzTarget::Luv as u8 }>( src, @@ -356,8 +394,8 @@ pub fn bgra_to_luv_with_alpha( dst_stride, width, height, - &SRGB_TO_XYZ_D65, - TransferFunction::Srgb, + matrix, + transfer_function, ); } @@ -370,6 +408,8 @@ pub fn bgra_to_luv_with_alpha( /// * `height` - Image height /// * `dst` - A mutable slice to receive XYZ(a) data /// * `dst_stride` - Bytes per row for dst data +/// * `matrix` - Transformation matrix from RGB to XYZ. If you don't have specific just pick `XYZ_TO_SRGB_D65` +/// * `transfer_function` - Transfer function. If you don't have specific pick `Srgb` pub fn rgba_to_xyz_with_alpha( src: &[u8], src_stride: u32, @@ -377,6 +417,8 @@ pub fn rgba_to_xyz_with_alpha( dst_stride: u32, width: u32, height: u32, + matrix: &[[f32; 3]; 3], + transfer_function: TransferFunction, ) { channels_to_xyz_with_alpha::<{ ImageConfiguration::Rgba as u8 }, { XyzTarget::Xyz as u8 }>( src, @@ -385,8 +427,8 @@ pub fn rgba_to_xyz_with_alpha( dst_stride, width, height, - &SRGB_TO_XYZ_D65, - TransferFunction::Srgb, + matrix, + transfer_function, ); } @@ -399,6 +441,8 @@ pub fn rgba_to_xyz_with_alpha( /// * `height` - Image height /// * `dst` - A mutable slice to receive XYZ data /// * `dst_stride` - Bytes per row for dst data +/// * `matrix` - Transformation matrix from RGB to XYZ. If you don't have specific just pick `XYZ_TO_SRGB_D65` +/// * `transfer_function` - Transfer function. If you don't have specific pick `Srgb` pub fn bgra_to_xyz_with_alpha( src: &[u8], src_stride: u32, @@ -406,6 +450,8 @@ pub fn bgra_to_xyz_with_alpha( dst_stride: u32, width: u32, height: u32, + matrix: &[[f32; 3]; 3], + transfer_function: TransferFunction, ) { channels_to_xyz_with_alpha::<{ ImageConfiguration::Bgra as u8 }, { XyzTarget::Xyz as u8 }>( src, @@ -414,8 +460,8 @@ pub fn bgra_to_xyz_with_alpha( dst_stride, width, height, - &SRGB_TO_XYZ_D65, - TransferFunction::Srgb, + matrix, + transfer_function, ); } @@ -428,6 +474,8 @@ pub fn bgra_to_xyz_with_alpha( /// * `height` - Image height /// * `dst` - A mutable slice to receive LCH(a) data /// * `dst_stride` - Bytes per row for dst data +/// * `matrix` - Transformation matrix from RGB to XYZ. If you don't have specific just pick `XYZ_TO_SRGB_D65` +/// * `transfer_function` - Transfer function. If you don't have specific pick `Srgb` pub fn rgba_to_lch_with_alpha( src: &[u8], src_stride: u32, @@ -435,6 +483,8 @@ pub fn rgba_to_lch_with_alpha( dst_stride: u32, width: u32, height: u32, + matrix: &[[f32; 3]; 3], + transfer_function: TransferFunction, ) { channels_to_xyz_with_alpha::<{ ImageConfiguration::Rgba as u8 }, { XyzTarget::Lch as u8 }>( src, @@ -443,8 +493,8 @@ pub fn rgba_to_lch_with_alpha( dst_stride, width, height, - &SRGB_TO_XYZ_D65, - TransferFunction::Srgb, + matrix, + transfer_function, ); } @@ -457,6 +507,8 @@ pub fn rgba_to_lch_with_alpha( /// * `height` - Image height /// * `dst` - A mutable slice to receive LCH data /// * `dst_stride` - Bytes per row for dst data +/// * `matrix` - Transformation matrix from RGB to XYZ. If you don't have specific just pick `XYZ_TO_SRGB_D65` +/// * `transfer_function` - Transfer function. If you don't have specific pick `Srgb` pub fn bgra_to_lch_with_alpha( src: &[u8], src_stride: u32, @@ -464,6 +516,8 @@ pub fn bgra_to_lch_with_alpha( dst_stride: u32, width: u32, height: u32, + matrix: &[[f32; 3]; 3], + transfer_function: TransferFunction, ) { channels_to_xyz_with_alpha::<{ ImageConfiguration::Bgra as u8 }, { XyzTarget::Lch as u8 }>( src, @@ -472,7 +526,7 @@ pub fn bgra_to_lch_with_alpha( dst_stride, width, height, - &SRGB_TO_XYZ_D65, - TransferFunction::Srgb, + matrix, + transfer_function, ); } diff --git a/src/jzazbz_to_image.rs b/src/jzazbz_to_image.rs index ec6657a..850c038 100644 --- a/src/jzazbz_to_image.rs +++ b/src/jzazbz_to_image.rs @@ -15,7 +15,6 @@ use crate::{Jzazbz, Jzczhz, TransferFunction}; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; #[cfg(feature = "rayon")] use rayon::prelude::{ParallelSlice, ParallelSliceMut}; -#[cfg(feature = "rayon")] use std::slice; #[allow(clippy::type_complexity)] @@ -33,7 +32,7 @@ fn jzazbz_to_image( let target: JzazbzTarget = TARGET.into(); let mut _wide_row_handle: Option< - unsafe fn(usize, *const f32, u32, *mut u8, u32, u32, f32, TransferFunction) -> usize, + unsafe fn(usize, *const f32, u32, *mut f32, u32, u32, f32) -> usize, > = None; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] @@ -46,14 +45,22 @@ fn jzazbz_to_image( _wide_row_handle = Some(neon_jzazbz_to_image::); } + let mut lut_table = vec![0u8; 2049]; + for i in 0..2049 { + lut_table[i] = (transfer_function.gamma(i as f32 * (1. / 2048.0)) * 255.) + .ceil() + .min(255.) as u8; + } + + let src_slice_safe_align = unsafe { + slice::from_raw_parts( + src.as_ptr() as *const u8, + src_stride as usize * height as usize, + ) + }; + #[cfg(feature = "rayon")] { - let src_slice_safe_align = unsafe { - slice::from_raw_parts( - src.as_ptr() as *const u8, - src_stride as usize * height as usize, - ) - }; dst.par_chunks_exact_mut(dst_stride as usize) .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize)) .for_each(|(dst, src)| unsafe { @@ -62,18 +69,18 @@ fn jzazbz_to_image( let mut _cx = 0usize; let src_ptr = src.as_ptr() as *mut f32; - let dst_ptr = dst.as_mut_ptr(); + + let mut transient_row = vec![0f32; width as usize * channels]; if let Some(dispatcher) = _wide_row_handle { _cx = dispatcher( _cx, src.as_ptr() as *const f32, 0, - dst.as_mut_ptr(), + transient_row.as_mut_ptr(), 0, width, display_luminance, - transfer_function, ); } @@ -86,26 +93,56 @@ fn jzazbz_to_image( JzazbzTarget::Jzazbz => { let jzazbz = Jzazbz::new_with_luminance(l_x, l_y, l_z, display_luminance); - jzazbz.to_rgb(transfer_function) + jzazbz.to_linear_rgb() } JzazbzTarget::Jzczhz => { let jzczhz = Jzczhz::new(l_x, l_y, l_z); - jzczhz.to_rgb_with_luminance(display_luminance, transfer_function) + jzczhz.to_linear_rgb_with_luminance(display_luminance) } }; - let dst = dst_ptr.add(x * channels); - dst.add(image_configuration.get_r_channel_offset()) - .write_unaligned(rgb.r); - dst.add(image_configuration.get_g_channel_offset()) - .write_unaligned(rgb.g); - dst.add(image_configuration.get_b_channel_offset()) - .write_unaligned(rgb.b); + let dst = transient_row.get_unchecked_mut((x * channels)..); + *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; + *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; + *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; if image_configuration.has_alpha() { let l_a = src_ptr.add(px + 3).read_unaligned(); - let a_value = (l_a * 255f32).max(0f32); - dst.add(image_configuration.get_a_channel_offset()) - .write_unaligned(a_value as u8); + *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a; + } + } + + for (dst_chunk, src_chunks) in dst + .chunks_exact_mut(channels) + .zip(transient_row.chunks_exact(channels)) + { + let r_cast = (src_chunks[image_configuration.get_r_channel_offset()] + .min(1.) + .max(0.) + * 2048f32) + .round(); + let g_cast = (src_chunks[image_configuration.get_g_channel_offset()] + .min(1.) + .max(0.) + * 2048f32) + .round(); + let b_cast = (src_chunks[image_configuration.get_b_channel_offset()] + .min(1.) + .max(0.) + * 2048f32) + .round(); + + dst_chunk[image_configuration.get_r_channel_offset()] = + *lut_table.get_unchecked(r_cast as usize); + dst_chunk[image_configuration.get_g_channel_offset()] = + *lut_table.get_unchecked(g_cast as usize); + dst_chunk[image_configuration.get_b_channel_offset()] = + *lut_table.get_unchecked(b_cast as usize); + + if image_configuration.has_alpha() { + let a_cast = (src_chunks[image_configuration.get_a_channel_offset()] * 255.) + .min(255.) + .max(0.) as u8; + dst_chunk[image_configuration.get_a_channel_offset()] = a_cast; } } }); @@ -113,67 +150,93 @@ fn jzazbz_to_image( #[cfg(not(feature = "rayon"))] { - let mut src_offset = 0usize; - let mut dst_offset = 0usize; + for (dst, src) in dst + .chunks_exact_mut(dst_stride as usize) + .zip(src_slice_safe_align.chunks_exact(src_stride as usize)) + { + unsafe { + let channels = image_configuration.get_channels_count(); - let channels = image_configuration.get_channels_count(); + let mut _cx = 0usize; - for _ in 0..height as usize { - let mut _cx = 0usize; + let src_ptr = src.as_ptr() as *mut f32; - let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *mut f32 }; - let dst_ptr = unsafe { dst.as_mut_ptr().add(dst_offset) }; + let mut transient_row = vec![0f32; width as usize * channels]; - if let Some(dispatcher) = _wide_row_handle { - unsafe { + if let Some(dispatcher) = _wide_row_handle { _cx = dispatcher( _cx, - src.as_ptr(), - src_offset as u32, - dst.as_mut_ptr(), - dst_offset as u32, + src.as_ptr() as *const f32, + 0, + transient_row.as_mut_ptr(), + 0, width, display_luminance, - transfer_function, ); } - } - for x in _cx..width as usize { - let px = x * channels; - let l_x = unsafe { src_ptr.add(px).read_unaligned() }; - let l_y = unsafe { src_ptr.add(px + 1).read_unaligned() }; - let l_z = unsafe { src_ptr.add(px + 2).read_unaligned() }; - let rgb = match target { - JzazbzTarget::Jzazbz => { - let jzazbz = Jzazbz::new_with_luminance(l_x, l_y, l_z, display_luminance); - jzazbz.to_rgb(transfer_function) - } - JzazbzTarget::Jzczhz => { - let jzczhz = Jzczhz::new(l_x, l_y, l_z); - jzczhz.to_rgb_with_luminance(display_luminance, transfer_function) - } - }; - - unsafe { - let dst = dst_ptr.add(x * channels); - dst.add(image_configuration.get_r_channel_offset()) - .write_unaligned(rgb.r); - dst.add(image_configuration.get_g_channel_offset()) - .write_unaligned(rgb.g); - dst.add(image_configuration.get_b_channel_offset()) - .write_unaligned(rgb.b); + for x in _cx..width as usize { + let px = x * channels; + let l_x = src_ptr.add(px).read_unaligned(); + let l_y = src_ptr.add(px + 1).read_unaligned(); + let l_z = src_ptr.add(px + 2).read_unaligned(); + let rgb = match target { + JzazbzTarget::Jzazbz => { + let jzazbz = + Jzazbz::new_with_luminance(l_x, l_y, l_z, display_luminance); + jzazbz.to_linear_rgb() + } + JzazbzTarget::Jzczhz => { + let jzczhz = Jzczhz::new(l_x, l_y, l_z); + jzczhz.to_linear_rgb_with_luminance(display_luminance) + } + }; + + let dst = transient_row.get_unchecked_mut((x * channels)..); + *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; + *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; + *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; if image_configuration.has_alpha() { let l_a = src_ptr.add(px + 3).read_unaligned(); - let a_value = (l_a * 255f32).max(0f32); - dst.add(image_configuration.get_a_channel_offset()) - .write_unaligned(a_value as u8); + *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a; } } - } - src_offset += src_stride as usize; - dst_offset += dst_stride as usize; + for (dst_chunk, src_chunks) in dst + .chunks_exact_mut(channels) + .zip(transient_row.chunks_exact(channels)) + { + let r_cast = (src_chunks[image_configuration.get_r_channel_offset()] + .min(1.) + .max(0.) + * 2048f32) + .round(); + let g_cast = (src_chunks[image_configuration.get_g_channel_offset()] + .min(1.) + .max(0.) + * 2048f32) + .round(); + let b_cast = (src_chunks[image_configuration.get_b_channel_offset()] + .min(1.) + .max(0.) + * 2048f32) + .round(); + + dst_chunk[image_configuration.get_r_channel_offset()] = + *lut_table.get_unchecked(r_cast as usize); + dst_chunk[image_configuration.get_g_channel_offset()] = + *lut_table.get_unchecked(g_cast as usize); + dst_chunk[image_configuration.get_b_channel_offset()] = + *lut_table.get_unchecked(b_cast as usize); + + if image_configuration.has_alpha() { + let a_cast = (src_chunks[image_configuration.get_a_channel_offset()] * 255.) + .min(255.) + .max(0.) as u8; + dst_chunk[image_configuration.get_a_channel_offset()] = a_cast; + } + } + } } } } diff --git a/src/jzczhz.rs b/src/jzczhz.rs index 89bbf37..ce28c9d 100644 --- a/src/jzczhz.rs +++ b/src/jzczhz.rs @@ -104,6 +104,20 @@ impl Jzczhz { jzazbz.to_rgb(transfer_function) } + /// Converts [Jzczhz] to linear [Rgb] + /// + /// # Arguments + /// `display_luminance` - display luminance + /// `transfer_function` - Transfer function to convert into linear colorspace and backwards + #[inline] + pub fn to_linear_rgb_with_luminance( + &self, + display_luminance: f32, + ) -> Rgb { + let jzazbz = self.to_jzazbz_with_luminance(display_luminance); + jzazbz.to_linear_rgb() + } + /// Converts Jzczhz to *Xyz* #[inline] pub fn to_xyz(&self) -> Xyz { @@ -111,13 +125,20 @@ impl Jzczhz { jzazbz.to_xyz() } - /// Converts *Xyz* to *Jzczhz* + /// Converts [Xyz] to [Jzczhz] #[inline] pub fn from_xyz(xyz: Xyz) -> Jzczhz { let jzazbz = Jzazbz::from_xyz(xyz); Jzczhz::from_jzazbz(jzazbz) } + /// Converts [Xyz] to [Jzczhz] + #[inline] + pub fn from_xyz_with_display_luminance(xyz: Xyz, luminance: f32) -> Jzczhz { + let jzazbz = Jzazbz::from_xyz_with_display_luminance(xyz, luminance); + Jzczhz::from_jzazbz(jzazbz) + } + /// Computes distance for *Jzczhz* #[inline] pub fn distance(&self, other: Jzczhz) -> f32 { diff --git a/src/lalphabeta.rs b/src/lalphabeta.rs index eeeeca6..79b8e43 100644 --- a/src/lalphabeta.rs +++ b/src/lalphabeta.rs @@ -30,6 +30,13 @@ impl LAlphaBeta { LAlphaBeta::from_xyz(xyz) } + #[inline] + /// Converts linear [Rgb] to [LAlphaBeta] using [Xyz] matrix + pub fn from_linear_rgb(rgb: Rgb, matrix: &[[f32; 3]; 3]) -> LAlphaBeta { + let xyz = Xyz::from_linear_rgb(rgb, matrix); + LAlphaBeta::from_xyz(xyz) + } + /// Converts XYZ to l-alpha-beta #[inline] pub fn from_xyz(xyz: Xyz) -> LAlphaBeta { @@ -71,12 +78,19 @@ impl LAlphaBeta { Xyz::new(x, y, z) } - /// Converts l-alpha-beta to RGB + /// Converts l-alpha-beta to [Rgb] #[inline] pub fn to_rgb(&self, transfer_function: TransferFunction) -> Rgb { let xyz = self.to_xyz(); xyz.to_rgb(&XYZ_TO_SRGB_D65, transfer_function) } + + /// Converts l-alpha-beta to Linear [Rgb] + #[inline] + pub fn to_linear_rgb(&self, matrix: &[[f32; 3]; 3]) -> Rgb { + let xyz = self.to_xyz(); + xyz.to_linear_rgb(matrix) + } } impl Index for LAlphaBeta { diff --git a/src/lalphabeta_to_image.rs b/src/lalphabeta_to_image.rs index 744e15c..9cf6656 100644 --- a/src/lalphabeta_to_image.rs +++ b/src/lalphabeta_to_image.rs @@ -5,12 +5,11 @@ * // license that can be found in the LICENSE file. */ use crate::image::ImageConfiguration; -use crate::{LAlphaBeta, TransferFunction}; +use crate::{LAlphaBeta, Rgb, TransferFunction, XYZ_TO_SRGB_D65}; #[cfg(feature = "rayon")] use rayon::iter::{IndexedParallelIterator, ParallelIterator}; #[cfg(feature = "rayon")] use rayon::prelude::{ParallelSlice, ParallelSliceMut}; -#[cfg(feature = "rayon")] use std::slice; fn lalphabeta_to_image( @@ -26,21 +25,30 @@ fn lalphabeta_to_image( let channels = image_configuration.get_channels_count(); + let mut lut_table = vec![0u8; 2049]; + for i in 0..2049 { + lut_table[i] = (transfer_function.gamma(i as f32 * (1. / 2048.0)) * 255.) + .round() + .min(255.) as u8; + } + + let src_slice_safe_align = unsafe { + slice::from_raw_parts( + src.as_ptr() as *const u8, + src_stride as usize * height as usize, + ) + }; + #[cfg(feature = "rayon")] { - let src_slice_safe_align = unsafe { - slice::from_raw_parts( - src.as_ptr() as *const u8, - src_stride as usize * height as usize, - ) - }; dst.par_chunks_exact_mut(dst_stride as usize) .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize)) .for_each(|(dst, src)| unsafe { let mut _cx = 0usize; let src_ptr = src.as_ptr() as *mut f32; - let dst_ptr = dst.as_mut_ptr(); + + let mut transient_row = vec![0f32; width as usize * channels]; for x in _cx..width as usize { let px = x * channels; @@ -48,20 +56,45 @@ fn lalphabeta_to_image( let l_y = src_ptr.add(px + 1).read_unaligned(); let l_z = src_ptr.add(px + 2).read_unaligned(); let lalphabeta = LAlphaBeta::new(l_x, l_y, l_z); - let rgb = lalphabeta.to_rgb(transfer_function); + let rgb = lalphabeta.to_linear_rgb(&XYZ_TO_SRGB_D65); - let dst = dst_ptr.add(x * channels); - dst.add(image_configuration.get_r_channel_offset()) - .write_unaligned(rgb.r); - dst.add(image_configuration.get_g_channel_offset()) - .write_unaligned(rgb.g); - dst.add(image_configuration.get_b_channel_offset()) - .write_unaligned(rgb.b); + let dst = transient_row.get_unchecked_mut((x * channels)..); + *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; + *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; + *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; if image_configuration.has_alpha() { let l_a = src_ptr.add(px + 3).read_unaligned(); let a_value = (l_a * 255f32).max(0f32); - dst.add(image_configuration.get_a_channel_offset()) - .write_unaligned(a_value as u8); + *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = + a_value; + } + } + + for (dst, src) in dst + .chunks_exact_mut(channels) + .zip(transient_row.chunks_exact(channels)) + { + let r = src[image_configuration.get_r_channel_offset()]; + let g = src[image_configuration.get_g_channel_offset()]; + let b = src[image_configuration.get_b_channel_offset()]; + + let rgb = (Rgb::::new( + r.min(1f32).max(0f32), + g.min(1f32).max(0f32), + b.min(1f32).max(0f32), + ) * Rgb::::dup(2048f32)) + .round() + .cast::(); + + dst[image_configuration.get_r_channel_offset()] = + *lut_table.get_unchecked(rgb.r.min(2048) as usize); + dst[image_configuration.get_g_channel_offset()] = + *lut_table.get_unchecked(rgb.g.min(2048) as usize); + dst[image_configuration.get_b_channel_offset()] = + *lut_table.get_unchecked(rgb.b.min(2048) as usize); + if image_configuration.has_alpha() { + dst[image_configuration.get_a_channel_offset()] = + src[image_configuration.get_a_channel_offset()] as u8; } } }); diff --git a/src/linear_to_planar.rs b/src/linear_to_planar.rs index 90ebe1d..8de2d91 100644 --- a/src/linear_to_planar.rs +++ b/src/linear_to_planar.rs @@ -5,16 +5,11 @@ * // license that can be found in the LICENSE file. */ -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -use crate::neon::linear_to_planar::neon_linear_plane_to_gamma; -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -use crate::sse::sse_linear_plane_to_gamma; use crate::TransferFunction; #[cfg(feature = "rayon")] use rayon::iter::{IndexedParallelIterator, ParallelIterator}; #[cfg(feature = "rayon")] use rayon::prelude::{ParallelSlice, ParallelSliceMut}; -#[cfg(feature = "rayon")] use std::slice; #[allow(clippy::type_complexity)] @@ -27,103 +22,68 @@ fn linear_to_gamma_channels( height: u32, transfer_function: TransferFunction, ) { - let mut _wide_row_handler: Option< - unsafe fn(usize, *const f32, u32, *mut u8, u32, u32, TransferFunction) -> usize, - > = None; - - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - { - _wide_row_handler = Some(neon_linear_plane_to_gamma); + let mut lut_table = vec![0u8; 2049]; + for i in 0..2049 { + lut_table[i] = (transfer_function.gamma(i as f32 * (1. / 2048.0)) * 255.) + .ceil() + .min(255.) as u8; } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - if std::arch::is_x86_feature_detected!("sse4.1") { - _wide_row_handler = Some(sse_linear_plane_to_gamma); - } + let src_slice_safe_align = unsafe { + slice::from_raw_parts( + src.as_ptr() as *const u8, + src_stride as usize * height as usize, + ) + }; #[cfg(feature = "rayon")] { - let src_slice_safe_align = unsafe { - slice::from_raw_parts( - src.as_ptr() as *const u8, - src_stride as usize * height as usize, - ) - }; dst.par_chunks_exact_mut(dst_stride as usize) .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize)) .for_each(|(dst, src)| unsafe { let mut _cx = 0usize; - if let Some(dispatcher) = _wide_row_handler { - _cx = dispatcher( - _cx, - src.as_ptr() as *const f32, - 0, - dst.as_mut_ptr(), - 0, - width, - transfer_function, - ); - } - let src_ptr = src.as_ptr() as *const f32; let dst_ptr = dst.as_mut_ptr(); for x in _cx..width as usize { let px = x; let src_slice = src_ptr.add(px); - let pixel = src_slice.read_unaligned().min(1f32).max(0f32); + let pixel = + (src_slice.read_unaligned().min(1f32).max(0f32) * 2048f32).round() as usize; let dst = dst_ptr.add(px); - let transferred = transfer_function.gamma(pixel); - let rgb8 = (transferred * 255f32).min(255f32).max(0f32) as u8; + let transferred = *lut_table.get_unchecked(pixel.min(2048)); - dst.write_unaligned(rgb8); + dst.write_unaligned(transferred); } }); } #[cfg(not(feature = "rayon"))] { - let mut src_offset = 0usize; - let mut dst_offset = 0usize; - - for _ in 0..height as usize { - let mut _cx = 0usize; - - if let Some(dispatcher) = _wide_row_handler { - unsafe { - _cx = dispatcher( - _cx, - src.as_ptr(), - src_offset as u32, - dst.as_mut_ptr(), - dst_offset as u32, - width, - transfer_function, - ); - } - } + for (dst, src) in dst + .chunks_exact_mut(dst_stride as usize) + .zip(src_slice_safe_align.chunks_exact(src_stride as usize)) + { + unsafe { + let mut _cx = 0usize; - let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *const f32 }; - let dst_ptr = unsafe { dst.as_mut_ptr().add(dst_offset) }; + let src_ptr = src.as_ptr() as *const f32; + let dst_ptr = dst.as_mut_ptr(); - for x in _cx..width as usize { - let px = x; - let src_slice = unsafe { src_ptr.add(px) }; - let pixel = unsafe { src_slice.read_unaligned() }.min(1f32).max(0f32); + for x in _cx..width as usize { + let px = x; + let src_slice = src_ptr.add(px); + let pixel = + (src_slice.read_unaligned().min(1f32).max(0f32) * 2048f32).round() as usize; - let dst = unsafe { dst_ptr.add(px) }; - let transferred = transfer_function.gamma(pixel); - let rgb8 = (transferred * 255f32).min(255f32).max(0f32) as u8; + let dst = dst_ptr.add(px); + let transferred = *lut_table.get_unchecked(pixel.min(2048)); - unsafe { - dst.write_unaligned(rgb8); + dst.write_unaligned(transferred); } } - - src_offset += src_stride as usize; - dst_offset += dst_stride as usize; } } } diff --git a/src/neon/gamma_curves.rs b/src/neon/gamma_curves.rs index f5b7833..c530a23 100644 --- a/src/neon/gamma_curves.rs +++ b/src/neon/gamma_curves.rs @@ -4,8 +4,7 @@ * // Use of this source code is governed by a BSD-style * // license that can be found in the LICENSE file. */ - -use crate::gamma_curves::TransferFunction; +#![allow(dead_code)] use crate::neon::math::vpowq_n_f32; use std::arch::aarch64::*; @@ -131,31 +130,3 @@ pub unsafe fn neon_gamma2p2_from_linear(linear: float32x4_t) -> float32x4_t { pub unsafe fn neon_gamma2p8_from_linear(linear: float32x4_t) -> float32x4_t { neon_pure_gamma_function(linear, 1f32 / 2.8f32) } - -#[inline(always)] -pub unsafe fn neon_perform_linear_transfer( - transfer_function: TransferFunction, - v: float32x4_t, -) -> float32x4_t { - match transfer_function { - TransferFunction::Srgb => neon_srgb_to_linear(v), - TransferFunction::Rec709 => neon_rec709_to_linear(v), - TransferFunction::Gamma2p2 => neon_gamma2p2_to_linear(v), - TransferFunction::Gamma2p8 => neon_gamma2p8_to_linear(v), - TransferFunction::Smpte428 => neon_smpte428_to_linear(v), - } -} - -#[inline(always)] -pub unsafe fn neon_perform_gamma_transfer( - transfer_function: TransferFunction, - v: float32x4_t, -) -> float32x4_t { - match transfer_function { - TransferFunction::Srgb => neon_srgb_from_linear(v), - TransferFunction::Rec709 => neon_rec709_from_linear(v), - TransferFunction::Gamma2p2 => neon_gamma2p2_from_linear(v), - TransferFunction::Gamma2p8 => neon_gamma2p8_from_linear(v), - TransferFunction::Smpte428 => neon_smpte428_from_linear(v), - } -} diff --git a/src/neon/image_to_jzazbz.rs b/src/neon/image_to_jzazbz.rs index beedf79..42a4623 100644 --- a/src/neon/image_to_jzazbz.rs +++ b/src/neon/image_to_jzazbz.rs @@ -7,11 +7,7 @@ use crate::image::ImageConfiguration; use crate::image_to_jzazbz::JzazbzTarget; use crate::neon::math::{vcolorq_matrix_f32, vpowq_n_f32}; -use crate::neon::neon_perform_linear_transfer; -use crate::{ - load_u8_and_deinterleave, load_u8_and_deinterleave_half, load_u8_and_deinterleave_quarter, - TransferFunction, SRGB_TO_XYZ_D65, -}; +use crate::{load_f32_and_deinterleave, SRGB_TO_XYZ_D65}; use erydanos::{vatan2q_f32, vhypotq_fast_f32, visnanq_f32, vmlafq_f32, vpowq_f32}; use std::arch::aarch64::*; @@ -32,15 +28,8 @@ macro_rules! perceptual_quantizer { } macro_rules! triple_to_jzazbz { - ($r: expr, $g: expr, $b: expr, $transfer: expr, $target: expr, $luminance: expr + ($r: expr, $g: expr, $b: expr, $target: expr, $luminance: expr ) => {{ - let r_f = vmulq_n_f32(vcvtq_f32_u32($r), 1f32 / 255f32); - let g_f = vmulq_n_f32(vcvtq_f32_u32($g), 1f32 / 255f32); - let b_f = vmulq_n_f32(vcvtq_f32_u32($b), 1f32 / 255f32); - let dl_l = neon_perform_linear_transfer($transfer, r_f); - let dl_m = neon_perform_linear_transfer($transfer, g_f); - let dl_s = neon_perform_linear_transfer($transfer, b_f); - let (x0, x1, x2, x3, x4, x5, x6, x7, x8) = ( vdupq_n_f32(*SRGB_TO_XYZ_D65.get_unchecked(0).get_unchecked(0)), vdupq_n_f32(*SRGB_TO_XYZ_D65.get_unchecked(0).get_unchecked(1)), @@ -53,7 +42,7 @@ macro_rules! triple_to_jzazbz { vdupq_n_f32(*SRGB_TO_XYZ_D65.get_unchecked(2).get_unchecked(2)), ); - let (mut x, mut y, mut z) = vcolorq_matrix_f32(dl_l, dl_m, dl_s, x0, x1, x2, x3, x4, x5, x6, x7, x8); + let (mut x, mut y, mut z) = vcolorq_matrix_f32($r, $g, $b, x0, x1, x2, x3, x4, x5, x6, x7, x8); x = vmulq_n_f32(x, $luminance); y = vmulq_n_f32(y, $luminance); @@ -111,13 +100,12 @@ macro_rules! triple_to_jzazbz { #[inline(always)] pub unsafe fn neon_image_to_jzazbz( start_cx: usize, - src: *const u8, + src: *const f32, src_offset: usize, width: u32, dst: *mut f32, dst_offset: usize, display_luminance: f32, - transfer_function: TransferFunction, ) -> usize { let target: JzazbzTarget = TARGET.into(); let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); @@ -126,206 +114,16 @@ pub unsafe fn neon_image_to_jzazbz {{ @@ -37,14 +36,11 @@ macro_rules! perceptual_quantizer_inverse { #[inline(always)] unsafe fn neon_jzazbz_gamma_vld( src: *const f32, - transfer_function: TransferFunction, target: JzazbzTarget, luminance: f32, -) -> (uint32x4_t, uint32x4_t, uint32x4_t, uint32x4_t) { - let v_scale_alpha = vdupq_n_f32(255f32); +) -> (float32x4_t, float32x4_t, float32x4_t, float32x4_t) { let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let (jz, mut az, mut bz, mut a_f32) = - load_f32_and_deinterleave_direct!(src, image_configuration); + let (jz, mut az, mut bz, a_f32) = load_f32_and_deinterleave_direct!(src, image_configuration); if target == JzazbzTarget::Jzczhz { let cz = az; @@ -111,32 +107,17 @@ unsafe fn neon_jzazbz_gamma_vld( let (r_l, g_l, b_l) = vcolorq_matrix_f32(x, y, z, x0, x1, x2, x3, x4, x5, x6, x7, x8); - let mut r_f32 = neon_perform_gamma_transfer(transfer_function, r_l); - let mut g_f32 = neon_perform_gamma_transfer(transfer_function, g_l); - let mut b_f32 = neon_perform_gamma_transfer(transfer_function, b_l); - r_f32 = vmulq_f32(r_f32, v_scale_alpha); - g_f32 = vmulq_f32(g_f32, v_scale_alpha); - b_f32 = vmulq_f32(b_f32, v_scale_alpha); - if image_configuration.has_alpha() { - a_f32 = vmulq_f32(a_f32, v_scale_alpha); - } - ( - vcvtaq_u32_f32(r_f32), - vcvtaq_u32_f32(g_f32), - vcvtaq_u32_f32(b_f32), - vcvtaq_u32_f32(a_f32), - ) + (r_l, g_l, b_l, a_f32) } pub unsafe fn neon_jzazbz_to_image( start_cx: usize, src: *const f32, src_offset: u32, - dst: *mut u8, + dst: *mut f32, dst_offset: u32, width: u32, display_luminance: f32, - transfer_function: TransferFunction, ) -> usize { let target: JzazbzTarget = TARGET.into(); let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); @@ -145,198 +126,37 @@ pub unsafe fn neon_jzazbz_to_image( - src_ptr_0, - transfer_function, - target, - luminance_scale, - ); - - let src_ptr_1 = offset_src_ptr.add(4 * channels); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = neon_jzazbz_gamma_vld::( - src_ptr_1, - transfer_function, - target, - luminance_scale, - ); - - let src_ptr_2 = offset_src_ptr.add(4 * 2 * channels); - - let (r_row2_, g_row2_, b_row2_, a_row2_) = neon_jzazbz_gamma_vld::( - src_ptr_2, - transfer_function, - target, - luminance_scale, - ); - - let src_ptr_3 = offset_src_ptr.add(4 * 3 * channels); - - let (r_row3_, g_row3_, b_row3_, a_row3_) = neon_jzazbz_gamma_vld::( - src_ptr_3, - transfer_function, - target, - luminance_scale, - ); - - let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_)); - let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_)); - let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_)); - - let r_row23 = vcombine_u16(vqmovn_u32(r_row2_), vqmovn_u32(r_row3_)); - let g_row23 = vcombine_u16(vqmovn_u32(g_row2_), vqmovn_u32(g_row3_)); - let b_row23 = vcombine_u16(vqmovn_u32(b_row2_), vqmovn_u32(b_row3_)); - - let r_row = vcombine_u8(vqmovn_u16(r_row01), vqmovn_u16(r_row23)); - let g_row = vcombine_u8(vqmovn_u16(g_row01), vqmovn_u16(g_row23)); - let b_row = vcombine_u8(vqmovn_u16(b_row01), vqmovn_u16(b_row23)); - - let dst_ptr = dst.add(dst_offset as usize + cx * channels); - - if image_configuration.has_alpha() { - let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_)); - let a_row23 = vcombine_u16(vqmovn_u32(a_row2_), vqmovn_u32(a_row3_)); - let a_row = vcombine_u8(vqmovn_u16(a_row01), vqmovn_u16(a_row23)); - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x16x4_t(r_row, g_row, b_row, a_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x16x4_t(b_row, g_row, r_row, a_row) - } - }; - vst4q_u8(dst_ptr, store_rows); - } else { - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x16x3_t(r_row, g_row, b_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x16x3_t(b_row, g_row, r_row) - } - }; - vst3q_u8(dst_ptr, store_rows); - } - - cx += 16; - } - - while cx + 8 < width as usize { - let offset_src_ptr = - ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); - - let src_ptr_0 = offset_src_ptr; - - let (r_row0_, g_row0_, b_row0_, a_row0_) = neon_jzazbz_gamma_vld::( - src_ptr_0, - transfer_function, - target, - luminance_scale, - ); - - let src_ptr_1 = offset_src_ptr.add(4 * channels); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = neon_jzazbz_gamma_vld::( - src_ptr_1, - transfer_function, - target, - luminance_scale, - ); - - let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_)); - let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_)); - let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_)); - - let r_row = vqmovn_u16(r_row01); - let g_row = vqmovn_u16(g_row01); - let b_row = vqmovn_u16(b_row01); - - let dst_ptr = dst.add(dst_offset as usize + cx * channels); - - if image_configuration.has_alpha() { - let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_)); - let a_row = vqmovn_u16(a_row01); - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x8x4_t(r_row, g_row, b_row, a_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x8x4_t(b_row, g_row, r_row, a_row) - } - }; - vst4_u8(dst_ptr, store_rows); - } else { - let store_rows = match image_configuration { - ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x8x3_t(r_row, g_row, b_row) - } - ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x8x3_t(b_row, g_row, r_row) - } - }; - vst3_u8(dst_ptr, store_rows); - } - - cx += 8; - } - while cx + 4 < width as usize { let offset_src_ptr = ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); let src_ptr_0 = offset_src_ptr; - let (r_row0_, g_row0_, b_row0_, a_row0_) = neon_jzazbz_gamma_vld::( - src_ptr_0, - transfer_function, - target, - luminance_scale, - ); - - let zeros = vdup_n_u16(0); - - let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), zeros); - let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), zeros); - let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), zeros); - - let r_row = vqmovn_u16(r_row01); - let g_row = vqmovn_u16(g_row01); - let b_row = vqmovn_u16(b_row01); + let (r_row0_, g_row0_, b_row0_, a_row0_) = + neon_jzazbz_gamma_vld::(src_ptr_0, target, luminance_scale); - let dst_ptr = dst.add(dst_offset as usize + cx * channels); + let dst_ptr = ((dst as *mut u8).add(dst_offset as usize) as *mut f32).add(cx * channels); if image_configuration.has_alpha() { - let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), zeros); - let a_row = vqmovn_u16(a_row01); let store_rows = match image_configuration { ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x8x4_t(r_row, g_row, b_row, a_row) + float32x4x4_t(r_row0_, g_row0_, b_row0_, a_row0_) } ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x8x4_t(b_row, g_row, r_row, a_row) + float32x4x4_t(b_row0_, g_row0_, r_row0_, a_row0_) } }; - let mut transient: [u8; 32] = [0; 32]; - vst4_u8(transient.as_mut_ptr(), store_rows); - std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 4); + vst4q_f32(dst_ptr, store_rows); } else { let store_rows = match image_configuration { ImageConfiguration::Rgb | ImageConfiguration::Rgba => { - uint8x8x3_t(r_row, g_row, b_row) + float32x4x3_t(r_row0_, g_row0_, b_row0_) } ImageConfiguration::Bgra | ImageConfiguration::Bgr => { - uint8x8x3_t(b_row, g_row, r_row) + float32x4x3_t(b_row0_, g_row0_, r_row0_) } }; - let mut transient: [u8; 24] = [0; 24]; - vst3_u8(transient.as_mut_ptr(), store_rows); - std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 3); + vst3q_f32(dst_ptr, store_rows); } cx += 4; diff --git a/src/neon/linear_to_planar.rs b/src/neon/linear_to_planar.rs deleted file mode 100644 index cf09d8e..0000000 --- a/src/neon/linear_to_planar.rs +++ /dev/null @@ -1,78 +0,0 @@ -/* - * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved. - * // - * // Use of this source code is governed by a BSD-style - * // license that can be found in the LICENSE file. - */ - -use crate::neon::neon_perform_gamma_transfer; -use crate::TransferFunction; -use std::arch::aarch64::*; - -#[inline(always)] -unsafe fn transfer_to_gamma(r: float32x4_t, transfer_function: TransferFunction) -> uint32x4_t { - vcvtaq_u32_f32(vmulq_n_f32( - neon_perform_gamma_transfer(transfer_function, r), - 255f32, - )) -} - -#[inline(always)] -unsafe fn process_set(k: float32x4x4_t, transfer_function: TransferFunction) -> uint8x16_t { - let y0 = transfer_to_gamma(k.0, transfer_function); - let y1 = transfer_to_gamma(k.1, transfer_function); - let y2 = transfer_to_gamma(k.2, transfer_function); - let y3 = transfer_to_gamma(k.3, transfer_function); - - let y_row01 = vcombine_u16(vqmovn_u32(y0), vqmovn_u32(y1)); - let y_row23 = vcombine_u16(vqmovn_u32(y2), vqmovn_u32(y3)); - - vcombine_u8(vqmovn_u16(y_row01), vqmovn_u16(y_row23)) -} - -#[inline] -pub unsafe fn neon_linear_plane_to_gamma( - start_cx: usize, - src: *const f32, - src_offset: u32, - dst: *mut u8, - dst_offset: u32, - width: u32, - transfer_function: TransferFunction, -) -> usize { - let mut cx = start_cx; - - while cx + 64 < width as usize { - let offset_src_ptr = ((src as *const u8).add(src_offset as usize) as *const f32).add(cx); - - let pixel_row0 = vld1q_f32_x4(offset_src_ptr); - let pixel_row1 = vld1q_f32_x4(offset_src_ptr.add(16)); - let pixel_row2 = vld1q_f32_x4(offset_src_ptr.add(32)); - let pixel_row3 = vld1q_f32_x4(offset_src_ptr.add(48)); - - let set0 = process_set(pixel_row0, transfer_function); - let set1 = process_set(pixel_row1, transfer_function); - let set2 = process_set(pixel_row2, transfer_function); - let set3 = process_set(pixel_row3, transfer_function); - - let dst_ptr = dst.add(dst_offset as usize + cx); - - let pixel_set = uint8x16x4_t(set0, set1, set2, set3); - vst1q_u8_x4(dst_ptr, pixel_set); - - cx += 64; - } - - while cx + 16 < width as usize { - let offset_src_ptr = ((src as *const u8).add(src_offset as usize) as *const f32).add(cx); - - let pixel_row = vld1q_f32_x4(offset_src_ptr); - let r_row = process_set(pixel_row, transfer_function); - let dst_ptr = dst.add(dst_offset as usize + cx); - vst1q_u8(dst_ptr, r_row); - - cx += 16; - } - - cx -} diff --git a/src/neon/mod.rs b/src/neon/mod.rs index 22c8d9d..89eecc9 100644 --- a/src/neon/mod.rs +++ b/src/neon/mod.rs @@ -14,10 +14,8 @@ mod image_to_hsv; mod image_to_jzazbz; mod image_to_oklab; mod jzazbz_to_image; -pub mod linear_to_planar; mod math; mod oklab_to_image; -pub mod planar_to_linear; mod routines; mod sigmoidal; mod to_sigmoidal; @@ -28,7 +26,6 @@ mod xyza_laba_to_image; pub use colors::*; pub use from_sigmoidal::neon_from_sigmoidal_row; -pub use gamma_curves::*; pub use hsv_to_image::*; pub use image_to_hsv::*; pub use image_to_jzazbz::neon_image_to_jzazbz; diff --git a/src/neon/planar_to_linear.rs b/src/neon/planar_to_linear.rs deleted file mode 100644 index 490fdc3..0000000 --- a/src/neon/planar_to_linear.rs +++ /dev/null @@ -1,84 +0,0 @@ -/* - * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved. - * // - * // Use of this source code is governed by a BSD-style - * // license that can be found in the LICENSE file. - */ - -use crate::gamma_curves::TransferFunction; -use crate::neon::*; -use std::arch::aarch64::*; - -#[inline(always)] -unsafe fn neon_to_linear(r: uint32x4_t, transfer_function: TransferFunction) -> float32x4_t { - let r_f = vmulq_n_f32(vcvtq_f32_u32(r), 1f32 / 255f32); - neon_perform_linear_transfer(transfer_function, r_f) -} - -#[inline] -unsafe fn process_pixels(pixels: uint8x16_t, transfer_function: TransferFunction) -> float32x4x4_t { - let r_low = vmovl_u8(vget_low_u8(pixels)); - - let r_low_low = vmovl_u16(vget_low_u16(r_low)); - - let x_low_low = neon_to_linear(r_low_low, transfer_function); - - let r_low_high = vmovl_high_u16(r_low); - - let x_low_high = neon_to_linear(r_low_high, transfer_function); - - let r_high = vmovl_high_u8(pixels); - - let r_high_low = vmovl_u16(vget_low_u16(r_high)); - - let x_high_low = neon_to_linear(r_high_low, transfer_function); - - let r_high_high = vmovl_high_u16(r_high); - - let x_high_high = neon_to_linear(r_high_high, transfer_function); - float32x4x4_t(x_low_low, x_low_high, x_high_low, x_high_high) -} - -#[inline(always)] -pub unsafe fn neon_plane_to_linear( - start_cx: usize, - src: *const u8, - src_offset: usize, - width: u32, - dst: *mut f32, - dst_offset: usize, - transfer_function: TransferFunction, -) -> usize { - let mut cx = start_cx; - - let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32; - - while cx + 64 < width as usize { - let src_ptr = src.add(src_offset + cx); - let pixels_row64 = vld1q_u8_x4(src_ptr); - let storing_row0 = process_pixels(pixels_row64.0, transfer_function); - vst1q_f32_x4(dst_ptr.add(cx), storing_row0); - - let storing_row1 = process_pixels(pixels_row64.1, transfer_function); - vst1q_f32_x4(dst_ptr.add(cx + 16), storing_row1); - - let storing_row2 = process_pixels(pixels_row64.2, transfer_function); - vst1q_f32_x4(dst_ptr.add(cx + 32), storing_row2); - - let storing_row3 = process_pixels(pixels_row64.3, transfer_function); - vst1q_f32_x4(dst_ptr.add(cx + 48), storing_row3); - - cx += 64; - } - - while cx + 16 < width as usize { - let src_ptr = src.add(src_offset + cx); - let pixels = vld1q_u8(src_ptr); - let storing_row = process_pixels(pixels, transfer_function); - vst1q_f32_x4(dst_ptr.add(cx), storing_row); - - cx += 16; - } - - cx -} diff --git a/src/oklab_to_image.rs b/src/oklab_to_image.rs index 35377dd..30e1210 100644 --- a/src/oklab_to_image.rs +++ b/src/oklab_to_image.rs @@ -114,16 +114,22 @@ fn oklab_to_image( .chunks_exact_mut(channels) .zip(transient_row.chunks_exact_mut(channels)) { - let rgb = (Rgb::::new(src_chunks[0], src_chunks[1], src_chunks[2]) - * Rgb::::dup(2048f32)) + let rgb = (Rgb::::new( + src_chunks[image_configuration.get_r_channel_offset()], + src_chunks[image_configuration.get_g_channel_offset()], + src_chunks[image_configuration.get_b_channel_offset()], + ) * Rgb::::dup(2048f32)) .cast::(); - dst_chunks[0] = *lut_table.get_unchecked(rgb.r as usize); - dst_chunks[1] = *lut_table.get_unchecked(rgb.g as usize); - dst_chunks[2] = *lut_table.get_unchecked(rgb.b as usize); + dst_chunks[image_configuration.get_r_channel_offset()] = + *lut_table.get_unchecked(rgb.r as usize); + dst_chunks[image_configuration.get_g_channel_offset()] = + *lut_table.get_unchecked(rgb.g as usize); + dst_chunks[image_configuration.get_b_channel_offset()] = + *lut_table.get_unchecked(rgb.b as usize); if image_configuration.has_alpha() { let a_lin = (src_chunks[4] * 255f32).round() as u8; - dst_chunks[0] = a_lin; + dst_chunks[image_configuration.get_a_channel_offset()] = a_lin; } } }); @@ -131,8 +137,10 @@ fn oklab_to_image( #[cfg(not(feature = "rayon"))] { - for (dst, src) in dst.chunks_exact_mut(dst_stride as usize) - .zip(src_slice_safe_align.chunks_exact(src_stride as usize)) { + for (dst, src) in dst + .chunks_exact_mut(dst_stride as usize) + .zip(src_slice_safe_align.chunks_exact(src_stride as usize)) + { unsafe { let mut _cx = 0usize; @@ -175,16 +183,22 @@ fn oklab_to_image( .chunks_exact_mut(channels) .zip(transient_row.chunks_exact_mut(channels)) { - let rgb = (Rgb::::new(src_chunks[0], src_chunks[1], src_chunks[2]) - * Rgb::::dup(2048f32)) - .cast::(); + let rgb = (Rgb::::new( + src_chunks[image_configuration.get_r_channel_offset()], + src_chunks[image_configuration.get_g_channel_offset()], + src_chunks[image_configuration.get_b_channel_offset()], + ) * Rgb::::dup(2048f32)) + .cast::(); - dst_chunks[0] = *lut_table.get_unchecked(rgb.r as usize); - dst_chunks[1] = *lut_table.get_unchecked(rgb.g as usize); - dst_chunks[2] = *lut_table.get_unchecked(rgb.b as usize); + dst_chunks[image_configuration.get_r_channel_offset()] = + *lut_table.get_unchecked(rgb.r as usize); + dst_chunks[image_configuration.get_g_channel_offset()] = + *lut_table.get_unchecked(rgb.g as usize); + dst_chunks[image_configuration.get_b_channel_offset()] = + *lut_table.get_unchecked(rgb.b as usize); if image_configuration.has_alpha() { let a_lin = (src_chunks[4] * 255f32).round() as u8; - dst_chunks[0] = a_lin; + dst_chunks[image_configuration.get_a_channel_offset()] = a_lin; } } } diff --git a/src/planar_to_linear.rs b/src/planar_to_linear.rs index fe888e0..51552d6 100644 --- a/src/planar_to_linear.rs +++ b/src/planar_to_linear.rs @@ -5,16 +5,11 @@ * // license that can be found in the LICENSE file. */ -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -use crate::neon::planar_to_linear::neon_plane_to_linear; -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -use crate::sse::sse_plane_to_linear; use crate::TransferFunction; #[cfg(feature = "rayon")] use rayon::iter::{IndexedParallelIterator, ParallelIterator}; #[cfg(feature = "rayon")] use rayon::prelude::{ParallelSlice, ParallelSliceMut}; -#[cfg(feature = "rayon")] use std::slice; #[inline(always)] @@ -28,28 +23,20 @@ fn channels_to_linear( height: u32, transfer_function: TransferFunction, ) { - let mut _wide_row_handler: Option< - unsafe fn(usize, *const u8, usize, u32, *mut f32, usize, TransferFunction) -> usize, - > = None; - - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - if std::arch::is_x86_feature_detected!("sse4.1") { - _wide_row_handler = Some(sse_plane_to_linear); - } - - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - { - _wide_row_handler = Some(neon_plane_to_linear); + let dst_slice_safe_align = unsafe { + slice::from_raw_parts_mut( + dst.as_mut_ptr() as *mut u8, + dst_stride as usize * height as usize, + ) + }; + + let mut lut_table = vec![0f32; 256]; + for i in 0..256 { + lut_table[i] = transfer_function.linearize(i as f32 * (1. / 255.0)); } #[cfg(feature = "rayon")] { - let dst_slice_safe_align = unsafe { - slice::from_raw_parts_mut( - dst.as_mut_ptr() as *mut u8, - dst_stride as usize * height as usize, - ) - }; dst_slice_safe_align .par_chunks_exact_mut(dst_stride as usize) .zip(src.par_chunks_exact(src_stride as usize)) @@ -59,16 +46,11 @@ fn channels_to_linear( let src_ptr = src.as_ptr(); let dst_ptr = dst.as_mut_ptr() as *mut f32; - if let Some(dispatcher) = _wide_row_handler { - _cx = dispatcher(_cx, src_ptr, 0, width, dst_ptr, 0, transfer_function); - } - for x in _cx..width as usize { let px = x; let dst = dst_ptr.add(px); let src = src_ptr.add(px); - let pixel_f = src.read_unaligned() as f32 * (1. / 255.); - let transferred = transfer_function.linearize(pixel_f); + let transferred = *lut_table.get_unchecked(src.read_unaligned() as usize); dst.write_unaligned(transferred); } @@ -77,43 +59,25 @@ fn channels_to_linear( #[cfg(not(feature = "rayon"))] { - let mut src_offset = 0usize; - let mut dst_offset = 0usize; - - for _ in 0..height as usize { - let mut _cx = 0usize; - - let src_ptr = unsafe { src.as_ptr().add(src_offset) }; - let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 }; + for (dst, src) in dst_slice_safe_align + .chunks_exact_mut(dst_stride as usize) + .zip(src.chunks_exact(src_stride as usize)) + { + unsafe { + let mut _cx = 0usize; - if let Some(dispatcher) = _wide_row_handler { - unsafe { - _cx = dispatcher( - _cx, - src.as_ptr(), - src_offset, - width, - dst.as_mut_ptr(), - dst_offset, - transfer_function, - ); - } - } + let src_ptr = src.as_ptr(); + let dst_ptr = dst.as_mut_ptr() as *mut f32; - for x in _cx..width as usize { - let px = x; - let dst = unsafe { dst_ptr.add(px) }; - let src = unsafe { src_ptr.add(px) }; - let pixel_f = unsafe { src.read_unaligned() as f32 } * (1. / 255.); - let transferred = transfer_function.linearize(pixel_f); + for x in _cx..width as usize { + let px = x; + let dst = dst_ptr.add(px); + let src = src_ptr.add(px); + let transferred = *lut_table.get_unchecked(src.read_unaligned()); - unsafe { dst.write_unaligned(transferred); } } - - src_offset += src_stride as usize; - dst_offset += dst_stride as usize; } } } diff --git a/src/sse/gamma_curves.rs b/src/sse/gamma_curves.rs index ee6ea00..c5f68e7 100644 --- a/src/sse/gamma_curves.rs +++ b/src/sse/gamma_curves.rs @@ -4,7 +4,7 @@ * // Use of this source code is governed by a BSD-style * // license that can be found in the LICENSE file. */ - +#![allow(dead_code)] use crate::gamma_curves::TransferFunction; use crate::sse::*; use erydanos::_mm_pow_ps; @@ -143,29 +143,4 @@ pub unsafe fn sse_gamma2p2_from_linear(linear: __m128) -> __m128 { #[inline(always)] pub unsafe fn sse_gamma2p8_from_linear(linear: __m128) -> __m128 { sse_pure_gamma(linear, 1f32 / 2.8f32) -} - -#[inline(always)] -pub unsafe fn perform_sse_linear_transfer( - transfer_function: TransferFunction, - v: __m128, -) -> __m128 { - match transfer_function { - TransferFunction::Srgb => sse_srgb_to_linear(v), - TransferFunction::Rec709 => sse_rec709_to_linear(v), - TransferFunction::Gamma2p2 => sse_gamma2p2_to_linear(v), - TransferFunction::Gamma2p8 => sse_gamma2p8_to_linear(v), - TransferFunction::Smpte428 => sse_smpte428_to_linear(v), - } -} - -#[inline(always)] -pub unsafe fn perform_sse_gamma_transfer(transfer_function: TransferFunction, v: __m128) -> __m128 { - match transfer_function { - TransferFunction::Srgb => sse_srgb_from_linear(v), - TransferFunction::Rec709 => sse_rec709_from_linear(v), - TransferFunction::Gamma2p2 => sse_gamma2p2_from_linear(v), - TransferFunction::Gamma2p8 => sse_gamma2p8_from_linear(v), - TransferFunction::Smpte428 => sse_smpte428_from_linear(v), - } -} +} \ No newline at end of file diff --git a/src/sse/image_to_jzazbz.rs b/src/sse/image_to_jzazbz.rs index 053e896..5ec81f0 100644 --- a/src/sse/image_to_jzazbz.rs +++ b/src/sse/image_to_jzazbz.rs @@ -15,12 +15,12 @@ use erydanos::{_mm_atan2_ps, _mm_hypot_fast_ps, _mm_isnan_ps, _mm_mlaf_ps, _mm_p use crate::image::ImageConfiguration; use crate::image_to_jzazbz::JzazbzTarget; use crate::sse::{ - _mm_color_matrix_ps, _mm_pow_n_ps, _mm_select_ps, perform_sse_linear_transfer, - sse_deinterleave_rgb, sse_deinterleave_rgba, sse_interleave_ps_rgb, sse_interleave_ps_rgba, + _mm_color_matrix_ps, _mm_pow_n_ps, _mm_select_ps, sse_interleave_ps_rgb, sse_interleave_ps_rgba, }; +use crate::sse::{sse_deinterleave_rgb_ps, sse_deinterleave_rgba_ps}; use crate::{ - load_u8_and_deinterleave, load_u8_and_deinterleave_half, store_and_interleave_v3_direct_f32, - store_and_interleave_v4_direct_f32, TransferFunction, SRGB_TO_XYZ_D65, + load_f32_and_deinterleave, store_and_interleave_v3_direct_f32, + store_and_interleave_v4_direct_f32, SRGB_TO_XYZ_D65, }; macro_rules! perceptual_quantizer { @@ -41,16 +41,8 @@ macro_rules! perceptual_quantizer { } macro_rules! triple_to_jzazbz { - ($r: expr, $g: expr, $b: expr, $transfer: expr, $target: expr, $luminance: expr + ($r: expr, $g: expr, $b: expr,$target: expr, $luminance: expr ) => {{ - let u8_scale = _mm_set1_ps(1f32 / 255f32); - let r_f = _mm_mul_ps(_mm_cvtepi32_ps($r), u8_scale); - let g_f = _mm_mul_ps(_mm_cvtepi32_ps($g), u8_scale); - let b_f = _mm_mul_ps(_mm_cvtepi32_ps($b), u8_scale); - let r_linear = perform_sse_linear_transfer($transfer, r_f); - let g_linear = perform_sse_linear_transfer($transfer,g_f); - let b_linear = perform_sse_linear_transfer($transfer,b_f); - let (x0, x1, x2, x3, x4, x5, x6, x7, x8) = ( _mm_set1_ps(*SRGB_TO_XYZ_D65.get_unchecked(0).get_unchecked(0)), _mm_set1_ps(*SRGB_TO_XYZ_D65.get_unchecked(0).get_unchecked(1)), @@ -64,7 +56,7 @@ macro_rules! triple_to_jzazbz { ); let (mut x, mut y, mut z) = _mm_color_matrix_ps( - r_linear, g_linear, b_linear, x0, x1, x2, x3, x4, x5, x6, x7, x8, + $r, $g, $b, x0, x1, x2, x3, x4, x5, x6, x7, x8, ); x = _mm_mul_ps(x, $luminance); @@ -124,13 +116,12 @@ macro_rules! triple_to_jzazbz { #[target_feature(enable = "sse4.1")] pub unsafe fn sse_image_to_jzazbz( start_cx: usize, - src: *const u8, + src: *const f32, src_offset: usize, width: u32, dst: *mut f32, dst_offset: usize, display_luminance: f32, - transfer_function: TransferFunction, ) -> usize { let target: JzazbzTarget = TARGET.into(); let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); @@ -141,197 +132,22 @@ pub unsafe fn sse_image_to_jzazbz(r_low)); - let g_low_high = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(g_low)); - let b_low_high = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(b_low)); - - let (x_low_high, y_low_high, z_low_high) = triple_to_jzazbz!( - r_low_high, - g_low_high, - b_low_high, - transfer_function, - target, - luminance - ); - - if image_configuration.has_alpha() { - let a_low_high = _mm_mul_ps( - _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128::<8>(a_low))), - u8_scale, - ); - - let ptr = dst_ptr.add(cx * 4 + 16); - store_and_interleave_v4_direct_f32!( - ptr, x_low_high, y_low_high, z_low_high, a_low_high - ); - } else { - let ptr = dst_ptr.add(cx * 3 + 4 * 3); - store_and_interleave_v3_direct_f32!(ptr, x_low_high, y_low_high, z_low_high); - } - - let r_high = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(r_chan)); - let g_high = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(g_chan)); - let b_high = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(b_chan)); - - let r_high_low = _mm_cvtepu16_epi32(r_high); - let g_high_low = _mm_cvtepu16_epi32(g_high); - let b_high_low = _mm_cvtepu16_epi32(b_high); - - let (x_high_low, y_high_low, z_high_low) = triple_to_jzazbz!( - r_high_low, - g_high_low, - b_high_low, - transfer_function, - target, - luminance - ); - - let a_high = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(a_chan)); - - if image_configuration.has_alpha() { - let a_high_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_high)), u8_scale); - let ptr = dst_ptr.add(cx * 4 + 4 * 4 * 2); - store_and_interleave_v4_direct_f32!( - ptr, x_high_low, y_high_low, z_high_low, a_high_low - ); - } else { - let ptr = dst_ptr.add(cx * 3 + 4 * 3 * 2); - store_and_interleave_v3_direct_f32!(ptr, x_high_low, y_high_low, z_high_low); - } - - let r_high_high = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(r_high)); - let g_high_high = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(g_high)); - let b_high_high = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(b_high)); - - let (x_high_high, y_high_high, z_high_high) = triple_to_jzazbz!( - r_high_high, - g_high_high, - b_high_high, - transfer_function, - target, - luminance - ); - - if image_configuration.has_alpha() { - let a_high_high = _mm_mul_ps( - _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128::<8>(a_high))), - u8_scale, - ); - let ptr = dst_ptr.add(cx * 4 + 4 * 4 * 3); - store_and_interleave_v4_direct_f32!( - ptr, - x_high_high, - y_high_high, - z_high_high, - a_high_high - ); - } else { - let ptr = dst_ptr.add(cx * 3 + 4 * 3 * 3); - store_and_interleave_v3_direct_f32!(ptr, x_high_high, y_high_high, z_high_high); - } - - cx += 16; - } - - while cx + 8 < width as usize { - let src_ptr = src.add(src_offset + cx * channels); + while cx + 4 < width as usize { + let src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * channels); let (r_chan, g_chan, b_chan, a_chan) = - load_u8_and_deinterleave_half!(src_ptr, image_configuration); - - let r_low = _mm_cvtepu8_epi16(r_chan); - let g_low = _mm_cvtepu8_epi16(g_chan); - let b_low = _mm_cvtepu8_epi16(b_chan); - - let r_low_low = _mm_cvtepu16_epi32(r_low); - let g_low_low = _mm_cvtepu16_epi32(g_low); - let b_low_low = _mm_cvtepu16_epi32(b_low); - - let (x_low_low, y_low_low, z_low_low) = triple_to_jzazbz!( - r_low_low, - g_low_low, - b_low_low, - transfer_function, - target, - luminance - ); - - let a_low = _mm_cvtepu8_epi16(a_chan); - - let u8_scale = _mm_set1_ps(1f32 / 255f32); + load_f32_and_deinterleave!(src_ptr, image_configuration); + let (x_low_low, y_low_low, z_low_low) = + triple_to_jzazbz!(r_chan, g_chan, b_chan, target, luminance); if image_configuration.has_alpha() { - let a_low_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_low)), u8_scale); let ptr = dst_ptr.add(cx * 4); - store_and_interleave_v4_direct_f32!(ptr, x_low_low, y_low_low, z_low_low, a_low_low); + store_and_interleave_v4_direct_f32!(ptr, x_low_low, y_low_low, z_low_low, a_chan); } else { let ptr = dst_ptr.add(cx * 3); store_and_interleave_v3_direct_f32!(ptr, x_low_low, y_low_low, z_low_low); } - let r_low_high = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(r_low)); - let g_low_high = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(g_low)); - let b_low_high = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(b_low)); - - let (x_low_high, y_low_high, z_low_high) = triple_to_jzazbz!( - r_low_high, - g_low_high, - b_low_high, - transfer_function, - target, - luminance - ); - - if image_configuration.has_alpha() { - let a_low_high = _mm_mul_ps( - _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128::<8>(a_low))), - u8_scale, - ); - - let ptr = dst_ptr.add(cx * 4 + 16); - store_and_interleave_v4_direct_f32!( - ptr, x_low_high, y_low_high, z_low_high, a_low_high - ); - } else { - let ptr = dst_ptr.add(cx * 3 + 4 * 3); - store_and_interleave_v3_direct_f32!(ptr, x_low_high, y_low_high, z_low_high); - } - - cx += 8; + cx += 4; } cx diff --git a/src/sse/jzazbz_to_image.rs b/src/sse/jzazbz_to_image.rs index b101bee..edc745b 100644 --- a/src/sse/jzazbz_to_image.rs +++ b/src/sse/jzazbz_to_image.rs @@ -10,18 +10,18 @@ use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -use erydanos::{_mm_cos_ps, _mm_isnan_ps, _mm_mlaf_ps, _mm_pow_ps, _mm_sin_ps}; - use crate::image::ImageConfiguration; use crate::image_to_jzazbz::JzazbzTarget; use crate::sse::{ - _mm_color_matrix_ps, _mm_pow_n_ps, _mm_select_ps, perform_sse_gamma_transfer, - sse_deinterleave_rgb_ps, sse_deinterleave_rgba_ps, sse_interleave_rgb, sse_interleave_rgba, + _mm_color_matrix_ps, _mm_pow_n_ps, _mm_select_ps, sse_deinterleave_rgb_ps, + sse_deinterleave_rgba_ps, }; +use crate::sse::{sse_interleave_ps_rgb, sse_interleave_ps_rgba}; use crate::{ - load_f32_and_deinterleave_direct, store_and_interleave_v3_half_u8, store_and_interleave_v3_u8, - store_and_interleave_v4_half_u8, store_and_interleave_v4_u8, TransferFunction, XYZ_TO_SRGB_D65, + load_f32_and_deinterleave_direct, store_and_interleave_v3_f32, store_and_interleave_v4_f32 + , XYZ_TO_SRGB_D65, }; +use erydanos::{_mm_cos_ps, _mm_isnan_ps, _mm_mlaf_ps, _mm_pow_ps, _mm_sin_ps}; macro_rules! perceptual_quantizer_inverse { ($color: expr) => {{ @@ -46,15 +46,12 @@ macro_rules! perceptual_quantizer_inverse { #[inline(always)] unsafe fn sse_jzazbz_vld( src: *const f32, - transfer_function: TransferFunction, luminance_scale: __m128, -) -> (__m128i, __m128i, __m128i, __m128i) { +) -> (__m128, __m128, __m128, __m128) { let target: JzazbzTarget = TARGET.into(); - let v_scale_alpha = _mm_set1_ps(255f32); let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); - let (jz, mut az, mut bz, mut a_f32) = - load_f32_and_deinterleave_direct!(src, image_configuration); + let (jz, mut az, mut bz, a_f32) = load_f32_and_deinterleave_direct!(src, image_configuration); if target == JzazbzTarget::Jzczhz { let cz = az; @@ -120,34 +117,7 @@ unsafe fn sse_jzazbz_vld( ); let (r_l, g_l, b_l) = _mm_color_matrix_ps(x, y, z, x0, x1, x2, x3, x4, x5, x6, x7, x8); - - let mut r_f32 = perform_sse_gamma_transfer(transfer_function, r_l); - let mut g_f32 = perform_sse_gamma_transfer(transfer_function, g_l); - let mut b_f32 = perform_sse_gamma_transfer(transfer_function, b_l); - r_f32 = _mm_mul_ps(r_f32, v_scale_alpha); - g_f32 = _mm_mul_ps(g_f32, v_scale_alpha); - b_f32 = _mm_mul_ps(b_f32, v_scale_alpha); - if image_configuration.has_alpha() { - a_f32 = _mm_mul_ps(a_f32, v_scale_alpha); - } - - const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; - - if image_configuration.has_alpha() { - ( - _mm_cvtps_epi32(_mm_round_ps::(r_f32)), - _mm_cvtps_epi32(_mm_round_ps::(g_f32)), - _mm_cvtps_epi32(_mm_round_ps::(b_f32)), - _mm_cvtps_epi32(_mm_round_ps::(a_f32)), - ) - } else { - ( - _mm_cvtps_epi32(_mm_round_ps::(r_f32)), - _mm_cvtps_epi32(_mm_round_ps::(g_f32)), - _mm_cvtps_epi32(_mm_round_ps::(b_f32)), - _mm_set1_epi32(255), - ) - } + (r_l, g_l, b_l, a_f32) } #[target_feature(enable = "sse4.1")] @@ -155,11 +125,10 @@ pub unsafe fn sse_jzazbz_to_image usize { let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); let channels = image_configuration.get_channels_count(); @@ -167,113 +136,28 @@ pub unsafe fn sse_jzazbz_to_image( - src_ptr_0, - transfer_function, - luminance_scale, - ); - - let src_ptr_1 = offset_src_ptr.add(4 * channels); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = sse_jzazbz_vld::( - src_ptr_1, - transfer_function, - luminance_scale, - ); - - let src_ptr_2 = offset_src_ptr.add(4 * 2 * channels); - - let (r_row2_, g_row2_, b_row2_, a_row2_) = sse_jzazbz_vld::( - src_ptr_2, - transfer_function, - luminance_scale, - ); - - let src_ptr_3 = offset_src_ptr.add(4 * 3 * channels); - - let (r_row3_, g_row3_, b_row3_, a_row3_) = sse_jzazbz_vld::( - src_ptr_3, - transfer_function, - luminance_scale, - ); - - let r_row01 = _mm_packus_epi32(r_row0_, r_row1_); - let g_row01 = _mm_packus_epi32(g_row0_, g_row1_); - let b_row01 = _mm_packus_epi32(b_row0_, b_row1_); - - let r_row23 = _mm_packus_epi32(r_row2_, r_row3_); - let g_row23 = _mm_packus_epi32(g_row2_, g_row3_); - let b_row23 = _mm_packus_epi32(b_row2_, b_row3_); - - let r_row = _mm_packus_epi16(r_row01, r_row23); - let g_row = _mm_packus_epi16(g_row01, g_row23); - let b_row = _mm_packus_epi16(b_row01, b_row23); - - let dst_ptr = dst.add(dst_offset as usize + cx * channels); - - if image_configuration.has_alpha() { - let a_row01 = _mm_packus_epi32(a_row0_, a_row1_); - let a_row23 = _mm_packus_epi32(a_row2_, a_row3_); - let a_row = _mm_packus_epi16(a_row01, a_row23); - store_and_interleave_v4_u8!(dst_ptr, image_configuration, r_row, g_row, b_row, a_row); - } else { - store_and_interleave_v3_u8!(dst_ptr, image_configuration, r_row, g_row, b_row); - } - - cx += 16; - } - - let zeros = _mm_setzero_si128(); - while cx + 8 < width as usize { let offset_src_ptr = ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); let src_ptr_0 = offset_src_ptr; - let (r_row0_, g_row0_, b_row0_, a_row0_) = sse_jzazbz_vld::( - src_ptr_0, - transfer_function, - luminance_scale, - ); - - let src_ptr_1 = offset_src_ptr.add(4 * channels); - - let (r_row1_, g_row1_, b_row1_, a_row1_) = sse_jzazbz_vld::( - src_ptr_1, - transfer_function, - luminance_scale, - ); - - let r_row01 = _mm_packus_epi32(r_row0_, r_row1_); - let g_row01 = _mm_packus_epi32(g_row0_, g_row1_); - let b_row01 = _mm_packus_epi32(b_row0_, b_row1_); - - let r_row = _mm_packus_epi16(r_row01, zeros); - let g_row = _mm_packus_epi16(g_row01, zeros); - let b_row = _mm_packus_epi16(b_row01, zeros); + let (r_row0_, g_row0_, b_row0_, a_row0_) = + sse_jzazbz_vld::(src_ptr_0, luminance_scale); - let dst_ptr = dst.add(dst_offset as usize + cx * channels); + let dst_ptr = ((dst as *mut u8).add(dst_offset as usize) as *mut f32).add(cx * channels); if image_configuration.has_alpha() { - let a_row01 = _mm_packus_epi32(a_row0_, a_row1_); - let a_row = _mm_packus_epi16(a_row01, zeros); - store_and_interleave_v4_half_u8!( + store_and_interleave_v4_f32!( dst_ptr, image_configuration, - r_row, - g_row, - b_row, - a_row + r_row0_, + g_row0_, + b_row0_, + a_row0_ ); } else { - store_and_interleave_v3_half_u8!(dst_ptr, image_configuration, r_row, g_row, b_row); + store_and_interleave_v3_f32!(dst_ptr, image_configuration, r_row0_, g_row0_, b_row0_); } cx += 8; diff --git a/src/sse/linear_to_planar.rs b/src/sse/linear_to_planar.rs deleted file mode 100644 index fa5e615..0000000 --- a/src/sse/linear_to_planar.rs +++ /dev/null @@ -1,84 +0,0 @@ -/* - * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved. - * // - * // Use of this source code is governed by a BSD-style - * // license that can be found in the LICENSE file. - */ - -use crate::sse::{_mm_loadu_ps_x4, _mm_storeu_si128_x4, perform_sse_gamma_transfer}; -use crate::TransferFunction; -#[cfg(target_arch = "x86")] -use std::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use std::arch::x86_64::*; - -#[inline(always)] -unsafe fn transfer_to_gamma(r: __m128, transfer_function: TransferFunction) -> __m128i { - const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; - _mm_cvtps_epi32(_mm_round_ps::(_mm_mul_ps( - perform_sse_gamma_transfer(transfer_function, r), - _mm_set1_ps(255f32), - ))) -} - -#[inline(always)] -unsafe fn process_set( - k: (__m128, __m128, __m128, __m128), - transfer_function: TransferFunction, -) -> __m128i { - let y0 = transfer_to_gamma(k.0, transfer_function); - let y1 = transfer_to_gamma(k.1, transfer_function); - let y2 = transfer_to_gamma(k.2, transfer_function); - let y3 = transfer_to_gamma(k.3, transfer_function); - - let y_row01 = _mm_packus_epi32(y0, y1); - let y_row23 = _mm_packus_epi32(y2, y3); - - _mm_packus_epi16(y_row01, y_row23) -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn sse_linear_plane_to_gamma( - start_cx: usize, - src: *const f32, - src_offset: u32, - dst: *mut u8, - dst_offset: u32, - width: u32, - transfer_function: TransferFunction, -) -> usize { - let mut cx = start_cx; - - while cx + 64 < width as usize { - let offset_src_ptr = ((src as *const u8).add(src_offset as usize) as *const f32).add(cx); - - let pixel_row0 = _mm_loadu_ps_x4(offset_src_ptr); - let pixel_row1 = _mm_loadu_ps_x4(offset_src_ptr.add(16)); - let pixel_row2 = _mm_loadu_ps_x4(offset_src_ptr.add(32)); - let pixel_row3 = _mm_loadu_ps_x4(offset_src_ptr.add(48)); - - let set0 = process_set(pixel_row0, transfer_function); - let set1 = process_set(pixel_row1, transfer_function); - let set2 = process_set(pixel_row2, transfer_function); - let set3 = process_set(pixel_row3, transfer_function); - - let dst_ptr = dst.add(dst_offset as usize + cx); - - _mm_storeu_si128_x4(dst_ptr, (set0, set1, set2, set3)); - - cx += 64; - } - - while cx + 16 < width as usize { - let offset_src_ptr = ((src as *const u8).add(src_offset as usize) as *const f32).add(cx); - - let pixel_row = _mm_loadu_ps_x4(offset_src_ptr); - let r_row = process_set(pixel_row, transfer_function); - let dst_ptr = dst.add(dst_offset as usize + cx); - _mm_storeu_si128(dst_ptr as *mut __m128i, r_row); - - cx += 16; - } - - cx -} diff --git a/src/sse/planar_to_linear.rs b/src/sse/planar_to_linear.rs deleted file mode 100644 index 11d91b0..0000000 --- a/src/sse/planar_to_linear.rs +++ /dev/null @@ -1,92 +0,0 @@ -/* - * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved. - * // - * // Use of this source code is governed by a BSD-style - * // license that can be found in the LICENSE file. - */ - -use crate::sse::{_mm_loadu_si128_x4, _mm_storeu_ps_x4, perform_sse_linear_transfer}; -use crate::TransferFunction; -#[cfg(target_arch = "x86")] -use std::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use std::arch::x86_64::*; - -#[inline(always)] -unsafe fn sse_to_linear(r: __m128i, transfer_function: TransferFunction) -> __m128 { - let r_f = _mm_mul_ps(_mm_cvtepi32_ps(r), _mm_set1_ps(1f32 / 255f32)); - perform_sse_linear_transfer(transfer_function, r_f) -} - -#[inline] -unsafe fn process_pixels( - pixels: __m128i, - transfer_function: TransferFunction, -) -> (__m128, __m128, __m128, __m128) { - let zeros = _mm_setzero_si128(); - let r_low = _mm_unpacklo_epi8(pixels, zeros); - - let r_low_low = _mm_unpacklo_epi16(r_low, zeros); - - let x_low_low = sse_to_linear(r_low_low, transfer_function); - - let r_low_high = _mm_unpackhi_epi16(r_low, zeros); - - let x_low_high = sse_to_linear(r_low_high, transfer_function); - - let r_high = _mm_unpackhi_epi8(pixels, zeros); - - let r_high_low = _mm_unpacklo_epi16(r_high, zeros); - - let x_high_low = sse_to_linear(r_high_low, transfer_function); - - let r_high_high = _mm_unpackhi_epi16(r_high, zeros); - - let x_high_high = sse_to_linear(r_high_high, transfer_function); - - (x_low_low, x_low_high, x_high_low, x_high_high) -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn sse_plane_to_linear( - start_cx: usize, - src: *const u8, - src_offset: usize, - width: u32, - dst: *mut f32, - dst_offset: usize, - transfer_function: TransferFunction, -) -> usize { - let mut cx = start_cx; - - let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32; - - while cx + 64 < width as usize { - let src_ptr = src.add(src_offset + cx); - let pixels_row64 = _mm_loadu_si128_x4(src_ptr); - let storing_row0 = process_pixels(pixels_row64.0, transfer_function); - _mm_storeu_ps_x4(dst_ptr.add(cx), storing_row0); - - let storing_row1 = process_pixels(pixels_row64.1, transfer_function); - _mm_storeu_ps_x4(dst_ptr.add(cx + 16), storing_row1); - - let storing_row2 = process_pixels(pixels_row64.2, transfer_function); - _mm_storeu_ps_x4(dst_ptr.add(cx + 32), storing_row2); - - let storing_row3 = process_pixels(pixels_row64.3, transfer_function); - _mm_storeu_ps_x4(dst_ptr.add(cx + 48), storing_row3); - - cx += 64; - } - - while cx + 16 < width as usize { - let src_ptr = src.add(src_offset + cx); - let pixels = _mm_loadu_si128(src_ptr as *const __m128i); - let storing_row = process_pixels(pixels, transfer_function); - _mm_storeu_ps_x4(dst_ptr.add(cx), storing_row); - - cx += 16; - } - - cx -} diff --git a/src/xyz_lab_to_image.rs b/src/xyz_lab_to_image.rs index 3cd011b..3d9acfa 100644 --- a/src/xyz_lab_to_image.rs +++ b/src/xyz_lab_to_image.rs @@ -159,17 +159,35 @@ fn xyz_to_channels( src, @@ -247,8 +285,8 @@ pub fn lab_with_alpha_to_rgba( dst_stride, width, height, - &XYZ_TO_SRGB_D65, - TransferFunction::Srgb, + matrix, + transfer_function, ); } @@ -261,6 +299,8 @@ pub fn lab_with_alpha_to_rgba( /// * `dst_stride` - Bytes per row for dst data /// * `width` - Image width /// * `height` - Image height +/// * `matrix` - Transformation matrix from RGB to XYZ. If you don't have specific just pick `XYZ_TO_SRGB_D65` +/// * `transfer_function` - Transfer function. If you don't have specific pick `Srgb` pub fn lab_with_alpha_to_bgra( src: &[f32], src_stride: u32, @@ -268,6 +308,8 @@ pub fn lab_with_alpha_to_bgra( dst_stride: u32, width: u32, height: u32, + matrix: &[[f32; 3]; 3], + transfer_function: TransferFunction, ) { xyz_with_alpha_to_channels::<{ ImageConfiguration::Bgra as u8 }, { XyzTarget::Lab as u8 }>( src, @@ -276,8 +318,8 @@ pub fn lab_with_alpha_to_bgra( dst_stride, width, height, - &XYZ_TO_SRGB_D65, - TransferFunction::Srgb, + matrix, + transfer_function, ); } @@ -290,6 +332,8 @@ pub fn lab_with_alpha_to_bgra( /// * `dst_stride` - Bytes per row for dst data /// * `width` - Image width /// * `height` - Image height +/// * `matrix` - Transformation matrix from RGB to XYZ. If you don't have specific just pick `XYZ_TO_SRGB_D65` +/// * `transfer_function` - Transfer function. If you don't have specific pick `Srgb` pub fn luv_with_alpha_to_rgba( src: &[f32], src_stride: u32, @@ -297,6 +341,8 @@ pub fn luv_with_alpha_to_rgba( dst_stride: u32, width: u32, height: u32, + matrix: &[[f32; 3]; 3], + transfer_function: TransferFunction, ) { xyz_with_alpha_to_channels::<{ ImageConfiguration::Rgba as u8 }, { XyzTarget::Luv as u8 }>( src, @@ -305,8 +351,8 @@ pub fn luv_with_alpha_to_rgba( dst_stride, width, height, - &XYZ_TO_SRGB_D65, - TransferFunction::Srgb, + matrix, + transfer_function, ); } @@ -321,6 +367,8 @@ pub fn luv_with_alpha_to_rgba( /// * `dst_stride` - Bytes per row for dst data /// * `width` - Image width /// * `height` - Image height +/// * `matrix` - Transformation matrix from RGB to XYZ. If you don't have specific just pick `XYZ_TO_SRGB_D65` +/// * `transfer_function` - Transfer function. If you don't have specific pick `Srgb` pub fn luv_with_alpha_to_bgra( src: &[f32], src_stride: u32, @@ -328,6 +376,8 @@ pub fn luv_with_alpha_to_bgra( dst_stride: u32, width: u32, height: u32, + matrix: &[[f32; 3]; 3], + transfer_function: TransferFunction, ) { xyz_with_alpha_to_channels::<{ ImageConfiguration::Bgra as u8 }, { XyzTarget::Lab as u8 }>( src, @@ -336,8 +386,8 @@ pub fn luv_with_alpha_to_bgra( dst_stride, width, height, - &XYZ_TO_SRGB_D65, - TransferFunction::Srgb, + matrix, + transfer_function, ); } @@ -350,6 +400,8 @@ pub fn luv_with_alpha_to_bgra( /// * `dst_stride` - Bytes per row for dst data /// * `width` - Image width /// * `height` - Image height +/// * `matrix` - Transformation matrix from RGB to XYZ. If you don't have specific just pick `XYZ_TO_SRGB_D65` +/// * `transfer_function` - Transfer function. If you don't have specific pick `Srgb` pub fn xyz_with_alpha_to_rgba( src: &[f32], src_stride: u32, @@ -357,6 +409,8 @@ pub fn xyz_with_alpha_to_rgba( dst_stride: u32, width: u32, height: u32, + matrix: &[[f32; 3]; 3], + transfer_function: TransferFunction, ) { xyz_with_alpha_to_channels::<{ ImageConfiguration::Rgba as u8 }, { XyzTarget::Xyz as u8 }>( src, @@ -365,8 +419,8 @@ pub fn xyz_with_alpha_to_rgba( dst_stride, width, height, - &XYZ_TO_SRGB_D65, - TransferFunction::Srgb, + matrix, + transfer_function, ); } @@ -379,6 +433,8 @@ pub fn xyz_with_alpha_to_rgba( /// * `dst_stride` - Bytes per row for dst data /// * `width` - Image width /// * `height` - Image height +/// * `matrix` - Transformation matrix from RGB to XYZ. If you don't have specific just pick `XYZ_TO_SRGB_D65` +/// * `transfer_function` - Transfer function. If you don't have specific pick `Srgb` pub fn xyz_with_alpha_to_bgra( src: &[f32], src_stride: u32, @@ -386,6 +442,8 @@ pub fn xyz_with_alpha_to_bgra( dst_stride: u32, width: u32, height: u32, + matrix: &[[f32; 3]; 3], + transfer_function: TransferFunction, ) { xyz_with_alpha_to_channels::<{ ImageConfiguration::Bgra as u8 }, { XyzTarget::Xyz as u8 }>( src, @@ -394,8 +452,8 @@ pub fn xyz_with_alpha_to_bgra( dst_stride, width, height, - &XYZ_TO_SRGB_D65, - TransferFunction::Srgb, + matrix, + transfer_function, ); } @@ -408,6 +466,8 @@ pub fn xyz_with_alpha_to_bgra( /// * `dst_stride` - Bytes per row for dst data /// * `width` - Image width /// * `height` - Image height +/// * `matrix` - Transformation matrix from RGB to XYZ. If you don't have specific just pick `XYZ_TO_SRGB_D65` +/// * `transfer_function` - Transfer function. If you don't have specific pick `Srgb` pub fn lch_with_alpha_to_rgba( src: &[f32], src_stride: u32, @@ -415,6 +475,8 @@ pub fn lch_with_alpha_to_rgba( dst_stride: u32, width: u32, height: u32, + matrix: &[[f32; 3]; 3], + transfer_function: TransferFunction, ) { xyz_with_alpha_to_channels::<{ ImageConfiguration::Rgba as u8 }, { XyzTarget::Lch as u8 }>( src, @@ -423,8 +485,8 @@ pub fn lch_with_alpha_to_rgba( dst_stride, width, height, - &XYZ_TO_SRGB_D65, - TransferFunction::Srgb, + matrix, + transfer_function, ); } @@ -437,6 +499,8 @@ pub fn lch_with_alpha_to_rgba( /// * `dst_stride` - Bytes per row for dst data /// * `width` - Image width /// * `height` - Image height +/// * `matrix` - Transformation matrix from RGB to XYZ. If you don't have specific just pick `XYZ_TO_SRGB_D65` +/// * `transfer_function` - Transfer function. If you don't have specific pick `Srgb` pub fn lch_with_alpha_to_bgra( src: &[f32], src_stride: u32, @@ -444,6 +508,8 @@ pub fn lch_with_alpha_to_bgra( dst_stride: u32, width: u32, height: u32, + matrix: &[[f32; 3]; 3], + transfer_function: TransferFunction, ) { xyz_with_alpha_to_channels::<{ ImageConfiguration::Bgra as u8 }, { XyzTarget::Lch as u8 }>( src, @@ -452,7 +518,7 @@ pub fn lch_with_alpha_to_bgra( dst_stride, width, height, - &XYZ_TO_SRGB_D65, - TransferFunction::Srgb, + matrix, + transfer_function, ); }