diff --git a/Cargo.lock b/Cargo.lock index ccd37b4..064569e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -169,7 +169,7 @@ checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" [[package]] name = "colorutils-rs" -version = "0.7.0" +version = "0.7.1" dependencies = [ "erydanos", "half", diff --git a/Cargo.toml b/Cargo.toml index 756da70..a7960db 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ workspace = { members = ["src/app"] } [package] name = "colorutils-rs" -version = "0.7.0" +version = "0.7.1" edition = "2021" description = "High performance utilities for color format handling and conversion." readme = "README.md" diff --git a/src/app/src/main.rs b/src/app/src/main.rs index c06454a..5d22e6b 100644 --- a/src/app/src/main.rs +++ b/src/app/src/main.rs @@ -68,14 +68,13 @@ fn main() { lab_store.resize(width as usize * components * height as usize, 0.); let src_stride = width * components as u32; let start_time = Instant::now(); - rgba_to_jzazbz( + rgba_to_lalphabeta( src_bytes, src_stride, &mut lab_store, store_stride as u32, width, height, - 200., TransferFunction::Srgb, ); let elapsed_time = start_time.elapsed(); @@ -104,14 +103,13 @@ fn main() { // } let start_time = Instant::now(); - jzazbz_to_rgba( + lalphabeta_to_rgba( &lab_store, store_stride as u32, &mut dst_slice, src_stride, width, height, - 200., TransferFunction::Srgb, ); diff --git a/src/avx/image_to_oklab.rs b/src/avx/image_to_oklab.rs index 3dad138..06f3060 100644 --- a/src/avx/image_to_oklab.rs +++ b/src/avx/image_to_oklab.rs @@ -8,10 +8,7 @@ use crate::avx::routines::avx_vld_f32_and_deinterleave; use crate::avx::{_mm256_color_matrix_ps, avx2_interleave_rgb_ps, avx2_interleave_rgba_ps}; use crate::image::ImageConfiguration; use crate::image_to_oklab::OklabTarget; -use crate::{ - avx_store_and_interleave_v3_direct_f32, avx_store_and_interleave_v4_direct_f32 - , -}; +use crate::{avx_store_and_interleave_v3_direct_f32, avx_store_and_interleave_v4_direct_f32}; use erydanos::{_mm256_atan2_ps, _mm256_cbrt_fast_ps, _mm256_hypot_fast_ps}; #[cfg(target_arch = "x86")] use std::arch::x86::*; diff --git a/src/avx/support.rs b/src/avx/support.rs index 4e785e7..cf813fc 100644 --- a/src/avx/support.rs +++ b/src/avx/support.rs @@ -452,4 +452,4 @@ pub unsafe fn avx2_div_by255(v: __m256i) -> __m256i { let multiplier = _mm256_set1_epi16(-32640); let r = _mm256_mulhi_epu16(x, multiplier); _mm256_srli_epi16::<7>(r) -} \ No newline at end of file +} diff --git a/src/avx/to_xyz_lab.rs b/src/avx/to_xyz_lab.rs index 7e83338..9fdea3e 100644 --- a/src/avx/to_xyz_lab.rs +++ b/src/avx/to_xyz_lab.rs @@ -16,10 +16,13 @@ use crate::avx::cie::{ use crate::avx::routines::avx_vld_f32_and_deinterleave; use crate::avx::*; use crate::image::ImageConfiguration; -use crate::sse::{sse_interleave_ps_rgb, sse_triple_to_lab, sse_triple_to_lch, sse_triple_to_luv, sse_triple_to_xyz}; +use crate::sse::{sse_deinterleave_rgb_ps, sse_deinterleave_rgba_ps}; +use crate::sse::{ + sse_interleave_ps_rgb, sse_triple_to_lab, sse_triple_to_lch, sse_triple_to_luv, + sse_triple_to_xyz, +}; use crate::xyz_target::XyzTarget; use crate::{avx_store_and_interleave_v3_direct_f32, load_f32_and_deinterleave}; -use crate::sse::{sse_deinterleave_rgba_ps, sse_deinterleave_rgb_ps}; #[target_feature(enable = "avx2")] pub unsafe fn avx2_image_to_xyz_lab< @@ -101,7 +104,7 @@ pub unsafe fn avx2_image_to_xyz_lab< } while cx + 4 < width as usize { - let src_ptr = ((src as * const u8).add(src_offset) as *const f32).add(cx * channels); + let src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * channels); let (r_chan, g_chan, b_chan, a_chan) = load_f32_and_deinterleave!(src_ptr, image_configuration); diff --git a/src/image_to_jzazbz.rs b/src/image_to_jzazbz.rs index 315dc45..257e6cc 100644 --- a/src/image_to_jzazbz.rs +++ b/src/image_to_jzazbz.rs @@ -77,177 +77,89 @@ fn channels_to_jzaz( lut_table[i] = transfer_function.linearize(i as f32 * (1. / 255.0)); } + let iter; #[cfg(feature = "rayon")] { - dst_slice_safe_align + iter = dst_slice_safe_align .par_chunks_exact_mut(dst_stride as usize) - .zip(src.par_chunks_exact(src_stride as usize)) - .for_each(|(dst, src)| unsafe { - let mut _cx = 0usize; - - let dst_ptr = dst.as_mut_ptr() as *mut f32; - - let mut linearized_row = vec![0f32; width as usize * channels]; - for (linear_chunk, src_chunk) in linearized_row - .chunks_exact_mut(channels) - .zip(src.chunks_exact(channels)) - { - linear_chunk[image_configuration.get_r_channel_offset()] = *lut_table - .get_unchecked( - src_chunk[image_configuration.get_r_channel_offset()] as usize, - ); - linear_chunk[image_configuration.get_g_channel_offset()] = *lut_table - .get_unchecked( - src_chunk[image_configuration.get_g_channel_offset()] as usize, - ); - linear_chunk[image_configuration.get_b_channel_offset()] = *lut_table - .get_unchecked( - src_chunk[image_configuration.get_b_channel_offset()] as usize, - ); - if image_configuration.has_alpha() { - linear_chunk[image_configuration.get_a_channel_offset()] = - src_chunk[image_configuration.get_a_channel_offset()] as f32 - * (1. / 255.0); - } - } - - if let Some(dispatcher) = _wide_row_handle { - _cx = dispatcher( - _cx, - linearized_row.as_ptr(), - 0, - width, - dst.as_mut_ptr() as *mut f32, - 0, - display_luminance, - ); - } - - for x in _cx..width as usize { - let px = x * channels; - - let src = linearized_row.get_unchecked(px..); - let r = *src.get_unchecked(image_configuration.get_r_channel_offset()); - let g = *src.get_unchecked(image_configuration.get_g_channel_offset()); - let b = *src.get_unchecked(image_configuration.get_b_channel_offset()); - - let xyz = Xyz::from_linear_rgb(Rgb::::new(r, g, b), &SRGB_TO_XYZ_D65); - - let dst_store = dst_ptr.add(px); - - match target { - JzazbzTarget::Jzazbz => { - let jzazbz = - Jzazbz::from_xyz_with_display_luminance(xyz, display_luminance); - - dst_store.write_unaligned(jzazbz.jz); - dst_store.add(1).write_unaligned(jzazbz.az); - dst_store.add(2).write_unaligned(jzazbz.bz); - } - JzazbzTarget::Jzczhz => { - let jzczhz = - Jzczhz::from_xyz_with_display_luminance(xyz, display_luminance); - - dst_store.write_unaligned(jzczhz.jz); - dst_store.add(1).write_unaligned(jzczhz.cz); - dst_store.add(2).write_unaligned(jzczhz.hz); - } - } - - if image_configuration.has_alpha() { - let a = *src.get_unchecked(image_configuration.get_a_channel_offset()); - dst_store.add(3).write_unaligned(a); - } - } - }); + .zip(src.par_chunks_exact(src_stride as usize)); } - #[cfg(not(feature = "rayon"))] { - for (dst, src) in dst_slice_safe_align + iter = dst_slice_safe_align .chunks_exact_mut(dst_stride as usize) - .zip(src.chunks_exact(src_stride as usize)) - { - unsafe { - let mut _cx = 0usize; + .zip(src.chunks_exact(src_stride as usize)); + } - let dst_ptr = dst.as_mut_ptr() as *mut f32; + iter.for_each(|(dst, src)| unsafe { + let mut _cx = 0usize; - let mut linearized_row = vec![0f32; width as usize * channels]; - for (linear_chunk, src_chunk) in linearized_row - .chunks_exact_mut(channels) - .zip(src.chunks_exact(channels)) - { - linear_chunk[image_configuration.get_r_channel_offset()] = *lut_table - .get_unchecked( - src_chunk[image_configuration.get_r_channel_offset()] as usize, - ); - linear_chunk[image_configuration.get_g_channel_offset()] = *lut_table - .get_unchecked( - src_chunk[image_configuration.get_g_channel_offset()] as usize, - ); - linear_chunk[image_configuration.get_b_channel_offset()] = *lut_table - .get_unchecked( - src_chunk[image_configuration.get_b_channel_offset()] as usize, - ); - if image_configuration.has_alpha() { - linear_chunk[image_configuration.get_a_channel_offset()] = - src_chunk[image_configuration.get_a_channel_offset()] as f32 - * (1. / 255.0); - } - } + let dst_ptr = dst.as_mut_ptr() as *mut f32; - if let Some(dispatcher) = _wide_row_handle { - _cx = dispatcher( - _cx, - linearized_row.as_ptr(), - 0, - width, - dst.as_mut_ptr() as *mut f32, - 0, - display_luminance, - ); - } + let mut linearized_row = vec![0f32; width as usize * channels]; + for (linear_chunk, src_chunk) in linearized_row + .chunks_exact_mut(channels) + .zip(src.chunks_exact(channels)) + { + linear_chunk[image_configuration.get_r_channel_offset()] = *lut_table + .get_unchecked(src_chunk[image_configuration.get_r_channel_offset()] as usize); + linear_chunk[image_configuration.get_g_channel_offset()] = *lut_table + .get_unchecked(src_chunk[image_configuration.get_g_channel_offset()] as usize); + linear_chunk[image_configuration.get_b_channel_offset()] = *lut_table + .get_unchecked(src_chunk[image_configuration.get_b_channel_offset()] as usize); + if image_configuration.has_alpha() { + linear_chunk[image_configuration.get_a_channel_offset()] = + src_chunk[image_configuration.get_a_channel_offset()] as f32 * (1. / 255.0); + } + } - for x in _cx..width as usize { - let px = x * channels; + if let Some(dispatcher) = _wide_row_handle { + _cx = dispatcher( + _cx, + linearized_row.as_ptr(), + 0, + width, + dst.as_mut_ptr() as *mut f32, + 0, + display_luminance, + ); + } - let src = linearized_row.get_unchecked(px..); - let r = *src.get_unchecked(image_configuration.get_r_channel_offset()); - let g = *src.get_unchecked(image_configuration.get_g_channel_offset()); - let b = *src.get_unchecked(image_configuration.get_b_channel_offset()); + for x in _cx..width as usize { + let px = x * channels; - let xyz = Xyz::from_linear_rgb(Rgb::::new(r, g, b), &SRGB_TO_XYZ_D65); + let src = linearized_row.get_unchecked(px..); + let r = *src.get_unchecked(image_configuration.get_r_channel_offset()); + let g = *src.get_unchecked(image_configuration.get_g_channel_offset()); + let b = *src.get_unchecked(image_configuration.get_b_channel_offset()); - let dst_store = dst_ptr.add(px); + let xyz = Xyz::from_linear_rgb(Rgb::::new(r, g, b), &SRGB_TO_XYZ_D65); - match target { - JzazbzTarget::Jzazbz => { - let jzazbz = - Jzazbz::from_xyz_with_display_luminance(xyz, display_luminance); + let dst_store = dst_ptr.add(px); - dst_store.write_unaligned(jzazbz.jz); - dst_store.add(1).write_unaligned(jzazbz.az); - dst_store.add(2).write_unaligned(jzazbz.bz); - } - JzazbzTarget::Jzczhz => { - let jzczhz = - Jzczhz::from_xyz_with_display_luminance(xyz, display_luminance); + match target { + JzazbzTarget::Jzazbz => { + let jzazbz = Jzazbz::from_xyz_with_display_luminance(xyz, display_luminance); - dst_store.write_unaligned(jzczhz.jz); - dst_store.add(1).write_unaligned(jzczhz.cz); - dst_store.add(2).write_unaligned(jzczhz.hz); - } - } + dst_store.write_unaligned(jzazbz.jz); + dst_store.add(1).write_unaligned(jzazbz.az); + dst_store.add(2).write_unaligned(jzazbz.bz); + } + JzazbzTarget::Jzczhz => { + let jzczhz = Jzczhz::from_xyz_with_display_luminance(xyz, display_luminance); - if image_configuration.has_alpha() { - let a = *src.get_unchecked(image_configuration.get_a_channel_offset()); - dst_store.add(3).write_unaligned(a); - } + dst_store.write_unaligned(jzczhz.jz); + dst_store.add(1).write_unaligned(jzczhz.cz); + dst_store.add(2).write_unaligned(jzczhz.hz); } } + + if image_configuration.has_alpha() { + let a = *src.get_unchecked(image_configuration.get_a_channel_offset()); + dst_store.add(3).write_unaligned(a); + } } - } + }); } /// This function converts RGB to Jzazbz against D65 white point. This is much more effective than naive direct transformation diff --git a/src/image_to_lalphabeta.rs b/src/image_to_lalphabeta.rs index 08f1def..a0dcc3c 100644 --- a/src/image_to_lalphabeta.rs +++ b/src/image_to_lalphabeta.rs @@ -38,120 +38,66 @@ fn channels_to_lalphabeta( ) }; + let iter; + #[cfg(feature = "rayon")] { - dst_slice_safe_align + iter = dst_slice_safe_align .par_chunks_exact_mut(dst_stride as usize) - .zip(src.par_chunks_exact(src_stride as usize)) - .for_each(|(dst, src)| unsafe { - let mut _cx = 0usize; - - let mut linearized_row = vec![0f32; width as usize * channels]; - for (linear_chunk, src_chunk) in linearized_row - .chunks_exact_mut(channels) - .zip(src.chunks_exact(channels)) - { - linear_chunk[image_configuration.get_r_channel_offset()] = *lut_table - .get_unchecked( - src_chunk[image_configuration.get_r_channel_offset()] as usize, - ); - linear_chunk[image_configuration.get_g_channel_offset()] = *lut_table - .get_unchecked( - src_chunk[image_configuration.get_g_channel_offset()] as usize, - ); - linear_chunk[image_configuration.get_b_channel_offset()] = *lut_table - .get_unchecked( - src_chunk[image_configuration.get_b_channel_offset()] as usize, - ); - if image_configuration.has_alpha() { - linear_chunk[image_configuration.get_a_channel_offset()] = - src_chunk[image_configuration.get_g_channel_offset()] as f32 - * (1. / 255.0); - } - } - - let dst_ptr = dst.as_mut_ptr() as *mut f32; - - for x in _cx..width as usize { - let px = x * channels; - - let src = linearized_row.get_unchecked(px..); - let r = *src.get_unchecked(image_configuration.get_r_channel_offset()); - let g = *src.get_unchecked(image_configuration.get_g_channel_offset()); - let b = *src.get_unchecked(image_configuration.get_b_channel_offset()); - - let rgb = Rgb::::new(r, g, b); - let dst_store = dst_ptr.add(px); - let lalphabeta = LAlphaBeta::from_linear_rgb(rgb, &SRGB_TO_XYZ_D65); - dst_store.write_unaligned(lalphabeta.l); - dst_store.add(1).write_unaligned(lalphabeta.alpha); - dst_store.add(2).write_unaligned(lalphabeta.beta); - - if image_configuration.has_alpha() { - let a = *src.get_unchecked(image_configuration.get_g_channel_offset()); - dst_store.add(3).write_unaligned(a); - } - } - }); + .zip(src.par_chunks_exact(src_stride as usize)); } - #[cfg(not(feature = "rayon"))] { - for (dst, src) in dst_slice_safe_align + iter = dst_slice_safe_align .chunks_exact_mut(dst_stride as usize) - .zip(src.chunks_exact(src_stride as usize)) - { - unsafe { - let mut _cx = 0usize; + .zip(src.chunks_exact(src_stride as usize)); + } - let mut linearized_row = vec![0f32; width as usize * channels]; - for (linear_chunk, src_chunk) in linearized_row - .chunks_exact_mut(channels) - .zip(src.chunks_exact(channels)) - { - linear_chunk[image_configuration.get_r_channel_offset()] = *lut_table - .get_unchecked( - src_chunk[image_configuration.get_r_channel_offset()] as usize, - ); - linear_chunk[image_configuration.get_g_channel_offset()] = *lut_table - .get_unchecked( - src_chunk[image_configuration.get_g_channel_offset()] as usize, - ); - linear_chunk[image_configuration.get_b_channel_offset()] = *lut_table - .get_unchecked( - src_chunk[image_configuration.get_b_channel_offset()] as usize, - ); - if image_configuration.has_alpha() { - linear_chunk[image_configuration.get_a_channel_offset()] = - src_chunk[image_configuration.get_g_channel_offset()] as f32 - * (1. / 255.0); - } + #[cfg(feature = "rayon")] + { + iter.for_each(|(dst, src)| unsafe { + let mut _cx = 0usize; + + let mut linearized_row = vec![0f32; width as usize * channels]; + for (linear_chunk, src_chunk) in linearized_row + .chunks_exact_mut(channels) + .zip(src.chunks_exact(channels)) + { + linear_chunk[image_configuration.get_r_channel_offset()] = *lut_table + .get_unchecked(src_chunk[image_configuration.get_r_channel_offset()] as usize); + linear_chunk[image_configuration.get_g_channel_offset()] = *lut_table + .get_unchecked(src_chunk[image_configuration.get_g_channel_offset()] as usize); + linear_chunk[image_configuration.get_b_channel_offset()] = *lut_table + .get_unchecked(src_chunk[image_configuration.get_b_channel_offset()] as usize); + if image_configuration.has_alpha() { + linear_chunk[image_configuration.get_a_channel_offset()] = + src_chunk[image_configuration.get_a_channel_offset()] as f32 * (1. / 255.0); } + } - let dst_ptr = dst.as_mut_ptr() as *mut f32; + let dst_ptr = dst.as_mut_ptr() as *mut f32; - for x in _cx..width as usize { - let px = x * channels; + for x in _cx..width as usize { + let px = x * channels; - let src = linearized_row.get_unchecked(px..); - let r = *src.get_unchecked(image_configuration.get_r_channel_offset()); - let g = *src.get_unchecked(image_configuration.get_g_channel_offset()); - let b = *src.get_unchecked(image_configuration.get_b_channel_offset()); + let src = linearized_row.get_unchecked(px..); + let r = *src.get_unchecked(image_configuration.get_r_channel_offset()); + let g = *src.get_unchecked(image_configuration.get_g_channel_offset()); + let b = *src.get_unchecked(image_configuration.get_b_channel_offset()); - let rgb = Rgb::::new(r, g, b); - let dst_store = dst_ptr.add(px); - let lalphabeta = LAlphaBeta::from_linear_rgb(rgb, &SRGB_TO_XYZ_D65); - dst_store.write_unaligned(lalphabeta.l); - dst_store.add(1).write_unaligned(lalphabeta.alpha); - dst_store.add(2).write_unaligned(lalphabeta.beta); + let rgb = Rgb::::new(r, g, b); + let dst_store = dst_ptr.add(px); + let lalphabeta = LAlphaBeta::from_linear_rgb(rgb, &SRGB_TO_XYZ_D65); + dst_store.write_unaligned(lalphabeta.l); + dst_store.add(1).write_unaligned(lalphabeta.alpha); + dst_store.add(2).write_unaligned(lalphabeta.beta); - if image_configuration.has_alpha() { - let a = *src.get_unchecked(image_configuration.get_g_channel_offset()); - dst_store.add(3).write_unaligned(a); - } + if image_configuration.has_alpha() { + let a = *src.get_unchecked(image_configuration.get_a_channel_offset()); + dst_store.add(3).write_unaligned(a); } } - } + }); } } diff --git a/src/image_to_linear.rs b/src/image_to_linear.rs index 71441d0..abed787 100644 --- a/src/image_to_linear.rs +++ b/src/image_to_linear.rs @@ -42,99 +42,61 @@ fn channels_to_linear( ) }; - #[cfg(not(feature = "rayon"))] - { - for (dst_row, src_row) in dst_slice_safe_align - .chunks_exact_mut(dst_stride as usize) - .zip(src.chunks_exact(src_stride as usize)) - { - unsafe { - let mut _cx = 0usize; - - let src_ptr = src_row.as_ptr(); - let dst_ptr = dst_row.as_mut_ptr() as *mut f32; - - for x in _cx..width as usize { - let px = x * channels; - let dst = dst_ptr.add(px); - let src = src_ptr.add(px); - let r = src - .add(image_configuration.get_r_channel_offset()) - .read_unaligned(); - let g = src - .add(image_configuration.get_g_channel_offset()) - .read_unaligned(); - let b = src - .add(image_configuration.get_b_channel_offset()) - .read_unaligned(); - - let rgb = Rgb::::new(r, g, b); - - dst.add(image_configuration.get_r_channel_offset()) - .write_unaligned(*lut_table.get_unchecked(rgb.r as usize)); - dst.add(image_configuration.get_g_channel_offset()) - .write_unaligned(*lut_table.get_unchecked(rgb.g as usize)); - dst.add(image_configuration.get_b_channel_offset()) - .write_unaligned(*lut_table.get_unchecked(rgb.b as usize)); - - if USE_ALPHA && image_configuration.has_alpha() { - let a = src - .add(image_configuration.get_a_channel_offset()) - .read_unaligned(); - let a_lin = a as f32 * (1f32 / 255f32); - dst.add(image_configuration.get_a_channel_offset()) - .write_unaligned(a_lin); - } - } - } - } - } + let iter; #[cfg(feature = "rayon")] { - dst_slice_safe_align + iter = dst_slice_safe_align .par_chunks_exact_mut(dst_stride as usize) - .zip(src.par_chunks_exact(src_stride as usize)) - .for_each(|(dst_row, src_row)| unsafe { - let mut _cx = 0usize; - - let src_ptr = src_row.as_ptr(); - let dst_ptr = dst_row.as_mut_ptr() as *mut f32; - - for x in _cx..width as usize { - let px = x * channels; - let dst = dst_ptr.add(px); - let src = src_ptr.add(px); - let r = src - .add(image_configuration.get_r_channel_offset()) - .read_unaligned(); - let g = src - .add(image_configuration.get_g_channel_offset()) - .read_unaligned(); - let b = src - .add(image_configuration.get_b_channel_offset()) - .read_unaligned(); - - let rgb = Rgb::::new(r, g, b); - - dst.add(image_configuration.get_r_channel_offset()) - .write_unaligned(*lut_table.get_unchecked(rgb.r as usize)); - dst.add(image_configuration.get_g_channel_offset()) - .write_unaligned(*lut_table.get_unchecked(rgb.g as usize)); - dst.add(image_configuration.get_b_channel_offset()) - .write_unaligned(*lut_table.get_unchecked(rgb.b as usize)); + .zip(src.par_chunks_exact(src_stride as usize)); + } - if USE_ALPHA && image_configuration.has_alpha() { - let a = src - .add(image_configuration.get_a_channel_offset()) - .read_unaligned(); - let a_lin = a as f32 * (1f32 / 255f32); - dst.add(image_configuration.get_a_channel_offset()) - .write_unaligned(a_lin); - } - } - }); + #[cfg(not(feature = "rayon"))] + { + iter = dst_slice_safe_align + .chunks_exact_mut(dst_stride as usize) + .zip(src.chunks_exact(src_stride as usize)); } + + iter.for_each(|(dst_row, src_row)| unsafe { + let mut _cx = 0usize; + + let src_ptr = src_row.as_ptr(); + let dst_ptr = dst_row.as_mut_ptr() as *mut f32; + + for x in _cx..width as usize { + let px = x * channels; + let dst = dst_ptr.add(px); + let src = src_ptr.add(px); + let r = src + .add(image_configuration.get_r_channel_offset()) + .read_unaligned(); + let g = src + .add(image_configuration.get_g_channel_offset()) + .read_unaligned(); + let b = src + .add(image_configuration.get_b_channel_offset()) + .read_unaligned(); + + let rgb = Rgb::::new(r, g, b); + + dst.add(image_configuration.get_r_channel_offset()) + .write_unaligned(*lut_table.get_unchecked(rgb.r as usize)); + dst.add(image_configuration.get_g_channel_offset()) + .write_unaligned(*lut_table.get_unchecked(rgb.g as usize)); + dst.add(image_configuration.get_b_channel_offset()) + .write_unaligned(*lut_table.get_unchecked(rgb.b as usize)); + + if USE_ALPHA && image_configuration.has_alpha() { + let a = src + .add(image_configuration.get_a_channel_offset()) + .read_unaligned(); + let a_lin = a as f32 * (1f32 / 255f32); + dst.add(image_configuration.get_a_channel_offset()) + .write_unaligned(a_lin); + } + } + }); } /// This function converts RGB to linear colorspace diff --git a/src/image_to_linear_u8.rs b/src/image_to_linear_u8.rs index f8e50ad..581b426 100644 --- a/src/image_to_linear_u8.rs +++ b/src/image_to_linear_u8.rs @@ -36,69 +36,44 @@ fn channels_to_linear( .min(255.) as u8; } + let iter; + #[cfg(feature = "rayon")] + { + iter = l_dst + .par_chunks_exact_mut(dst_stride as usize) + .zip(l_src.par_chunks_exact(src_stride as usize)); + } #[cfg(not(feature = "rayon"))] - for (dst_row, src_row) in l_dst - .chunks_exact_mut(dst_stride as usize) - .zip(l_src.chunks_exact(src_stride as usize)) { - unsafe { - let mut _cx = 0usize; - - for x in _cx..width as usize { - let px = x * channels; - let r = *src_row.get_unchecked(px + image_configuration.get_r_channel_offset()); - let g = *src_row.get_unchecked(px + image_configuration.get_g_channel_offset()); - let b = *src_row.get_unchecked(px + image_configuration.get_b_channel_offset()); - - let rgb = Rgb::::new(r, g, b); - - *dst_row.get_unchecked_mut(px + image_configuration.get_r_channel_offset()) = - *lut_table.get_unchecked(rgb.r as usize); - *dst_row.get_unchecked_mut(px + image_configuration.get_g_channel_offset()) = - *lut_table.get_unchecked(rgb.g as usize); - *dst_row.get_unchecked_mut(px + image_configuration.get_b_channel_offset()) = - *lut_table.get_unchecked(rgb.b as usize); - - if USE_ALPHA && image_configuration.has_alpha() { - let a = *src_row.get_unchecked(px + image_configuration.get_a_channel_offset()); - *dst_row.get_unchecked_mut(px + image_configuration.get_a_channel_offset()) = a; - } - } - } + iter = l_dst + .chunks_exact_mut(dst_stride as usize) + .zip(l_src.chunks_exact(src_stride as usize)); } - #[cfg(feature = "rayon")] - { - l_dst - .par_chunks_exact_mut(dst_stride as usize) - .zip(l_src.par_chunks_exact(src_stride as usize)) - .for_each(|(dst_row, src_row)| unsafe { - let mut _cx = 0usize; + iter.for_each(|(dst_row, src_row)| unsafe { + let mut _cx = 0usize; - for x in _cx..width as usize { - let px = x * channels; - let r = *src_row.get_unchecked(px + image_configuration.get_r_channel_offset()); - let g = *src_row.get_unchecked(px + image_configuration.get_g_channel_offset()); - let b = *src_row.get_unchecked(px + image_configuration.get_b_channel_offset()); + for x in _cx..width as usize { + let px = x * channels; + let r = *src_row.get_unchecked(px + image_configuration.get_r_channel_offset()); + let g = *src_row.get_unchecked(px + image_configuration.get_g_channel_offset()); + let b = *src_row.get_unchecked(px + image_configuration.get_b_channel_offset()); - let rgb = Rgb::::new(r, g, b); + let rgb = Rgb::::new(r, g, b); - *dst_row.get_unchecked_mut(px + image_configuration.get_r_channel_offset()) = - *lut_table.get_unchecked(rgb.r as usize); - *dst_row.get_unchecked_mut(px + image_configuration.get_g_channel_offset()) = - *lut_table.get_unchecked(rgb.g as usize); - *dst_row.get_unchecked_mut(px + image_configuration.get_b_channel_offset()) = - *lut_table.get_unchecked(rgb.b as usize); + *dst_row.get_unchecked_mut(px + image_configuration.get_r_channel_offset()) = + *lut_table.get_unchecked(rgb.r as usize); + *dst_row.get_unchecked_mut(px + image_configuration.get_g_channel_offset()) = + *lut_table.get_unchecked(rgb.g as usize); + *dst_row.get_unchecked_mut(px + image_configuration.get_b_channel_offset()) = + *lut_table.get_unchecked(rgb.b as usize); - if USE_ALPHA && image_configuration.has_alpha() { - let a = - *src_row.get_unchecked(px + image_configuration.get_a_channel_offset()); - *dst_row - .get_unchecked_mut(px + image_configuration.get_a_channel_offset()) = a; - } - } - }); - } + if USE_ALPHA && image_configuration.has_alpha() { + let a = *src_row.get_unchecked(px + image_configuration.get_a_channel_offset()); + *dst_row.get_unchecked_mut(px + image_configuration.get_a_channel_offset()) = a; + } + } + }); } /// This function converts RGB to Linear. This is much more effective than naive direct transformation diff --git a/src/image_to_oklab.rs b/src/image_to_oklab.rs index 90197b6..993918d 100644 --- a/src/image_to_oklab.rs +++ b/src/image_to_oklab.rs @@ -95,115 +95,66 @@ fn channels_to_oklab( ) }; + let iter; #[cfg(feature = "rayon")] { - dst_slice_safe_align - .par_chunks_exact_mut(dst_stride as usize) - .for_each(|dst| unsafe { - let mut _cx = 0usize; - - let dst_ptr = dst.as_mut_ptr() as *mut f32; - - if let Some(dispatcher) = _wide_row_handle { - _cx = dispatcher(_cx, width, dst_ptr, 0) - } - - for x in _cx..width as usize { - let px = x * channels; - - let src = dst_ptr.add(px); - let r = src - .add(image_configuration.get_r_channel_offset()) - .read_unaligned(); - let g = src - .add(image_configuration.get_g_channel_offset()) - .read_unaligned(); - let b = src - .add(image_configuration.get_b_channel_offset()) - .read_unaligned(); - - let rgb = Rgb::::new(r, g, b); - let dst_store = dst_ptr.add(px); - - match target { - OklabTarget::Oklab => { - let oklab = Oklab::from_linear_rgb(rgb); - dst_store.write_unaligned(oklab.l); - dst_store.add(1).write_unaligned(oklab.a); - dst_store.add(2).write_unaligned(oklab.b); - } - OklabTarget::Oklch => { - let oklch = Oklch::from_linear_rgb(rgb); - dst_store.write_unaligned(oklch.l); - dst_store.add(1).write_unaligned(oklch.c); - dst_store.add(2).write_unaligned(oklch.h); - } - } - - if image_configuration.has_alpha() { - let a = src - .add(image_configuration.get_a_channel_offset()) - .read_unaligned(); - dst_store.add(3).write_unaligned(a); - } - } - }); + iter = dst_slice_safe_align.par_chunks_exact_mut(dst_stride as usize); } #[cfg(not(feature = "rayon"))] { - for dst in dst_slice_safe_align.chunks_exact_mut(dst_stride as usize) { - unsafe { - let mut _cx = 0usize; - - let dst_ptr = dst.as_mut_ptr() as *mut f32; - - if let Some(dispatcher) = _wide_row_handle { - _cx = dispatcher(_cx, width, dst_ptr, 0) - } - - for x in _cx..width as usize { - let px = x * channels; + iter = dst_slice_safe_align.chunks_exact_mut(dst_stride as usize); + } - let src = dst_ptr.add(px); - let r = src - .add(image_configuration.get_r_channel_offset()) - .read_unaligned(); - let g = src - .add(image_configuration.get_g_channel_offset()) - .read_unaligned(); - let b = src - .add(image_configuration.get_b_channel_offset()) - .read_unaligned(); + iter.for_each(|dst| unsafe { + let mut _cx = 0usize; - let rgb = Rgb::::new(r, g, b); - let dst_store = dst_ptr.add(px); + let dst_ptr = dst.as_mut_ptr() as *mut f32; - match target { - OklabTarget::Oklab => { - let oklab = Oklab::from_linear_rgb(rgb); - dst_store.write_unaligned(oklab.l); - dst_store.add(1).write_unaligned(oklab.a); - dst_store.add(2).write_unaligned(oklab.b); - } - OklabTarget::Oklch => { - let oklch = Oklch::from_linear_rgb(rgb); - dst_store.write_unaligned(oklch.l); - dst_store.add(1).write_unaligned(oklch.c); - dst_store.add(2).write_unaligned(oklch.h); - } - } + if let Some(dispatcher) = _wide_row_handle { + _cx = dispatcher(_cx, width, dst_ptr, 0) + } - if image_configuration.has_alpha() { - let a = src - .add(image_configuration.get_a_channel_offset()) - .read_unaligned(); - dst_store.add(3).write_unaligned(a); - } + for x in _cx..width as usize { + let px = x * channels; + + let src = dst_ptr.add(px); + let r = src + .add(image_configuration.get_r_channel_offset()) + .read_unaligned(); + let g = src + .add(image_configuration.get_g_channel_offset()) + .read_unaligned(); + let b = src + .add(image_configuration.get_b_channel_offset()) + .read_unaligned(); + + let rgb = Rgb::::new(r, g, b); + let dst_store = dst_ptr.add(px); + + match target { + OklabTarget::Oklab => { + let oklab = Oklab::from_linear_rgb(rgb); + dst_store.write_unaligned(oklab.l); + dst_store.add(1).write_unaligned(oklab.a); + dst_store.add(2).write_unaligned(oklab.b); } + OklabTarget::Oklch => { + let oklch = Oklch::from_linear_rgb(rgb); + dst_store.write_unaligned(oklch.l); + dst_store.add(1).write_unaligned(oklch.c); + dst_store.add(2).write_unaligned(oklch.h); + } + } + + if image_configuration.has_alpha() { + let a = src + .add(image_configuration.get_a_channel_offset()) + .read_unaligned(); + dst_store.add(3).write_unaligned(a); } } - } + }); } /// This function converts RGB to Oklab against D65 white point. This is much more effective than naive direct transformation diff --git a/src/image_to_sigmoidal.rs b/src/image_to_sigmoidal.rs index af1899e..fc28c07 100644 --- a/src/image_to_sigmoidal.rs +++ b/src/image_to_sigmoidal.rs @@ -56,121 +56,70 @@ fn image_to_sigmoidal( const COLOR_SCALE: f32 = 1f32 / 255f32; + let dst_slice_safe_align = unsafe { + slice::from_raw_parts_mut( + dst.as_mut_ptr() as *mut u8, + dst_stride as usize * height as usize, + ) + }; + + let iter; + #[cfg(feature = "rayon")] { - let dst_slice_safe_align = unsafe { - slice::from_raw_parts_mut( - dst.as_mut_ptr() as *mut u8, - dst_stride as usize * height as usize, - ) - }; - - dst_slice_safe_align + iter = dst_slice_safe_align .par_chunks_exact_mut(dst_stride as usize) - .zip(src.par_chunks_exact(src_stride as usize)) - .for_each(|(dst, src)| unsafe { - let mut _cx = 0usize; - - let src_ptr = src.as_ptr(); - let dst_ptr = dst.as_mut_ptr() as *mut f32; - - if let Some(dispatcher) = _wide_row_handler { - _cx = dispatcher(_cx, src_ptr, width, dst_ptr); - } - - for x in _cx..width as usize { - let px = x * channels; - let src = src_ptr.add(px); - let r = src - .add(image_configuration.get_r_channel_offset()) - .read_unaligned(); - let g = src - .add(image_configuration.get_g_channel_offset()) - .read_unaligned(); - let b = src - .add(image_configuration.get_b_channel_offset()) - .read_unaligned(); - - let rgb = Rgb::::new(r, g, b); - - let writing_ptr = dst_ptr.add(px); - - let sigmoidal = rgb.to_sigmoidal(); - writing_ptr.write_unaligned(sigmoidal.sr); - writing_ptr.add(1).write_unaligned(sigmoidal.sg); - writing_ptr.add(2).write_unaligned(sigmoidal.sb); - - if image_configuration.has_alpha() { - let a = src - .add(image_configuration.get_a_channel_offset()) - .read_unaligned() as f32 - * COLOR_SCALE; - - writing_ptr.add(3).write_unaligned(a); - } - } - }); + .zip(src.par_chunks_exact(src_stride as usize)); } - #[cfg(not(feature = "rayon"))] { - let mut src_offset = 0usize; - let mut dst_offset = 0usize; + iter = dst_slice_safe_align + .chunks_exact_mut(dst_stride as usize) + .zip(src.chunks_exact(src_stride as usize)); + } - for _ in 0..height as usize { - let mut _cx = 0usize; + iter.for_each(|(dst, src)| unsafe { + let mut _cx = 0usize; - let src_ptr = unsafe { src.as_ptr().add(src_offset) }; - let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 }; + let src_ptr = src.as_ptr(); + let dst_ptr = dst.as_mut_ptr() as *mut f32; - if let Some(dispatcher) = _wide_row_handler { - unsafe { _cx = dispatcher(_cx, src_ptr, width, dst_ptr) } - } + if let Some(dispatcher) = _wide_row_handler { + _cx = dispatcher(_cx, src_ptr, width, dst_ptr); + } - for x in _cx..width as usize { - let px = x * channels; - let src = unsafe { src_ptr.add(px) }; - let r = unsafe { - src.add(image_configuration.get_r_channel_offset()) - .read_unaligned() - }; - let g = unsafe { - src.add(image_configuration.get_g_channel_offset()) - .read_unaligned() - }; - let b = unsafe { - src.add(image_configuration.get_b_channel_offset()) - .read_unaligned() - }; - - let rgb = Rgb::::new(r, g, b); - - let writing_ptr = unsafe { dst_ptr.add(px) }; - - let sigmoidal = rgb.to_sigmoidal(); - unsafe { - writing_ptr.write_unaligned(sigmoidal.sr); - writing_ptr.add(1).write_unaligned(sigmoidal.sg); - writing_ptr.add(2).write_unaligned(sigmoidal.sb); - } - - if image_configuration.has_alpha() { - let a = unsafe { - src.add(image_configuration.get_a_channel_offset()) - .read_unaligned() - } as f32 - * COLOR_SCALE; - - unsafe { - writing_ptr.add(3).write_unaligned(a); - } - } + for x in _cx..width as usize { + let px = x * channels; + let src = src_ptr.add(px); + let r = src + .add(image_configuration.get_r_channel_offset()) + .read_unaligned(); + let g = src + .add(image_configuration.get_g_channel_offset()) + .read_unaligned(); + let b = src + .add(image_configuration.get_b_channel_offset()) + .read_unaligned(); + + let rgb = Rgb::::new(r, g, b); + + let writing_ptr = dst_ptr.add(px); + + let sigmoidal = rgb.to_sigmoidal(); + writing_ptr.write_unaligned(sigmoidal.sr); + writing_ptr.add(1).write_unaligned(sigmoidal.sg); + writing_ptr.add(2).write_unaligned(sigmoidal.sb); + + if image_configuration.has_alpha() { + let a = src + .add(image_configuration.get_a_channel_offset()) + .read_unaligned() as f32 + * COLOR_SCALE; + + writing_ptr.add(3).write_unaligned(a); } - - src_offset += src_stride as usize; - dst_offset += dst_stride as usize; } - } + }); } /// This function converts RGB to Sigmoidal. This is much more effective than naive direct transformation diff --git a/src/image_to_xyz_lab.rs b/src/image_to_xyz_lab.rs index 1223d67..db18c6b 100644 --- a/src/image_to_xyz_lab.rs +++ b/src/image_to_xyz_lab.rs @@ -608,7 +608,6 @@ pub fn rgb_to_lab( ); } - /// This function converts RGBA to XYZ. This is much more effective than naive direct transformation /// /// # Arguments diff --git a/src/image_xyza_laba.rs b/src/image_xyza_laba.rs index 4b0ca84..1a89ede 100644 --- a/src/image_xyza_laba.rs +++ b/src/image_xyza_laba.rs @@ -65,189 +65,98 @@ fn channels_to_xyz_with_alpha::new(r, g, b); - let px = x * channels; - let dst_store = dst_ptr.add(px); - - let xyz = Xyz::from_linear_rgb(rgb, matrix); - - match target { - XyzTarget::Lab => { - let lab = Lab::from_xyz(xyz); - dst_store.write_unaligned(lab.l); - dst_store.add(1).write_unaligned(lab.a); - dst_store.add(2).write_unaligned(lab.b); - } - XyzTarget::Xyz => { - dst_store.write_unaligned(xyz.x); - dst_store.add(1).write_unaligned(xyz.y); - dst_store.add(2).write_unaligned(xyz.z); - } - XyzTarget::Luv => { - let luv = Luv::from_xyz(xyz); - dst_store.write_unaligned(luv.l); - dst_store.add(1).write_unaligned(luv.u); - dst_store.add(2).write_unaligned(luv.v); - } - XyzTarget::Lch => { - let luv = Luv::from_xyz(xyz); - let lch = LCh::from_luv(luv); - dst_store.write_unaligned(lch.l); - dst_store.add(1).write_unaligned(lch.c); - dst_store.add(2).write_unaligned(lch.h); - } - } - let a = *src.get_unchecked(image_configuration.get_a_channel_offset()); - dst_store.add(3).write_unaligned(a); - } - }); + .zip(src.par_chunks_exact(src_stride as usize)); } - #[cfg(not(feature = "rayon"))] { - for (dst, src) in dst_slice_safe_align + iter = dst_slice_safe_align .chunks_exact_mut(dst_stride as usize) - .zip(src.chunks_exact(src_stride as usize)) - { - unsafe { - let mut _cx = 0usize; + .zip(src.chunks_exact(src_stride as usize)); + } - let mut transient_row = vec![0f32; width as usize * channels]; + iter.for_each(|(dst, src)| unsafe { + let mut _cx = 0usize; - for (dst_chunk, src_chunks) in transient_row - .chunks_exact_mut(channels) - .zip(src.chunks_exact(channels)) - { - dst_chunk[image_configuration.get_r_channel_offset()] = *lut_table - .get_unchecked( - src_chunks[image_configuration.get_r_channel_offset()] as usize, - ); - dst_chunk[image_configuration.get_g_channel_offset()] = *lut_table - .get_unchecked( - src_chunks[image_configuration.get_g_channel_offset()] as usize, - ); - dst_chunk[image_configuration.get_b_channel_offset()] = *lut_table - .get_unchecked( - src_chunks[image_configuration.get_b_channel_offset()] as usize, - ); - dst_chunk[image_configuration.get_a_channel_offset()] = - src_chunks[image_configuration.get_a_channel_offset()] as f32 - * (1. / 255.0); - } + let mut transient_row = vec![0f32; width as usize * channels]; - if let Some(dispatcher) = _wide_row_handler { - _cx = dispatcher( - _cx, - transient_row.as_ptr(), - 0, - width, - dst.as_mut_ptr() as *mut f32, - 0, - matrix, - ); - } + for (dst_chunk, src_chunks) in transient_row + .chunks_exact_mut(channels) + .zip(src.chunks_exact(channels)) + { + dst_chunk[image_configuration.get_r_channel_offset()] = *lut_table + .get_unchecked(src_chunks[image_configuration.get_r_channel_offset()] as usize); + dst_chunk[image_configuration.get_g_channel_offset()] = *lut_table + .get_unchecked(src_chunks[image_configuration.get_g_channel_offset()] as usize); + dst_chunk[image_configuration.get_b_channel_offset()] = *lut_table + .get_unchecked(src_chunks[image_configuration.get_b_channel_offset()] as usize); + dst_chunk[image_configuration.get_a_channel_offset()] = + src_chunks[image_configuration.get_a_channel_offset()] as f32 * (1. / 255.0); + } - let dst_ptr = dst.as_mut_ptr() as *mut f32; + if let Some(dispatcher) = _wide_row_handler { + _cx = dispatcher( + _cx, + transient_row.as_ptr(), + 0, + width, + dst.as_mut_ptr() as *mut f32, + 0, + matrix, + ); + } + + let dst_ptr = dst.as_mut_ptr() as *mut f32; - for x in _cx..width as usize { - let px = x * channels; - let src = transient_row.get_unchecked(px..); + for x in _cx..width as usize { + let px = x * channels; + let src = transient_row.get_unchecked(px..); - let r = *src.get_unchecked(image_configuration.get_r_channel_offset()); - let g = *src.get_unchecked(image_configuration.get_g_channel_offset()); - let b = *src.get_unchecked(image_configuration.get_b_channel_offset()); + let r = *src.get_unchecked(image_configuration.get_r_channel_offset()); + let g = *src.get_unchecked(image_configuration.get_g_channel_offset()); + let b = *src.get_unchecked(image_configuration.get_b_channel_offset()); - let rgb = Rgb::::new(r, g, b); - let px = x * channels; - let dst_store = dst_ptr.add(px); + let rgb = Rgb::::new(r, g, b); + let px = x * channels; + let dst_store = dst_ptr.add(px); - let xyz = Xyz::from_linear_rgb(rgb, matrix); + let xyz = Xyz::from_linear_rgb(rgb, matrix); - match target { - XyzTarget::Lab => { - let lab = Lab::from_xyz(xyz); - dst_store.write_unaligned(lab.l); - dst_store.add(1).write_unaligned(lab.a); - dst_store.add(2).write_unaligned(lab.b); - } - XyzTarget::Xyz => { - dst_store.write_unaligned(xyz.x); - dst_store.add(1).write_unaligned(xyz.y); - dst_store.add(2).write_unaligned(xyz.z); - } - XyzTarget::Luv => { - let luv = Luv::from_xyz(xyz); - dst_store.write_unaligned(luv.l); - dst_store.add(1).write_unaligned(luv.u); - dst_store.add(2).write_unaligned(luv.v); - } - XyzTarget::Lch => { - let luv = Luv::from_xyz(xyz); - let lch = LCh::from_luv(luv); - dst_store.write_unaligned(lch.l); - dst_store.add(1).write_unaligned(lch.c); - dst_store.add(2).write_unaligned(lch.h); - } - } - let a = *src.get_unchecked(image_configuration.get_a_channel_offset()); - dst_store.add(3).write_unaligned(a); + match target { + XyzTarget::Lab => { + let lab = Lab::from_xyz(xyz); + dst_store.write_unaligned(lab.l); + dst_store.add(1).write_unaligned(lab.a); + dst_store.add(2).write_unaligned(lab.b); + } + XyzTarget::Xyz => { + dst_store.write_unaligned(xyz.x); + dst_store.add(1).write_unaligned(xyz.y); + dst_store.add(2).write_unaligned(xyz.z); + } + XyzTarget::Luv => { + let luv = Luv::from_xyz(xyz); + dst_store.write_unaligned(luv.l); + dst_store.add(1).write_unaligned(luv.u); + dst_store.add(2).write_unaligned(luv.v); + } + XyzTarget::Lch => { + let luv = Luv::from_xyz(xyz); + let lch = LCh::from_luv(luv); + dst_store.write_unaligned(lch.l); + dst_store.add(1).write_unaligned(lch.c); + dst_store.add(2).write_unaligned(lch.h); } } + let a = *src.get_unchecked(image_configuration.get_a_channel_offset()); + dst_store.add(3).write_unaligned(a); } - } + }); } /// This function converts RGBA to CIE L*ab. diff --git a/src/jzazbz_to_image.rs b/src/jzazbz_to_image.rs index 1a37dc1..2839dda 100644 --- a/src/jzazbz_to_image.rs +++ b/src/jzazbz_to_image.rs @@ -59,186 +59,103 @@ fn jzazbz_to_image( ) }; + let iter; + #[cfg(feature = "rayon")] { - dst.par_chunks_exact_mut(dst_stride as usize) - .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize)) - .for_each(|(dst, src)| unsafe { - let channels = image_configuration.get_channels_count(); - - let mut _cx = 0usize; - - let src_ptr = src.as_ptr() as *mut f32; - - let mut transient_row = vec![0f32; width as usize * channels]; - - if let Some(dispatcher) = _wide_row_handle { - _cx = dispatcher( - _cx, - src.as_ptr() as *const f32, - 0, - transient_row.as_mut_ptr(), - 0, - width, - display_luminance, - ); - } - - for x in _cx..width as usize { - let px = x * channels; - let l_x = src_ptr.add(px).read_unaligned(); - let l_y = src_ptr.add(px + 1).read_unaligned(); - let l_z = src_ptr.add(px + 2).read_unaligned(); - let rgb = match target { - JzazbzTarget::Jzazbz => { - let jzazbz = - Jzazbz::new_with_luminance(l_x, l_y, l_z, display_luminance); - jzazbz.to_linear_rgb() - } - JzazbzTarget::Jzczhz => { - let jzczhz = Jzczhz::new(l_x, l_y, l_z); - jzczhz.to_linear_rgb_with_luminance(display_luminance) - } - }; - - let dst = transient_row.get_unchecked_mut((x * channels)..); - *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; - *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; - *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; - if image_configuration.has_alpha() { - let l_a = src_ptr.add(px + 3).read_unaligned(); - *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a; - } - } - - for (dst_chunk, src_chunks) in dst - .chunks_exact_mut(channels) - .zip(transient_row.chunks_exact(channels)) - { - let r_cast = (src_chunks[image_configuration.get_r_channel_offset()] - .min(1.) - .max(0.) - * 2048f32) - .round(); - let g_cast = (src_chunks[image_configuration.get_g_channel_offset()] - .min(1.) - .max(0.) - * 2048f32) - .round(); - let b_cast = (src_chunks[image_configuration.get_b_channel_offset()] - .min(1.) - .max(0.) - * 2048f32) - .round(); - - dst_chunk[image_configuration.get_r_channel_offset()] = - *lut_table.get_unchecked((r_cast as usize).min(2048)); - dst_chunk[image_configuration.get_g_channel_offset()] = - *lut_table.get_unchecked((g_cast as usize).min(2048)); - dst_chunk[image_configuration.get_b_channel_offset()] = - *lut_table.get_unchecked((b_cast as usize).min(2048)); - - if image_configuration.has_alpha() { - let a_cast = (src_chunks[image_configuration.get_a_channel_offset()] * 255.) - .min(255.) - .max(0.) as u8; - dst_chunk[image_configuration.get_a_channel_offset()] = a_cast; - } - } - }); + iter = dst + .par_chunks_exact_mut(dst_stride as usize) + .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize)); } - #[cfg(not(feature = "rayon"))] { - for (dst, src) in dst + iter = dst .chunks_exact_mut(dst_stride as usize) - .zip(src_slice_safe_align.chunks_exact(src_stride as usize)) - { - unsafe { - let channels = image_configuration.get_channels_count(); + .zip(src_slice_safe_align.chunks_exact(src_stride as usize)); + } - let mut _cx = 0usize; + iter.for_each(|(dst, src)| unsafe { + let channels = image_configuration.get_channels_count(); - let src_ptr = src.as_ptr() as *mut f32; + let mut _cx = 0usize; - let mut transient_row = vec![0f32; width as usize * channels]; + let src_ptr = src.as_ptr() as *mut f32; - if let Some(dispatcher) = _wide_row_handle { - _cx = dispatcher( - _cx, - src.as_ptr() as *const f32, - 0, - transient_row.as_mut_ptr(), - 0, - width, - display_luminance, - ); - } + let mut transient_row = vec![0f32; width as usize * channels]; - for x in _cx..width as usize { - let px = x * channels; - let l_x = src_ptr.add(px).read_unaligned(); - let l_y = src_ptr.add(px + 1).read_unaligned(); - let l_z = src_ptr.add(px + 2).read_unaligned(); - let rgb = match target { - JzazbzTarget::Jzazbz => { - let jzazbz = - Jzazbz::new_with_luminance(l_x, l_y, l_z, display_luminance); - jzazbz.to_linear_rgb() - } - JzazbzTarget::Jzczhz => { - let jzczhz = Jzczhz::new(l_x, l_y, l_z); - jzczhz.to_linear_rgb_with_luminance(display_luminance) - } - }; + if let Some(dispatcher) = _wide_row_handle { + _cx = dispatcher( + _cx, + src.as_ptr() as *const f32, + 0, + transient_row.as_mut_ptr(), + 0, + width, + display_luminance, + ); + } - let dst = transient_row.get_unchecked_mut((x * channels)..); - *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; - *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; - *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; - if image_configuration.has_alpha() { - let l_a = src_ptr.add(px + 3).read_unaligned(); - *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a; - } + for x in _cx..width as usize { + let px = x * channels; + let l_x = src_ptr.add(px).read_unaligned(); + let l_y = src_ptr.add(px + 1).read_unaligned(); + let l_z = src_ptr.add(px + 2).read_unaligned(); + let rgb = match target { + JzazbzTarget::Jzazbz => { + let jzazbz = Jzazbz::new_with_luminance(l_x, l_y, l_z, display_luminance); + jzazbz.to_linear_rgb() } - - for (dst_chunk, src_chunks) in dst - .chunks_exact_mut(channels) - .zip(transient_row.chunks_exact(channels)) - { - let r_cast = (src_chunks[image_configuration.get_r_channel_offset()] - .min(1.) - .max(0.) - * 2048f32) - .round(); - let g_cast = (src_chunks[image_configuration.get_g_channel_offset()] - .min(1.) - .max(0.) - * 2048f32) - .round(); - let b_cast = (src_chunks[image_configuration.get_b_channel_offset()] - .min(1.) - .max(0.) - * 2048f32) - .round(); - - dst_chunk[image_configuration.get_r_channel_offset()] = - *lut_table.get_unchecked((r_cast as usize).min(2048)); - dst_chunk[image_configuration.get_g_channel_offset()] = - *lut_table.get_unchecked((g_cast as usize).min(2048)); - dst_chunk[image_configuration.get_b_channel_offset()] = - *lut_table.get_unchecked((b_cast as usize).min(2048)); - - if image_configuration.has_alpha() { - let a_cast = (src_chunks[image_configuration.get_a_channel_offset()] * 255.) - .min(255.) - .max(0.) as u8; - dst_chunk[image_configuration.get_a_channel_offset()] = a_cast; - } + JzazbzTarget::Jzczhz => { + let jzczhz = Jzczhz::new(l_x, l_y, l_z); + jzczhz.to_linear_rgb_with_luminance(display_luminance) } + }; + + let dst = transient_row.get_unchecked_mut((x * channels)..); + *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; + *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; + *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; + if image_configuration.has_alpha() { + let l_a = src_ptr.add(px + 3).read_unaligned(); + *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a; } } - } + + for (dst_chunk, src_chunks) in dst + .chunks_exact_mut(channels) + .zip(transient_row.chunks_exact(channels)) + { + let r_cast = (src_chunks[image_configuration.get_r_channel_offset()] + .min(1.) + .max(0.) + * 2048f32) + .round(); + let g_cast = (src_chunks[image_configuration.get_g_channel_offset()] + .min(1.) + .max(0.) + * 2048f32) + .round(); + let b_cast = (src_chunks[image_configuration.get_b_channel_offset()] + .min(1.) + .max(0.) + * 2048f32) + .round(); + + dst_chunk[image_configuration.get_r_channel_offset()] = + *lut_table.get_unchecked((r_cast as usize).min(2048)); + dst_chunk[image_configuration.get_g_channel_offset()] = + *lut_table.get_unchecked((g_cast as usize).min(2048)); + dst_chunk[image_configuration.get_b_channel_offset()] = + *lut_table.get_unchecked((b_cast as usize).min(2048)); + + if image_configuration.has_alpha() { + let a_cast = (src_chunks[image_configuration.get_a_channel_offset()] * 255.) + .min(255.) + .max(0.) as u8; + dst_chunk[image_configuration.get_a_channel_offset()] = a_cast; + } + } + }); } /// This function converts Jzazbz with interleaved alpha channel to RGBA. This is much more effective than naive direct transformation diff --git a/src/jzczhz.rs b/src/jzczhz.rs index ce28c9d..2d8bed5 100644 --- a/src/jzczhz.rs +++ b/src/jzczhz.rs @@ -110,10 +110,7 @@ impl Jzczhz { /// `display_luminance` - display luminance /// `transfer_function` - Transfer function to convert into linear colorspace and backwards #[inline] - pub fn to_linear_rgb_with_luminance( - &self, - display_luminance: f32, - ) -> Rgb { + pub fn to_linear_rgb_with_luminance(&self, display_luminance: f32) -> Rgb { let jzazbz = self.to_jzazbz_with_luminance(display_luminance); jzazbz.to_linear_rgb() } diff --git a/src/lalphabeta_to_image.rs b/src/lalphabeta_to_image.rs index d1f7aec..b9dc538 100644 --- a/src/lalphabeta_to_image.rs +++ b/src/lalphabeta_to_image.rs @@ -39,129 +39,76 @@ fn lalphabeta_to_image( ) }; + let iter; #[cfg(feature = "rayon")] { - dst.par_chunks_exact_mut(dst_stride as usize) - .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize)) - .for_each(|(dst, src)| unsafe { - let mut _cx = 0usize; - - let src_ptr = src.as_ptr() as *mut f32; - - let mut transient_row = vec![0f32; width as usize * channels]; - - for x in _cx..width as usize { - let px = x * channels; - let l_x = src_ptr.add(px).read_unaligned(); - let l_y = src_ptr.add(px + 1).read_unaligned(); - let l_z = src_ptr.add(px + 2).read_unaligned(); - let lalphabeta = LAlphaBeta::new(l_x, l_y, l_z); - let rgb = lalphabeta.to_linear_rgb(&XYZ_TO_SRGB_D65); - - let dst = transient_row.get_unchecked_mut((x * channels)..); - *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; - *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; - *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; - if image_configuration.has_alpha() { - let l_a = src_ptr.add(px + 3).read_unaligned(); - let a_value = (l_a * 255f32).max(0f32); - *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = - a_value; - } - } - - for (dst, src) in dst - .chunks_exact_mut(channels) - .zip(transient_row.chunks_exact(channels)) - { - let r = src[image_configuration.get_r_channel_offset()]; - let g = src[image_configuration.get_g_channel_offset()]; - let b = src[image_configuration.get_b_channel_offset()]; - - let rgb = (Rgb::::new( - r.min(1f32).max(0f32), - g.min(1f32).max(0f32), - b.min(1f32).max(0f32), - ) * Rgb::::dup(2048f32)) - .round() - .cast::(); - - dst[image_configuration.get_r_channel_offset()] = - *lut_table.get_unchecked(rgb.r.min(2048) as usize); - dst[image_configuration.get_g_channel_offset()] = - *lut_table.get_unchecked(rgb.g.min(2048) as usize); - dst[image_configuration.get_b_channel_offset()] = - *lut_table.get_unchecked(rgb.b.min(2048) as usize); - if image_configuration.has_alpha() { - dst[image_configuration.get_a_channel_offset()] = - src[image_configuration.get_a_channel_offset()] as u8; - } - } - }); + iter = dst + .par_chunks_exact_mut(dst_stride as usize) + .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize)); } - #[cfg(not(feature = "rayon"))] { - for (dst, src) in dst + iter = dst .chunks_exact_mut(dst_stride as usize) - .zip(src_slice_safe_align.chunks_exact(src_stride as usize)) - { - unsafe { - let mut _cx = 0usize; - - let src_ptr = src.as_ptr() as *mut f32; - - let mut transient_row = vec![0f32; width as usize * channels]; - - for x in _cx..width as usize { - let px = x * channels; - let l_x = src_ptr.add(px).read_unaligned(); - let l_y = src_ptr.add(px + 1).read_unaligned(); - let l_z = src_ptr.add(px + 2).read_unaligned(); - let lalphabeta = LAlphaBeta::new(l_x, l_y, l_z); - let rgb = lalphabeta.to_linear_rgb(&XYZ_TO_SRGB_D65); + .zip(src_slice_safe_align.chunks_exact(src_stride as usize)); + } - let dst = transient_row.get_unchecked_mut((x * channels)..); - *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; - *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; - *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; - if image_configuration.has_alpha() { - let l_a = src_ptr.add(px + 3).read_unaligned(); - let a_value = (l_a * 255f32).max(0f32); - *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = - a_value; - } + #[cfg(feature = "rayon")] + { + iter.for_each(|(dst, src)| unsafe { + let mut _cx = 0usize; + + let src_ptr = src.as_ptr() as *mut f32; + + let mut transient_row = vec![0f32; width as usize * channels]; + + for x in _cx..width as usize { + let px = x * channels; + let l_x = src_ptr.add(px).read_unaligned(); + let l_y = src_ptr.add(px + 1).read_unaligned(); + let l_z = src_ptr.add(px + 2).read_unaligned(); + let lalphabeta = LAlphaBeta::new(l_x, l_y, l_z); + let rgb = lalphabeta.to_linear_rgb(&XYZ_TO_SRGB_D65); + + let dst = transient_row.get_unchecked_mut((x * channels)..); + *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; + *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; + *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; + if image_configuration.has_alpha() { + let l_a = src_ptr.add(px + 3).read_unaligned(); + let a_value = (l_a * 255f32).max(0f32).round(); + *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = a_value; } + } - for (dst, src) in dst - .chunks_exact_mut(channels) - .zip(transient_row.chunks_exact(channels)) - { - let r = src[image_configuration.get_r_channel_offset()]; - let g = src[image_configuration.get_g_channel_offset()]; - let b = src[image_configuration.get_b_channel_offset()]; - - let rgb = (Rgb::::new( - r.min(1f32).max(0f32), - g.min(1f32).max(0f32), - b.min(1f32).max(0f32), - ) * Rgb::::dup(2048f32)) - .round() - .cast::(); - - dst[image_configuration.get_r_channel_offset()] = - *lut_table.get_unchecked(rgb.r.min(2048) as usize); - dst[image_configuration.get_g_channel_offset()] = - *lut_table.get_unchecked(rgb.g.min(2048) as usize); - dst[image_configuration.get_b_channel_offset()] = - *lut_table.get_unchecked(rgb.b.min(2048) as usize); - if image_configuration.has_alpha() { - dst[image_configuration.get_a_channel_offset()] = - src[image_configuration.get_a_channel_offset()] as u8; - } + for (dst, src) in dst + .chunks_exact_mut(channels) + .zip(transient_row.chunks_exact(channels)) + { + let r = src[image_configuration.get_r_channel_offset()]; + let g = src[image_configuration.get_g_channel_offset()]; + let b = src[image_configuration.get_b_channel_offset()]; + + let rgb = (Rgb::::new( + r.min(1f32).max(0f32), + g.min(1f32).max(0f32), + b.min(1f32).max(0f32), + ) * Rgb::::dup(2048f32)) + .round() + .cast::(); + + *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = + *lut_table.get_unchecked(rgb.r.min(2048) as usize); + *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = + *lut_table.get_unchecked(rgb.g.min(2048) as usize); + *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = + *lut_table.get_unchecked(rgb.b.min(2048) as usize); + if image_configuration.has_alpha() { + *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = + *src.get_unchecked(image_configuration.get_a_channel_offset()) as u8; } } - } + }); } } diff --git a/src/lib.rs b/src/lib.rs index f1e52cc..3f7966d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -72,6 +72,7 @@ pub use image_to_linear_u8::*; pub use image_to_xyz_lab::bgr_to_lab; pub use image_to_xyz_lab::bgr_to_lch; pub use image_to_xyz_lab::bgr_to_luv; +pub use image_to_xyz_lab::bgr_to_xyz; pub use image_to_xyz_lab::bgra_to_laba; pub use image_to_xyz_lab::rgb_to_lab; pub use image_to_xyz_lab::rgb_to_lch; @@ -84,7 +85,6 @@ pub use image_to_xyz_lab::rgba_to_xyza; pub use image_to_xyz_lab::srgb_to_xyz; pub use image_to_xyz_lab::srgba_to_xyz; pub use image_to_xyz_lab::srgba_to_xyza; -pub use image_to_xyz_lab::bgr_to_xyz; pub use image_xyza_laba::bgra_to_lab_with_alpha; pub use image_xyza_laba::bgra_to_lch_with_alpha; pub use image_xyza_laba::bgra_to_luv_with_alpha; @@ -109,18 +109,18 @@ pub use rgba::ToRgbaF16; pub use rgba::ToRgbaF32; pub use xyb::Xyb; pub use xyz::Xyz; +pub use xyz_lab_to_image::lab_to_bgr; +pub use xyz_lab_to_image::lab_to_rgb; pub use xyz_lab_to_image::lab_to_srgb; pub use xyz_lab_to_image::laba_to_srgb; pub use xyz_lab_to_image::lch_to_bgr; pub use xyz_lab_to_image::lch_to_rgb; pub use xyz_lab_to_image::luv_to_bgr; pub use xyz_lab_to_image::luv_to_rgb; +pub use xyz_lab_to_image::xyz_to_bgr; pub use xyz_lab_to_image::xyz_to_rgb; pub use xyz_lab_to_image::xyz_to_srgb; pub use xyz_lab_to_image::xyza_to_rgba; -pub use xyz_lab_to_image::xyz_to_bgr; -pub use xyz_lab_to_image::lab_to_rgb; -pub use xyz_lab_to_image::lab_to_bgr; pub use xyz_transform::*; pub use xyza_laba_to_image::lab_with_alpha_to_bgra; pub use xyza_laba_to_image::lab_with_alpha_to_rgba; diff --git a/src/linear_to_image.rs b/src/linear_to_image.rs index 55929a4..13486ce 100644 --- a/src/linear_to_image.rs +++ b/src/linear_to_image.rs @@ -44,110 +44,66 @@ fn linear_to_gamma_channels::new( - r.min(1f32).max(0f32), - g.min(1f32).max(0f32), - b.min(1f32).max(0f32), - ) * Rgb::::dup(2048f32)) - .round() - .cast::(); - - let dst = dst_ptr.add(px); - - dst.add(image_configuration.get_r_channel_offset()) - .write_unaligned(*lut_table.get_unchecked(rgb.r.min(2048) as usize)); - dst.add(image_configuration.get_g_channel_offset()) - .write_unaligned(*lut_table.get_unchecked(rgb.g.min(2048) as usize)); - dst.add(image_configuration.get_b_channel_offset()) - .write_unaligned(*lut_table.get_unchecked(rgb.b.min(2048) as usize)); - - if USE_ALPHA && image_configuration.has_alpha() { - let a = src_slice - .add(image_configuration.get_a_channel_offset()) - .read_unaligned(); - let a_lin = (a * 255f32).round() as u8; - dst.add(image_configuration.get_a_channel_offset()).write_unaligned(a_lin); - } - } - }); + iter = dst + .par_chunks_exact_mut(dst_stride as usize) + .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize)); } - #[cfg(not(feature = "rayon"))] { - for (dst, src) in dst + iter = dst .chunks_exact_mut(dst_stride as usize) - .zip(src_slice_safe_align.chunks_exact(src_stride as usize)) - { - unsafe { - let mut _cx = 0usize; - - let src_ptr = src.as_ptr() as *const f32; - let dst_ptr = dst.as_mut_ptr(); - - for x in _cx..width as usize { - let px = x * channels; - let src_slice = src_ptr.add(px); - let r = src_slice - .add(image_configuration.get_r_channel_offset()) - .read_unaligned(); - let g = src_slice - .add(image_configuration.get_g_channel_offset()) - .read_unaligned(); - let b = src_slice - .add(image_configuration.get_b_channel_offset()) - .read_unaligned(); - - let rgb = (Rgb::::new( - r.min(1f32).max(0f32), - g.min(1f32).max(0f32), - b.min(1f32).max(0f32), - ) * Rgb::::dup(2048f32)) - .round() - .cast::(); - - let dst = dst_ptr.add(px); - - dst.add(image_configuration.get_r_channel_offset()) - .write_unaligned(*lut_table.get_unchecked(rgb.r.min(2048) as usize)); - dst.add(image_configuration.get_g_channel_offset()) - .write_unaligned(*lut_table.get_unchecked(rgb.g.min(2048) as usize)); - dst.add(image_configuration.get_b_channel_offset()) - .write_unaligned(*lut_table.get_unchecked(rgb.b.min(2048) as usize)); + .zip(src_slice_safe_align.chunks_exact(src_stride as usize)); + } - if USE_ALPHA && image_configuration.has_alpha() { - let a = src_slice - .add(image_configuration.get_a_channel_offset()) - .read_unaligned(); - let a_lin = (a * 255f32).round() as u8; - dst.add(image_configuration.get_a_channel_offset()).write_unaligned(a_lin); - } - } + iter.for_each(|(dst, src)| unsafe { + let mut _cx = 0usize; + + let src_ptr = src.as_ptr() as *const f32; + let dst_ptr = dst.as_mut_ptr(); + + for x in _cx..width as usize { + let px = x * channels; + let src_slice = src_ptr.add(px); + let r = src_slice + .add(image_configuration.get_r_channel_offset()) + .read_unaligned(); + let g = src_slice + .add(image_configuration.get_g_channel_offset()) + .read_unaligned(); + let b = src_slice + .add(image_configuration.get_b_channel_offset()) + .read_unaligned(); + + let rgb = (Rgb::::new( + r.min(1f32).max(0f32), + g.min(1f32).max(0f32), + b.min(1f32).max(0f32), + ) * Rgb::::dup(2048f32)) + .round() + .cast::(); + + let dst = dst_ptr.add(px); + + dst.add(image_configuration.get_r_channel_offset()) + .write_unaligned(*lut_table.get_unchecked(rgb.r.min(2048) as usize)); + dst.add(image_configuration.get_g_channel_offset()) + .write_unaligned(*lut_table.get_unchecked(rgb.g.min(2048) as usize)); + dst.add(image_configuration.get_b_channel_offset()) + .write_unaligned(*lut_table.get_unchecked(rgb.b.min(2048) as usize)); + + if USE_ALPHA && image_configuration.has_alpha() { + let a = src_slice + .add(image_configuration.get_a_channel_offset()) + .read_unaligned(); + let a_lin = (a * 255f32).round() as u8; + dst.add(image_configuration.get_a_channel_offset()) + .write_unaligned(a_lin); } } - } + }); } /// This function converts Linear to RGB. This is much more effective than naive direct transformation diff --git a/src/neon/cie.rs b/src/neon/cie.rs index 9456929..20fb7b4 100644 --- a/src/neon/cie.rs +++ b/src/neon/cie.rs @@ -28,9 +28,7 @@ pub(crate) unsafe fn neon_triple_to_xyz( c8: float32x4_t, c9: float32x4_t, ) -> (float32x4_t, float32x4_t, float32x4_t) { - let (x, y, z) = vcolorq_matrix_f32( - r, g, b, c1, c2, c3, c4, c5, c6, c7, c8, c9, - ); + let (x, y, z) = vcolorq_matrix_f32(r, g, b, c1, c2, c3, c4, c5, c6, c7, c8, c9); (x, y, z) } diff --git a/src/neon/image_to_oklab.rs b/src/neon/image_to_oklab.rs index 9e72371..17c3448 100644 --- a/src/neon/image_to_oklab.rs +++ b/src/neon/image_to_oklab.rs @@ -6,8 +6,8 @@ */ use crate::image::ImageConfiguration; use crate::image_to_oklab::OklabTarget; -use crate::neon::math::vcolorq_matrix_f32; use crate::load_f32_and_deinterleave; +use crate::neon::math::vcolorq_matrix_f32; use erydanos::{vatan2q_f32, vcbrtq_fast_f32, vhypotq_fast_f32}; use std::arch::aarch64::*; @@ -16,9 +16,8 @@ macro_rules! triple_to_oklab { $c0:expr, $c1:expr, $c2: expr, $c3: expr, $c4:expr, $c5: expr, $c6:expr, $c7: expr, $c8: expr, $m0: expr, $m1: expr, $m2: expr, $m3: expr, $m4: expr, $m5: expr, $m6: expr, $m7: expr, $m8: expr ) => {{ - let (l_l, l_m, l_s) = vcolorq_matrix_f32( - $r, $g, $b, $c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7, $c8, - ); + let (l_l, l_m, l_s) = + vcolorq_matrix_f32($r, $g, $b, $c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7, $c8); let l_ = vcbrtq_fast_f32(l_l); let m_ = vcbrtq_fast_f32(l_m); diff --git a/src/neon/oklab_to_image.rs b/src/neon/oklab_to_image.rs index ca94daf..5ff6067 100644 --- a/src/neon/oklab_to_image.rs +++ b/src/neon/oklab_to_image.rs @@ -10,8 +10,8 @@ use erydanos::{vcosq_f32, vsinq_f32}; use crate::image::ImageConfiguration; use crate::image_to_oklab::OklabTarget; -use crate::neon::math::vcolorq_matrix_f32; use crate::load_f32_and_deinterleave_direct; +use crate::neon::math::vcolorq_matrix_f32; #[inline(always)] unsafe fn neon_oklab_gamma_vld( @@ -95,30 +95,11 @@ pub unsafe fn neon_oklab_to_image( - v_src_ptr, - m0, - m1, - m2, - m3, - m4, - m5, - m6, - m7, - m8, - c0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, + v_src_ptr, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4, c5, c6, c7, c8, ); let in_place_ptr = diff --git a/src/neon/to_xyz_lab.rs b/src/neon/to_xyz_lab.rs index 4ad96bd..28eb1ff 100644 --- a/src/neon/to_xyz_lab.rs +++ b/src/neon/to_xyz_lab.rs @@ -50,23 +50,12 @@ pub unsafe fn neon_channels_to_xyz_or_lab< let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32; while cx + 4 < width as usize { - let src_ptr = ((src as * const u8).add(src_offset) as *const f32).add(cx * channels); + let src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * channels); let (r_chan, g_chan, b_chan, a_chan) = load_f32_and_deinterleave!(src_ptr, image_configuration); let (mut x_low_low, mut y_low_low, mut z_low_low) = neon_triple_to_xyz( - r_chan, - g_chan, - b_chan, - cq1, - cq2, - cq3, - cq4, - cq5, - cq6, - cq7, - cq8, - cq9, + r_chan, g_chan, b_chan, cq1, cq2, cq3, cq4, cq5, cq6, cq7, cq8, cq9, ); match target { @@ -103,6 +92,5 @@ pub unsafe fn neon_channels_to_xyz_or_lab< cx += 4; } - cx } diff --git a/src/neon/to_xyza_laba.rs b/src/neon/to_xyza_laba.rs index 0542683..134b99b 100644 --- a/src/neon/to_xyza_laba.rs +++ b/src/neon/to_xyza_laba.rs @@ -6,11 +6,11 @@ */ use crate::image::ImageConfiguration; +use crate::load_f32_and_deinterleave; use crate::neon::cie::{ neon_triple_to_lab, neon_triple_to_lch, neon_triple_to_luv, neon_triple_to_xyz, }; use crate::xyz_target::XyzTarget; -use crate::load_f32_and_deinterleave; use std::arch::aarch64::*; #[inline(always)] diff --git a/src/neon/xyza_laba_to_image.rs b/src/neon/xyza_laba_to_image.rs index ed829ea..c412639 100644 --- a/src/neon/xyza_laba_to_image.rs +++ b/src/neon/xyza_laba_to_image.rs @@ -94,19 +94,10 @@ pub unsafe fn neon_xyza_to_image( - src_ptr_0, - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, + src_ptr_0, c1, c2, c3, c4, c5, c6, c7, c8, c9, ); - let dst_ptr = ((dst as *mut u8).add(dst_offset) as* mut f32).add(cx * channels); + let dst_ptr = ((dst as *mut u8).add(dst_offset) as *mut f32).add(cx * channels); let store_rows = match image_configuration { ImageConfiguration::Rgb | ImageConfiguration::Rgba => { diff --git a/src/oklab_to_image.rs b/src/oklab_to_image.rs index 77318ab..6712742 100644 --- a/src/oklab_to_image.rs +++ b/src/oklab_to_image.rs @@ -68,156 +68,89 @@ fn oklab_to_image( ) }; + let iter; #[cfg(feature = "rayon")] { - dst.par_chunks_exact_mut(dst_stride as usize) - .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize)) - .for_each(|(dst, src)| unsafe { - let mut _cx = 0usize; - - let mut transient_row = vec![0f32; width as usize * channels]; - - let src_ptr = src.as_ptr() as *mut f32; - - if let Some(dispatcher) = _wide_row_handle { - _cx = dispatcher(_cx, src_ptr, 0, transient_row.as_mut_ptr(), 0, width) - } - - for x in _cx..width as usize { - let px = x * channels; - let source_p = src_ptr.add(px); - let l_x = source_p.read_unaligned(); - let l_y = source_p.add(1).read_unaligned(); - let l_z = source_p.add(2).read_unaligned(); - let rgb = match target { - OklabTarget::Oklab => { - let oklab = Oklab::new(l_x, l_y, l_z); - oklab.to_linear_rgb() - } - OklabTarget::Oklch => { - let oklch = Oklch::new(l_x, l_y, l_z); - oklch.to_linear_rgb() - } - }; - - let v_dst = transient_row.get_unchecked_mut((x * channels)..); - *v_dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; - *v_dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; - *v_dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; - if image_configuration.has_alpha() { - let l_a = source_p.add(3).read_unaligned(); - *v_dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a; - } - } - - for (dst_chunks, src_chunks) in dst - .chunks_exact_mut(channels) - .zip(transient_row.chunks_exact_mut(channels)) - { - let rgb = (Rgb::::new( - src_chunks[image_configuration.get_r_channel_offset()] - .max(0.) - .min(1.), - src_chunks[image_configuration.get_g_channel_offset()] - .max(0.) - .min(1.), - src_chunks[image_configuration.get_b_channel_offset()] - .max(0.) - .min(1.), - ) * Rgb::::dup(2048f32)) - .round() - .cast::(); - - dst_chunks[image_configuration.get_r_channel_offset()] = - *lut_table.get_unchecked((rgb.r as usize).min(2048)); - dst_chunks[image_configuration.get_g_channel_offset()] = - *lut_table.get_unchecked((rgb.g as usize).min(2048)); - dst_chunks[image_configuration.get_b_channel_offset()] = - *lut_table.get_unchecked((rgb.b as usize).min(2048)); - if image_configuration.has_alpha() { - let a_lin = (src_chunks[3] * 255f32).round() as u8; - dst_chunks[image_configuration.get_a_channel_offset()] = a_lin; - } - } - }); + iter = dst + .par_chunks_exact_mut(dst_stride as usize) + .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize)); } #[cfg(not(feature = "rayon"))] { - for (dst, src) in dst + iter = dst .chunks_exact_mut(dst_stride as usize) - .zip(src_slice_safe_align.chunks_exact(src_stride as usize)) - { - unsafe { - let mut _cx = 0usize; + .zip(src_slice_safe_align.chunks_exact(src_stride as usize)); + } - let mut transient_row = vec![0f32; width as usize * channels]; + iter.for_each(|(dst, src)| unsafe { + let mut _cx = 0usize; - let src_ptr = src.as_ptr() as *mut f32; + let mut transient_row = vec![0f32; width as usize * channels]; - if let Some(dispatcher) = _wide_row_handle { - _cx = dispatcher(_cx, src_ptr, 0, transient_row.as_mut_ptr(), 0, width) - } + let src_ptr = src.as_ptr() as *mut f32; - for x in _cx..width as usize { - let px = x * channels; - let source_p = src_ptr.add(px); - let l_x = source_p.read_unaligned(); - let l_y = source_p.add(1).read_unaligned(); - let l_z = source_p.add(2).read_unaligned(); - let rgb = match target { - OklabTarget::Oklab => { - let oklab = Oklab::new(l_x, l_y, l_z); - oklab.to_linear_rgb() - } - OklabTarget::Oklch => { - let oklch = Oklch::new(l_x, l_y, l_z); - oklch.to_linear_rgb() - } - }; + if let Some(dispatcher) = _wide_row_handle { + _cx = dispatcher(_cx, src_ptr, 0, transient_row.as_mut_ptr(), 0, width) + } - let v_dst = transient_row.get_unchecked_mut((x * channels)..); - *v_dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; - *v_dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; - *v_dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; - if image_configuration.has_alpha() { - let l_a = source_p.add(3).read_unaligned(); - *v_dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a; - } + for x in _cx..width as usize { + let px = x * channels; + let source_p = src_ptr.add(px); + let l_x = source_p.read_unaligned(); + let l_y = source_p.add(1).read_unaligned(); + let l_z = source_p.add(2).read_unaligned(); + let rgb = match target { + OklabTarget::Oklab => { + let oklab = Oklab::new(l_x, l_y, l_z); + oklab.to_linear_rgb() } - - for (dst_chunks, src_chunks) in dst - .chunks_exact_mut(channels) - .zip(transient_row.chunks_exact_mut(channels)) - { - let rgb = (Rgb::new( - src_chunks[image_configuration.get_r_channel_offset()] - .max(0.) - .min(1.), - src_chunks[image_configuration.get_g_channel_offset()] - .max(0.) - .min(1.), - src_chunks[image_configuration.get_b_channel_offset()] - .max(0.) - .min(1.), - ) * Rgb::dup(2048f32)) - .round() - .cast::(); - - dst_chunks[image_configuration.get_r_channel_offset()] = - *lut_table.get_unchecked((rgb.r as usize).min(2048)); - dst_chunks[image_configuration.get_g_channel_offset()] = - *lut_table.get_unchecked((rgb.g as usize).min(2048)); - dst_chunks[image_configuration.get_b_channel_offset()] = - *lut_table.get_unchecked((rgb.b as usize).min(2048)); - if image_configuration.has_alpha() { - let a_lin = (src_chunks[3] * 255f32).round() as u8; - dst_chunks[image_configuration.get_a_channel_offset()] = a_lin; - } + OklabTarget::Oklch => { + let oklch = Oklch::new(l_x, l_y, l_z); + oklch.to_linear_rgb() } + }; + + let v_dst = transient_row.get_unchecked_mut((x * channels)..); + *v_dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; + *v_dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; + *v_dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; + if image_configuration.has_alpha() { + let l_a = source_p.add(3).read_unaligned(); + *v_dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a; } } - } + + for (dst_chunks, src_chunks) in dst + .chunks_exact_mut(channels) + .zip(transient_row.chunks_exact_mut(channels)) + { + let rgb = (Rgb::::new( + src_chunks[image_configuration.get_r_channel_offset()] + .max(0.) + .min(1.), + src_chunks[image_configuration.get_g_channel_offset()] + .max(0.) + .min(1.), + src_chunks[image_configuration.get_b_channel_offset()] + .max(0.) + .min(1.), + ) * Rgb::::dup(2048f32)) + .round() + .cast::(); + + dst_chunks[image_configuration.get_r_channel_offset()] = + *lut_table.get_unchecked((rgb.r as usize).min(2048)); + dst_chunks[image_configuration.get_g_channel_offset()] = + *lut_table.get_unchecked((rgb.g as usize).min(2048)); + dst_chunks[image_configuration.get_b_channel_offset()] = + *lut_table.get_unchecked((rgb.b as usize).min(2048)); + if image_configuration.has_alpha() { + let a_lin = (src_chunks[3] * 255f32).round() as u8; + dst_chunks[image_configuration.get_a_channel_offset()] = a_lin; + } + } + }); } /// This function converts Oklab with interleaved alpha channel to RGBA. This is much more effective than naive direct transformation diff --git a/src/planar_to_linear.rs b/src/planar_to_linear.rs index 388f85b..3fd79ed 100644 --- a/src/planar_to_linear.rs +++ b/src/planar_to_linear.rs @@ -35,51 +35,38 @@ fn channels_to_linear( lut_table[i] = transfer_function.linearize(i as f32 * (1. / 255.0)); } + let iter; #[cfg(feature = "rayon")] { - dst_slice_safe_align + iter = dst_slice_safe_align .par_chunks_exact_mut(dst_stride as usize) - .zip(src.par_chunks_exact(src_stride as usize)) - .for_each(|(dst, src)| unsafe { - let mut _cx = 0usize; - - let src_ptr = src.as_ptr(); - let dst_ptr = dst.as_mut_ptr() as *mut f32; - - for x in _cx..width as usize { - let px = x; - let dst = dst_ptr.add(px); - let src = src_ptr.add(px); - let transferred = *lut_table.get_unchecked(src.read_unaligned() as usize); - - dst.write_unaligned(transferred); - } - }); + .zip(src.par_chunks_exact(src_stride as usize)); } - #[cfg(not(feature = "rayon"))] { - for (dst, src) in dst_slice_safe_align + iter = dst_slice_safe_align .chunks_exact_mut(dst_stride as usize) - .zip(src.chunks_exact(src_stride as usize)) - { - unsafe { - let mut _cx = 0usize; + .zip(src.chunks_exact(src_stride as usize)); + } + + dst_slice_safe_align + .par_chunks_exact_mut(dst_stride as usize) + .zip(src.par_chunks_exact(src_stride as usize)) + .for_each(|(dst, src)| unsafe { + let mut _cx = 0usize; - let src_ptr = src.as_ptr(); - let dst_ptr = dst.as_mut_ptr() as *mut f32; + let src_ptr = src.as_ptr(); + let dst_ptr = dst.as_mut_ptr() as *mut f32; - for x in _cx..width as usize { - let px = x; - let dst = dst_ptr.add(px); - let src = src_ptr.add(px); - let transferred = *lut_table.get_unchecked(src.read_unaligned() as usize); + for x in _cx..width as usize { + let px = x; + let dst = dst_ptr.add(px); + let src = src_ptr.add(px); + let transferred = *lut_table.get_unchecked(src.read_unaligned() as usize); - dst.write_unaligned(transferred); - } + dst.write_unaligned(transferred); } - } - } + }); } /// This function converts Plane to Linear. This is much more effective than naive direct transformation diff --git a/src/sigmoidal_to_image.rs b/src/sigmoidal_to_image.rs index 7ef23b1..7033feb 100644 --- a/src/sigmoidal_to_image.rs +++ b/src/sigmoidal_to_image.rs @@ -53,113 +53,69 @@ fn sigmoidal_to_image( _wide_row_handler = Some(neon_from_sigmoidal_row::); } + let src_slice_safe_align = unsafe { + slice::from_raw_parts( + src.as_ptr() as *const u8, + src_stride as usize * height as usize, + ) + }; + + let iter; + #[cfg(feature = "rayon")] { - let src_slice_safe_align = unsafe { - slice::from_raw_parts( - src.as_ptr() as *const u8, - src_stride as usize * height as usize, - ) - }; - dst.par_chunks_exact_mut(dst_stride as usize) - .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize)) - .for_each(|(dst, src)| unsafe { - let mut _cx = 0usize; - - let src_ptr = src.as_ptr() as *const f32; - let dst_ptr = dst.as_mut_ptr(); - - if let Some(dispatcher) = _wide_row_handler { - _cx = dispatcher(_cx, src_ptr, dst_ptr, width); - } - - for x in _cx..width as usize { - let px = x * channels; - let reading_ptr = src_ptr.add(px); - let sr = reading_ptr.read_unaligned(); - let sg = reading_ptr.add(1).read_unaligned(); - let sb = reading_ptr.add(2).read_unaligned(); - - let sigmoidal = Sigmoidal::new(sr, sg, sb); - let rgb: Rgb = sigmoidal.into(); - - let hx = x * channels; - - let dst = dst_ptr.add(hx); - - dst.add(image_configuration.get_r_channel_offset()) - .write_unaligned(rgb.r); - dst.add(image_configuration.get_g_channel_offset()) - .write_unaligned(rgb.g); - dst.add(image_configuration.get_b_channel_offset()) - .write_unaligned(rgb.b); - - if image_configuration.has_alpha() { - let a = (reading_ptr.add(3).read_unaligned() * 255f32) - .max(0f32) - .round() - .min(255f32); - dst.add(image_configuration.get_a_channel_offset()) - .write_unaligned(a as u8); - } - } - }); + iter = dst + .par_chunks_exact_mut(dst_stride as usize) + .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize)); } - #[cfg(not(feature = "rayon"))] { - let mut src_offset = 0usize; - let mut dst_offset = 0usize; + iter = dst + .chunks_exact_mut(dst_stride as usize) + .zip(src_slice_safe_align.chunks_exact(src_stride as usize)); + } - for _ in 0..height as usize { - let mut _cx = 0usize; + iter.for_each(|(dst, src)| unsafe { + let mut _cx = 0usize; - let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *const f32 }; - let dst_ptr = unsafe { dst.as_mut_ptr().add(dst_offset) }; + let src_ptr = src.as_ptr() as *const f32; + let dst_ptr = dst.as_mut_ptr(); - if let Some(dispatcher) = _wide_row_handler { - unsafe { - _cx = dispatcher(_cx, src_ptr, dst_ptr, width); - } - } + if let Some(dispatcher) = _wide_row_handler { + _cx = dispatcher(_cx, src_ptr, dst_ptr, width); + } - for x in _cx..width as usize { - let px = x * channels; - let reading_ptr = unsafe { src_ptr.add(px) }; - let sr = unsafe { reading_ptr.read_unaligned() }; - let sg = unsafe { reading_ptr.add(1).read_unaligned() }; - let sb = unsafe { reading_ptr.add(2).read_unaligned() }; - - let sigmoidal = Sigmoidal::new(sr, sg, sb); - let rgb: Rgb = sigmoidal.into(); - - let hx = x * channels; - - let dst = unsafe { dst_ptr.add(hx) }; - - unsafe { - dst.add(image_configuration.get_r_channel_offset()) - .write_unaligned(rgb.r); - dst.add(image_configuration.get_g_channel_offset()) - .write_unaligned(rgb.g); - dst.add(image_configuration.get_b_channel_offset()) - .write_unaligned(rgb.b); - - if image_configuration.has_alpha() { - let a = (reading_ptr.add(3).read_unaligned() * 255f32) - .max(0f32) - .round() - .min(255f32); - dst.add(image_configuration.get_a_channel_offset()) - .write_unaligned(a as u8); - } - } + for x in _cx..width as usize { + let px = x * channels; + let reading_ptr = src_ptr.add(px); + let sr = reading_ptr.read_unaligned(); + let sg = reading_ptr.add(1).read_unaligned(); + let sb = reading_ptr.add(2).read_unaligned(); + + let sigmoidal = Sigmoidal::new(sr, sg, sb); + let rgb: Rgb = sigmoidal.into(); + + let hx = x * channels; + + let dst = dst_ptr.add(hx); + + dst.add(image_configuration.get_r_channel_offset()) + .write_unaligned(rgb.r); + dst.add(image_configuration.get_g_channel_offset()) + .write_unaligned(rgb.g); + dst.add(image_configuration.get_b_channel_offset()) + .write_unaligned(rgb.b); + + if image_configuration.has_alpha() { + let a = (reading_ptr.add(3).read_unaligned() * 255f32) + .max(0f32) + .round() + .min(255f32); + dst.add(image_configuration.get_a_channel_offset()) + .write_unaligned(a as u8); } - - src_offset += src_stride as usize; - dst_offset += dst_stride as usize; } - } + }); } /// This function converts Sigmoid to RGB. This is much more effective than naive direct transformation diff --git a/src/sse/cie.rs b/src/sse/cie.rs index ff2f349..d62e30e 100644 --- a/src/sse/cie.rs +++ b/src/sse/cie.rs @@ -9,9 +9,7 @@ use crate::luv::{ LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y, LUV_MULTIPLIER_INVERSE_Y, LUV_WHITE_U_PRIME, LUV_WHITE_V_PRIME, }; -use crate::sse::{ - _mm_color_matrix_ps, _mm_cube_ps, _mm_prefer_fma_ps, _mm_select_ps, -}; +use crate::sse::{_mm_color_matrix_ps, _mm_cube_ps, _mm_prefer_fma_ps, _mm_select_ps}; use erydanos::{_mm_atan2_ps, _mm_cbrt_fast_ps, _mm_cos_ps, _mm_hypot_ps, _mm_sin_ps}; #[cfg(target_arch = "x86")] use std::arch::x86::*; @@ -33,18 +31,12 @@ pub unsafe fn sse_triple_to_xyz( c8: __m128, c9: __m128, ) -> (__m128, __m128, __m128) { - let (x, y, z) = _mm_color_matrix_ps( - r, g, b, c1, c2, c3, c4, c5, c6, c7, c8, c9, - ); + let (x, y, z) = _mm_color_matrix_ps(r, g, b, c1, c2, c3, c4, c5, c6, c7, c8, c9); (x, y, z) } #[inline(always)] -pub unsafe fn sse_triple_to_luv( - x: __m128, - y: __m128, - z: __m128, -) -> (__m128, __m128, __m128) { +pub unsafe fn sse_triple_to_luv(x: __m128, y: __m128, z: __m128) -> (__m128, __m128, __m128) { let zeros = _mm_setzero_ps(); let den = _mm_prefer_fma_ps( _mm_prefer_fma_ps(x, z, _mm_set1_ps(3f32)), @@ -70,11 +62,7 @@ pub unsafe fn sse_triple_to_luv( } #[inline(always)] -pub unsafe fn sse_triple_to_lab( - x: __m128, - y: __m128, - z: __m128, -) -> (__m128, __m128, __m128) { +pub unsafe fn sse_triple_to_lab(x: __m128, y: __m128, z: __m128) -> (__m128, __m128, __m128) { let x = _mm_mul_ps(x, _mm_set1_ps(100f32 / 95.047f32)); let z = _mm_mul_ps(z, _mm_set1_ps(100f32 / 108.883f32)); let cbrt_x = _mm_cbrt_fast_ps(x); @@ -96,11 +84,7 @@ pub unsafe fn sse_triple_to_lab( } #[inline(always)] -pub unsafe fn sse_triple_to_lch( - x: __m128, - y: __m128, - z: __m128, -) -> (__m128, __m128, __m128) { +pub unsafe fn sse_triple_to_lch(x: __m128, y: __m128, z: __m128) -> (__m128, __m128, __m128) { let (luv_l, luv_u, luv_v) = sse_triple_to_luv(x, y, z); let lch_c = _mm_hypot_ps(luv_u, luv_v); let lch_h = _mm_atan2_ps(luv_v, luv_u); diff --git a/src/sse/gamma_curves.rs b/src/sse/gamma_curves.rs index 76449c9..67e8ee7 100644 --- a/src/sse/gamma_curves.rs +++ b/src/sse/gamma_curves.rs @@ -142,4 +142,4 @@ pub unsafe fn sse_gamma2p2_from_linear(linear: __m128) -> __m128 { #[inline(always)] pub unsafe fn sse_gamma2p8_from_linear(linear: __m128) -> __m128 { sse_pure_gamma(linear, 1f32 / 2.8f32) -} \ No newline at end of file +} diff --git a/src/sse/image_to_oklab.rs b/src/sse/image_to_oklab.rs index 58befe8..5e62a6d 100644 --- a/src/sse/image_to_oklab.rs +++ b/src/sse/image_to_oklab.rs @@ -13,20 +13,19 @@ use std::arch::x86_64::*; use crate::image::ImageConfiguration; use crate::image_to_oklab::OklabTarget; -use crate::sse::{ - _mm_color_matrix_ps, sse_interleave_ps_rgb, - sse_interleave_ps_rgba, +use crate::sse::{_mm_color_matrix_ps, sse_interleave_ps_rgb, sse_interleave_ps_rgba}; +use crate::{ + load_f32_and_deinterleave, store_and_interleave_v3_direct_f32, + store_and_interleave_v4_direct_f32, }; -use crate::{load_f32_and_deinterleave, store_and_interleave_v3_direct_f32, store_and_interleave_v4_direct_f32}; macro_rules! triple_to_oklab { ($r: expr, $g: expr, $b: expr, $target: expr, $c0:expr, $c1:expr, $c2: expr, $c3: expr, $c4:expr, $c5: expr, $c6:expr, $c7: expr, $c8: expr, $m0: expr, $m1: expr, $m2: expr, $m3: expr, $m4: expr, $m5: expr, $m6: expr, $m7: expr, $m8: expr ) => {{ - let (l_l, l_m, l_s) = _mm_color_matrix_ps( - $r, $g, $b, $c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7, $c8, - ); + let (l_l, l_m, l_s) = + _mm_color_matrix_ps($r, $g, $b, $c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7, $c8); let l_ = _mm_cbrt_fast_ps(l_l); let m_ = _mm_cbrt_fast_ps(l_m); @@ -90,28 +89,8 @@ pub unsafe fn sse_image_to_oklab { - let lab = Lab::new(l_x, l_y, l_z); - lab.to_linear_rgb(matrix) - } - XyzTarget::Xyz => { - let xyz = Xyz::new(l_x, l_y, l_z); - xyz.to_linear_rgb(matrix) - } - XyzTarget::Luv => { - let luv = Luv::new(l_x, l_y, l_z); - luv.to_linear_rgb(matrix) - } - XyzTarget::Lch => { - let lch = LCh::new(l_x, l_y, l_z); - lch.to_linear_rgb(matrix) - } - }; - - let l_a = src_ptr.add(px + 3).read_unaligned(); - let dst = transient_row.get_unchecked_mut((x * channels)..); - *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; - *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; - *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; - *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a; - } - - for (dst_chunk, src_chunks) in dst - .chunks_exact_mut(channels) - .zip(transient_row.chunks_exact(channels)) - { - let r_cast = (src_chunks[image_configuration.get_r_channel_offset()] - .min(1.) - .max(0.) - * 2048f32) - .round(); - let g_cast = (src_chunks[image_configuration.get_g_channel_offset()] - .min(1.) - .max(0.) - * 2048f32) - .round(); - let b_cast = (src_chunks[image_configuration.get_b_channel_offset()] - .min(1.) - .max(0.) - * 2048f32) - .round(); - let a_cast = (src_chunks[image_configuration.get_a_channel_offset()] * 255.) - .min(255.) - .max(0.) as u8; - - dst_chunk[image_configuration.get_r_channel_offset()] = - *lut_table.get_unchecked(r_cast as usize); - dst_chunk[image_configuration.get_g_channel_offset()] = - *lut_table.get_unchecked(g_cast as usize); - dst_chunk[image_configuration.get_b_channel_offset()] = - *lut_table.get_unchecked(b_cast as usize); - dst_chunk[image_configuration.get_a_channel_offset()] = a_cast; - } - }); + iter = dst + .par_chunks_exact_mut(dst_stride as usize) + .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize)); } - #[cfg(not(feature = "rayon"))] { - for (dst, src) in dst + iter = dst .chunks_exact_mut(dst_stride as usize) - .zip(src_slice_safe_align.chunks_exact(src_stride as usize)) - { - unsafe { - let channels = image_configuration.get_channels_count(); - - let mut _cx = 0usize; + .zip(src_slice_safe_align.chunks_exact(src_stride as usize)); + } - let mut transient_row = vec![0f32; width as usize * channels]; + iter.for_each(|(dst, src)| unsafe { + let channels = image_configuration.get_channels_count(); - if let Some(dispatcher) = _wide_row_handler { - _cx = dispatcher( - _cx, - src.as_ptr() as *const f32, - 0, - transient_row.as_mut_ptr(), - 0, - width, - matrix, - ) - } + let mut _cx = 0usize; - let src_ptr = src.as_ptr() as *mut f32; + let mut transient_row = vec![0f32; width as usize * channels]; - for x in _cx..width as usize { - let px = x * 4; - let l_x = src_ptr.add(px).read_unaligned(); - let l_y = src_ptr.add(px + 1).read_unaligned(); - let l_z = src_ptr.add(px + 2).read_unaligned(); - let rgb = match source { - XyzTarget::Lab => { - let lab = Lab::new(l_x, l_y, l_z); - lab.to_linear_rgb(matrix) - } - XyzTarget::Xyz => { - let xyz = Xyz::new(l_x, l_y, l_z); - xyz.to_linear_rgb(matrix) - } - XyzTarget::Luv => { - let luv = Luv::new(l_x, l_y, l_z); - luv.to_linear_rgb(matrix) - } - XyzTarget::Lch => { - let lch = LCh::new(l_x, l_y, l_z); - lch.to_linear_rgb(matrix) - } - }; + if let Some(dispatcher) = _wide_row_handler { + _cx = dispatcher( + _cx, + src.as_ptr() as *const f32, + 0, + transient_row.as_mut_ptr(), + 0, + width, + matrix, + ) + } - let l_a = src_ptr.add(px + 3).read_unaligned(); - let dst = transient_row.get_unchecked_mut((x * channels)..); - *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; - *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; - *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; - *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a; + let src_ptr = src.as_ptr() as *mut f32; + + for x in _cx..width as usize { + let px = x * 4; + let l_x = src_ptr.add(px).read_unaligned(); + let l_y = src_ptr.add(px + 1).read_unaligned(); + let l_z = src_ptr.add(px + 2).read_unaligned(); + let rgb = match source { + XyzTarget::Lab => { + let lab = Lab::new(l_x, l_y, l_z); + lab.to_linear_rgb(matrix) } - - for (dst_chunk, src_chunks) in dst - .chunks_exact_mut(channels) - .zip(transient_row.chunks_exact(channels)) - { - let r_cast = (src_chunks[image_configuration.get_r_channel_offset()] - .min(1.) - .max(0.) - * 2048f32) - .round(); - let g_cast = (src_chunks[image_configuration.get_g_channel_offset()] - .min(1.) - .max(0.) - * 2048f32) - .round(); - let b_cast = (src_chunks[image_configuration.get_b_channel_offset()] - .min(1.) - .max(0.) - * 2048f32) - .round(); - let a_cast = (src_chunks[image_configuration.get_a_channel_offset()] * 255.) - .min(255.) - .max(0.) as u8; - - dst_chunk[image_configuration.get_r_channel_offset()] = - *lut_table.get_unchecked(r_cast as usize); - dst_chunk[image_configuration.get_g_channel_offset()] = - *lut_table.get_unchecked(g_cast as usize); - dst_chunk[image_configuration.get_b_channel_offset()] = - *lut_table.get_unchecked(b_cast as usize); - dst_chunk[image_configuration.get_a_channel_offset()] = a_cast; + XyzTarget::Xyz => { + let xyz = Xyz::new(l_x, l_y, l_z); + xyz.to_linear_rgb(matrix) } - } + XyzTarget::Luv => { + let luv = Luv::new(l_x, l_y, l_z); + luv.to_linear_rgb(matrix) + } + XyzTarget::Lch => { + let lch = LCh::new(l_x, l_y, l_z); + lch.to_linear_rgb(matrix) + } + }; + + let l_a = src_ptr.add(px + 3).read_unaligned(); + let dst = transient_row.get_unchecked_mut((x * channels)..); + *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r; + *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g; + *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b; + *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a; } - } + + for (dst_chunk, src_chunks) in dst + .chunks_exact_mut(channels) + .zip(transient_row.chunks_exact(channels)) + { + let r_cast = (src_chunks[image_configuration.get_r_channel_offset()] + .min(1.) + .max(0.) + * 2048f32) + .round(); + let g_cast = (src_chunks[image_configuration.get_g_channel_offset()] + .min(1.) + .max(0.) + * 2048f32) + .round(); + let b_cast = (src_chunks[image_configuration.get_b_channel_offset()] + .min(1.) + .max(0.) + * 2048f32) + .round(); + let a_cast = (src_chunks[image_configuration.get_a_channel_offset()] * 255.) + .min(255.) + .max(0.) as u8; + + dst_chunk[image_configuration.get_r_channel_offset()] = + *lut_table.get_unchecked(r_cast as usize); + dst_chunk[image_configuration.get_g_channel_offset()] = + *lut_table.get_unchecked(g_cast as usize); + dst_chunk[image_configuration.get_b_channel_offset()] = + *lut_table.get_unchecked(b_cast as usize); + dst_chunk[image_configuration.get_a_channel_offset()] = a_cast; + } + }); } /// This function converts LAB with interleaved alpha channel to RGBA. This is much more effective than naive direct transformation