diff --git a/src/app/src/main.rs b/src/app/src/main.rs index 45d6510..bb51a19 100644 --- a/src/app/src/main.rs +++ b/src/app/src/main.rs @@ -1,7 +1,7 @@ use std::time::Instant; -use image::{EncodableLayout, GenericImageView}; use image::io::Reader as ImageReader; +use image::{EncodableLayout, GenericImageView}; use colorutils_rs::*; @@ -23,7 +23,7 @@ fn main() { println!("HSL {:?}", hsl); println!("Back RGB {:?}", hsl.to_rgb8()); - let img = ImageReader::open("./assets/beach_horizon.jpg") + let img = ImageReader::open("./assets/asset.jpg") .unwrap() .decode() .unwrap(); @@ -100,7 +100,7 @@ fn main() { src_stride, width, height, - TransferFunction::Gamma2p8 + TransferFunction::Gamma2p8, ); let elapsed_time = start_time.elapsed(); diff --git a/src/avx/avx_color.rs b/src/avx/color.rs similarity index 100% rename from src/avx/avx_color.rs rename to src/avx/color.rs diff --git a/src/avx/avx_gamma_curves.rs b/src/avx/gamma_curves.rs similarity index 99% rename from src/avx/avx_gamma_curves.rs rename to src/avx/gamma_curves.rs index 143ac66..4fddce8 100644 --- a/src/avx/avx_gamma_curves.rs +++ b/src/avx/gamma_curves.rs @@ -1,4 +1,4 @@ -use crate::avx::avx_math::*; +use crate::avx::math::*; #[allow(unused_imports)] use crate::gamma_curves::TransferFunction; #[cfg(target_arch = "x86")] diff --git a/src/avx/linear_to_image.rs b/src/avx/linear_to_image.rs new file mode 100644 index 0000000..ced82a5 --- /dev/null +++ b/src/avx/linear_to_image.rs @@ -0,0 +1,153 @@ +use crate::avx::gamma_curves::get_avx_gamma_transfer; +use crate::avx::{ + avx2_deinterleave_rgb_ps, avx2_deinterleave_rgba_ps, avx2_interleave_rgb, + avx2_interleave_rgba_epi8, avx2_pack_s32, avx2_pack_u16, +}; +use crate::image::ImageConfiguration; +use crate::TransferFunction; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +#[inline(always)] +unsafe fn gamma_vld( + src: *const f32, + transfer_function: TransferFunction, +) -> (__m256i, __m256i, __m256i, __m256i) { + let d_alpha = _mm256_set1_ps(1f32); + let transfer = get_avx_gamma_transfer(transfer_function); + let v_scale_alpha = _mm256_set1_ps(255f32); + let (mut r_f32, mut g_f32, mut b_f32, mut a_f32); + let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); + + let row0 = _mm256_loadu_ps(src); + let row1 = _mm256_loadu_ps(src.add(8)); + let row2 = _mm256_loadu_ps(src.add(16)); + + match image_configuration { + ImageConfiguration::Rgba | ImageConfiguration::Bgra => { + let row3 = _mm256_loadu_ps(src.add(24)); + let (v0, v1, v2, v3) = avx2_deinterleave_rgba_ps(row0, row1, row2, row3); + if image_configuration == ImageConfiguration::Rgba { + r_f32 = v0; + g_f32 = v1; + b_f32 = v2; + } else { + r_f32 = v2; + g_f32 = v1; + b_f32 = v0; + } + a_f32 = v3; + } + ImageConfiguration::Bgr | ImageConfiguration::Rgb => { + let rgb_pixels = avx2_deinterleave_rgb_ps(row0, row1, row2); + if image_configuration == ImageConfiguration::Rgb { + r_f32 = rgb_pixels.0; + g_f32 = rgb_pixels.1; + b_f32 = rgb_pixels.2; + } else { + r_f32 = rgb_pixels.2; + g_f32 = rgb_pixels.1; + b_f32 = rgb_pixels.0; + } + a_f32 = d_alpha; + } + } + + let zeros = _mm256_setzero_ps(); + r_f32 = _mm256_max_ps(_mm256_min_ps(r_f32, d_alpha), zeros); + g_f32 = _mm256_max_ps(_mm256_min_ps(g_f32, d_alpha), zeros); + b_f32 = _mm256_max_ps(_mm256_min_ps(b_f32, d_alpha), zeros); + + r_f32 = transfer(r_f32); + g_f32 = transfer(g_f32); + b_f32 = transfer(b_f32); + r_f32 = _mm256_mul_ps(r_f32, v_scale_alpha); + g_f32 = _mm256_mul_ps(g_f32, v_scale_alpha); + b_f32 = _mm256_mul_ps(b_f32, v_scale_alpha); + if USE_ALPHA { + a_f32 = _mm256_mul_ps(a_f32, v_scale_alpha); + } + ( + _mm256_cvtps_epi32(_mm256_round_ps::<0>(r_f32)), + _mm256_cvtps_epi32(_mm256_round_ps::<0>(g_f32)), + _mm256_cvtps_epi32(_mm256_round_ps::<0>(b_f32)), + _mm256_cvtps_epi32(_mm256_round_ps::<0>(a_f32)), + ) +} + +#[inline(always)] +pub unsafe fn avx_linear_to_gamma( + start_cx: usize, + src: *const f32, + src_offset: u32, + dst: *mut u8, + dst_offset: u32, + width: u32, + transfer_function: TransferFunction, +) -> usize { + let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); + let channels = image_configuration.get_channels_count(); + let mut cx = start_cx; + + while cx + 32 < width as usize { + let offset_src_ptr = + ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels); + + let src_ptr_0 = offset_src_ptr; + + let (r_row0_, g_row0_, b_row0_, a_row0_) = + gamma_vld::(src_ptr_0, transfer_function); + + let src_ptr_1 = offset_src_ptr.add(8 * channels); + + let (r_row1_, g_row1_, b_row1_, a_row1_) = + gamma_vld::(src_ptr_1, transfer_function); + + let src_ptr_2 = offset_src_ptr.add(8 * 2 * channels); + + let (r_row2_, g_row2_, b_row2_, a_row2_) = + gamma_vld::(src_ptr_2, transfer_function); + + let src_ptr_3 = offset_src_ptr.add(8 * 3 * channels); + + let (r_row3_, g_row3_, b_row3_, a_row3_) = + gamma_vld::(src_ptr_3, transfer_function); + + let r_row01 = avx2_pack_s32(r_row0_, r_row1_); + let g_row01 = avx2_pack_s32(g_row0_, g_row1_); + let b_row01 = avx2_pack_s32(b_row0_, b_row1_); + + let r_row23 = avx2_pack_s32(r_row2_, r_row3_); + let g_row23 = avx2_pack_s32(g_row2_, g_row3_); + let b_row23 = avx2_pack_s32(b_row2_, b_row3_); + + let r_row = avx2_pack_u16(r_row01, r_row23); + let g_row = avx2_pack_u16(g_row01, g_row23); + let b_row = avx2_pack_u16(b_row01, b_row23); + + let dst_ptr = dst.add(dst_offset as usize + cx * channels); + + if USE_ALPHA { + let a_row01 = avx2_pack_s32(a_row0_, a_row1_); + let a_row23 = avx2_pack_s32(a_row2_, a_row3_); + let a_row = avx2_pack_u16(a_row01, a_row23); + let (rgba0, rgba1, rgba2, rgba3) = + avx2_interleave_rgba_epi8(r_row, g_row, b_row, a_row); + _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0); + _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1); + _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2); + _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3); + } else { + let (rgb0, rgb1, rgb2) = avx2_interleave_rgb(r_row, g_row, b_row); + _mm256_storeu_si256(dst_ptr as *mut __m256i, rgb0); + _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgb1); + _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgb2); + } + + cx += 32; + } + + cx +} diff --git a/src/avx/avx_math.rs b/src/avx/math.rs similarity index 100% rename from src/avx/avx_math.rs rename to src/avx/math.rs diff --git a/src/avx/mod.rs b/src/avx/mod.rs index 6f9b286..3736abc 100644 --- a/src/avx/mod.rs +++ b/src/avx/mod.rs @@ -5,30 +5,22 @@ * // license that can be found in the LICENSE file. */ -mod avx2_to_xyz_lab; - -mod avx2_utils; - -mod avx_color; - -mod avx_gamma_curves; - -mod avx_math; - -mod avx_support; - -mod avx_xyz_lab_to_image; - -mod avx_xyza_laba_to_image; - -pub use avx2_to_xyz_lab::*; - -pub use avx2_utils::*; - -pub use avx_math::*; - -pub use avx_support::*; - -pub use avx_xyz_lab_to_image::*; - -pub use avx_xyza_laba_to_image::*; +mod to_xyz_lab; +mod utils; +mod color; +mod gamma_curves; +mod math; +mod support; +mod xyz_lab_to_image; +mod linear_to_image; +mod xyza_laba_to_image; +mod to_linear; + +pub use linear_to_image::avx_linear_to_gamma; +pub use math::*; +pub use support::*; +pub use to_xyz_lab::*; +pub use utils::*; +pub use xyz_lab_to_image::*; +pub use xyza_laba_to_image::*; +pub use to_linear::avx_channels_to_linear; \ No newline at end of file diff --git a/src/avx/avx_support.rs b/src/avx/support.rs similarity index 100% rename from src/avx/avx_support.rs rename to src/avx/support.rs diff --git a/src/avx/to_linear.rs b/src/avx/to_linear.rs new file mode 100644 index 0000000..484edf6 --- /dev/null +++ b/src/avx/to_linear.rs @@ -0,0 +1,207 @@ +use crate::avx::gamma_curves::get_avx2_linear_transfer; +use crate::avx::{ + avx2_deinterleave_rgb_epi8, avx2_deinterleave_rgba_epi8, avx2_interleave_rgb_ps, + avx2_interleave_rgba_ps, +}; +use crate::gamma_curves::TransferFunction; +use crate::image::ImageConfiguration; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +#[inline(always)] +unsafe fn triple_to_linear( + r: __m256i, + g: __m256i, + b: __m256i, + transfer: &unsafe fn(__m256) -> __m256, +) -> (__m256, __m256, __m256) { + let u8_scale = _mm256_set1_ps(1f32 / 255f32); + let r_f = _mm256_mul_ps(_mm256_cvtepi32_ps(r), u8_scale); + let g_f = _mm256_mul_ps(_mm256_cvtepi32_ps(g), u8_scale); + let b_f = _mm256_mul_ps(_mm256_cvtepi32_ps(b), u8_scale); + let r_linear = transfer(r_f); + let g_linear = transfer(g_f); + let b_linear = transfer(b_f); + (r_linear, g_linear, b_linear) +} + +#[inline(always)] +pub unsafe fn avx_channels_to_linear( + start_cx: usize, + src: *const u8, + src_offset: usize, + width: u32, + dst: *mut f32, + dst_offset: usize, + transfer_function: TransferFunction, +) -> usize { + let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into(); + let channels = image_configuration.get_channels_count(); + let mut cx = start_cx; + + let transfer = get_avx2_linear_transfer(transfer_function); + + let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32; + + while cx + 32 < width as usize { + let (r_chan, g_chan, b_chan, a_chan); + let src_ptr = src.add(src_offset + cx * channels); + let row1 = _mm256_loadu_si256(src_ptr as *const __m256i); + let row2 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i); + let row3 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i); + match image_configuration { + ImageConfiguration::Rgb | ImageConfiguration::Bgr => { + let (c1, c2, c3) = avx2_deinterleave_rgb_epi8(row1, row2, row3); + if image_configuration == ImageConfiguration::Rgb { + r_chan = c1; + g_chan = c2; + b_chan = c3; + } else { + r_chan = c3; + g_chan = c2; + b_chan = c1; + } + a_chan = _mm256_set1_epi8(-128); + } + ImageConfiguration::Rgba | ImageConfiguration::Bgra => { + let row4 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i); + let (c1, c2, c3, c4) = avx2_deinterleave_rgba_epi8(row1, row2, row3, row4); + if image_configuration == ImageConfiguration::Rgba { + r_chan = c1; + g_chan = c2; + b_chan = c3; + a_chan = c4; + } else { + r_chan = c3; + g_chan = c2; + b_chan = c1; + a_chan = c4; + } + } + } + + let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan)); + let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan)); + let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan)); + + let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low)); + let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low)); + let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low)); + + let (x_low_low, y_low_low, z_low_low) = + triple_to_linear(r_low_low, g_low_low, b_low_low, &transfer); + + let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan)); + + let u8_scale = _mm256_set1_ps(1f32 / 255f32); + + if USE_ALPHA { + let a_low_low = _mm256_mul_ps( + _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))), + u8_scale, + ); + + let (v0, v1, v2, v3) = + avx2_interleave_rgba_ps(x_low_low, y_low_low, z_low_low, a_low_low); + _mm256_storeu_ps(dst_ptr.add(cx * 4), v0); + _mm256_storeu_ps(dst_ptr.add(cx * 4 + 8), v1); + _mm256_storeu_ps(dst_ptr.add(cx * 4 + 16), v2); + _mm256_storeu_ps(dst_ptr.add(cx * 4 + 24), v3); + } else { + let (v0, v1, v2) = avx2_interleave_rgb_ps(x_low_low, y_low_low, z_low_low); + _mm256_storeu_ps(dst_ptr.add(cx * 3), v0); + _mm256_storeu_ps(dst_ptr.add(cx * 3 + 8), v1); + _mm256_storeu_ps(dst_ptr.add(cx * 3 + 16), v2); + } + + let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low)); + let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low)); + let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low)); + + let (x_low_high, y_low_high, z_low_high) = + triple_to_linear(r_low_high, g_low_high, b_low_high, &transfer); + + if USE_ALPHA { + let a_low_high = _mm256_mul_ps( + _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(a_low))), + u8_scale, + ); + + let (v0, v1, v2, v3) = + avx2_interleave_rgba_ps(x_low_high, y_low_high, z_low_high, a_low_high); + _mm256_storeu_ps(dst_ptr.add(cx * 4 + 32), v0); + _mm256_storeu_ps(dst_ptr.add(cx * 4 + 32 + 8), v1); + _mm256_storeu_ps(dst_ptr.add(cx * 4 + 32 + 16), v2); + _mm256_storeu_ps(dst_ptr.add(cx * 4 + 32 + 24), v3); + } else { + let (v0, v1, v2) = avx2_interleave_rgb_ps(x_low_high, y_low_high, z_low_high); + _mm256_storeu_ps(dst_ptr.add(cx * 3 + 24), v0); + _mm256_storeu_ps(dst_ptr.add(cx * 3 + 24 + 8), v1); + _mm256_storeu_ps(dst_ptr.add(cx * 3 + 24 + 16), v2); + } + + let r_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(r_chan)); + let g_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(g_chan)); + let b_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(b_chan)); + + let r_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_high)); + let g_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_high)); + let b_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_high)); + + let (x_high_low, y_high_low, z_high_low) = + triple_to_linear(r_high_low, g_high_low, b_high_low, &transfer); + + let a_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(a_chan)); + + if USE_ALPHA { + let a_high_low = _mm256_mul_ps( + _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_high))), + u8_scale, + ); + + let (v0, v1, v2, v3) = + avx2_interleave_rgba_ps(x_high_low, y_high_low, z_high_low, a_high_low); + _mm256_storeu_ps(dst_ptr.add(cx * 4 + 64), v0); + _mm256_storeu_ps(dst_ptr.add(cx * 4 + 64 + 8), v1); + _mm256_storeu_ps(dst_ptr.add(cx * 4 + 64 + 16), v2); + _mm256_storeu_ps(dst_ptr.add(cx * 4 + 64 + 32), v3); + } else { + let (v0, v1, v2) = avx2_interleave_rgb_ps(x_high_low, y_high_low, z_high_low); + _mm256_storeu_ps(dst_ptr.add(cx * 3 + 48), v0); + _mm256_storeu_ps(dst_ptr.add(cx * 3 + 48 + 8), v1); + _mm256_storeu_ps(dst_ptr.add(cx * 3 + 48 + 16), v2); + } + + let r_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_high)); + let g_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_high)); + let b_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_high)); + + let (x_high_high, y_high_high, z_high_high) = + triple_to_linear(r_high_high, g_high_high, b_high_high, &transfer); + + if USE_ALPHA { + let a_high_high = _mm256_mul_ps( + _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(a_high))), + u8_scale, + ); + + let (v0, v1, v2, v3) = + avx2_interleave_rgba_ps(x_high_high, y_high_high, z_high_high, a_high_high); + _mm256_storeu_ps(dst_ptr.add(cx * 4 + 96), v0); + _mm256_storeu_ps(dst_ptr.add(cx * 4 + 96 + 8), v1); + _mm256_storeu_ps(dst_ptr.add(cx * 4 + 96 + 16), v2); + _mm256_storeu_ps(dst_ptr.add(cx * 4 + 96 + 32), v3); + } else { + let (v0, v1, v2) = avx2_interleave_rgb_ps(x_high_high, y_high_high, z_high_high); + _mm256_storeu_ps(dst_ptr.add(cx * 3 + 24 * 3), v0); + _mm256_storeu_ps(dst_ptr.add(cx * 3 + 24 * 3 + 8), v1); + _mm256_storeu_ps(dst_ptr.add(cx * 3 + 24 * 3 + 16), v2); + } + + cx += 32; + } + + cx +} diff --git a/src/avx/avx2_to_xyz_lab.rs b/src/avx/to_xyz_lab.rs similarity index 99% rename from src/avx/avx2_to_xyz_lab.rs rename to src/avx/to_xyz_lab.rs index 13e6e83..d89348d 100644 --- a/src/avx/avx2_to_xyz_lab.rs +++ b/src/avx/to_xyz_lab.rs @@ -3,8 +3,8 @@ use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; +use crate::avx::gamma_curves::get_avx2_linear_transfer; use crate::avx::*; -use crate::avx::avx_gamma_curves::get_avx2_linear_transfer; #[allow(unused_imports)] use crate::gamma_curves::TransferFunction; #[allow(unused_imports)] diff --git a/src/avx/avx2_utils.rs b/src/avx/utils.rs similarity index 100% rename from src/avx/avx2_utils.rs rename to src/avx/utils.rs diff --git a/src/avx/avx_xyz_lab_to_image.rs b/src/avx/xyz_lab_to_image.rs similarity index 98% rename from src/avx/avx_xyz_lab_to_image.rs rename to src/avx/xyz_lab_to_image.rs index f6da6b6..26f261c 100644 --- a/src/avx/avx_xyz_lab_to_image.rs +++ b/src/avx/xyz_lab_to_image.rs @@ -1,5 +1,5 @@ -use crate::avx::avx_color::{avx_lab_to_xyz, avx_luv_to_xyz}; -use crate::avx::avx_gamma_curves::get_avx_gamma_transfer; +use crate::avx::color::{avx_lab_to_xyz, avx_luv_to_xyz}; +use crate::avx::gamma_curves::get_avx_gamma_transfer; use crate::avx::{ _mm256_color_matrix_ps, avx2_deinterleave_rgb_ps, avx2_interleave_rgb, avx2_interleave_rgba_epi8, avx2_pack_s32, avx2_pack_u16, diff --git a/src/avx/avx_xyza_laba_to_image.rs b/src/avx/xyza_laba_to_image.rs similarity index 96% rename from src/avx/avx_xyza_laba_to_image.rs rename to src/avx/xyza_laba_to_image.rs index 93c1ce8..1cc8d28 100644 --- a/src/avx/avx_xyza_laba_to_image.rs +++ b/src/avx/xyza_laba_to_image.rs @@ -3,9 +3,12 @@ use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -use crate::avx::avx_color::{avx_lab_to_xyz, avx_luv_to_xyz}; -use crate::avx::avx_gamma_curves::get_avx_gamma_transfer; -use crate::avx::{_mm256_color_matrix_ps, avx2_deinterleave_rgba_ps, avx2_interleave_rgba_epi8, avx2_pack_s32, avx2_pack_u16}; +use crate::avx::color::{avx_lab_to_xyz, avx_luv_to_xyz}; +use crate::avx::gamma_curves::get_avx_gamma_transfer; +use crate::avx::{ + _mm256_color_matrix_ps, avx2_deinterleave_rgba_ps, avx2_interleave_rgba_epi8, avx2_pack_s32, + avx2_pack_u16, +}; use crate::image::ImageConfiguration; use crate::image_to_xyz_lab::XyzTarget; use crate::TransferFunction; diff --git a/src/image_to_linear.rs b/src/image_to_linear.rs index 2fad1cc..3a8b889 100644 --- a/src/image_to_linear.rs +++ b/src/image_to_linear.rs @@ -1,5 +1,8 @@ -use std::slice; - +#[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "avx2" +))] +use crate::avx::avx_channels_to_linear; use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; #[cfg(all( @@ -13,6 +16,7 @@ use crate::neon::neon_channels_to_linear; ))] use crate::sse::*; use crate::Rgb; +use std::slice; #[inline(always)] fn channels_to_linear( @@ -52,9 +56,41 @@ fn channels_to_linear( _has_sse = true; } + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "avx2" + ))] + let mut _has_avx2 = false; + + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "avx2" + ))] + if is_x86_feature_detected!("avx2") { + _has_avx2 = true; + } + for _ in 0..height as usize { let mut _cx = 0usize; + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "avx2" + ))] + unsafe { + if _has_avx2 { + _cx = avx_channels_to_linear::( + _cx, + src.as_ptr(), + src_offset, + width, + dst.as_mut_ptr(), + dst_offset, + transfer_function, + ) + } + } + #[cfg(all( any(target_arch = "x86_64", target_arch = "x86"), target_feature = "sse4.1" diff --git a/src/linear_to_image.rs b/src/linear_to_image.rs index 9e43290..53c341e 100644 --- a/src/linear_to_image.rs +++ b/src/linear_to_image.rs @@ -1,5 +1,8 @@ -use std::slice; - +#[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "avx2" +))] +use crate::avx::avx_linear_to_gamma; use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; #[cfg(all( @@ -13,6 +16,7 @@ use crate::neon::neon_linear_to_gamma; ))] use crate::sse::sse_linear_to_gamma; use crate::Rgb; +use std::slice; #[inline(always)] fn linear_to_gamma_channels( @@ -52,9 +56,41 @@ fn linear_to_gamma_channels( + _cx, + src.as_ptr(), + src_offset as u32, + dst.as_mut_ptr(), + dst_offset as u32, + width, + transfer_function, + ) + } + } + #[cfg(all( any(target_arch = "x86_64", target_arch = "x86"), target_feature = "sse4.1" @@ -113,7 +149,11 @@ fn linear_to_gamma_channels::new(r, g, b); + let rgb = Rgb::::new( + r.min(1f32).max(0f32), + g.min(1f32).max(0f32), + b.min(1f32).max(0f32), + ); unsafe { *dst_slice.get_unchecked_mut(px) = (transfer(rgb.r) * 255f32) as u8; diff --git a/src/neon/mod.rs b/src/neon/mod.rs index a5d7bd0..2fca0d5 100644 --- a/src/neon/mod.rs +++ b/src/neon/mod.rs @@ -1,29 +1,29 @@ -mod from_sigmoidal; mod colors; +mod from_sigmoidal; mod gamma_curves; mod hsv_to_image; mod image_to_hsv; mod linear_to_image; mod math; +mod sigmoidal; mod to_linear; mod to_linear_u8; +mod to_sigmoidal; mod to_xyz_lab; mod to_xyza_laba; mod xyz_lab_to_image; mod xyza_laba_to_image; -mod sigmoidal; -mod to_sigmoidal; -pub use from_sigmoidal::neon_from_sigmoidal_row; pub use colors::*; +pub use from_sigmoidal::neon_from_sigmoidal_row; pub use gamma_curves::*; pub use hsv_to_image::*; pub use image_to_hsv::*; pub use linear_to_image::*; pub use to_linear::*; pub use to_linear_u8::*; +pub use to_sigmoidal::neon_image_to_sigmoidal; pub use to_xyz_lab::*; pub use to_xyza_laba::*; pub use xyz_lab_to_image::*; pub use xyza_laba_to_image::*; -pub use to_sigmoidal::neon_image_to_sigmoidal; diff --git a/src/sse/image_to_linear_u8.rs b/src/sse/image_to_linear_u8.rs index 2e6608d..e506b64 100644 --- a/src/sse/image_to_linear_u8.rs +++ b/src/sse/image_to_linear_u8.rs @@ -1,11 +1,6 @@ #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] pub mod sse_image_to_linear_unsigned { - #[allow(unused_imports)] - use crate::gamma_curves::TransferFunction; - #[allow(unused_imports)] use crate::image::ImageConfiguration; - #[allow(unused_imports)] - use crate::image_to_xyz_lab::XyzTarget; use crate::sse::*; #[cfg(target_arch = "x86")] use std::arch::x86::*; diff --git a/src/sse/linear_to_image.rs b/src/sse/linear_to_image.rs index c248392..01beb23 100644 --- a/src/sse/linear_to_image.rs +++ b/src/sse/linear_to_image.rs @@ -1,15 +1,11 @@ -#[allow(unused_imports)] use crate::image::ImageConfiguration; -#[allow(unused_imports)] use crate::sse::*; -#[allow(unused_imports)] use crate::TransferFunction; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] unsafe fn sse_gamma_vld( src: *const f32, @@ -55,6 +51,11 @@ unsafe fn sse_gamma_vld } } + let zeros = _mm_setzero_ps(); + r_f32 = _mm_max_ps(_mm_min_ps(r_f32, d_alpha), zeros); + g_f32 = _mm_max_ps(_mm_min_ps(g_f32, d_alpha), zeros); + b_f32 = _mm_max_ps(_mm_min_ps(b_f32, d_alpha), zeros); + r_f32 = transfer(r_f32); g_f32 = transfer(g_f32); b_f32 = transfer(b_f32); @@ -64,15 +65,16 @@ unsafe fn sse_gamma_vld if USE_ALPHA { a_f32 = _mm_mul_ps(a_f32, v_scale_alpha); } + const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; + ( - _mm_cvtps_epi32(r_f32), - _mm_cvtps_epi32(g_f32), - _mm_cvtps_epi32(b_f32), - _mm_cvtps_epi32(a_f32), + _mm_cvtps_epi32(_mm_round_ps::(r_f32)), + _mm_cvtps_epi32(_mm_round_ps::(g_f32)), + _mm_cvtps_epi32(_mm_round_ps::(b_f32)), + _mm_cvtps_epi32(_mm_round_ps::(a_f32)), ) } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] pub unsafe fn sse_linear_to_gamma( start_cx: usize, diff --git a/src/sse/math.rs b/src/sse/math.rs index 9866c4d..1b51179 100644 --- a/src/sse/math.rs +++ b/src/sse/math.rs @@ -150,16 +150,12 @@ pub unsafe fn _mm_exp_ps(x: __m128) -> __m128 { return poly; } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm_pow_ps(x: __m128, n: __m128) -> __m128 { _mm_exp_ps(_mm_mul_ps(n, _mm_log_ps(x))) } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] -#[allow(dead_code)] pub unsafe fn _mm_pow_n_ps(x: __m128, n: f32) -> __m128 { _mm_exp_ps(_mm_mul_ps(_mm_set1_ps(n), _mm_log_ps(x))) } diff --git a/src/sse/to_linear.rs b/src/sse/to_linear.rs index 8ca9592..628d299 100644 --- a/src/sse/to_linear.rs +++ b/src/sse/to_linear.rs @@ -1,18 +1,11 @@ -#[allow(unused_imports)] use crate::gamma_curves::TransferFunction; -#[allow(unused_imports)] use crate::image::ImageConfiguration; -#[allow(unused_imports)] -use crate::image_to_xyz_lab::XyzTarget; -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -#[allow(unused_imports)] use crate::sse::*; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] unsafe fn sse_triple_to_linear( r: __m128i, @@ -30,7 +23,6 @@ unsafe fn sse_triple_to_linear( (r_linear, g_linear, b_linear) } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] pub unsafe fn sse_channels_to_linear( start_cx: usize, diff --git a/src/sse/to_xyz_lab.rs b/src/sse/to_xyz_lab.rs index ab0e598..72e1064 100644 --- a/src/sse/to_xyz_lab.rs +++ b/src/sse/to_xyz_lab.rs @@ -1,9 +1,7 @@ use crate::gamma_curves::TransferFunction; use crate::image::ImageConfiguration; -#[allow(unused_imports)] use crate::image_to_xyz_lab::XyzTarget; use crate::luv::{LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y}; -#[allow(unused_imports)] use crate::sse::*; #[cfg(target_arch = "x86")] use std::arch::x86::*; @@ -40,7 +38,6 @@ pub(crate) unsafe fn sse_triple_to_xyz( (x, y, z) } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] pub(crate) unsafe fn sse_triple_to_luv( x: __m128, @@ -71,7 +68,6 @@ pub(crate) unsafe fn sse_triple_to_luv( (l, u, v) } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] pub(crate) unsafe fn sse_triple_to_lab( x: __m128, @@ -99,7 +95,6 @@ pub(crate) unsafe fn sse_triple_to_lab( (l, a, b) } -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] pub unsafe fn sse_channels_to_xyz_or_lab< const CHANNELS_CONFIGURATION: u8,