Skip to content

Commit

Permalink
AVX2 fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Jun 15, 2024
1 parent baba914 commit e84a311
Show file tree
Hide file tree
Showing 10 changed files with 71 additions and 123 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ workspace = { members = ["src/app"] }

[package]
name = "colorutils-rs"
version = "0.4.0"
version = "0.4.1"
edition = "2021"
description = "High performance utilities for color format handling and conversion."
readme = "README.md"
Expand Down
83 changes: 8 additions & 75 deletions src/app/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
use colorutils_rs::*;
use image::io::Reader as ImageReader;
use image::{EncodableLayout, GenericImageView};
use std::time::Instant;

use image::{EncodableLayout, GenericImageView};
use image::io::Reader as ImageReader;

use colorutils_rs::*;

#[cfg(target_arch = "x86_64")]
#[inline(always)]
#[allow(dead_code)]
Expand All @@ -12,27 +14,6 @@ pub const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
}

fn main() {
// #[cfg(target_arch = "x86_64")]
// unsafe {
// println!("HAS fma: {}", is_x86_feature_detected!("fma"));
// let mut dst: [f32; 4] = [0f32; 4];
// // let src = _mm_setr_ps(0.119973198f32, 0.0428578928f32, 0.225254923f32, 27f32);
// let src = _mm_setr_ps(0.0428578928f32, 0.0428578928f32, 0.0428578928f32, 27f32);
// let rgb = Rgb::<u8>::new(0, 0, 0);
// let xyz = Xyz::from_srgb(&rgb);
// let ln = _mm_cbrt_ps(src);
// println!("X: {}, Y: {}, Z: {}", 0.119973198f32.cbrt(), 0.0428578928f32.cbrt(), 0.225254923f32.cbrt());
// _mm_storeu_ps(dst.as_mut_ptr() as *mut f32, ln);
// println!("{:?}", dst);
// }
// #[cfg(target_arch = "aarch64")]
// unsafe {
// let m = vdupq_n_f32(27f32);
// let cbrt = vcbrtq_f32_ulp2(m);
// let l = vgetq_lane_f32::<0>(cbrt);
// println!("Cbrt {}", l);
// }

let r = 140;
let g = 164;
let b = 177;
Expand All @@ -42,55 +23,7 @@ fn main() {
println!("HSL {:?}", hsl);
println!("Back RGB {:?}", hsl.to_rgb8());

// unsafe {
// let (h, s, l) = neon_rgb_to_hsl(
// vdupq_n_u32(r as u32),
// vdupq_n_u32(g as u32),
// vdupq_n_u32(b as u32),
// vdupq_n_f32(1f32),
// );
// println!(
// "NEON HSL {}, {}, {}",
// vgetq_lane_f32::<0>(h),
// vgetq_lane_f32::<0>(s),
// vgetq_lane_f32::<0>(l)
// );
// let (r1, g1, b1) = neon_hsl_to_rgb(h, s, l, vdupq_n_f32(1f32));
//
// println!(
// "NEON HSL -> RGB {}, {}, {}",
// vgetq_lane_u32::<0>(r1),
// vgetq_lane_u32::<0>(g1),
// vgetq_lane_u32::<0>(b1)
// );
// }
//
// unsafe {
// let (h, s, v) = neon_rgb_to_hsv(
// vdupq_n_u32(r as u32),
// vdupq_n_u32(g as u32),
// vdupq_n_u32(b as u32),
// vdupq_n_f32(1f32),
// );
// let hsv = rgb.to_hsv();
// println!("HSV {:?}", hsv);
// println!("HSV->RBB {:?}", hsv.to_rgb8());
// println!(
// "NEON HSV {}, {}, {}",
// vgetq_lane_f32::<0>(h),
// vgetq_lane_f32::<0>(s),
// vgetq_lane_f32::<0>(v)
// );
// let (r1, g1, b1) = neon_hsv_to_rgb(h, s, v, vdupq_n_f32(1f32));
// println!(
// "NEON RGB {}, {}, {}",
// vgetq_lane_u32::<0>(r1),
// vgetq_lane_u32::<0>(g1),
// vgetq_lane_u32::<0>(b1)
// );
// }

let img = ImageReader::open("./assets/asset.jpg")
let img = ImageReader::open("./assets/asset_middle.jpg")
.unwrap()
.decode()
.unwrap();
Expand Down Expand Up @@ -125,7 +58,7 @@ fn main() {
lab_store.resize(width as usize * components * height as usize, 0f32);
let src_stride = width * components as u32;
let start_time = Instant::now();
rgb_to_sigmoidal(
rgb_to_lab(
src_bytes,
src_stride,
&mut lab_store,
Expand Down Expand Up @@ -159,7 +92,7 @@ fn main() {
// }

let start_time = Instant::now();
sigmoidal_to_rgb(
lab_to_srgb(
&lab_store,
store_stride as u32,
&mut dst_slice,
Expand Down
4 changes: 2 additions & 2 deletions src/avx/avx2_to_xyz_lab.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ pub(crate) unsafe fn avx2_channels_to_xyz_or_lab<
let row3 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i);
match image_configuration {
ImageConfiguration::Rgb | ImageConfiguration::Bgr => {
let (c1, c2, c3) = avx2_deinterleave_rgb(row1, row2, row3);
let (c1, c2, c3) = avx2_deinterleave_rgb_epi8(row1, row2, row3);
if image_configuration == ImageConfiguration::Rgb {
r_chan = c1;
g_chan = c2;
Expand All @@ -177,7 +177,7 @@ pub(crate) unsafe fn avx2_channels_to_xyz_or_lab<
}
ImageConfiguration::Rgba | ImageConfiguration::Bgra => {
let row4 = _mm256_loadu_si256(src_ptr.add(64 + 32) as *const __m256i);
let (c1, c2, c3, c4) = avx2_deinterleave_rgba(row1, row2, row3, row4);
let (c1, c2, c3, c4) = avx2_deinterleave_rgba_epi8(row1, row2, row3, row4);
if image_configuration == ImageConfiguration::Rgba {
r_chan = c1;
g_chan = c2;
Expand Down
6 changes: 1 addition & 5 deletions src/avx/avx_gamma_curves.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ use std::arch::x86_64::*;

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn avx2_srgb_from_linear(linear: __m256) -> __m256 {
let low_cut_off = _mm256_set1_ps(0.0030412825601275209f32);
let mask = _mm256_cmp_ps::<_CMP_GE_OS>(linear, low_cut_off);
Expand All @@ -29,7 +28,6 @@ pub unsafe fn avx2_srgb_from_linear(linear: __m256) -> __m256 {

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn avx2_srgb_to_linear(gamma: __m256) -> __m256 {
let low_cut_off = _mm256_set1_ps(12.92f32 * 0.0030412825601275209f32);
let mask = _mm256_cmp_ps::<_CMP_GE_OS>(gamma, low_cut_off);
Expand All @@ -48,7 +46,6 @@ pub unsafe fn avx2_srgb_to_linear(gamma: __m256) -> __m256 {

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn avx2_rec709_from_linear(linear: __m256) -> __m256 {
let low_cut_off = _mm256_set1_ps(0.018053968510807f32);
let mask = _mm256_cmp_ps::<_CMP_GE_OS>(linear, low_cut_off);
Expand All @@ -69,7 +66,6 @@ pub unsafe fn avx2_rec709_from_linear(linear: __m256) -> __m256 {

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn avx2_rec709_to_linear(linear: __m256) -> __m256 {
let low_cut_off = _mm256_set1_ps(4.5f32 * 0.018053968510807f32);
let mask = _mm256_cmp_ps::<_CMP_GE_OS>(linear, low_cut_off);
Expand All @@ -92,7 +88,7 @@ pub unsafe fn get_avx_gamma_transfer(
transfer_function: TransferFunction,
) -> unsafe fn(__m256) -> __m256 {
match transfer_function {
TransferFunction::Srgb => avx2_srgb_to_linear,
TransferFunction::Srgb => avx2_srgb_from_linear,
TransferFunction::Rec709 => avx2_rec709_from_linear,
}
}
5 changes: 0 additions & 5 deletions src/avx/avx_math.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,26 @@ use std::arch::x86_64::*;

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn _mm256_cube_ps(x: __m256) -> __m256 {
_mm256_mul_ps(_mm256_mul_ps(x, x), x)
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[cfg(not(target_feature = "fma"))]
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn _mm256_prefer_fma_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
return _mm256_add_ps(_mm256_mul_ps(b, c), a);
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[cfg(target_feature = "fma")]
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn _mm256_prefer_fma_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
return _mm256_fmadd_ps(b, c, a);
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
unsafe fn _mm256_taylorpoly_ps(
x: __m256,
poly0: __m256,
Expand Down Expand Up @@ -56,7 +52,6 @@ unsafe fn _mm256_taylorpoly_ps(

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn _mm256_log_ps(v: __m256) -> __m256 {
let const_ln127 = _mm256_set1_epi32(127); // 127
let const_ln2 = _mm256_set1_ps(std::f32::consts::LN_2); // ln(2)
Expand Down
64 changes: 42 additions & 22 deletions src/avx/avx_support.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,28 @@ pub unsafe fn avx2_interleave_rgb(
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn avx2_deinterleave_rgb(
pub unsafe fn avx2_deinterleave_rgb_epi32(
bgr0: __m256i,
bgr1: __m256i,
bgr2: __m256i,
) -> (__m256i, __m256i, __m256i) {
let s02_low = _mm256_permute2x128_si256::<32>(bgr0, bgr2);
let s02_high = _mm256_permute2x128_si256::<49>(bgr0, bgr2);

let b0 = _mm256_blend_epi32::<0x92>(_mm256_blend_epi32::<0x24>(s02_low, s02_high), bgr1);
let g0 = _mm256_blend_epi32::<0x24>(_mm256_blend_epi32::<0x92>(s02_high, s02_low), bgr1);
let r0 = _mm256_blend_epi32::<0x92>(_mm256_blend_epi32::<0x24>(bgr1, s02_low), s02_high);

let b0 = _mm256_shuffle_epi32::<0x6c>(b0);
let g0 = _mm256_shuffle_epi32::<0xb1>(g0);
let r0 = _mm256_shuffle_epi32::<0xc6>(r0);
(b0, g0, r0)
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn avx2_deinterleave_rgb_epi8(
rgb0: __m256i,
rgb1: __m256i,
rgb2: __m256i,
Expand Down Expand Up @@ -220,7 +241,7 @@ pub unsafe fn avx2_deinterleave_rgb_ps(
rgb1: __m256,
rgb2: __m256,
) -> (__m256, __m256, __m256) {
let (r, g, b) = avx2_deinterleave_rgb(
let (r, g, b) = avx2_deinterleave_rgb_epi32(
_mm256_castps_si256(rgb0),
_mm256_castps_si256(rgb1),
_mm256_castps_si256(rgb2),
Expand All @@ -243,7 +264,7 @@ pub unsafe fn avx2_reshuffle_odd(v: __m256i) -> __m256i {
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn avx2_deinterleave_rgba(
pub unsafe fn avx2_deinterleave_rgba_epi8(
rgba0: __m256i,
rgba1: __m256i,
rgba2: __m256i,
Expand Down Expand Up @@ -295,28 +316,27 @@ pub unsafe fn avx2_store_u8_rgb(ptr: *mut u8, r: __m256i, g: __m256i, b: __m256i

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn avx2_interleave_rgba_epi8(
r: __m256i,
g: __m256i,
b: __m256i,
a: __m256i,
b: __m256i,
c: __m256i,
d: __m256i,
) -> (__m256i, __m256i, __m256i, __m256i) {
let bg0 = _mm256_unpacklo_epi8(r, g);
let bg1 = _mm256_unpackhi_epi8(r, g);
let ra0 = _mm256_unpacklo_epi8(b, a);
let ra1 = _mm256_unpackhi_epi8(b, a);

let rgba0_ = _mm256_unpacklo_epi16(bg0, ra0);
let rgba1_ = _mm256_unpackhi_epi16(bg0, ra0);
let rgba2_ = _mm256_unpacklo_epi16(bg1, ra1);
let rgba3_ = _mm256_unpackhi_epi16(bg1, ra1);

let rgba0 = _mm256_permute2x128_si256::<32>(rgba0_, rgba1_);
let rgba2 = _mm256_permute2x128_si256::<49>(rgba0_, rgba1_);
let rgba1 = _mm256_permute2x128_si256::<32>(rgba2_, rgba3_);
let rgba3 = _mm256_permute2x128_si256::<49>(rgba2_, rgba3_);
(rgba0, rgba1, rgba2, rgba3)
let bg0 = _mm256_unpacklo_epi8(a, b);
let bg1 = _mm256_unpackhi_epi8(a, b);
let ra0 = _mm256_unpacklo_epi8(c, d);
let ra1 = _mm256_unpackhi_epi8(c, d);

let bgra0_ = _mm256_unpacklo_epi16(bg0, ra0);
let bgra1_ = _mm256_unpackhi_epi16(bg0, ra0);
let bgra2_ = _mm256_unpacklo_epi16(bg1, ra1);
let bgra3_ = _mm256_unpackhi_epi16(bg1, ra1);

let bgra0 = _mm256_permute2x128_si256::<32>(bgra0_, bgra1_);
let bgra2 = _mm256_permute2x128_si256::<49>(bgra0_, bgra1_);
let bgra1 = _mm256_permute2x128_si256::<32>(bgra2_, bgra3_);
let bgra3 = _mm256_permute2x128_si256::<49>(bgra2_, bgra3_);
(bgra0, bgra1, bgra2, bgra3)
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
Expand Down
6 changes: 3 additions & 3 deletions src/avx/avx_xyz_lab_to_image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ pub unsafe fn avx_xyz_to_channels<
c9,
);

let src_ptr_1 = offset_src_ptr.add(4 * CHANNELS);
let src_ptr_1 = offset_src_ptr.add(8 * CHANNELS);

let (r_row1_, g_row1_, b_row1_) =
avx_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
Expand All @@ -155,7 +155,7 @@ pub unsafe fn avx_xyz_to_channels<
c9,
);

let src_ptr_2 = offset_src_ptr.add(4 * 2 * CHANNELS);
let src_ptr_2 = offset_src_ptr.add(8 * 2 * CHANNELS);

let (r_row2_, g_row2_, b_row2_) =
avx_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
Expand All @@ -172,7 +172,7 @@ pub unsafe fn avx_xyz_to_channels<
c9,
);

let src_ptr_3 = offset_src_ptr.add(4 * 3 * CHANNELS);
let src_ptr_3 = offset_src_ptr.add(8 * 3 * CHANNELS);

let (r_row3_, g_row3_, b_row3_) =
avx_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
Expand Down
Loading

0 comments on commit e84a311

Please sign in to comment.