Skip to content

Commit

Permalink
Fix discovered AVX sigmoidal fix
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Dec 28, 2024
1 parent 91a4fe6 commit ff40c57
Show file tree
Hide file tree
Showing 10 changed files with 16 additions and 236 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ workspace = { members = ["src/app"] }

[package]
name = "colorutils-rs"
version = "0.7.3"
version = "0.7.4"
edition = "2021"
description = "High performance utilities for color format handling and conversion."
readme = "README.md"
Expand Down
3 changes: 1 addition & 2 deletions src/avx/oklab_to_image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,7 @@ pub unsafe fn avx_oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:
);

while cx + 8 < width as usize {
let offset_src_ptr =
((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);
let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * channels);

let src_ptr_0 = offset_src_ptr;

Expand Down
86 changes: 0 additions & 86 deletions src/avx/routines.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,92 +58,6 @@ pub(crate) unsafe fn avx_vld_u8_and_deinterleave<const CHANNELS_CONFIGURATION: u
(r_chan, g_chan, b_chan, a_chan)
}

#[inline(always)]
pub(crate) unsafe fn avx_vld_u8_and_deinterleave_half<const CHANNELS_CONFIGURATION: u8>(
ptr: *const u8,
) -> (__m256i, __m256i, __m256i, __m256i) {
let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
let (r_chan, g_chan, b_chan, a_chan);

let row1 = _mm256_loadu_si256(ptr as *const __m256i);
let row2 = _mm256_loadu_si256(ptr.add(32) as *const __m256i);
let empty_row = _mm256_setzero_si256();
match image_configuration {
ImageConfiguration::Rgb | ImageConfiguration::Bgr => {
let (c1, c2, c3) = avx2_deinterleave_rgb_epi8(row1, row2, empty_row);
if image_configuration == ImageConfiguration::Rgb {
r_chan = c1;
g_chan = c2;
b_chan = c3;
} else {
r_chan = c3;
g_chan = c2;
b_chan = c1;
}
a_chan = _mm256_set1_epi8(-128);
}
ImageConfiguration::Rgba | ImageConfiguration::Bgra => {
let (c1, c2, c3, c4) = avx2_deinterleave_rgba_epi8(row1, row2, empty_row, empty_row);
if image_configuration == ImageConfiguration::Rgba {
r_chan = c1;
g_chan = c2;
b_chan = c3;
a_chan = c4;
} else {
r_chan = c3;
g_chan = c2;
b_chan = c1;
a_chan = c4;
}
}
}

(r_chan, g_chan, b_chan, a_chan)
}

#[inline(always)]
pub(crate) unsafe fn avx_vld_u8_and_deinterleave_quarter<const CHANNELS_CONFIGURATION: u8>(
ptr: *const u8,
) -> (__m256i, __m256i, __m256i, __m256i) {
let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
let (r_chan, g_chan, b_chan, a_chan);

let row1 = _mm256_loadu_si256(ptr as *const __m256i);
let empty_row = _mm256_setzero_si256();
match image_configuration {
ImageConfiguration::Rgb | ImageConfiguration::Bgr => {
let (c1, c2, c3) = avx2_deinterleave_rgb_epi8(row1, empty_row, empty_row);
if image_configuration == ImageConfiguration::Rgb {
r_chan = c1;
g_chan = c2;
b_chan = c3;
} else {
r_chan = c3;
g_chan = c2;
b_chan = c1;
}
a_chan = _mm256_set1_epi8(-128);
}
ImageConfiguration::Rgba | ImageConfiguration::Bgra => {
let (c1, c2, c3, c4) =
avx2_deinterleave_rgba_epi8(row1, empty_row, empty_row, empty_row);
if image_configuration == ImageConfiguration::Rgba {
r_chan = c1;
g_chan = c2;
b_chan = c3;
a_chan = c4;
} else {
r_chan = c3;
g_chan = c2;
b_chan = c1;
a_chan = c4;
}
}
}

(r_chan, g_chan, b_chan, a_chan)
}

#[inline(always)]
pub(crate) unsafe fn avx_vld_f32_and_deinterleave<const CHANNELS_CONFIGURATION: u8>(
ptr: *const f32,
Expand Down
134 changes: 1 addition & 133 deletions src/avx/to_sigmoidal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,7 @@ use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

use crate::avx::routines::{
avx_vld_u8_and_deinterleave, avx_vld_u8_and_deinterleave_half,
avx_vld_u8_and_deinterleave_quarter,
};
use crate::avx::routines::avx_vld_u8_and_deinterleave;
use crate::avx::sigmoidal::avx_rgb_to_sigmoidal;
use crate::avx::{avx2_interleave_rgb_ps, avx2_interleave_rgba_ps};
use crate::image::ImageConfiguration;
Expand Down Expand Up @@ -192,134 +189,5 @@ pub unsafe fn avx_image_to_sigmoidal_row<
cx += 32;
}

while cx + 16 < width as usize {
let src_ptr = src.add(cx * channels);
let (r_chan, g_chan, b_chan, a_chan) =
avx_vld_u8_and_deinterleave_half::<CHANNELS_CONFIGURATION>(src_ptr);

let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan));
let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan));
let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan));
let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan));

let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low));
let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low));
let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low));

let (x_low_low, y_low_low, z_low_low) =
avx_rgb_to_sigmoidal(r_low_low, g_low_low, b_low_low);

let u8_scale = _mm256_set1_ps(1f32 / 255f32);

if USE_ALPHA {
let a_low_low = _mm256_mul_ps(
_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))),
u8_scale,
);

let ptr = dst_ptr.add(cx * channels);
avx_store_and_interleave_v4_f32!(
ptr,
image_configuration,
x_low_low,
y_low_low,
z_low_low,
a_low_low
);
} else {
let ptr = dst_ptr.add(cx * channels);
avx_store_and_interleave_v3_f32!(
ptr,
image_configuration,
x_low_low,
y_low_low,
z_low_low
);
}

let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low));
let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low));
let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low));

let (x_low_high, y_low_high, z_low_high) =
avx_rgb_to_sigmoidal(r_low_high, g_low_high, b_low_high);

if USE_ALPHA {
let a_low_high = _mm256_mul_ps(
_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))),
u8_scale,
);

let ptr = dst_ptr.add(cx * channels + 8 * channels);
avx_store_and_interleave_v4_f32!(
ptr,
image_configuration,
x_low_high,
y_low_high,
z_low_high,
a_low_high
);
} else {
let ptr = dst_ptr.add(cx * channels + 8 * channels);
avx_store_and_interleave_v3_f32!(
ptr,
image_configuration,
x_low_high,
y_low_high,
z_low_high
);
}

cx += 16;
}

while cx + 8 < width as usize {
let src_ptr = src.add(cx * channels);
let (r_chan, g_chan, b_chan, a_chan) =
avx_vld_u8_and_deinterleave_quarter::<CHANNELS_CONFIGURATION>(src_ptr);

let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan));
let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan));
let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan));
let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan));

let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low));
let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low));
let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low));

let (x_low_low, y_low_low, z_low_low) =
avx_rgb_to_sigmoidal(r_low_low, g_low_low, b_low_low);

let u8_scale = _mm256_set1_ps(1f32 / 255f32);

if USE_ALPHA {
let a_low_low = _mm256_mul_ps(
_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))),
u8_scale,
);

let ptr = dst_ptr.add(cx * channels);
avx_store_and_interleave_v4_f32!(
ptr,
image_configuration,
x_low_low,
y_low_low,
z_low_low,
a_low_low
);
} else {
let ptr = dst_ptr.add(cx * channels);
avx_store_and_interleave_v3_f32!(
ptr,
image_configuration,
x_low_low,
y_low_low,
z_low_low
);
}

cx += 8;
}

cx
}
11 changes: 5 additions & 6 deletions src/gamma_curves.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,15 +258,14 @@ pub fn hlg_to_linear(gamma: f32) -> f32 {
if gamma < 0.0 {
return 0.0;
}
let linear;
if gamma <= 0.5 {
linear = f32::powf((gamma * gamma) * (1.0 / 3.0), 1.2);
let linear = if gamma <= 0.5 {
f32::powf((gamma * gamma) * (1.0 / 3.0), 1.2)
} else {
linear = f32::powf(
f32::powf(
(f32::exp((gamma - 0.55991073) / 0.17883277) + 0.28466892) / 12.0,
1.2,
);
}
)
};
// Scale so that SDR white is 1.0 (extended SDR).
linear * HLG_WHITE_NITS / SDR_WHITE_NITS
}
Expand Down
4 changes: 2 additions & 2 deletions src/image_to_lalphabeta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ fn channels_to_lalphabeta<const CHANNELS_CONFIGURATION: u8>(
let channels = image_configuration.get_channels_count();

let mut lut_table = vec![0f32; 256];
for i in 0..256 {
lut_table[i] = transfer_function.linearize(i as f32 * (1. / 255.0));
for (i, element) in lut_table.iter_mut().enumerate() {
*element = transfer_function.linearize(i as f32 * (1. / 255.0));
}

let dst_slice_safe_align = unsafe {
Expand Down
2 changes: 1 addition & 1 deletion src/oklab.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* // Use of this source code is governed by a BSD-style
* // license that can be found in the LICENSE file.
*/
#[allow(clippy::excessive_precision)]
#![allow(clippy::excessive_precision)]
use crate::utils::mlaf;
use crate::{EuclideanDistance, Rgb, TaxicabDistance, TransferFunction};
use num_traits::Pow;
Expand Down
4 changes: 2 additions & 2 deletions src/xyz_lab_to_image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ fn xyz_to_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
};

let mut lut_table = vec![0u8; 2049];
for i in 0..2049 {
lut_table[i] = (transfer_function.gamma(i as f32 * (1. / 2048.0)) * 255.).min(255.) as u8;
for (i, element) in lut_table.iter_mut().enumerate() {
*element = (transfer_function.gamma(i as f32 * (1. / 2048.0)) * 255.).min(255.) as u8;
}

#[cfg(feature = "rayon")]
Expand Down
4 changes: 2 additions & 2 deletions src/xyza_laba_to_image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ fn xyz_with_alpha_to_channels<const CHANNELS_CONFIGURATION: u8, const TARGET: u8
}

let mut lut_table = vec![0u8; 2049];
for i in 0..2049 {
lut_table[i] = (transfer_function.gamma(i as f32 * (1. / 2048.0)) * 255.).min(255.) as u8;
for (i, element) in lut_table.iter_mut().enumerate() {
*element = (transfer_function.gamma(i as f32 * (1. / 2048.0)) * 255.).min(255.) as u8;
}

let src_slice_safe_align = unsafe {
Expand Down

0 comments on commit ff40c57

Please sign in to comment.