Skip to content

Commit

Permalink
Improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Aug 1, 2024
1 parent 304088b commit de01f24
Show file tree
Hide file tree
Showing 20 changed files with 863 additions and 137 deletions.
8 changes: 3 additions & 5 deletions src/app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,13 @@ fn main() {
lab_store.resize(width as usize * components * height as usize, 0f32);
let src_stride = width * components as u32;
let start_time = Instant::now();
rgb_to_oklch(
rgb_to_lab(
src_bytes,
src_stride,
&mut lab_store,
store_stride as u32,
width,
height,
TransferFunction::Srgb,
height
);
let elapsed_time = start_time.elapsed();
// Print the elapsed time in milliseconds
Expand Down Expand Up @@ -101,14 +100,13 @@ fn main() {
// }

let start_time = Instant::now();
oklch_to_rgb(
lab_to_srgb(
&lab_store,
store_stride as u32,
&mut dst_slice,
src_stride,
width,
height,
TransferFunction::Srgb,
);

let elapsed_time = start_time.elapsed();
Expand Down
32 changes: 15 additions & 17 deletions src/avx/image_to_oklab.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,6 @@ pub unsafe fn avx_image_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET:
_mm256_set1_ps(-0.8086757660f32),
);

let zeros = _mm256_setzero_si256();

while cx + 32 < width as usize {
let src_ptr = src.add(src_offset + cx * channels);
let (r_chan, g_chan, b_chan, a_chan) =
Expand Down Expand Up @@ -157,9 +155,9 @@ pub unsafe fn avx_image_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET:
avx_store_and_interleave_v3_direct_f32!(ptr, x_low_low, y_low_low, z_low_low);
}

let r_low_high = _mm256_unpackhi_epi16(r_low, zeros);
let g_low_high = _mm256_unpackhi_epi16(g_low, zeros);
let b_low_high = _mm256_unpackhi_epi16(b_low, zeros);
let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low));
let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low));
let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low));

let (x_low_high, y_low_high, z_low_high) = triple_to_oklab!(
r_low_high, g_low_high, b_low_high, &transfer, target, x0, x1, x2, x3, x4, x5, x6, x7,
Expand All @@ -181,9 +179,9 @@ pub unsafe fn avx_image_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET:
avx_store_and_interleave_v3_direct_f32!(ptr, x_low_high, y_low_high, z_low_high);
}

let r_high = _mm256_unpackhi_epi8(r_chan, zeros);
let g_high = _mm256_unpackhi_epi8(g_chan, zeros);
let b_high = _mm256_unpackhi_epi8(b_chan, zeros);
let r_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(r_chan));
let g_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(g_chan));
let b_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(b_chan));

let r_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_high));
let g_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_high));
Expand All @@ -194,7 +192,7 @@ pub unsafe fn avx_image_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET:
x8, c0, c1, c2, c3, c4, c5, c6, c7, c8, m0, m1, m2, m3, m4, m5, m6, m7, m8
);

let a_high = _mm256_unpackhi_epi8(a_chan, zeros);
let a_high = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan));

if image_configuration.has_alpha() {
let a_high_low = _mm256_mul_ps(
Expand All @@ -210,9 +208,9 @@ pub unsafe fn avx_image_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET:
avx_store_and_interleave_v3_direct_f32!(ptr, x_high_low, y_high_low, z_high_low);
}

let r_high_high = _mm256_unpackhi_epi16(r_high, zeros);
let g_high_high = _mm256_unpackhi_epi16(g_high, zeros);
let b_high_high = _mm256_unpackhi_epi16(b_high, zeros);
let r_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_high));
let g_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_high));
let b_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_high));

let (x_high_high, y_high_high, z_high_high) = triple_to_oklab!(
r_high_high,
Expand Down Expand Up @@ -251,7 +249,7 @@ pub unsafe fn avx_image_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET:

if image_configuration.has_alpha() {
let a_high_high = _mm256_mul_ps(
_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_high, zeros)),
_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_castsi256_si128(a_high))),
u8_scale,
);
let ptr = dst_ptr.add(cx * 4 + 8 * 4 * 3);
Expand Down Expand Up @@ -306,9 +304,9 @@ pub unsafe fn avx_image_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET:
avx_store_and_interleave_v3_direct_f32!(ptr, x_low_low, y_low_low, z_low_low);
}

let r_low_high = _mm256_unpackhi_epi16(r_low, zeros);
let g_low_high = _mm256_unpackhi_epi16(g_low, zeros);
let b_low_high = _mm256_unpackhi_epi16(b_low, zeros);
let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low));
let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low));
let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low));

let (x_low_high, y_low_high, z_low_high) = triple_to_oklab!(
r_low_high, g_low_high, b_low_high, &transfer, target, x0, x1, x2, x3, x4, x5, x6, x7,
Expand All @@ -317,7 +315,7 @@ pub unsafe fn avx_image_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET:

if image_configuration.has_alpha() {
let a_low_high = _mm256_mul_ps(
_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_low, zeros)),
_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))),
u8_scale,
);

Expand Down
49 changes: 48 additions & 1 deletion src/avx/linear_to_image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ use crate::avx::gamma_curves::get_avx_gamma_transfer;
use crate::avx::routines::avx_vld_f32_and_deinterleave;
use crate::avx::{avx2_interleave_rgb, avx2_interleave_rgba_epi8, avx2_pack_s32, avx2_pack_u16};
use crate::image::ImageConfiguration;
use crate::{avx_store_and_interleave_v3_u8, avx_store_and_interleave_v4_u8, TransferFunction};
use crate::{
avx_store_and_interleave_v3_half_u8, avx_store_and_interleave_v3_u8,
avx_store_and_interleave_v4_half_u8, avx_store_and_interleave_v4_u8, TransferFunction,
};

#[inline(always)]
unsafe fn gamma_vld<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
Expand Down Expand Up @@ -113,5 +116,49 @@ pub unsafe fn avx_linear_to_gamma<const CHANNELS_CONFIGURATION: u8, const USE_AL
cx += 32;
}

let zeros = _mm256_setzero_si256();

while cx + 16 < width as usize {
let offset_src_ptr =
((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);

let src_ptr_0 = offset_src_ptr;

let (r_row0_, g_row0_, b_row0_, a_row0_) =
gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_0, transfer_function);

let src_ptr_1 = offset_src_ptr.add(8 * channels);

let (r_row1_, g_row1_, b_row1_, a_row1_) =
gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_1, transfer_function);

let r_row01 = avx2_pack_s32(r_row0_, r_row1_);
let g_row01 = avx2_pack_s32(g_row0_, g_row1_);
let b_row01 = avx2_pack_s32(b_row0_, b_row1_);

let r_row = avx2_pack_u16(r_row01, zeros);
let g_row = avx2_pack_u16(g_row01, zeros);
let b_row = avx2_pack_u16(b_row01, zeros);

let dst_ptr = dst.add(dst_offset as usize + cx * channels);

if USE_ALPHA {
let a_row01 = avx2_pack_s32(a_row0_, a_row1_);
let a_row = avx2_pack_u16(a_row01, zeros);
avx_store_and_interleave_v4_half_u8!(
dst_ptr,
image_configuration,
r_row,
g_row,
b_row,
a_row
);
} else {
avx_store_and_interleave_v3_half_u8!(dst_ptr, image_configuration, r_row, g_row, b_row);
}

cx += 16;
}

cx
}
3 changes: 0 additions & 3 deletions src/avx/math.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,16 @@ pub unsafe fn _mm256_cube_ps(x: __m256) -> __m256 {
}

#[inline(always)]
#[allow(dead_code)]
pub unsafe fn _mm256_pow_ps(x: __m256, n: __m256) -> __m256 {
_mm256_pow_fast_ps(x, n)
}

#[inline(always)]
#[allow(dead_code)]
pub unsafe fn _mm256_pow_n_ps(x: __m256, n: f32) -> __m256 {
_mm256_pow_fast_ps(x, _mm256_set1_ps(n))
}

#[inline(always)]
#[allow(dead_code)]
pub(crate) unsafe fn _mm256_fmaf_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
_mm256_prefer_fma_ps(c, b, a)
}
Expand Down
2 changes: 2 additions & 0 deletions src/avx/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ mod to_xyz_lab;
mod utils;
mod xyz_lab_to_image;
mod xyza_laba_to_image;
mod oklab_to_image;

pub use from_sigmoidal::avx_from_sigmoidal_row;
pub use image_to_oklab::avx_image_to_oklab;
Expand All @@ -32,3 +33,4 @@ pub use to_xyz_lab::*;
pub use utils::*;
pub use xyz_lab_to_image::*;
pub use xyza_laba_to_image::*;
pub use oklab_to_image::avx_oklab_to_image;
Loading

0 comments on commit de01f24

Please sign in to comment.