Skip to content

Commit

Permalink
Some improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Aug 1, 2024
1 parent 5d23f6e commit 40ac385
Show file tree
Hide file tree
Showing 7 changed files with 66 additions and 403 deletions.
4 changes: 2 additions & 2 deletions src/avx/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ mod gamma_curves;
mod image_to_oklab;
mod linear_to_image;
mod math;
mod oklab_to_image;
mod routines;
mod sigmoidal;
mod support;
Expand All @@ -20,17 +21,16 @@ mod to_xyz_lab;
mod utils;
mod xyz_lab_to_image;
mod xyza_laba_to_image;
mod oklab_to_image;

pub use from_sigmoidal::avx_from_sigmoidal_row;
pub use image_to_oklab::avx_image_to_oklab;
pub use linear_to_image::avx_linear_to_gamma;
pub use math::*;
pub use oklab_to_image::avx_oklab_to_image;
pub use support::*;
pub use to_linear::avx_channels_to_linear;
pub use to_sigmoidal::avx_image_to_sigmoidal_row;
pub use to_xyz_lab::*;
pub use utils::*;
pub use xyz_lab_to_image::*;
pub use xyza_laba_to_image::*;
pub use oklab_to_image::avx_oklab_to_image;
256 changes: 41 additions & 215 deletions src/avx/oklab_to_image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,20 @@ use std::arch::x86_64::*;

use erydanos::{_mm256_cos_ps, _mm256_sin_ps};

use crate::{avx_store_and_interleave_v3_half_u8, avx_store_and_interleave_v3_quarter_u8, avx_store_and_interleave_v3_u8, avx_store_and_interleave_v4_half_u8, avx_store_and_interleave_v4_quarter_u8, avx_store_and_interleave_v4_u8, TransferFunction, XYZ_TO_SRGB_D65};
use crate::avx::{_mm256_color_matrix_ps, _mm256_cube_ps, avx2_pack_u16, avx2_pack_u32, avx2_interleave_rgba_epi8, avx2_interleave_rgb};
use crate::avx::gamma_curves::get_avx_gamma_transfer;
use crate::avx::routines::avx_vld_f32_and_deinterleave_direct;
use crate::avx::{
_mm256_color_matrix_ps, _mm256_cube_ps, avx2_interleave_rgb, avx2_interleave_rgba_epi8,
avx2_pack_u16, avx2_pack_u32,
};
use crate::image::ImageConfiguration;
use crate::image_to_oklab::OklabTarget;
use crate::{
avx_store_and_interleave_v3_half_u8, avx_store_and_interleave_v3_quarter_u8,
avx_store_and_interleave_v3_u8, avx_store_and_interleave_v4_half_u8,
avx_store_and_interleave_v4_quarter_u8, avx_store_and_interleave_v4_u8, TransferFunction,
XYZ_TO_SRGB_D65,
};

#[inline(always)]
unsafe fn avx_oklab_vld<const CHANNELS_CONFIGURATION: u8>(
Expand Down Expand Up @@ -54,7 +62,8 @@ unsafe fn avx_oklab_vld<const CHANNELS_CONFIGURATION: u8>(
let v_scale_alpha = _mm256_set1_ps(255f32);
let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();

let (l, mut a, mut b, mut a_f32) = avx_vld_f32_and_deinterleave_direct::<CHANNELS_CONFIGURATION>(src);
let (l, mut a, mut b, mut a_f32) =
avx_vld_f32_and_deinterleave_direct::<CHANNELS_CONFIGURATION>(src);

if oklab_target == OklabTarget::OKLCH {
let a0 = _mm256_mul_ps(a, _mm256_cos_ps(b));
Expand Down Expand Up @@ -164,141 +173,29 @@ pub unsafe fn avx_oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:
let src_ptr_0 = offset_src_ptr;

let (r_row0_, g_row0_, b_row0_, a_row0_) = avx_oklab_vld::<CHANNELS_CONFIGURATION>(
src_ptr_0,
&transfer,
target,
m0,
m1,
m2,
m3,
m4,
m5,
m6,
m7,
m8,
c0,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
x0,
x1,
x2,
x3,
x4,
x5,
x6,
x7,
x8,
src_ptr_0, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4,
c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8,
);

let src_ptr_1 = offset_src_ptr.add(8 * channels);

let (r_row1_, g_row1_, b_row1_, a_row1_) = avx_oklab_vld::<CHANNELS_CONFIGURATION>(
src_ptr_1,
&transfer,
target,
m0,
m1,
m2,
m3,
m4,
m5,
m6,
m7,
m8,
c0,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
x0,
x1,
x2,
x3,
x4,
x5,
x6,
x7,
x8,
src_ptr_1, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4,
c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8,
);

let src_ptr_2 = offset_src_ptr.add(8 * 2 * channels);

let (r_row2_, g_row2_, b_row2_, a_row2_) = avx_oklab_vld::<CHANNELS_CONFIGURATION>(
src_ptr_2,
&transfer,
target,
m0,
m1,
m2,
m3,
m4,
m5,
m6,
m7,
m8,
c0,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
x0,
x1,
x2,
x3,
x4,
x5,
x6,
x7,
x8,
src_ptr_2, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4,
c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8,
);

let src_ptr_3 = offset_src_ptr.add(8 * 3 * channels);

let (r_row3_, g_row3_, b_row3_, a_row3_) = avx_oklab_vld::<CHANNELS_CONFIGURATION>(
src_ptr_3,
&transfer,
target,
m0,
m1,
m2,
m3,
m4,
m5,
m6,
m7,
m8,
c0,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
x0,
x1,
x2,
x3,
x4,
x5,
x6,
x7,
x8,
src_ptr_3, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4,
c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8,
);

let r_row01 = avx2_pack_u32(r_row0_, r_row1_);
Expand All @@ -319,7 +216,14 @@ pub unsafe fn avx_oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:
let a_row01 = avx2_pack_u32(a_row0_, a_row1_);
let a_row23 = avx2_pack_u32(a_row2_, a_row3_);
let a_row = avx2_pack_u16(a_row01, a_row23);
avx_store_and_interleave_v4_u8!(dst_ptr, image_configuration, r_row, g_row, b_row, a_row);
avx_store_and_interleave_v4_u8!(
dst_ptr,
image_configuration,
r_row,
g_row,
b_row,
a_row
);
} else {
avx_store_and_interleave_v3_u8!(dst_ptr, image_configuration, r_row, g_row, b_row);
}
Expand All @@ -334,71 +238,15 @@ pub unsafe fn avx_oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:
let src_ptr_0 = offset_src_ptr;

let (r_row0_, g_row0_, b_row0_, a_row0_) = avx_oklab_vld::<CHANNELS_CONFIGURATION>(
src_ptr_0,
&transfer,
target,
m0,
m1,
m2,
m3,
m4,
m5,
m6,
m7,
m8,
c0,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
x0,
x1,
x2,
x3,
x4,
x5,
x6,
x7,
x8,
src_ptr_0, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4,
c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8,
);

let src_ptr_1 = offset_src_ptr.add(8 * channels);

let (r_row1_, g_row1_, b_row1_, a_row1_) = avx_oklab_vld::<CHANNELS_CONFIGURATION>(
src_ptr_1,
&transfer,
target,
m0,
m1,
m2,
m3,
m4,
m5,
m6,
m7,
m8,
c0,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
x0,
x1,
x2,
x3,
x4,
x5,
x6,
x7,
x8,
src_ptr_1, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4,
c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8,
);

let r_row01 = avx2_pack_u32(r_row0_, r_row1_);
Expand Down Expand Up @@ -436,36 +284,8 @@ pub unsafe fn avx_oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:
let src_ptr_0 = offset_src_ptr;

let (r_row0_, g_row0_, b_row0_, a_row0_) = avx_oklab_vld::<CHANNELS_CONFIGURATION>(
src_ptr_0,
&transfer,
target,
m0,
m1,
m2,
m3,
m4,
m5,
m6,
m7,
m8,
c0,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
x0,
x1,
x2,
x3,
x4,
x5,
x6,
x7,
x8,
src_ptr_0, &transfer, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4,
c5, c6, c7, c8, x0, x1, x2, x3, x4, x5, x6, x7, x8,
);

let r_row01 = avx2_pack_u32(r_row0_, zeros);
Expand All @@ -490,7 +310,13 @@ pub unsafe fn avx_oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:
a_row
);
} else {
avx_store_and_interleave_v3_quarter_u8!(dst_ptr, image_configuration, r_row, g_row, b_row);
avx_store_and_interleave_v3_quarter_u8!(
dst_ptr,
image_configuration,
r_row,
g_row,
b_row
);
}

cx += 8;
Expand Down
4 changes: 2 additions & 2 deletions src/avx/xyza_laba_to_image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ use std::arch::x86_64::*;
use crate::avx::cie::{avx_lab_to_xyz, avx_lch_to_xyz, avx_luv_to_xyz};
use crate::avx::gamma_curves::get_avx_gamma_transfer;
use crate::avx::{
_mm256_color_matrix_ps, avx2_deinterleave_rgba_ps, avx2_interleave_rgba_epi8, avx2_pack_u32,
avx2_pack_u16,
_mm256_color_matrix_ps, avx2_deinterleave_rgba_ps, avx2_interleave_rgba_epi8, avx2_pack_u16,
avx2_pack_u32,
};
use crate::image::ImageConfiguration;
use crate::xyz_target::XyzTarget;
Expand Down
3 changes: 2 additions & 1 deletion src/neon/jzazbz_to_image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ unsafe fn neon_jzazbz_gamma_vld<const CHANNELS_CONFIGURATION: u8>(
let transfer = get_neon_gamma_transfer(transfer_function);
let v_scale_alpha = vdupq_n_f32(255f32);
let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
let (jz, mut az, mut bz, mut a_f32) = load_f32_and_deinterleave_direct!(src, image_configuration);
let (jz, mut az, mut bz, mut a_f32) =
load_f32_and_deinterleave_direct!(src, image_configuration);

if target == JzazbzTarget::JZCZHZ {
let cz = az;
Expand Down
2 changes: 1 addition & 1 deletion src/neon/oklab_to_image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ use std::arch::aarch64::*;

use erydanos::{vcosq_f32, vsinq_f32};

use crate::{load_f32_and_deinterleave_direct, TransferFunction, XYZ_TO_SRGB_D65};
use crate::image::ImageConfiguration;
use crate::image_to_oklab::OklabTarget;
use crate::neon::get_neon_gamma_transfer;
use crate::neon::math::vcolorq_matrix_f32;
use crate::{load_f32_and_deinterleave_direct, TransferFunction, XYZ_TO_SRGB_D65};

#[inline(always)]
unsafe fn neon_oklab_gamma_vld<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
Expand Down
Loading

0 comments on commit 40ac385

Please sign in to comment.