Skip to content

Commit

Permalink
Some improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Aug 1, 2024
1 parent de01f24 commit 5d23f6e
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 145 deletions.
8 changes: 5 additions & 3 deletions src/app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,14 @@ fn main() {
lab_store.resize(width as usize * components * height as usize, 0f32);
let src_stride = width * components as u32;
let start_time = Instant::now();
rgb_to_lab(
rgb_to_oklab(
src_bytes,
src_stride,
&mut lab_store,
store_stride as u32,
width,
height
height,
TransferFunction::Srgb,
);
let elapsed_time = start_time.elapsed();
// Print the elapsed time in milliseconds
Expand Down Expand Up @@ -100,13 +101,14 @@ fn main() {
// }

let start_time = Instant::now();
lab_to_srgb(
oklab_to_rgb(
&lab_store,
store_stride as u32,
&mut dst_slice,
src_stride,
width,
height,
TransferFunction::Srgb,
);

let elapsed_time = start_time.elapsed();
Expand Down
4 changes: 2 additions & 2 deletions src/neon/cie.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ pub(crate) unsafe fn neon_triple_to_luv(
);
let u_prime = vdivq_f32(vmulq_n_f32(x, 4f32), den);
let v_prime = vdivq_f32(vmulq_n_f32(y, 9f32), den);
let sub_u_prime = vsubq_f32(u_prime, vdupq_n_f32(crate::luv::LUV_WHITE_U_PRIME));
let sub_v_prime = vsubq_f32(v_prime, vdupq_n_f32(crate::luv::LUV_WHITE_V_PRIME));
let sub_u_prime = vsubq_f32(u_prime, vdupq_n_f32(LUV_WHITE_U_PRIME));
let sub_v_prime = vsubq_f32(v_prime, vdupq_n_f32(LUV_WHITE_V_PRIME));
let l13 = vmulq_n_f32(l, 13f32);
let u = vbslq_f32(nan_mask, zeros, vmulq_f32(l13, sub_u_prime));
let v = vbslq_f32(nan_mask, zeros, vmulq_f32(l13, sub_v_prime));
Expand Down
1 change: 1 addition & 0 deletions src/neon/image_to_hsv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use crate::neon::{neon_rgb_to_hsl, neon_rgb_to_hsv};
use crate::{load_u8_and_deinterleave, load_u8_and_deinterleave_half};
use std::arch::aarch64::*;

#[allow(dead_code)]
#[inline(always)]
pub unsafe fn neon_channels_to_hsv<
const CHANNELS_CONFIGURATION: u8,
Expand Down
10 changes: 6 additions & 4 deletions src/neon/oklab_to_image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
* // Use of this source code is governed by a BSD-style
* // license that can be found in the LICENSE file.
*/
use std::arch::aarch64::*;

use erydanos::{vcosq_f32, vsinq_f32};

use crate::{load_f32_and_deinterleave_direct, TransferFunction, XYZ_TO_SRGB_D65};
use crate::image::ImageConfiguration;
use crate::image_to_oklab::OklabTarget;
use crate::neon::get_neon_gamma_transfer;
use crate::neon::math::vcolorq_matrix_f32;
use crate::{load_f32_and_deinterleave, TransferFunction, XYZ_TO_SRGB_D65};
use erydanos::{vcosq_f32, vsinq_f32};
use std::arch::aarch64::*;

#[inline(always)]
unsafe fn neon_oklab_gamma_vld<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
Expand Down Expand Up @@ -48,7 +50,7 @@ unsafe fn neon_oklab_gamma_vld<const CHANNELS_CONFIGURATION: u8, const TARGET: u
let transfer = get_neon_gamma_transfer(transfer_function);
let v_scale_alpha = vdupq_n_f32(255f32);
let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
let (l, mut a, mut b, mut a_f32) = load_f32_and_deinterleave!(src, image_configuration);
let (l, mut a, mut b, mut a_f32) = load_f32_and_deinterleave_direct!(src, image_configuration);

if target == OklabTarget::OKLCH {
let a0 = vmulq_f32(a, vcosq_f32(b));
Expand Down
77 changes: 9 additions & 68 deletions src/neon/xyz_lab_to_image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pub(crate) unsafe fn neon_xyz_lab_vld<
const TARGET: u8,
>(
src: *const f32,
transfer_function: TransferFunction,
transfer: &unsafe fn(float32x4_t) -> float32x4_t,
c1: float32x4_t,
c2: float32x4_t,
c3: float32x4_t,
Expand All @@ -32,7 +32,6 @@ pub(crate) unsafe fn neon_xyz_lab_vld<
c9: float32x4_t,
) -> (uint32x4_t, uint32x4_t, uint32x4_t) {
let target: XyzTarget = TARGET.into();
let transfer = get_neon_gamma_transfer(transfer_function);
let v_scale_color = vdupq_n_f32(255f32);
let lab_pixel = vld3q_f32(src);
let (mut r_f32, mut g_f32, mut b_f32) = (lab_pixel.0, lab_pixel.1, lab_pixel.2);
Expand Down Expand Up @@ -121,6 +120,8 @@ pub unsafe fn neon_xyz_to_channels<
let c8 = vdupq_n_f32(*matrix.get_unchecked(2).get_unchecked(1));
let c9 = vdupq_n_f32(*matrix.get_unchecked(2).get_unchecked(2));

let transfer = get_neon_gamma_transfer(transfer_function);

let src_channels = 3usize;

while cx + 16 < width as usize {
Expand All @@ -131,68 +132,28 @@ pub unsafe fn neon_xyz_to_channels<

let (r_row0_, g_row0_, b_row0_) =
neon_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
src_ptr_0,
transfer_function,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
c9,
src_ptr_0, &transfer, c1, c2, c3, c4, c5, c6, c7, c8, c9,
);

let src_ptr_1 = offset_src_ptr.add(4 * src_channels);

let (r_row1_, g_row1_, b_row1_) =
neon_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
src_ptr_1,
transfer_function,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
c9,
src_ptr_1, &transfer, c1, c2, c3, c4, c5, c6, c7, c8, c9,
);

let src_ptr_2 = offset_src_ptr.add(4 * 2 * src_channels);

let (r_row2_, g_row2_, b_row2_) =
neon_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
src_ptr_2,
transfer_function,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
c9,
src_ptr_2, &transfer, c1, c2, c3, c4, c5, c6, c7, c8, c9,
);

let src_ptr_3 = offset_src_ptr.add(4 * 3 * src_channels);

let (r_row3_, g_row3_, b_row3_) =
neon_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
src_ptr_3,
transfer_function,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
c9,
src_ptr_3, &transfer, c1, c2, c3, c4, c5, c6, c7, c8, c9,
);

let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_));
Expand Down Expand Up @@ -258,34 +219,14 @@ pub unsafe fn neon_xyz_to_channels<

let (r_row0_, g_row0_, b_row0_) =
neon_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
src_ptr_0,
transfer_function,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
c9,
src_ptr_0, &transfer, c1, c2, c3, c4, c5, c6, c7, c8, c9,
);

let src_ptr_1 = offset_src_ptr.add(4 * src_channels);

let (r_row1_, g_row1_, b_row1_) =
neon_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
src_ptr_1,
transfer_function,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
c9,
src_ptr_1, &transfer, c1, c2, c3, c4, c5, c6, c7, c8, c9,
);

let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_));
Expand Down
76 changes: 8 additions & 68 deletions src/neon/xyza_laba_to_image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use std::arch::aarch64::*;
#[inline(always)]
pub(crate) unsafe fn neon_xyza_lab_vld<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
src: *const f32,
transfer_function: TransferFunction,
transfer: &unsafe fn(float32x4_t) -> float32x4_t,
c1: float32x4_t,
c2: float32x4_t,
c3: float32x4_t,
Expand All @@ -28,7 +28,6 @@ pub(crate) unsafe fn neon_xyza_lab_vld<const CHANNELS_CONFIGURATION: u8, const T
c9: float32x4_t,
) -> (uint32x4_t, uint32x4_t, uint32x4_t, uint32x4_t) {
let target: XyzTarget = TARGET.into();
let transfer = get_neon_gamma_transfer(transfer_function);
let v_scale_color = vdupq_n_f32(255f32);
let lab_pixel = vld4q_f32(src);
let (mut r_f32, mut g_f32, mut b_f32) = (lab_pixel.0, lab_pixel.1, lab_pixel.2);
Expand Down Expand Up @@ -92,6 +91,7 @@ pub unsafe fn neon_xyza_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:
matrix: &[[f32; 3]; 3],
transfer_function: TransferFunction,
) -> usize {
let transfer = get_neon_gamma_transfer(transfer_function);
let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
if !image_configuration.has_alpha() {
panic!("Alpha may be set only on images with alpha");
Expand Down Expand Up @@ -120,68 +120,28 @@ pub unsafe fn neon_xyza_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:

let (r_row0_, g_row0_, b_row0_, a_row0_) =
neon_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
src_ptr_0,
transfer_function,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
c9,
src_ptr_0, &transfer, c1, c2, c3, c4, c5, c6, c7, c8, c9,
);

let src_ptr_1 = offset_src_ptr.add(4 * CHANNELS);

let (r_row1_, g_row1_, b_row1_, a_row1_) =
neon_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
src_ptr_1,
transfer_function,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
c9,
src_ptr_1, &transfer, c1, c2, c3, c4, c5, c6, c7, c8, c9,
);

let src_ptr_2 = offset_src_ptr.add(4 * 2 * CHANNELS);

let (r_row2_, g_row2_, b_row2_, a_row2_) =
neon_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
src_ptr_2,
transfer_function,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
c9,
src_ptr_2, &transfer, c1, c2, c3, c4, c5, c6, c7, c8, c9,
);

let src_ptr_3 = offset_src_ptr.add(4 * 3 * CHANNELS);

let (r_row3_, g_row3_, b_row3_, a_row3_) =
neon_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
src_ptr_3,
transfer_function,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
c9,
src_ptr_3, &transfer, c1, c2, c3, c4, c5, c6, c7, c8, c9,
);

let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_));
Expand Down Expand Up @@ -221,34 +181,14 @@ pub unsafe fn neon_xyza_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:

let (r_row0_, g_row0_, b_row0_, a_row0_) =
neon_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
src_ptr_0,
transfer_function,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
c9,
src_ptr_0, &transfer, c1, c2, c3, c4, c5, c6, c7, c8, c9,
);

let src_ptr_1 = offset_src_ptr.add(4 * CHANNELS);

let (r_row1_, g_row1_, b_row1_, a_row1_) =
neon_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
src_ptr_1,
transfer_function,
c1,
c2,
c3,
c4,
c5,
c6,
c7,
c8,
c9,
src_ptr_1, &transfer, c1, c2, c3, c4, c5, c6, c7, c8, c9,
);

let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_));
Expand Down

0 comments on commit 5d23f6e

Please sign in to comment.