Skip to content

Commit

Permalink
NEON improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Aug 3, 2024
1 parent ec13222 commit 360cb74
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 4 deletions.
56 changes: 56 additions & 0 deletions src/neon/jzazbz_to_image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -288,5 +288,61 @@ pub unsafe fn neon_jzazbz_to_image<const CHANNELS_CONFIGURATION: u8, const TARGE
cx += 8;
}

while cx + 4 < width as usize {
let offset_src_ptr =
((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);

let src_ptr_0 = offset_src_ptr;

let (r_row0_, g_row0_, b_row0_, a_row0_) = neon_jzazbz_gamma_vld::<CHANNELS_CONFIGURATION>(
src_ptr_0,
transfer_function,
target,
luminance_scale,
);

let zeros = vdup_n_u16(0);

let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), zeros);
let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), zeros);
let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), zeros);

let r_row = vqmovn_u16(r_row01);
let g_row = vqmovn_u16(g_row01);
let b_row = vqmovn_u16(b_row01);

let dst_ptr = dst.add(dst_offset as usize + cx * channels);

if image_configuration.has_alpha() {
let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), zeros);
let a_row = vqmovn_u16(a_row01);
let store_rows = match image_configuration {
ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
uint8x8x4_t(r_row, g_row, b_row, a_row)
}
ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
uint8x8x4_t(b_row, g_row, r_row, a_row)
}
};
let mut transient: [u8; 32] = [0; 32];
vst4_u8(transient.as_mut_ptr(), store_rows);
std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 4);
} else {
let store_rows = match image_configuration {
ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
uint8x8x3_t(r_row, g_row, b_row)
}
ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
uint8x8x3_t(b_row, g_row, r_row)
}
};
let mut transient: [u8; 24] = [0; 24];
vst3_u8(transient.as_mut_ptr(), store_rows);
std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 3);
}

cx += 4;
}

cx
}
53 changes: 50 additions & 3 deletions src/neon/to_linear_u8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@

use crate::image::ImageConfiguration;
use crate::neon::{get_neon_gamma_transfer, get_neon_linear_transfer};
use crate::{load_u8_and_deinterleave, load_u8_and_deinterleave_half, TransferFunction};
use crate::{
load_u8_and_deinterleave, load_u8_and_deinterleave_half, load_u8_and_deinterleave_quarter,
TransferFunction,
};
use std::arch::aarch64::*;

#[inline(always)]
Expand Down Expand Up @@ -156,16 +159,60 @@ pub unsafe fn neon_channels_to_linear_u8<

let b_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(z_low_low), vmovn_u32(z_low_high)));

let dst = dst_ptr.add(cx * channels);

if USE_ALPHA {
let v_4 = uint8x8x4_t(r_u_norm, g_u_norm, b_u_norm, vget_low_u8(a_chan));
vst4_u8(dst_ptr.add(cx * channels), v_4);
vst4_u8(dst, v_4);
} else {
let v_4 = uint8x8x3_t(r_u_norm, g_u_norm, b_u_norm);
vst3_u8(dst_ptr.add(cx * channels), v_4);
vst3_u8(dst, v_4);
}

cx += 8;
}

while cx + 4 < width as usize {
let src_ptr = src.add(src_offset + cx * channels);

let (r_chan, g_chan, b_chan, a_chan) =
load_u8_and_deinterleave_quarter!(src_ptr, image_configuration);

let r_low = vmovl_u8(vget_low_u8(r_chan));
let g_low = vmovl_u8(vget_low_u8(g_chan));
let b_low = vmovl_u8(vget_low_u8(b_chan));

let r_low_low = vmovl_u16(vget_low_u16(r_low));
let g_low_low = vmovl_u16(vget_low_u16(g_low));
let b_low_low = vmovl_u16(vget_low_u16(b_low));

let (x_low_low, y_low_low, z_low_low) =
neon_triple_to_linear_u8(r_low_low, g_low_low, b_low_low, &transfer);

let zeros = vdup_n_u16(0);

let r_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(x_low_low), zeros));

let g_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(y_low_low), zeros));

let b_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(z_low_low), zeros));

let dst = dst_ptr.add(cx * channels);

if USE_ALPHA {
let v_4 = uint8x8x4_t(r_u_norm, g_u_norm, b_u_norm, vget_low_u8(a_chan));
let mut transient: [u8; 32] = [0; 32];
vst4_u8(transient.as_mut_ptr(), v_4);
std::ptr::copy_nonoverlapping(transient.as_ptr(), dst, 4 * 4);
} else {
let v_3 = uint8x8x3_t(r_u_norm, g_u_norm, b_u_norm);
let mut transient: [u8; 24] = [0; 24];
vst3_u8(transient.as_mut_ptr(), v_3);
std::ptr::copy_nonoverlapping(transient.as_ptr(), dst, 4 * 3);
}

cx += 4;
}

cx
}
46 changes: 45 additions & 1 deletion src/neon/to_sigmoidal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

use crate::image::ImageConfiguration;
use crate::neon::sigmoidal::neon_rgb_to_sigmoidal;
use crate::{load_u8_and_deinterleave, load_u8_and_deinterleave_half};
use crate::{load_u8_and_deinterleave, load_u8_and_deinterleave_half, load_u8_and_deinterleave_quarter};
use std::arch::aarch64::*;

#[inline(always)]
Expand Down Expand Up @@ -244,5 +244,49 @@ pub unsafe fn neon_image_to_sigmoidal<const CHANNELS_CONFIGURATION: u8, const US
cx += 8;
}

while cx + 4 < width as usize {
let src_ptr = src.add(cx * channels);
let (r_chan, g_chan, b_chan, a_chan) =
load_u8_and_deinterleave_quarter!(src_ptr, image_configuration);

let r_low = vmovl_u8(vget_low_u8(r_chan));
let g_low = vmovl_u8(vget_low_u8(g_chan));
let b_low = vmovl_u8(vget_low_u8(b_chan));

let r_low_low = vmovl_u16(vget_low_u16(r_low));
let g_low_low = vmovl_u16(vget_low_u16(g_low));
let b_low_low = vmovl_u16(vget_low_u16(b_low));

let (x_low_low, y_low_low, z_low_low) =
neon_rgb_to_sigmoidal(r_low_low, g_low_low, b_low_low);

let a_low = vmovl_u8(vget_low_u8(a_chan));
if USE_ALPHA {
let a_low_low =
vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32);
let store_rows = match image_configuration {
ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
float32x4x4_t(x_low_low, y_low_low, z_low_low, a_low_low)
}
ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
float32x4x4_t(z_low_low, y_low_low, x_low_low, a_low_low)
}
};
vst4q_f32(dst_ptr.add(cx * channels), store_rows);
} else {
let store_rows = match image_configuration {
ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
float32x4x3_t(x_low_low, y_low_low, z_low_low)
}
ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
float32x4x3_t(z_low_low, y_low_low, x_low_low)
}
};
vst3q_f32(dst_ptr.add(cx * channels), store_rows);
}

cx += 4;
}

cx
}

0 comments on commit 360cb74

Please sign in to comment.