diff --git a/src/app/src/main.rs b/src/app/src/main.rs index 6275de5..ad4c5b5 100644 --- a/src/app/src/main.rs +++ b/src/app/src/main.rs @@ -1,3 +1,4 @@ +use std::arch::aarch64::{vdupq_n_f32, vgetq_lane_f32}; use std::time::Instant; use image::io::Reader as ImageReader; @@ -14,6 +15,13 @@ pub const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 { } fn main() { + unsafe { + let y = vdupq_n_f32(0f32); + let x = vdupq_n_f32(0f32); + let v = vatan2q_f32(y, x); + let val = vgetq_lane_f32::<0>(v); + print!("{}", val); + } let r = 140; let g = 164; let b = 177; @@ -23,7 +31,7 @@ fn main() { println!("HSL {:?}", hsl); println!("Back RGB {:?}", hsl.to_rgb8()); - let img = ImageReader::open("./assets/beach_horizon.jpg") + let img = ImageReader::open("./assets/horse.png") .unwrap() .decode() .unwrap(); @@ -34,7 +42,7 @@ fn main() { let mut src_bytes = img.as_bytes(); let width = dimensions.0; let height = dimensions.1; - let components = 3; + let components = 4; // // let mut dst_rgba = vec![]; // dst_rgba.resize(4usize * width as usize * height as usize, 0u8); @@ -58,14 +66,13 @@ fn main() { lab_store.resize(width as usize * components * height as usize, 0f32); let src_stride = width * components as u32; let start_time = Instant::now(); - rgb_to_linear( + rgba_to_lch_with_alpha( src_bytes, src_stride, &mut lab_store, store_stride as u32, width, height, - TransferFunction::Srgb, ); let elapsed_time = start_time.elapsed(); // Print the elapsed time in milliseconds @@ -93,14 +100,13 @@ fn main() { // } let start_time = Instant::now(); - linear_to_rgb( + lch_with_alpha_to_rgba( &lab_store, store_stride as u32, &mut dst_slice, src_stride, width, height, - TransferFunction::Srgb, ); let elapsed_time = start_time.elapsed(); diff --git a/src/avx/gamma_curves.rs b/src/avx/gamma_curves.rs index 114ea61..ae96e4e 100644 --- a/src/avx/gamma_curves.rs +++ b/src/avx/gamma_curves.rs @@ -88,9 +88,13 @@ pub unsafe fn avx2_rec709_to_linear(gamma: __m256) -> __m256 { #[inline(always)] pub unsafe fn avx2_pure_gamma(x: __m256, value: f32) -> __m256 { - let x = _mm256_max_ps(x, _mm256_setzero_ps()); - let x = _mm256_min_ps(x, _mm256_set1_ps(1f32)); - _mm256_pow_n_ps(x, value) + let zeros = _mm256_setzero_ps(); + let ones = _mm256_set1_ps(1f32); + let zero_mask = _mm256_cmp_ps::<_CMP_LE_OS>(x, zeros); + let ones_mask = _mm256_cmp_ps::<_CMP_GE_OS>(x, ones); + let mut rs = _mm256_pow_n_ps(x, value); + rs = crate::avx::math::_mm256_select_ps(zero_mask, zeros, rs); + crate::avx::math::_mm256_select_ps(ones_mask, ones, rs) } #[inline(always)] diff --git a/src/avx/math.rs b/src/avx/math.rs index 6d8d471..05046f9 100644 --- a/src/avx/math.rs +++ b/src/avx/math.rs @@ -1,3 +1,4 @@ +use crate::sse::{_mm_mulsign_ps, _mm_select_ps}; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] @@ -47,7 +48,9 @@ unsafe fn _mm256_taylorpoly_ps( } #[inline(always)] -pub unsafe fn _mm256_log_ps(v: __m256) -> __m256 { +pub unsafe fn _mm256_log_ps(v: __m256) -> __m256 { + let zeros = _mm256_setzero_ps(); + let nan_mask = _mm256_cmp_ps::<_CMP_LE_OS>(v, zeros); let const_ln127 = _mm256_set1_epi32(127); // 127 let const_ln2 = _mm256_set1_ps(std::f32::consts::LN_2); // ln(2) @@ -71,6 +74,13 @@ pub unsafe fn _mm256_log_ps(v: __m256) -> __m256 { ); poly = _mm256_prefer_fma_ps(poly, _mm256_cvtepi32_ps(m), const_ln2); + + if HANDLE_NAN { + poly = _mm256_select_ps(nan_mask, _mm256_set1_ps(-f32::INFINITY), poly); + } else { + poly = _mm256_select_ps(nan_mask, zeros, poly); + } + poly } @@ -576,7 +586,15 @@ unsafe fn _mm256_atan2q_ps_impl(y: __m256, x: __m256) -> __m256 { #[inline(always)] pub unsafe fn _mm256_atan2_ps(y: __m256, x: __m256) -> __m256 { let r = _mm256_atan2q_ps_impl(_mm256_abs_ps(y), x); - let r = _mm256_mulsign_ps(r, x); + let mut r = _mm256_mulsign_ps(r, x); + let zeros = _mm256_setzero_ps(); + let y_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(y, zeros); + r = _mm256_select_ps( + _mm256_cmp_ps::<_CMP_EQ_OS>(x, zeros), + _mm256_set1_ps(std::f32::consts::FRAC_PI_2), + r, + ); + r = _mm256_select_ps(y_zero_mask, zeros, r); _mm256_mulsign_ps(r, y) } diff --git a/src/hsl.rs b/src/hsl.rs index 0f36b3b..baf4d98 100644 --- a/src/hsl.rs +++ b/src/hsl.rs @@ -1,6 +1,5 @@ use crate::rgb::Rgb; -#[allow(dead_code)] #[derive(Debug, Copy, Clone, PartialOrd, PartialEq)] pub struct Hsl { pub h: f32, @@ -9,7 +8,6 @@ pub struct Hsl { } impl Hsl { - #[allow(dead_code)] pub fn new(h: u16, s: u16, l: u16) -> Hsl { Hsl { h: h as f32, @@ -22,12 +20,10 @@ impl Hsl { Hsl { h, s, l } } - #[allow(dead_code)] pub fn from_rgb(rgb: &Rgb) -> Hsl { rgb2hsl(rgb.r, rgb.g, rgb.b) } - #[allow(dead_code)] pub fn to_rgb8(&self) -> Rgb { let c = (1f32 - (2f32 * self.l - 1f32).abs()) * self.s; let x = c * (1f32 - ((self.h / 60f32) % 2f32 - 1f32).abs()); @@ -54,22 +50,18 @@ impl Hsl { } } - #[allow(dead_code)] pub fn to_rgb(&self) -> Rgb { self.to_rgb8() } - #[allow(dead_code)] pub fn get_saturation(&self) -> u16 { ((self.s * 100f32) as u16).min(100u16) } - #[allow(dead_code)] pub fn get_lightness(&self) -> u16 { ((self.l * 100f32) as u16).min(100u16) } - #[allow(dead_code)] pub fn get_hue(&self) -> u16 { (self.h as u16).min(360) } diff --git a/src/hsv.rs b/src/hsv.rs index 3357d90..79b1e17 100644 --- a/src/hsv.rs +++ b/src/hsv.rs @@ -11,7 +11,6 @@ static HSV_U8_SCALE: f32 = 1f32 / 255f32; static HSV_PERCENTAGE_SCALE: f32 = 1f32 / 100f32; impl Hsv { - #[allow(dead_code)] pub fn new(h: u16, s: u16, l: u16) -> Hsv { Hsv { h: h as f32, @@ -20,12 +19,10 @@ impl Hsv { } } - #[allow(dead_code)] pub fn from_components(h: f32, s: f32, v: f32) -> Hsv { Hsv { h, s, v } } - #[allow(dead_code)] pub fn from(rgb: &Rgb) -> Hsv { let (h, s, v) = rgb_to_hsv( rgb.r as f32 * HSV_U8_SCALE, @@ -35,7 +32,6 @@ impl Hsv { return Hsv { h, s, v }; } - #[allow(dead_code)] pub fn to_rgb8(&self) -> Rgb { let (rf, gf, bf) = hsv_to_rgb(self.h, self.s, self.v); return Rgb { @@ -45,32 +41,26 @@ impl Hsv { }; } - #[allow(dead_code)] pub fn get_hue(&self) -> f32 { self.h } - #[allow(dead_code)] pub fn get_saturation(&self) -> f32 { self.s } - #[allow(dead_code)] pub fn get_value(&self) -> f32 { self.v } - #[allow(dead_code)] pub fn get_hue_p(&self) -> u16 { self.h.max(0f32).min(360f32) as u16 } - #[allow(dead_code)] pub fn get_saturation_p(&self) -> u16 { (self.s * 100f32).max(0f32).min(100f32) as u16 } - #[allow(dead_code)] pub fn get_value_p(&self) -> u16 { (self.v * 100f32).max(0f32).min(100f32) as u16 } diff --git a/src/lib.rs b/src/lib.rs index 99ecd51..6d1e0de 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -109,6 +109,7 @@ pub use xyza_laba_to_image::xyz_with_alpha_to_rgba; pub use image_to_sigmoidal::bgra_to_sigmoidal; pub use image_to_sigmoidal::rgb_to_sigmoidal; pub use image_to_sigmoidal::rgba_to_sigmoidal; +pub use neon::*; pub use rgb_expand::*; pub use sigmoidal::Sigmoidal; pub use sigmoidal_to_image::sigmoidal_to_bgra; diff --git a/src/neon/gamma_curves.rs b/src/neon/gamma_curves.rs index 3f2d0d1..1b02668 100644 --- a/src/neon/gamma_curves.rs +++ b/src/neon/gamma_curves.rs @@ -78,9 +78,13 @@ pub unsafe fn neon_rec709_to_linear(gamma: float32x4_t) -> float32x4_t { #[inline(always)] pub unsafe fn neon_pure_gamma_function(gamma: float32x4_t, gamma_constant: f32) -> float32x4_t { - let gamma = vmaxq_f32(gamma, vdupq_n_f32(0f32)); - let gamma = vminq_f32(gamma, vdupq_n_f32(1f32)); - vpowq_n_f32(gamma, gamma_constant) + let zero_mask = vclezq_f32(gamma); + let ones = vdupq_n_f32(1f32); + let zeros = vdupq_n_f32(0f32); + let ones_mask = vcgeq_f32(gamma, ones); + let mut rs = vpowq_n_f32(gamma, gamma_constant); + rs = vbslq_f32(zero_mask, zeros, rs); + vbslq_f32(ones_mask, ones, rs) } #[inline(always)] diff --git a/src/neon/math.rs b/src/neon/math.rs index 7c69f48..7b4a868 100644 --- a/src/neon/math.rs +++ b/src/neon/math.rs @@ -213,7 +213,8 @@ pub unsafe fn vexpq_f32_ulp1(d: float32x4_t) -> float32x4_t { } #[inline(always)] -pub unsafe fn vlogq_f32(x: float32x4_t) -> float32x4_t { +pub unsafe fn vlogq_f32(x: float32x4_t) -> float32x4_t { + let nan_mask = vclezq_f32(x); let const_ln127 = vdupq_n_s32(127); // 127 let const_ln2 = vdupq_n_f32(std::f32::consts::LN_2); // ln(2) @@ -240,6 +241,12 @@ pub unsafe fn vlogq_f32(x: float32x4_t) -> float32x4_t { // Reconstruct poly = prefer_vfmaq_f32(poly, vcvtq_f32_s32(m), const_ln2); + if HANDLE_NAN { + poly = vbslq_f32(nan_mask, vdupq_n_f32(-f32::INFINITY), poly); + } else { + poly = vbslq_f32(nan_mask, vdupq_n_f32(0f32), poly); + } + return poly; } @@ -295,7 +302,7 @@ pub unsafe fn vlogq_f32_ulp35(d: float32x4_t) -> float32x4_t { #[inline(always)] pub unsafe fn vpowq_f32(val: float32x4_t, n: float32x4_t) -> float32x4_t { - return vexpq_f32(vmulq_f32(n, vlogq_f32(val))); + return vexpq_f32(vmulq_f32(n, vlogq_f32::(val))); } #[inline(always)] @@ -552,10 +559,26 @@ unsafe fn vatan2q_f32_impl(y: float32x4_t, x: float32x4_t) -> float32x4_t { t } +#[inline(always)] +pub unsafe fn visnegq_f32(x: float32x4_t) -> uint32x4_t { + vcltzq_f32(x) +} + #[inline(always)] pub unsafe fn vatan2q_f32(y: float32x4_t, x: float32x4_t) -> float32x4_t { let r = vatan2q_f32_impl(vabsq_f32(y), x); - let r = vmulsignq_f32(r, x); + let mut r = vmulsignq_f32(r, x); + let y_zero_mask = vceqzq_f32(y); + r = vbslq_f32(vceqzq_f32(x), vdupq_n_f32(std::f32::consts::FRAC_PI_2), r); + r = vbslq_f32( + y_zero_mask, + vbslq_f32( + visnegq_f32(x), + vdupq_n_f32(std::f32::consts::PI), + vdupq_n_f32(0f32), + ), + r, + ); vmulsignq_f32(r, y) } diff --git a/src/neon/mod.rs b/src/neon/mod.rs index 78d1949..8ac98ab 100644 --- a/src/neon/mod.rs +++ b/src/neon/mod.rs @@ -21,6 +21,7 @@ pub use gamma_curves::*; pub use hsv_to_image::*; pub use image_to_hsv::*; pub use linear_to_image::*; +pub use math::vatan2q_f32; pub use to_linear::*; pub use to_linear_u8::*; pub use to_sigmoidal::neon_image_to_sigmoidal; diff --git a/src/neon/sigmoidal.rs b/src/neon/sigmoidal.rs index dcb012f..e1af8aa 100644 --- a/src/neon/sigmoidal.rs +++ b/src/neon/sigmoidal.rs @@ -19,7 +19,7 @@ pub(crate) unsafe fn neon_sigmoidal_to_color(x: float32x4_t) -> float32x4_t { let k = vmulq_f32(x, vrecpeq_f32(den)); let zeros = vdupq_n_f32(0f32); let zero_mask_2 = vcleq_f32(k, zeros); - let ln = vlogq_f32(k); + let ln = vlogq_f32::(k); let rs = vbslq_f32(vandq_u32(zero_mask_1, zero_mask_2), zeros, ln); return rs; } diff --git a/src/sse/gamma_curves.rs b/src/sse/gamma_curves.rs index 42347c8..92196fd 100644 --- a/src/sse/gamma_curves.rs +++ b/src/sse/gamma_curves.rs @@ -87,9 +87,13 @@ pub unsafe fn sse_rec709_to_linear(gamma: __m128) -> __m128 { #[inline(always)] pub unsafe fn sse_pure_gamma(gamma: __m128, value: f32) -> __m128 { - let gamma = _mm_max_ps(gamma, _mm_setzero_ps()); - let gamma = _mm_min_ps(gamma, _mm_set1_ps(1f32)); - _mm_pow_n_ps(gamma, value) + let zeros = _mm_setzero_ps(); + let zero_mask = _mm_cmple_ps(gamma, zeros); + let ones = _mm_set1_ps(1f32); + let ones_mask = _mm_cmpge_ps(gamma, ones); + let mut rs = _mm_pow_n_ps(gamma, value); + rs = _mm_select_ps(zero_mask, zeros, rs); + _mm_select_ps(ones_mask, ones, rs) } #[inline(always)] diff --git a/src/sse/math.rs b/src/sse/math.rs index 8048cf9..8074028 100644 --- a/src/sse/math.rs +++ b/src/sse/math.rs @@ -43,7 +43,9 @@ unsafe fn _mm_taylorpoly_ps( } #[inline(always)] -pub unsafe fn _mm_log_ps(v: __m128) -> __m128 { +pub unsafe fn _mm_log_ps(v: __m128) -> __m128 { + let zeros = _mm_setzero_ps(); + let nan_mask = _mm_cmple_ps(v, zeros); let const_ln127 = _mm_set1_epi32(127); // 127 let const_ln2 = _mm_set1_ps(std::f32::consts::LN_2); // ln(2) @@ -64,6 +66,13 @@ pub unsafe fn _mm_log_ps(v: __m128) -> __m128 { ); poly = _mm_prefer_fma_ps(poly, _mm_cvtepi32_ps(m), const_ln2); + + if HANDLE_NAN { + poly = _mm_select_ps(nan_mask, _mm_set1_ps(-f32::INFINITY), poly); + } else { + poly = _mm_select_ps(nan_mask, zeros, poly); + } + poly } @@ -550,7 +559,15 @@ unsafe fn _mm_atan2q_ps_impl(y: __m128, x: __m128) -> __m128 { #[inline(always)] pub unsafe fn _mm_atan2_ps(y: __m128, x: __m128) -> __m128 { let r = _mm_atan2q_ps_impl(_mm_abs_ps(y), x); - let r = _mm_mulsign_ps(r, x); + let mut r = _mm_mulsign_ps(r, x); + let zeros = _mm_setzero_ps(); + let y_zero_mask = _mm_cmpeq_ps(y, zeros); + r = _mm_select_ps( + _mm_cmpeq_ps(x, zeros), + _mm_set1_ps(std::f32::consts::FRAC_PI_2), + r, + ); + r = _mm_select_ps(y_zero_mask, zeros, r); _mm_mulsign_ps(r, y) }