diff --git a/Cargo.lock b/Cargo.lock index eb76259..513b58f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -163,8 +163,9 @@ checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" [[package]] name = "colorutils-rs" -version = "0.4.10" +version = "0.4.11" dependencies = [ + "erydanos", "half", ] @@ -220,6 +221,15 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +[[package]] +name = "erydanos" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b0354c3359e57ded8b5f8a120273cb1da304630399da59afa14182404573d6f" +dependencies = [ + "num-traits", +] + [[package]] name = "exr" version = "1.72.0" diff --git a/Cargo.toml b/Cargo.toml index 253f089..e8f5432 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ workspace = { members = ["src/app"] } [package] name = "colorutils-rs" -version = "0.4.10" +version = "0.4.11" edition = "2021" description = "High performance utilities for color format handling and conversion." readme = "README.md" @@ -16,6 +16,7 @@ repository = "https://github.com/awxkee/colorutils-rs" exclude = ["*.jpg"] [dependencies] +erydanos = "0.1.0" half = "2.4.1" [features] diff --git a/src/app/src/main.rs b/src/app/src/main.rs index 39868fb..9bbdfae 100644 --- a/src/app/src/main.rs +++ b/src/app/src/main.rs @@ -23,7 +23,7 @@ fn main() { println!("HSL {:?}", hsl); println!("Back RGB {:?}", hsl.to_rgb8()); - let img = ImageReader::open("./assets/test_image_2.png") + let img = ImageReader::open("./assets/beach_horizon.jpg") .unwrap() .decode() .unwrap(); @@ -34,7 +34,7 @@ fn main() { let mut src_bytes = img.as_bytes(); let width = dimensions.0; let height = dimensions.1; - let components = 4; + let components = 3; // // let mut dst_rgba = vec![]; // dst_rgba.resize(4usize * width as usize * height as usize, 0u8); @@ -58,14 +58,13 @@ fn main() { lab_store.resize(width as usize * components * height as usize, 0f32); let src_stride = width * components as u32; let start_time = Instant::now(); - rgba_to_linear( + rgb_to_lab( src_bytes, src_stride, &mut lab_store, store_stride as u32, width, height, - TransferFunction::Srgb, ); let elapsed_time = start_time.elapsed(); // Print the elapsed time in milliseconds @@ -93,14 +92,13 @@ fn main() { // } let start_time = Instant::now(); - linear_to_rgba( + lab_to_srgb( &lab_store, store_stride as u32, &mut dst_slice, src_stride, width, height, - TransferFunction::Srgb, ); let elapsed_time = start_time.elapsed(); diff --git a/src/gamma_curves.rs b/src/gamma_curves.rs index 7e7b626..e61171b 100644 --- a/src/gamma_curves.rs +++ b/src/gamma_curves.rs @@ -1,4 +1,5 @@ #[inline(always)] +/// Linear transfer function for sRGB pub fn srgb_to_linear(gamma: f32) -> f32 { return if gamma < 0f32 { 0f32 @@ -12,6 +13,7 @@ pub fn srgb_to_linear(gamma: f32) -> f32 { } #[inline(always)] +/// Gamma transfer function for sRGB pub fn srgb_from_linear(linear: f32) -> f32 { return if linear < 0.0f32 { 0.0f32 @@ -25,6 +27,7 @@ pub fn srgb_from_linear(linear: f32) -> f32 { } #[inline(always)] +/// Linear transfer function for Rec.709 pub fn rec709_to_linear(gamma: f32) -> f32 { return if gamma < 0.0f32 { 0.0f32 @@ -38,6 +41,7 @@ pub fn rec709_to_linear(gamma: f32) -> f32 { } #[inline(always)] +/// Gamma transfer function for Rec.709 pub fn rec709_from_linear(linear: f32) -> f32 { return if linear < 0.0f32 { 0.0f32 @@ -51,26 +55,43 @@ pub fn rec709_from_linear(linear: f32) -> f32 { } #[inline(always)] +/// Pure gamma transfer function for gamma 2.2 +pub fn pure_gamma_function(x: f32, gamma: f32) -> f32 { + if x <= 0f32 { + return 0f32; + } else if x >= 1f32 { + return 1f32; + } else { + return x.powf(gamma); + } +} + +#[inline(always)] +/// Pure gamma transfer function for gamma 2.2 pub fn gamma2p2_from_linear(linear: f32) -> f32 { - linear.powf(1f32 / 2.2f32) + pure_gamma_function(linear, 1f32 / 2.2f32) } #[inline(always)] +/// Linear transfer function for gamma 2.2 pub fn gamma2p2_to_linear(gamma: f32) -> f32 { - gamma.powf(2.2f32) + pure_gamma_function(gamma, 2.2f32) } #[inline(always)] +/// Pure gamma transfer function for gamma 2.8 pub fn gamma2p8_from_linear(linear: f32) -> f32 { - linear.powf(1f32 / 2.8f32) + pure_gamma_function(linear, 1f32 / 2.8f32) } #[inline(always)] +/// Linear transfer function for gamma 2.8 pub fn gamma2p8_to_linear(gamma: f32) -> f32 { - gamma.powf(2.8f32) + pure_gamma_function(gamma, 2.8f32) } #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] +/// Declares transfer function for transfer components into a linear colorspace and its inverse pub enum TransferFunction { /// sRGB Transfer function Srgb, diff --git a/src/hsl.rs b/src/hsl.rs index baf4d98..d831822 100644 --- a/src/hsl.rs +++ b/src/hsl.rs @@ -1,9 +1,13 @@ use crate::rgb::Rgb; #[derive(Debug, Copy, Clone, PartialOrd, PartialEq)] +/// Represents *HSL* (hue, saturation, lightness) colorspace, H ∈ [0, 360f32], s ∈ [0f32, 1f32], v ∈ [0f32, 1f32] pub struct Hsl { + /// Hue H ∈ [0, 360f32] pub h: f32, + /// Saturation s ∈ [0, 1f32] pub s: f32, + /// Lightness v ∈ [0, 1f32] pub l: f32, } diff --git a/src/hsv.rs b/src/hsv.rs index 79b1e17..c10a948 100644 --- a/src/hsv.rs +++ b/src/hsv.rs @@ -1,9 +1,13 @@ use crate::rgb::Rgb; #[derive(Debug, Copy, Clone, PartialOrd, PartialEq)] +/// Represents *HSV* (hue, saturation, value) colorspace, H ∈ [0, 360f32], s ∈ [0f32, 1f32], v ∈ [0f32, 1f32] pub struct Hsv { + /// Hue H ∈ [0, 360f32] pub h: f32, + /// Saturation s ∈ [0, 1f32] pub s: f32, + /// Value v ∈ [0, 1f32] pub v: f32, } diff --git a/src/lab.rs b/src/lab.rs index 1b69cb1..f1dcf99 100644 --- a/src/lab.rs +++ b/src/lab.rs @@ -1,11 +1,14 @@ use crate::rgb::Rgb; use crate::xyz::Xyz; -/// A CIELAB color. +/// Represents CIELAB color space. #[derive(Copy, Clone, Debug, Default, PartialOrd, PartialEq)] pub struct Lab { + /// `l`: lightness component (0 to 100) pub l: f32, + /// `a`: green (negative) and red (positive) component. pub a: f32, + /// `b`: blue (negative) and yellow (positive) component pub b: f32, } diff --git a/src/lib.rs b/src/lib.rs index 99ecd51..7cba515 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,12 +19,14 @@ mod image_xyza_laba; mod lab; mod linear_to_image; mod linear_to_image_u8; +pub mod linear_to_planar; mod luv; #[cfg(all( any(target_arch = "aarch64", target_arch = "arm"), target_feature = "neon" ))] mod neon; +pub mod planar_to_linear; mod rgb; mod rgb_expand; mod rgba; diff --git a/src/linear_to_planar.rs b/src/linear_to_planar.rs new file mode 100644 index 0000000..ba90d4e --- /dev/null +++ b/src/linear_to_planar.rs @@ -0,0 +1,92 @@ +#[cfg(all( + any(target_arch = "aarch64", target_arch = "arm"), + target_feature = "neon" +))] +use crate::neon::linear_to_planar::neon_linear_plane_to_gamma; +use crate::TransferFunction; + +#[inline(always)] +fn linear_to_gamma_channels( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + let mut src_offset = 0usize; + let mut dst_offset = 0usize; + + let transfer = transfer_function.get_gamma_function(); + + for _ in 0..height as usize { + let mut _cx = 0usize; + + #[cfg(all( + any(target_arch = "aarch64", target_arch = "arm"), + target_feature = "neon" + ))] + unsafe { + _cx = neon_linear_plane_to_gamma( + _cx, + src.as_ptr(), + src_offset as u32, + dst.as_mut_ptr(), + dst_offset as u32, + width, + transfer_function, + ); + } + + let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *const f32 }; + let dst_ptr = unsafe { dst.as_mut_ptr().add(dst_offset) }; + + for x in _cx..width as usize { + let px = x; + let src_slice = unsafe { src_ptr.add(px) }; + let pixel = unsafe { src_slice.read_unaligned() }.min(1f32).max(0f32); + + let dst = unsafe { dst_ptr.add(px) }; + let transferred = transfer(pixel); + let rgb8 = (transferred * 255f32).min(255f32).max(0f32) as u8; + + unsafe { + dst.write_unaligned(rgb8); + } + } + + src_offset += src_stride as usize; + dst_offset += dst_stride as usize; + } +} + +/// This function converts Linear to Plane. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains Linear plane data +/// * `src_stride` - Bytes per row for src data. +/// * `dst` - A mutable slice to receive Gamma plane data +/// * `dst_stride` - Bytes per row for dst data +/// * `width` - Image width +/// * `height` - Image height +/// * `transfer_function` - Transfer function from gamma to linear space. If you don't have specific pick `Srgb` +pub fn linear_to_plane( + src: &[f32], + src_stride: u32, + dst: &mut [u8], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + linear_to_gamma_channels( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); +} diff --git a/src/luv.rs b/src/luv.rs index 62bb2a9..f89e093 100644 --- a/src/luv.rs +++ b/src/luv.rs @@ -1,5 +1,5 @@ //! # Luv -/// Struct representing a color in CIALuv, a.k.a. L\*u\*v\*, color space +/// Struct representing a color in CIE LUV, a.k.a. L\*u\*v\*, color space #[derive(Debug, Copy, Clone, Default, PartialOrd)] pub struct Luv { /// The L\* value (achromatic luminance) of the colour in 0–100 range. @@ -22,7 +22,7 @@ pub struct Luv { pub v: f32, } -/// Struct representing a color in cylindrical CIELCh(uv) color space +/// Representing a color in cylindrical CIE LCh(uv) color space #[derive(Debug, Copy, Clone, Default, PartialOrd)] pub struct LCh { /// The L\* value (achromatic luminance) of the colour in 0–100 range. diff --git a/src/neon/cie.rs b/src/neon/cie.rs index 94cd620..4630546 100644 --- a/src/neon/cie.rs +++ b/src/neon/cie.rs @@ -2,10 +2,12 @@ use crate::luv::{ LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y, LUV_MULTIPLIER_INVERSE_Y, LUV_WHITE_U_PRIME, LUV_WHITE_V_PRIME, }; -use crate::neon::math::{ - prefer_vfmaq_f32, vatan2q_f32, vcbrtq_f32, vcolorq_matrix_f32, vcosq_f32, vcubeq_f32, - vhypotq_f32, vsinq_f32, -}; +use crate::neon::math::{prefer_vfmaq_f32, vcolorq_matrix_f32, vcubeq_f32}; +use erydanos::neon::atan2f::vatan2q_f32; +use erydanos::neon::cbrtf::vcbrtq_f32; +use erydanos::neon::cosf::vcosq_f32; +use erydanos::neon::hypotf::vhypotq_fast_f32; +use erydanos::neon::sinf::vsinq_f32; use std::arch::aarch64::*; #[inline(always)] @@ -100,7 +102,7 @@ pub(crate) unsafe fn neon_triple_to_lch( z: float32x4_t, ) -> (float32x4_t, float32x4_t, float32x4_t) { let (luv_l, luv_u, luv_v) = neon_triple_to_luv(x, y, z); - let lch_c = vhypotq_f32(luv_u, luv_v); + let lch_c = vhypotq_fast_f32(luv_u, luv_v); let lch_h = vatan2q_f32(luv_v, luv_u); (luv_l, lch_c, lch_h) } diff --git a/src/neon/linear_to_planar.rs b/src/neon/linear_to_planar.rs new file mode 100644 index 0000000..911c891 --- /dev/null +++ b/src/neon/linear_to_planar.rs @@ -0,0 +1,79 @@ +use std::arch::aarch64::*; + +use crate::neon::get_neon_gamma_transfer; +use crate::TransferFunction; + +#[inline(always)] +unsafe fn transfer_to_gamma( + r: float32x4_t, + transfer: &unsafe fn(float32x4_t) -> float32x4_t, +) -> uint32x4_t { + let r_f = vcvtaq_u32_f32(vmulq_n_f32(transfer(r), 255f32)); + r_f +} + +#[inline(always)] +unsafe fn process_set( + k: float32x4x4_t, + function: &unsafe fn(float32x4_t) -> float32x4_t, +) -> uint8x16_t { + let y0 = transfer_to_gamma(k.0, &function); + let y1 = transfer_to_gamma(k.1, &function); + let y2 = transfer_to_gamma(k.2, &function); + let y3 = transfer_to_gamma(k.2, &function); + + let y_row01 = vcombine_u16(vqmovn_u32(y0), vqmovn_u32(y1)); + let y_row23 = vcombine_u16(vqmovn_u32(y2), vqmovn_u32(y3)); + + let r_row = vcombine_u8(vqmovn_u16(y_row01), vqmovn_u16(y_row23)); + r_row +} + +#[inline] +pub unsafe fn neon_linear_plane_to_gamma( + start_cx: usize, + src: *const f32, + src_offset: u32, + dst: *mut u8, + dst_offset: u32, + width: u32, + transfer_function: TransferFunction, +) -> usize { + let mut cx = start_cx; + + let function = get_neon_gamma_transfer(transfer_function); + + while cx + 64 < width as usize { + let offset_src_ptr = ((src as *const u8).add(src_offset as usize) as *const f32).add(cx); + + let pixel_row0 = vld1q_f32_x4(offset_src_ptr); + let pixel_row1 = vld1q_f32_x4(offset_src_ptr.add(16)); + let pixel_row2 = vld1q_f32_x4(offset_src_ptr.add(32)); + let pixel_row3 = vld1q_f32_x4(offset_src_ptr.add(48)); + + let set0 = process_set(pixel_row0, &function); + let set1 = process_set(pixel_row1, &function); + let set2 = process_set(pixel_row2, &function); + let set3 = process_set(pixel_row3, &function); + + let dst_ptr = dst.add(dst_offset as usize + cx); + + let pixel_set = uint8x16x4_t(set0, set1, set2, set3); + vst1q_u8_x4(dst_ptr, pixel_set); + + cx += 64; + } + + while cx + 16 < width as usize { + let offset_src_ptr = ((src as *const u8).add(src_offset as usize) as *const f32).add(cx); + + let pixel_row = vld1q_f32_x4(offset_src_ptr); + let r_row = process_set(pixel_row, &function); + let dst_ptr = dst.add(dst_offset as usize + cx); + vst1q_u8(dst_ptr, r_row); + + cx += 16; + } + + cx +} diff --git a/src/neon/math.rs b/src/neon/math.rs index 7b4a868..2852e44 100644 --- a/src/neon/math.rs +++ b/src/neon/math.rs @@ -1,5 +1,7 @@ use std::arch::aarch64::*; +use erydanos::neon::powf::vpowq_fast_f32; + #[inline(always)] #[allow(dead_code)] pub(crate) unsafe fn vfmodq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t { @@ -301,13 +303,13 @@ pub unsafe fn vlogq_f32_ulp35(d: float32x4_t) -> float32x4_t { } #[inline(always)] -pub unsafe fn vpowq_f32(val: float32x4_t, n: float32x4_t) -> float32x4_t { - return vexpq_f32(vmulq_f32(n, vlogq_f32::(val))); +pub unsafe fn vpowjq_f32(val: float32x4_t, n: float32x4_t) -> float32x4_t { + vpowq_fast_f32(val, n) } #[inline(always)] pub unsafe fn vpowq_n_f32(t: float32x4_t, power: f32) -> float32x4_t { - return vpowq_f32(t, vdupq_n_f32(power)); + return vpowjq_f32(t, vdupq_n_f32(power)); } #[inline(always)] @@ -359,109 +361,6 @@ pub(crate) unsafe fn vmlafq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) prefer_vfmaq_f32(c, b, a) } -#[inline(always)] -pub unsafe fn vcbrtq_f32(d: float32x4_t) -> float32x4_t { - vcbrtq_f32_ulp2::(d) -} - -#[inline(always)] -/// Precise version of Cube Root with ULP 2 -pub unsafe fn vcbrtq_f32_ulp2(x: float32x4_t) -> float32x4_t { - let x1p24 = vreinterpretq_f32_u32(vdupq_n_u32(0x4b800000)); // 0x1p24f === 2 ^ 24 - - let mut ui = vreinterpretq_u32_f32(x); - let hx = vandq_u32(ui, vdupq_n_u32(0x7fffffff)); - - let nan_mask = vcgeq_u32(hx, vdupq_n_u32(0x7f800000)); - let is_zero_mask = vceqzq_u32(hx); - - let lo_mask = vcltq_u32(hx, vdupq_n_u32(0x00800000)); - let hi_ui_f = vreinterpretq_u32_f32(vmulq_f32(x, x1p24)); - let mut lo_hx = vandq_u32(hi_ui_f, vdupq_n_u32(0x7fffffff)); - lo_hx = vaddq_u32( - vcvtq_u32_f32(vmulq_n_f32(vcvtq_f32_u32(lo_hx), 1f32 / 3f32)), - vdupq_n_u32(642849266), - ); - let hi_hx = vaddq_u32( - vcvtq_u32_f32(vmulq_n_f32(vcvtq_f32_u32(hx), 1f32 / 3f32)), - vdupq_n_u32(709958130), - ); - let hx = vbslq_u32(lo_mask, lo_hx, hi_hx); - - ui = vbslq_u32(lo_mask, hi_ui_f, ui); - ui = vandq_u32(ui, vdupq_n_u32(0x80000000)); - ui = vorrq_u32(ui, hx); - - let mut t = vreinterpretq_f32_u32(ui); - let mut r = vmulq_f32(vmulq_f32(t, t), t); - - let sum_x = vaddq_f32(x, x); - - t = vmulq_f32( - vdivq_f32(vaddq_f32(sum_x, r), vaddq_f32(vaddq_f32(r, r), x)), - t, - ); - - r = vmulq_f32(vmulq_f32(t, t), t); - t = vmulq_f32( - vdivq_f32(vaddq_f32(sum_x, r), vaddq_f32(vaddq_f32(r, r), x)), - t, - ); - if HANDLE_NAN { - t = vbslq_f32(nan_mask, vdupq_n_f32(f32::NAN), t); - t = vbslq_f32(is_zero_mask, vdupq_n_f32(0f32), t); - } - t -} - -#[inline(always)] -#[allow(dead_code)] -/// Precise version of Cube Root with ULP 3.5 -pub unsafe fn vcbrtq_f32_ulp35(d: float32x4_t) -> float32x4_t { - let mut q = vdupq_n_f32(1f32); - let e = vaddq_s32(vilogbk_vi2_vf(vabsq_f32(d)), vdupq_n_s32(1)); - let mut d = vldexp2q_f32(d, vnegq_s32(e)); - - let t = vaddq_f32(vcvtq_f32_s32(e), vdupq_n_f32(6144f32)); - let qu = vcvtq_s32_f32(vmulq_n_f32(t, 1.0f32 / 3.0f32)); - let re = vcvtq_s32_f32(vsubq_f32(t, vmulq_n_f32(vcvtq_f32_s32(qu), 3f32))); - - q = vbslq_f32( - vceqq_s32(re, vdupq_n_s32(1)), - vdupq_n_f32(1.2599210498948731647672106f32), - q, - ); - q = vbslq_f32( - vceqq_s32(re, vdupq_n_s32(2)), - vdupq_n_f32(1.5874010519681994747517056f32), - q, - ); - q = vldexp2q_f32(q, vsubq_s32(qu, vdupq_n_s32(2048))); - - q = vmulsignq_f32(q, d); - d = vabsq_f32(d); - - let mut x = vdupq_n_f32(-0.601564466953277587890625f32); - x = vmlafq_f32(x, d, vdupq_n_f32(2.8208892345428466796875f32)); - x = vmlafq_f32(x, d, vdupq_n_f32(-5.532182216644287109375f32)); - x = vmlafq_f32(x, d, vdupq_n_f32(5.898262500762939453125f32)); - x = vmlafq_f32(x, d, vdupq_n_f32(-3.8095417022705078125f32)); - x = vmlafq_f32(x, d, vdupq_n_f32(2.2241256237030029296875f32)); - - let mut y = vmulq_f32(vmulq_f32(d, x), x); - y = vmulq_f32( - vsubq_f32( - y, - vmulq_f32( - vmulq_n_f32(y, 2.0f32 / 3.0f32), - vmlafq_f32(y, x, vdupq_n_f32(-1.0f32)), - ), - ), - q, - ); - return y; -} - #[inline(always)] pub unsafe fn vcolorq_matrix_f32( r: float32x4_t, @@ -482,187 +381,6 @@ pub unsafe fn vcolorq_matrix_f32( let new_b = prefer_vfmaq_f32(prefer_vfmaq_f32(vmulq_f32(g, c8), b, c9), r, c7); (new_r, new_g, new_b) } - -#[inline(always)] -pub unsafe fn vhypotq_f32(x: float32x4_t, y: float32x4_t) -> float32x4_t { - let xp2 = vmulq_f32(x, x); - let yp2 = vmulq_f32(y, y); - let z = vaddq_f32(xp2, yp2); - return vsqrtq_f32(z); -} - -#[inline(always)] -pub unsafe fn vpoly4q_f32( - x: float32x4_t, - x2: float32x4_t, - c3: float32x4_t, - c2: float32x4_t, - c1: float32x4_t, - c0: float32x4_t, -) -> float32x4_t { - vmlafq_f32(x2, vmlafq_f32(x, c3, c2), vmlafq_f32(x, c1, c0)) -} - -#[inline(always)] -pub unsafe fn vpoly8q_f32( - x: float32x4_t, - x2: float32x4_t, - x4: float32x4_t, - c7: float32x4_t, - c6: float32x4_t, - c5: float32x4_t, - c4: float32x4_t, - c3: float32x4_t, - c2: float32x4_t, - c1: float32x4_t, - c0: float32x4_t, -) -> float32x4_t { - vmlafq_f32( - x4, - vpoly4q_f32(x, x2, c7, c6, c5, c4), - vpoly4q_f32(x, x2, c3, c2, c1, c0), - ) -} - -#[inline(always)] -unsafe fn vatan2q_f32_impl(y: float32x4_t, x: float32x4_t) -> float32x4_t { - let q = vbslq_s32(vcltzq_f32(x), vdupq_n_s32(-2), vdupq_n_s32(0)); - let x = vabsq_f32(x); - let is_y_more_than_x = vcgtq_f32(y, x); - let t = vbslq_f32(is_y_more_than_x, x, vdupq_n_f32(0f32)); - let x = vbslq_f32(is_y_more_than_x, y, x); - let y = vbslq_f32(is_y_more_than_x, vnegq_f32(t), y); - let q = vbslq_s32(is_y_more_than_x, vaddq_s32(q, vdupq_n_s32(1)), q); - let s = vdivq_f32(y, x); - let t = vmulq_f32(s, s); - let t2 = vmulq_f32(t, t); - let t4 = vmulq_f32(t2, t2); - let poly = vpoly8q_f32( - t, - t2, - t4, - vdupq_n_f32(0.00282363896258175373077393f32), - vdupq_n_f32(-0.0159569028764963150024414f32), - vdupq_n_f32(0.0425049886107444763183594f32), - vdupq_n_f32(-0.0748900920152664184570312f32), - vdupq_n_f32(0.106347933411598205566406f32), - vdupq_n_f32(-0.142027363181114196777344f32), - vdupq_n_f32(0.199926957488059997558594f32), - vdupq_n_f32(-0.333331018686294555664062f32), - ); - let t = prefer_vfmaq_f32(s, vmulq_f32(poly, t), s); - let t = prefer_vfmaq_f32( - t, - vcvtq_f32_s32(q), - vdupq_n_f32(std::f32::consts::FRAC_PI_2), - ); - t -} - -#[inline(always)] -pub unsafe fn visnegq_f32(x: float32x4_t) -> uint32x4_t { - vcltzq_f32(x) -} - -#[inline(always)] -pub unsafe fn vatan2q_f32(y: float32x4_t, x: float32x4_t) -> float32x4_t { - let r = vatan2q_f32_impl(vabsq_f32(y), x); - let mut r = vmulsignq_f32(r, x); - let y_zero_mask = vceqzq_f32(y); - r = vbslq_f32(vceqzq_f32(x), vdupq_n_f32(std::f32::consts::FRAC_PI_2), r); - r = vbslq_f32( - y_zero_mask, - vbslq_f32( - visnegq_f32(x), - vdupq_n_f32(std::f32::consts::PI), - vdupq_n_f32(0f32), - ), - r, - ); - vmulsignq_f32(r, y) -} - -#[inline(always)] -pub unsafe fn vsinq_f32(val: float32x4_t) -> float32x4_t { - let pi_v = vdupq_n_f32(std::f32::consts::PI); - let pio2_v = vdupq_n_f32(std::f32::consts::FRAC_PI_2); - let ipi_v = vdupq_n_f32(std::f32::consts::FRAC_1_PI); - - //Find positive or negative - let c_v = vabsq_s32(vcvtq_s32_f32(vmulq_f32(val, ipi_v))); - let sign_v = vcleq_f32(val, vdupq_n_f32(0f32)); - let odd_v = vandq_u32(vreinterpretq_u32_s32(c_v), vdupq_n_u32(1)); - - let neg_v = veorq_u32(odd_v, sign_v); - - //Modulus a - (n * int(a*(1/n))) - let mut ma = vsubq_f32(vabsq_f32(val), vmulq_f32(pi_v, vcvtq_f32_s32(c_v))); - let reb_v = vcgeq_f32(ma, pio2_v); - - //Rebase a between 0 and pi/2 - ma = vbslq_f32(reb_v, vsubq_f32(pi_v, ma), ma); - - //Taylor series - let ma2 = vmulq_f32(ma, ma); - - //2nd elem: x^3 / 3! - let mut elem = vmulq_f32(vmulq_f32(ma, ma2), vdupq_n_f32(0.166666666666f32)); - let mut res = vsubq_f32(ma, elem); - - //3rd elem: x^5 / 5! - elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(0.05f32)); - res = vaddq_f32(res, elem); - - //4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val) - elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(0.023809523810f32)); - res = vsubq_f32(res, elem); - - //5th elem: x^9 / 9! - elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(0.013888888889f32)); - res = vaddq_f32(res, elem); - - //Change of sign - let neg_v = vshlq_n_u32::<31>(neg_v); - res = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(res), neg_v)); - return res; -} - -#[inline(always)] -pub unsafe fn vcosq_f32(d: float32x4_t) -> float32x4_t { - let mut q = vcvtq_s32_f32(vsubq_f32( - vmulq_f32(d, vdupq_n_f32(std::f32::consts::FRAC_1_PI)), - vdupq_n_f32(0.5f32), - )); - - q = vaddq_s32(vaddq_s32(q, q), vdupq_n_s32(1)); - - let mut u = vcvtq_f32_s32(q); - let mut d = vmlafq_f32(u, vdupq_n_f32(-0.78515625f32 * 2f32), d); - d = vmlafq_f32(u, vdupq_n_f32(-0.00024187564849853515625f32 * 2f32), d); - d = vmlafq_f32(u, vdupq_n_f32(-3.7747668102383613586e-08f32 * 2f32), d); - d = vmlafq_f32(u, vdupq_n_f32(-1.2816720341285448015e-12f32 * 2f32), d); - - let s = vmulq_f32(d, d); - - d = vreinterpretq_f32_u32(veorq_u32( - vandq_u32( - vceqq_s32(vandq_s32(q, vdupq_n_s32(2)), vdupq_n_s32(0)), - vreinterpretq_u32_f32(vdupq_n_f32(-0.0f32)), - ), - vreinterpretq_u32_f32(d), - )); - - u = vdupq_n_f32(2.6083159809786593541503e-06f32); - u = vmlafq_f32(u, s, vdupq_n_f32(-0.0001981069071916863322258f32)); - u = vmlafq_f32(u, s, vdupq_n_f32(0.00833307858556509017944336f32)); - u = vmlafq_f32(u, s, vdupq_n_f32(-0.166666597127914428710938f32)); - - u = vmlafq_f32(s, vmulq_f32(u, d), d); - - u = vreinterpretq_f32_u32(vorrq_u32(vispinfq_f32(d), vreinterpretq_u32_f32(u))); - return u; -} - #[inline(always)] pub(crate) unsafe fn vcubeq_f32(x: float32x4_t) -> float32x4_t { vmulq_f32(vmulq_f32(x, x), x) diff --git a/src/neon/mod.rs b/src/neon/mod.rs index 78d1949..0e9dc03 100644 --- a/src/neon/mod.rs +++ b/src/neon/mod.rs @@ -5,7 +5,9 @@ mod gamma_curves; mod hsv_to_image; mod image_to_hsv; mod linear_to_image; +pub mod linear_to_planar; mod math; +pub mod planar_to_linear; mod sigmoidal; mod to_linear; mod to_linear_u8; diff --git a/src/neon/planar_to_linear.rs b/src/neon/planar_to_linear.rs new file mode 100644 index 0000000..6b71d8d --- /dev/null +++ b/src/neon/planar_to_linear.rs @@ -0,0 +1,86 @@ +use crate::gamma_curves::TransferFunction; +use crate::neon::*; +use std::arch::aarch64::*; + +#[inline(always)] +unsafe fn neon_to_linear( + r: uint32x4_t, + transfer: &unsafe fn(float32x4_t) -> float32x4_t, +) -> float32x4_t { + let r_f = vmulq_n_f32(vcvtq_f32_u32(r), 1f32 / 255f32); + transfer(r_f) +} + +#[inline] +unsafe fn process_pixels( + pixels: uint8x16_t, + transfer: &unsafe fn(float32x4_t) -> float32x4_t, +) -> float32x4x4_t { + let r_low = vmovl_u8(vget_low_u8(pixels)); + + let r_low_low = vmovl_u16(vget_low_u16(r_low)); + + let x_low_low = neon_to_linear(r_low_low, &transfer); + + let r_low_high = vmovl_high_u16(r_low); + + let x_low_high = neon_to_linear(r_low_high, &transfer); + + let r_high = vmovl_high_u8(pixels); + + let r_high_low = vmovl_u16(vget_low_u16(r_high)); + + let x_high_low = neon_to_linear(r_high_low, &transfer); + + let r_high_high = vmovl_high_u16(r_high); + + let x_high_high = neon_to_linear(r_high_high, &transfer); + + let storing_row = float32x4x4_t(x_low_low, x_low_high, x_high_low, x_high_high); + storing_row +} + +#[inline(always)] +pub unsafe fn neon_plane_to_linear( + start_cx: usize, + src: *const u8, + src_offset: usize, + width: u32, + dst: *mut f32, + dst_offset: usize, + transfer_function: TransferFunction, +) -> usize { + let mut cx = start_cx; + let transfer = get_neon_linear_transfer(transfer_function); + + let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32; + + while cx + 64 < width as usize { + let src_ptr = src.add(src_offset + cx); + let pixels_row64 = vld1q_u8_x4(src_ptr); + let storing_row0 = process_pixels(pixels_row64.0, &transfer); + vst1q_f32_x4(dst_ptr.add(cx), storing_row0); + + let storing_row1 = process_pixels(pixels_row64.1, &transfer); + vst1q_f32_x4(dst_ptr.add(cx + 16), storing_row1); + + let storing_row2 = process_pixels(pixels_row64.2, &transfer); + vst1q_f32_x4(dst_ptr.add(cx + 32), storing_row2); + + let storing_row3 = process_pixels(pixels_row64.3, &transfer); + vst1q_f32_x4(dst_ptr.add(cx + 48), storing_row3); + + cx += 64; + } + + while cx + 16 < width as usize { + let src_ptr = src.add(src_offset + cx); + let pixels = vld1q_u8(src_ptr); + let storing_row = process_pixels(pixels, &transfer); + vst1q_f32_x4(dst_ptr.add(cx), storing_row); + + cx += 16; + } + + cx +} diff --git a/src/planar_to_linear.rs b/src/planar_to_linear.rs new file mode 100644 index 0000000..7bf4672 --- /dev/null +++ b/src/planar_to_linear.rs @@ -0,0 +1,89 @@ +#[cfg(all( + any(target_arch = "aarch64", target_arch = "arm"), + target_feature = "neon" +))] +use crate::neon::planar_to_linear::neon_plane_to_linear; +use crate::TransferFunction; + +#[inline(always)] +fn channels_to_linear( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + let mut src_offset = 0usize; + let mut dst_offset = 0usize; + + let transfer = transfer_function.get_linearize_function(); + for _ in 0..height as usize { + let mut _cx = 0usize; + + let src_ptr = unsafe { src.as_ptr().add(src_offset) }; + let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 }; + + #[cfg(all( + any(target_arch = "aarch64", target_arch = "arm"), + target_feature = "neon" + ))] + unsafe { + _cx = neon_plane_to_linear( + _cx, + src.as_ptr(), + src_offset, + width, + dst.as_mut_ptr(), + dst_offset, + transfer_function, + ) + } + + for x in _cx..width as usize { + let px = x; + let dst = unsafe { dst_ptr.add(px) }; + let src = unsafe { src_ptr.add(px) }; + let pixel_f = unsafe { src.read_unaligned() as f32 } * (1. / 255.); + let transferred = transfer(pixel_f); + + unsafe { + dst.write_unaligned(transferred); + } + } + + src_offset += src_stride as usize; + dst_offset += dst_stride as usize; + } +} + +/// This function converts Plane to Linear. This is much more effective than naive direct transformation +/// +/// # Arguments +/// * `src` - A slice contains RGB data +/// * `src_stride` - Bytes per row for src data. +/// * `width` - Image width +/// * `height` - Image height +/// * `dst` - A mutable slice to receive linear data +/// * `dst_stride` - Bytes per row for dst data +/// * `transfer_function` - Transfer function from gamma to linear space. If you don't have specific pick `Srgb` +pub fn plane_to_linear( + src: &[u8], + src_stride: u32, + dst: &mut [f32], + dst_stride: u32, + width: u32, + height: u32, + transfer_function: TransferFunction, +) { + channels_to_linear( + src, + src_stride, + dst, + dst_stride, + width, + height, + transfer_function, + ); +} diff --git a/src/rgb.rs b/src/rgb.rs index 3c46237..16e0770 100644 --- a/src/rgb.rs +++ b/src/rgb.rs @@ -4,9 +4,13 @@ use crate::luv::Luv; use crate::{Hsl, LCh, Sigmoidal}; #[derive(Debug, PartialOrd, PartialEq, Clone, Copy)] +/// Represents any RGB values, Rgb, Rgb etc. pub struct Rgb { + /// Red component pub r: T, + /// Green component pub g: T, + /// Blue component pub b: T, } diff --git a/src/rgba.rs b/src/rgba.rs index d180e4b..f17abc6 100644 --- a/src/rgba.rs +++ b/src/rgba.rs @@ -2,10 +2,15 @@ use crate::rgb::Rgb; use half::f16; #[derive(Debug, Copy, Clone, PartialOrd, PartialEq)] +/// Represents any RGBA values, Rgba, Rgba etc. pub struct Rgba { + /// Red component pub r: T, + /// Green component pub g: T, + /// Blue component pub b: T, + /// Alpha component pub a: T, } @@ -131,6 +136,8 @@ impl ToRgbaF16 for Rgba { } } +#[derive(Debug, Copy, Clone, PartialOrd, PartialEq)] +/// Represents RGB 565 color in one u16 pub struct Rgb565 { pub rgb565: u16, } @@ -199,6 +206,8 @@ impl ToRgb565 for Rgba { } } +#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] +/// Represents RGBA 1010102 in one u32 store pub struct Rgba1010102 { pub rgba: u32, } diff --git a/src/xyz_transform.rs b/src/xyz_transform.rs index 09d3245..9b95285 100644 --- a/src/xyz_transform.rs +++ b/src/xyz_transform.rs @@ -1,25 +1,25 @@ -/// sRGB to XYZ, D65 White point +/// sRGB to XYZ transformation matrix, D65 White point pub const SRGB_TO_XYZ_D65: [[f32; 3]; 3] = [ [0.4124564f32, 0.3575761f32, 0.1804375f32], [0.2126729f32, 0.7151522f32, 0.0721750f32], [0.0193339f32, 0.1191920f32, 0.9503041f32], ]; -/// XYZ to sRGB, D65 White point +/// XYZ to sRGB transformation matrix, D65 White point pub const XYZ_TO_SRGB_D65: [[f32; 3]; 3] = [ [3.2404542f32, -1.5371385f32, -0.4985314f32], [-0.9692660f32, 1.8760108f32, 0.0415560f32], [0.0556434f32, -0.2040259f32, 1.0572252f32], ]; -/// sRGB to XYZ, D50 White point +/// sRGB to XYZ transformation matrix, D50 White point pub const SRGB_TO_XYZ_D50: [[f32; 3]; 3] = [ [0.4360747f32, 0.3850649f32, 0.1430804f32], [0.2225045f32, 0.7168786f32, 0.0606169f32], [0.0139322f32, 0.0971045f32, 0.7141733f32], ]; -/// XYZ to sRGB, D50 White point +/// XYZ to sRGB transformation matrix, D50 White point pub const XYZ_TO_SRGB_D50: [[f32; 3]; 3] = [ [3.1338561f32, -1.6168667f32, -0.4906146f32], [-0.9787684f32, 1.9161415f32, 0.0334540f32],