diff --git a/Cargo.lock b/Cargo.lock
index eb76259..513b58f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -163,8 +163,9 @@ checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b"
 
 [[package]]
 name = "colorutils-rs"
-version = "0.4.10"
+version = "0.4.11"
 dependencies = [
+ "erydanos",
  "half",
 ]
 
@@ -220,6 +221,15 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 
+[[package]]
+name = "erydanos"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b0354c3359e57ded8b5f8a120273cb1da304630399da59afa14182404573d6f"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "exr"
 version = "1.72.0"
diff --git a/Cargo.toml b/Cargo.toml
index 253f089..e8f5432 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,7 @@ workspace = { members = ["src/app"] }
 
 [package]
 name = "colorutils-rs"
-version = "0.4.10"
+version = "0.4.11"
 edition = "2021"
 description = "High performance utilities for color format handling and conversion."
 readme = "README.md"
@@ -16,6 +16,7 @@ repository = "https://github.com/awxkee/colorutils-rs"
 exclude = ["*.jpg"]
 
 [dependencies]
+erydanos = "0.1.0"
 half = "2.4.1"
 
 [features]
diff --git a/src/app/src/main.rs b/src/app/src/main.rs
index 39868fb..9bbdfae 100644
--- a/src/app/src/main.rs
+++ b/src/app/src/main.rs
@@ -23,7 +23,7 @@ fn main() {
     println!("HSL {:?}", hsl);
     println!("Back RGB {:?}", hsl.to_rgb8());
 
-    let img = ImageReader::open("./assets/test_image_2.png")
+    let img = ImageReader::open("./assets/beach_horizon.jpg")
         .unwrap()
         .decode()
         .unwrap();
@@ -34,7 +34,7 @@ fn main() {
     let mut src_bytes = img.as_bytes();
     let width = dimensions.0;
     let height = dimensions.1;
-    let components = 4;
+    let components = 3;
     //
     // let mut dst_rgba = vec![];
     // dst_rgba.resize(4usize * width as usize * height as usize, 0u8);
@@ -58,14 +58,13 @@ fn main() {
         lab_store.resize(width as usize * components * height as usize, 0f32);
         let src_stride = width * components as u32;
         let start_time = Instant::now();
-        rgba_to_linear(
+        rgb_to_lab(
             src_bytes,
             src_stride,
             &mut lab_store,
             store_stride as u32,
             width,
             height,
-            TransferFunction::Srgb,
         );
         let elapsed_time = start_time.elapsed();
         // Print the elapsed time in milliseconds
@@ -93,14 +92,13 @@ fn main() {
         // }
 
         let start_time = Instant::now();
-        linear_to_rgba(
+        lab_to_srgb(
             &lab_store,
             store_stride as u32,
             &mut dst_slice,
             src_stride,
             width,
             height,
-            TransferFunction::Srgb,
         );
 
         let elapsed_time = start_time.elapsed();
diff --git a/src/gamma_curves.rs b/src/gamma_curves.rs
index 7e7b626..e61171b 100644
--- a/src/gamma_curves.rs
+++ b/src/gamma_curves.rs
@@ -1,4 +1,5 @@
 #[inline(always)]
+/// Linear transfer function for sRGB
 pub fn srgb_to_linear(gamma: f32) -> f32 {
     return if gamma < 0f32 {
         0f32
@@ -12,6 +13,7 @@ pub fn srgb_to_linear(gamma: f32) -> f32 {
 }
 
 #[inline(always)]
+/// Gamma transfer function for sRGB
 pub fn srgb_from_linear(linear: f32) -> f32 {
     return if linear < 0.0f32 {
         0.0f32
@@ -25,6 +27,7 @@ pub fn srgb_from_linear(linear: f32) -> f32 {
 }
 
 #[inline(always)]
+/// Linear transfer function for Rec.709
 pub fn rec709_to_linear(gamma: f32) -> f32 {
     return if gamma < 0.0f32 {
         0.0f32
@@ -38,6 +41,7 @@ pub fn rec709_to_linear(gamma: f32) -> f32 {
 }
 
 #[inline(always)]
+/// Gamma transfer function for Rec.709
 pub fn rec709_from_linear(linear: f32) -> f32 {
     return if linear < 0.0f32 {
         0.0f32
@@ -51,26 +55,43 @@ pub fn rec709_from_linear(linear: f32) -> f32 {
 }
 
 #[inline(always)]
+/// Pure gamma transfer function for gamma 2.2
+pub fn pure_gamma_function(x: f32, gamma: f32) -> f32 {
+    if x <= 0f32 {
+        return 0f32;
+    } else if x >= 1f32 {
+        return 1f32;
+    } else {
+        return x.powf(gamma);
+    }
+}
+
+#[inline(always)]
+/// Pure gamma transfer function for gamma 2.2
 pub fn gamma2p2_from_linear(linear: f32) -> f32 {
-    linear.powf(1f32 / 2.2f32)
+    pure_gamma_function(linear, 1f32 / 2.2f32)
 }
 
 #[inline(always)]
+/// Linear transfer function for gamma 2.2
 pub fn gamma2p2_to_linear(gamma: f32) -> f32 {
-    gamma.powf(2.2f32)
+    pure_gamma_function(gamma, 2.2f32)
 }
 
 #[inline(always)]
+/// Pure gamma transfer function for gamma 2.8
 pub fn gamma2p8_from_linear(linear: f32) -> f32 {
-    linear.powf(1f32 / 2.8f32)
+    pure_gamma_function(linear, 1f32 / 2.8f32)
 }
 
 #[inline(always)]
+/// Linear transfer function for gamma 2.8
 pub fn gamma2p8_to_linear(gamma: f32) -> f32 {
-    gamma.powf(2.8f32)
+    pure_gamma_function(gamma, 2.8f32)
 }
 
 #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
+/// Declares transfer function for transfer components into a linear colorspace and its inverse
 pub enum TransferFunction {
     /// sRGB Transfer function
     Srgb,
diff --git a/src/hsl.rs b/src/hsl.rs
index baf4d98..d831822 100644
--- a/src/hsl.rs
+++ b/src/hsl.rs
@@ -1,9 +1,13 @@
 use crate::rgb::Rgb;
 
 #[derive(Debug, Copy, Clone, PartialOrd, PartialEq)]
+/// Represents *HSL* (hue, saturation, lightness) colorspace, H ∈ [0, 360f32], s ∈ [0f32, 1f32], v ∈ [0f32, 1f32]
 pub struct Hsl {
+    /// Hue H ∈ [0, 360f32]
     pub h: f32,
+    /// Saturation s ∈ [0, 1f32]
     pub s: f32,
+    /// Lightness v ∈ [0, 1f32]
     pub l: f32,
 }
 
diff --git a/src/hsv.rs b/src/hsv.rs
index 79b1e17..c10a948 100644
--- a/src/hsv.rs
+++ b/src/hsv.rs
@@ -1,9 +1,13 @@
 use crate::rgb::Rgb;
 
 #[derive(Debug, Copy, Clone, PartialOrd, PartialEq)]
+/// Represents *HSV* (hue, saturation, value) colorspace, H ∈ [0, 360f32], s ∈ [0f32, 1f32], v ∈ [0f32, 1f32]
 pub struct Hsv {
+    /// Hue H ∈ [0, 360f32]
     pub h: f32,
+    /// Saturation s ∈ [0, 1f32]
     pub s: f32,
+    /// Value v ∈ [0, 1f32]
     pub v: f32,
 }
 
diff --git a/src/lab.rs b/src/lab.rs
index 1b69cb1..f1dcf99 100644
--- a/src/lab.rs
+++ b/src/lab.rs
@@ -1,11 +1,14 @@
 use crate::rgb::Rgb;
 use crate::xyz::Xyz;
 
-/// A CIELAB color.
+/// Represents CIELAB color space.
 #[derive(Copy, Clone, Debug, Default, PartialOrd, PartialEq)]
 pub struct Lab {
+    /// `l`: lightness component (0 to 100)
     pub l: f32,
+    /// `a`: green (negative) and red (positive) component.
     pub a: f32,
+    /// `b`: blue (negative) and yellow (positive) component
     pub b: f32,
 }
 
diff --git a/src/lib.rs b/src/lib.rs
index 99ecd51..7cba515 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -19,12 +19,14 @@ mod image_xyza_laba;
 mod lab;
 mod linear_to_image;
 mod linear_to_image_u8;
+pub mod linear_to_planar;
 mod luv;
 #[cfg(all(
     any(target_arch = "aarch64", target_arch = "arm"),
     target_feature = "neon"
 ))]
 mod neon;
+pub mod planar_to_linear;
 mod rgb;
 mod rgb_expand;
 mod rgba;
diff --git a/src/linear_to_planar.rs b/src/linear_to_planar.rs
new file mode 100644
index 0000000..ba90d4e
--- /dev/null
+++ b/src/linear_to_planar.rs
@@ -0,0 +1,92 @@
+#[cfg(all(
+    any(target_arch = "aarch64", target_arch = "arm"),
+    target_feature = "neon"
+))]
+use crate::neon::linear_to_planar::neon_linear_plane_to_gamma;
+use crate::TransferFunction;
+
+#[inline(always)]
+fn linear_to_gamma_channels(
+    src: &[f32],
+    src_stride: u32,
+    dst: &mut [u8],
+    dst_stride: u32,
+    width: u32,
+    height: u32,
+    transfer_function: TransferFunction,
+) {
+    let mut src_offset = 0usize;
+    let mut dst_offset = 0usize;
+
+    let transfer = transfer_function.get_gamma_function();
+
+    for _ in 0..height as usize {
+        let mut _cx = 0usize;
+
+        #[cfg(all(
+            any(target_arch = "aarch64", target_arch = "arm"),
+            target_feature = "neon"
+        ))]
+        unsafe {
+            _cx = neon_linear_plane_to_gamma(
+                _cx,
+                src.as_ptr(),
+                src_offset as u32,
+                dst.as_mut_ptr(),
+                dst_offset as u32,
+                width,
+                transfer_function,
+            );
+        }
+
+        let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *const f32 };
+        let dst_ptr = unsafe { dst.as_mut_ptr().add(dst_offset) };
+
+        for x in _cx..width as usize {
+            let px = x;
+            let src_slice = unsafe { src_ptr.add(px) };
+            let pixel = unsafe { src_slice.read_unaligned() }.min(1f32).max(0f32);
+
+            let dst = unsafe { dst_ptr.add(px) };
+            let transferred = transfer(pixel);
+            let rgb8 = (transferred * 255f32).min(255f32).max(0f32) as u8;
+
+            unsafe {
+                dst.write_unaligned(rgb8);
+            }
+        }
+
+        src_offset += src_stride as usize;
+        dst_offset += dst_stride as usize;
+    }
+}
+
+/// This function converts Linear to Plane. This is much more effective than naive direct transformation
+///
+/// # Arguments
+/// * `src` - A slice contains Linear plane data
+/// * `src_stride` - Bytes per row for src data.
+/// * `dst` - A mutable slice to receive Gamma plane data
+/// * `dst_stride` - Bytes per row for dst data
+/// * `width` - Image width
+/// * `height` - Image height
+/// * `transfer_function` - Transfer function from gamma to linear space. If you don't have specific pick `Srgb`
+pub fn linear_to_plane(
+    src: &[f32],
+    src_stride: u32,
+    dst: &mut [u8],
+    dst_stride: u32,
+    width: u32,
+    height: u32,
+    transfer_function: TransferFunction,
+) {
+    linear_to_gamma_channels(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        width,
+        height,
+        transfer_function,
+    );
+}
diff --git a/src/luv.rs b/src/luv.rs
index 62bb2a9..f89e093 100644
--- a/src/luv.rs
+++ b/src/luv.rs
@@ -1,5 +1,5 @@
 //! # Luv
-/// Struct representing a color in CIALuv, a.k.a. L\*u\*v\*, color space
+/// Struct representing a color in CIE LUV, a.k.a. L\*u\*v\*, color space
 #[derive(Debug, Copy, Clone, Default, PartialOrd)]
 pub struct Luv {
     /// The L\* value (achromatic luminance) of the colour in 0–100 range.
@@ -22,7 +22,7 @@ pub struct Luv {
     pub v: f32,
 }
 
-/// Struct representing a color in cylindrical CIELCh(uv) color space
+/// Representing a color in cylindrical CIE LCh(uv) color space
 #[derive(Debug, Copy, Clone, Default, PartialOrd)]
 pub struct LCh {
     /// The L\* value (achromatic luminance) of the colour in 0–100 range.
diff --git a/src/neon/cie.rs b/src/neon/cie.rs
index 94cd620..4630546 100644
--- a/src/neon/cie.rs
+++ b/src/neon/cie.rs
@@ -2,10 +2,12 @@ use crate::luv::{
     LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y, LUV_MULTIPLIER_INVERSE_Y, LUV_WHITE_U_PRIME,
     LUV_WHITE_V_PRIME,
 };
-use crate::neon::math::{
-    prefer_vfmaq_f32, vatan2q_f32, vcbrtq_f32, vcolorq_matrix_f32, vcosq_f32, vcubeq_f32,
-    vhypotq_f32, vsinq_f32,
-};
+use crate::neon::math::{prefer_vfmaq_f32, vcolorq_matrix_f32, vcubeq_f32};
+use erydanos::neon::atan2f::vatan2q_f32;
+use erydanos::neon::cbrtf::vcbrtq_f32;
+use erydanos::neon::cosf::vcosq_f32;
+use erydanos::neon::hypotf::vhypotq_fast_f32;
+use erydanos::neon::sinf::vsinq_f32;
 use std::arch::aarch64::*;
 
 #[inline(always)]
@@ -100,7 +102,7 @@ pub(crate) unsafe fn neon_triple_to_lch(
     z: float32x4_t,
 ) -> (float32x4_t, float32x4_t, float32x4_t) {
     let (luv_l, luv_u, luv_v) = neon_triple_to_luv(x, y, z);
-    let lch_c = vhypotq_f32(luv_u, luv_v);
+    let lch_c = vhypotq_fast_f32(luv_u, luv_v);
     let lch_h = vatan2q_f32(luv_v, luv_u);
     (luv_l, lch_c, lch_h)
 }
diff --git a/src/neon/linear_to_planar.rs b/src/neon/linear_to_planar.rs
new file mode 100644
index 0000000..911c891
--- /dev/null
+++ b/src/neon/linear_to_planar.rs
@@ -0,0 +1,79 @@
+use std::arch::aarch64::*;
+
+use crate::neon::get_neon_gamma_transfer;
+use crate::TransferFunction;
+
+#[inline(always)]
+unsafe fn transfer_to_gamma(
+    r: float32x4_t,
+    transfer: &unsafe fn(float32x4_t) -> float32x4_t,
+) -> uint32x4_t {
+    let r_f = vcvtaq_u32_f32(vmulq_n_f32(transfer(r), 255f32));
+    r_f
+}
+
+#[inline(always)]
+unsafe fn process_set(
+    k: float32x4x4_t,
+    function: &unsafe fn(float32x4_t) -> float32x4_t,
+) -> uint8x16_t {
+    let y0 = transfer_to_gamma(k.0, &function);
+    let y1 = transfer_to_gamma(k.1, &function);
+    let y2 = transfer_to_gamma(k.2, &function);
+    let y3 = transfer_to_gamma(k.2, &function);
+
+    let y_row01 = vcombine_u16(vqmovn_u32(y0), vqmovn_u32(y1));
+    let y_row23 = vcombine_u16(vqmovn_u32(y2), vqmovn_u32(y3));
+
+    let r_row = vcombine_u8(vqmovn_u16(y_row01), vqmovn_u16(y_row23));
+    r_row
+}
+
+#[inline]
+pub unsafe fn neon_linear_plane_to_gamma(
+    start_cx: usize,
+    src: *const f32,
+    src_offset: u32,
+    dst: *mut u8,
+    dst_offset: u32,
+    width: u32,
+    transfer_function: TransferFunction,
+) -> usize {
+    let mut cx = start_cx;
+
+    let function = get_neon_gamma_transfer(transfer_function);
+
+    while cx + 64 < width as usize {
+        let offset_src_ptr = ((src as *const u8).add(src_offset as usize) as *const f32).add(cx);
+
+        let pixel_row0 = vld1q_f32_x4(offset_src_ptr);
+        let pixel_row1 = vld1q_f32_x4(offset_src_ptr.add(16));
+        let pixel_row2 = vld1q_f32_x4(offset_src_ptr.add(32));
+        let pixel_row3 = vld1q_f32_x4(offset_src_ptr.add(48));
+
+        let set0 = process_set(pixel_row0, &function);
+        let set1 = process_set(pixel_row1, &function);
+        let set2 = process_set(pixel_row2, &function);
+        let set3 = process_set(pixel_row3, &function);
+
+        let dst_ptr = dst.add(dst_offset as usize + cx);
+
+        let pixel_set = uint8x16x4_t(set0, set1, set2, set3);
+        vst1q_u8_x4(dst_ptr, pixel_set);
+
+        cx += 64;
+    }
+
+    while cx + 16 < width as usize {
+        let offset_src_ptr = ((src as *const u8).add(src_offset as usize) as *const f32).add(cx);
+
+        let pixel_row = vld1q_f32_x4(offset_src_ptr);
+        let r_row = process_set(pixel_row, &function);
+        let dst_ptr = dst.add(dst_offset as usize + cx);
+        vst1q_u8(dst_ptr, r_row);
+
+        cx += 16;
+    }
+
+    cx
+}
diff --git a/src/neon/math.rs b/src/neon/math.rs
index 7b4a868..2852e44 100644
--- a/src/neon/math.rs
+++ b/src/neon/math.rs
@@ -1,5 +1,7 @@
 use std::arch::aarch64::*;
 
+use erydanos::neon::powf::vpowq_fast_f32;
+
 #[inline(always)]
 #[allow(dead_code)]
 pub(crate) unsafe fn vfmodq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
@@ -301,13 +303,13 @@ pub unsafe fn vlogq_f32_ulp35(d: float32x4_t) -> float32x4_t {
 }
 
 #[inline(always)]
-pub unsafe fn vpowq_f32(val: float32x4_t, n: float32x4_t) -> float32x4_t {
-    return vexpq_f32(vmulq_f32(n, vlogq_f32::<false>(val)));
+pub unsafe fn vpowjq_f32(val: float32x4_t, n: float32x4_t) -> float32x4_t {
+    vpowq_fast_f32(val, n)
 }
 
 #[inline(always)]
 pub unsafe fn vpowq_n_f32(t: float32x4_t, power: f32) -> float32x4_t {
-    return vpowq_f32(t, vdupq_n_f32(power));
+    return vpowjq_f32(t, vdupq_n_f32(power));
 }
 
 #[inline(always)]
@@ -359,109 +361,6 @@ pub(crate) unsafe fn vmlafq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t)
     prefer_vfmaq_f32(c, b, a)
 }
 
-#[inline(always)]
-pub unsafe fn vcbrtq_f32(d: float32x4_t) -> float32x4_t {
-    vcbrtq_f32_ulp2::<false>(d)
-}
-
-#[inline(always)]
-/// Precise version of Cube Root with ULP 2
-pub unsafe fn vcbrtq_f32_ulp2<const HANDLE_NAN: bool>(x: float32x4_t) -> float32x4_t {
-    let x1p24 = vreinterpretq_f32_u32(vdupq_n_u32(0x4b800000)); // 0x1p24f === 2 ^ 24
-
-    let mut ui = vreinterpretq_u32_f32(x);
-    let hx = vandq_u32(ui, vdupq_n_u32(0x7fffffff));
-
-    let nan_mask = vcgeq_u32(hx, vdupq_n_u32(0x7f800000));
-    let is_zero_mask = vceqzq_u32(hx);
-
-    let lo_mask = vcltq_u32(hx, vdupq_n_u32(0x00800000));
-    let hi_ui_f = vreinterpretq_u32_f32(vmulq_f32(x, x1p24));
-    let mut lo_hx = vandq_u32(hi_ui_f, vdupq_n_u32(0x7fffffff));
-    lo_hx = vaddq_u32(
-        vcvtq_u32_f32(vmulq_n_f32(vcvtq_f32_u32(lo_hx), 1f32 / 3f32)),
-        vdupq_n_u32(642849266),
-    );
-    let hi_hx = vaddq_u32(
-        vcvtq_u32_f32(vmulq_n_f32(vcvtq_f32_u32(hx), 1f32 / 3f32)),
-        vdupq_n_u32(709958130),
-    );
-    let hx = vbslq_u32(lo_mask, lo_hx, hi_hx);
-
-    ui = vbslq_u32(lo_mask, hi_ui_f, ui);
-    ui = vandq_u32(ui, vdupq_n_u32(0x80000000));
-    ui = vorrq_u32(ui, hx);
-
-    let mut t = vreinterpretq_f32_u32(ui);
-    let mut r = vmulq_f32(vmulq_f32(t, t), t);
-
-    let sum_x = vaddq_f32(x, x);
-
-    t = vmulq_f32(
-        vdivq_f32(vaddq_f32(sum_x, r), vaddq_f32(vaddq_f32(r, r), x)),
-        t,
-    );
-
-    r = vmulq_f32(vmulq_f32(t, t), t);
-    t = vmulq_f32(
-        vdivq_f32(vaddq_f32(sum_x, r), vaddq_f32(vaddq_f32(r, r), x)),
-        t,
-    );
-    if HANDLE_NAN {
-        t = vbslq_f32(nan_mask, vdupq_n_f32(f32::NAN), t);
-        t = vbslq_f32(is_zero_mask, vdupq_n_f32(0f32), t);
-    }
-    t
-}
-
-#[inline(always)]
-#[allow(dead_code)]
-/// Precise version of Cube Root with ULP 3.5
-pub unsafe fn vcbrtq_f32_ulp35(d: float32x4_t) -> float32x4_t {
-    let mut q = vdupq_n_f32(1f32);
-    let e = vaddq_s32(vilogbk_vi2_vf(vabsq_f32(d)), vdupq_n_s32(1));
-    let mut d = vldexp2q_f32(d, vnegq_s32(e));
-
-    let t = vaddq_f32(vcvtq_f32_s32(e), vdupq_n_f32(6144f32));
-    let qu = vcvtq_s32_f32(vmulq_n_f32(t, 1.0f32 / 3.0f32));
-    let re = vcvtq_s32_f32(vsubq_f32(t, vmulq_n_f32(vcvtq_f32_s32(qu), 3f32)));
-
-    q = vbslq_f32(
-        vceqq_s32(re, vdupq_n_s32(1)),
-        vdupq_n_f32(1.2599210498948731647672106f32),
-        q,
-    );
-    q = vbslq_f32(
-        vceqq_s32(re, vdupq_n_s32(2)),
-        vdupq_n_f32(1.5874010519681994747517056f32),
-        q,
-    );
-    q = vldexp2q_f32(q, vsubq_s32(qu, vdupq_n_s32(2048)));
-
-    q = vmulsignq_f32(q, d);
-    d = vabsq_f32(d);
-
-    let mut x = vdupq_n_f32(-0.601564466953277587890625f32);
-    x = vmlafq_f32(x, d, vdupq_n_f32(2.8208892345428466796875f32));
-    x = vmlafq_f32(x, d, vdupq_n_f32(-5.532182216644287109375f32));
-    x = vmlafq_f32(x, d, vdupq_n_f32(5.898262500762939453125f32));
-    x = vmlafq_f32(x, d, vdupq_n_f32(-3.8095417022705078125f32));
-    x = vmlafq_f32(x, d, vdupq_n_f32(2.2241256237030029296875f32));
-
-    let mut y = vmulq_f32(vmulq_f32(d, x), x);
-    y = vmulq_f32(
-        vsubq_f32(
-            y,
-            vmulq_f32(
-                vmulq_n_f32(y, 2.0f32 / 3.0f32),
-                vmlafq_f32(y, x, vdupq_n_f32(-1.0f32)),
-            ),
-        ),
-        q,
-    );
-    return y;
-}
-
 #[inline(always)]
 pub unsafe fn vcolorq_matrix_f32(
     r: float32x4_t,
@@ -482,187 +381,6 @@ pub unsafe fn vcolorq_matrix_f32(
     let new_b = prefer_vfmaq_f32(prefer_vfmaq_f32(vmulq_f32(g, c8), b, c9), r, c7);
     (new_r, new_g, new_b)
 }
-
-#[inline(always)]
-pub unsafe fn vhypotq_f32(x: float32x4_t, y: float32x4_t) -> float32x4_t {
-    let xp2 = vmulq_f32(x, x);
-    let yp2 = vmulq_f32(y, y);
-    let z = vaddq_f32(xp2, yp2);
-    return vsqrtq_f32(z);
-}
-
-#[inline(always)]
-pub unsafe fn vpoly4q_f32(
-    x: float32x4_t,
-    x2: float32x4_t,
-    c3: float32x4_t,
-    c2: float32x4_t,
-    c1: float32x4_t,
-    c0: float32x4_t,
-) -> float32x4_t {
-    vmlafq_f32(x2, vmlafq_f32(x, c3, c2), vmlafq_f32(x, c1, c0))
-}
-
-#[inline(always)]
-pub unsafe fn vpoly8q_f32(
-    x: float32x4_t,
-    x2: float32x4_t,
-    x4: float32x4_t,
-    c7: float32x4_t,
-    c6: float32x4_t,
-    c5: float32x4_t,
-    c4: float32x4_t,
-    c3: float32x4_t,
-    c2: float32x4_t,
-    c1: float32x4_t,
-    c0: float32x4_t,
-) -> float32x4_t {
-    vmlafq_f32(
-        x4,
-        vpoly4q_f32(x, x2, c7, c6, c5, c4),
-        vpoly4q_f32(x, x2, c3, c2, c1, c0),
-    )
-}
-
-#[inline(always)]
-unsafe fn vatan2q_f32_impl(y: float32x4_t, x: float32x4_t) -> float32x4_t {
-    let q = vbslq_s32(vcltzq_f32(x), vdupq_n_s32(-2), vdupq_n_s32(0));
-    let x = vabsq_f32(x);
-    let is_y_more_than_x = vcgtq_f32(y, x);
-    let t = vbslq_f32(is_y_more_than_x, x, vdupq_n_f32(0f32));
-    let x = vbslq_f32(is_y_more_than_x, y, x);
-    let y = vbslq_f32(is_y_more_than_x, vnegq_f32(t), y);
-    let q = vbslq_s32(is_y_more_than_x, vaddq_s32(q, vdupq_n_s32(1)), q);
-    let s = vdivq_f32(y, x);
-    let t = vmulq_f32(s, s);
-    let t2 = vmulq_f32(t, t);
-    let t4 = vmulq_f32(t2, t2);
-    let poly = vpoly8q_f32(
-        t,
-        t2,
-        t4,
-        vdupq_n_f32(0.00282363896258175373077393f32),
-        vdupq_n_f32(-0.0159569028764963150024414f32),
-        vdupq_n_f32(0.0425049886107444763183594f32),
-        vdupq_n_f32(-0.0748900920152664184570312f32),
-        vdupq_n_f32(0.106347933411598205566406f32),
-        vdupq_n_f32(-0.142027363181114196777344f32),
-        vdupq_n_f32(0.199926957488059997558594f32),
-        vdupq_n_f32(-0.333331018686294555664062f32),
-    );
-    let t = prefer_vfmaq_f32(s, vmulq_f32(poly, t), s);
-    let t = prefer_vfmaq_f32(
-        t,
-        vcvtq_f32_s32(q),
-        vdupq_n_f32(std::f32::consts::FRAC_PI_2),
-    );
-    t
-}
-
-#[inline(always)]
-pub unsafe fn visnegq_f32(x: float32x4_t) -> uint32x4_t {
-    vcltzq_f32(x)
-}
-
-#[inline(always)]
-pub unsafe fn vatan2q_f32(y: float32x4_t, x: float32x4_t) -> float32x4_t {
-    let r = vatan2q_f32_impl(vabsq_f32(y), x);
-    let mut r = vmulsignq_f32(r, x);
-    let y_zero_mask = vceqzq_f32(y);
-    r = vbslq_f32(vceqzq_f32(x), vdupq_n_f32(std::f32::consts::FRAC_PI_2), r);
-    r = vbslq_f32(
-        y_zero_mask,
-        vbslq_f32(
-            visnegq_f32(x),
-            vdupq_n_f32(std::f32::consts::PI),
-            vdupq_n_f32(0f32),
-        ),
-        r,
-    );
-    vmulsignq_f32(r, y)
-}
-
-#[inline(always)]
-pub unsafe fn vsinq_f32(val: float32x4_t) -> float32x4_t {
-    let pi_v = vdupq_n_f32(std::f32::consts::PI);
-    let pio2_v = vdupq_n_f32(std::f32::consts::FRAC_PI_2);
-    let ipi_v = vdupq_n_f32(std::f32::consts::FRAC_1_PI);
-
-    //Find positive or negative
-    let c_v = vabsq_s32(vcvtq_s32_f32(vmulq_f32(val, ipi_v)));
-    let sign_v = vcleq_f32(val, vdupq_n_f32(0f32));
-    let odd_v = vandq_u32(vreinterpretq_u32_s32(c_v), vdupq_n_u32(1));
-
-    let neg_v = veorq_u32(odd_v, sign_v);
-
-    //Modulus a - (n * int(a*(1/n)))
-    let mut ma = vsubq_f32(vabsq_f32(val), vmulq_f32(pi_v, vcvtq_f32_s32(c_v)));
-    let reb_v = vcgeq_f32(ma, pio2_v);
-
-    //Rebase a between 0 and pi/2
-    ma = vbslq_f32(reb_v, vsubq_f32(pi_v, ma), ma);
-
-    //Taylor series
-    let ma2 = vmulq_f32(ma, ma);
-
-    //2nd elem: x^3 / 3!
-    let mut elem = vmulq_f32(vmulq_f32(ma, ma2), vdupq_n_f32(0.166666666666f32));
-    let mut res = vsubq_f32(ma, elem);
-
-    //3rd elem: x^5 / 5!
-    elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(0.05f32));
-    res = vaddq_f32(res, elem);
-
-    //4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val)
-    elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(0.023809523810f32));
-    res = vsubq_f32(res, elem);
-
-    //5th elem: x^9 / 9!
-    elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(0.013888888889f32));
-    res = vaddq_f32(res, elem);
-
-    //Change of sign
-    let neg_v = vshlq_n_u32::<31>(neg_v);
-    res = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(res), neg_v));
-    return res;
-}
-
-#[inline(always)]
-pub unsafe fn vcosq_f32(d: float32x4_t) -> float32x4_t {
-    let mut q = vcvtq_s32_f32(vsubq_f32(
-        vmulq_f32(d, vdupq_n_f32(std::f32::consts::FRAC_1_PI)),
-        vdupq_n_f32(0.5f32),
-    ));
-
-    q = vaddq_s32(vaddq_s32(q, q), vdupq_n_s32(1));
-
-    let mut u = vcvtq_f32_s32(q);
-    let mut d = vmlafq_f32(u, vdupq_n_f32(-0.78515625f32 * 2f32), d);
-    d = vmlafq_f32(u, vdupq_n_f32(-0.00024187564849853515625f32 * 2f32), d);
-    d = vmlafq_f32(u, vdupq_n_f32(-3.7747668102383613586e-08f32 * 2f32), d);
-    d = vmlafq_f32(u, vdupq_n_f32(-1.2816720341285448015e-12f32 * 2f32), d);
-
-    let s = vmulq_f32(d, d);
-
-    d = vreinterpretq_f32_u32(veorq_u32(
-        vandq_u32(
-            vceqq_s32(vandq_s32(q, vdupq_n_s32(2)), vdupq_n_s32(0)),
-            vreinterpretq_u32_f32(vdupq_n_f32(-0.0f32)),
-        ),
-        vreinterpretq_u32_f32(d),
-    ));
-
-    u = vdupq_n_f32(2.6083159809786593541503e-06f32);
-    u = vmlafq_f32(u, s, vdupq_n_f32(-0.0001981069071916863322258f32));
-    u = vmlafq_f32(u, s, vdupq_n_f32(0.00833307858556509017944336f32));
-    u = vmlafq_f32(u, s, vdupq_n_f32(-0.166666597127914428710938f32));
-
-    u = vmlafq_f32(s, vmulq_f32(u, d), d);
-
-    u = vreinterpretq_f32_u32(vorrq_u32(vispinfq_f32(d), vreinterpretq_u32_f32(u)));
-    return u;
-}
-
 #[inline(always)]
 pub(crate) unsafe fn vcubeq_f32(x: float32x4_t) -> float32x4_t {
     vmulq_f32(vmulq_f32(x, x), x)
diff --git a/src/neon/mod.rs b/src/neon/mod.rs
index 78d1949..0e9dc03 100644
--- a/src/neon/mod.rs
+++ b/src/neon/mod.rs
@@ -5,7 +5,9 @@ mod gamma_curves;
 mod hsv_to_image;
 mod image_to_hsv;
 mod linear_to_image;
+pub mod linear_to_planar;
 mod math;
+pub mod planar_to_linear;
 mod sigmoidal;
 mod to_linear;
 mod to_linear_u8;
diff --git a/src/neon/planar_to_linear.rs b/src/neon/planar_to_linear.rs
new file mode 100644
index 0000000..6b71d8d
--- /dev/null
+++ b/src/neon/planar_to_linear.rs
@@ -0,0 +1,86 @@
+use crate::gamma_curves::TransferFunction;
+use crate::neon::*;
+use std::arch::aarch64::*;
+
+#[inline(always)]
+unsafe fn neon_to_linear(
+    r: uint32x4_t,
+    transfer: &unsafe fn(float32x4_t) -> float32x4_t,
+) -> float32x4_t {
+    let r_f = vmulq_n_f32(vcvtq_f32_u32(r), 1f32 / 255f32);
+    transfer(r_f)
+}
+
+#[inline]
+unsafe fn process_pixels(
+    pixels: uint8x16_t,
+    transfer: &unsafe fn(float32x4_t) -> float32x4_t,
+) -> float32x4x4_t {
+    let r_low = vmovl_u8(vget_low_u8(pixels));
+
+    let r_low_low = vmovl_u16(vget_low_u16(r_low));
+
+    let x_low_low = neon_to_linear(r_low_low, &transfer);
+
+    let r_low_high = vmovl_high_u16(r_low);
+
+    let x_low_high = neon_to_linear(r_low_high, &transfer);
+
+    let r_high = vmovl_high_u8(pixels);
+
+    let r_high_low = vmovl_u16(vget_low_u16(r_high));
+
+    let x_high_low = neon_to_linear(r_high_low, &transfer);
+
+    let r_high_high = vmovl_high_u16(r_high);
+
+    let x_high_high = neon_to_linear(r_high_high, &transfer);
+
+    let storing_row = float32x4x4_t(x_low_low, x_low_high, x_high_low, x_high_high);
+    storing_row
+}
+
+#[inline(always)]
+pub unsafe fn neon_plane_to_linear(
+    start_cx: usize,
+    src: *const u8,
+    src_offset: usize,
+    width: u32,
+    dst: *mut f32,
+    dst_offset: usize,
+    transfer_function: TransferFunction,
+) -> usize {
+    let mut cx = start_cx;
+    let transfer = get_neon_linear_transfer(transfer_function);
+
+    let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32;
+
+    while cx + 64 < width as usize {
+        let src_ptr = src.add(src_offset + cx);
+        let pixels_row64 = vld1q_u8_x4(src_ptr);
+        let storing_row0 = process_pixels(pixels_row64.0, &transfer);
+        vst1q_f32_x4(dst_ptr.add(cx), storing_row0);
+
+        let storing_row1 = process_pixels(pixels_row64.1, &transfer);
+        vst1q_f32_x4(dst_ptr.add(cx + 16), storing_row1);
+
+        let storing_row2 = process_pixels(pixels_row64.2, &transfer);
+        vst1q_f32_x4(dst_ptr.add(cx + 32), storing_row2);
+
+        let storing_row3 = process_pixels(pixels_row64.3, &transfer);
+        vst1q_f32_x4(dst_ptr.add(cx + 48), storing_row3);
+
+        cx += 64;
+    }
+
+    while cx + 16 < width as usize {
+        let src_ptr = src.add(src_offset + cx);
+        let pixels = vld1q_u8(src_ptr);
+        let storing_row = process_pixels(pixels, &transfer);
+        vst1q_f32_x4(dst_ptr.add(cx), storing_row);
+
+        cx += 16;
+    }
+
+    cx
+}
diff --git a/src/planar_to_linear.rs b/src/planar_to_linear.rs
new file mode 100644
index 0000000..7bf4672
--- /dev/null
+++ b/src/planar_to_linear.rs
@@ -0,0 +1,89 @@
+#[cfg(all(
+    any(target_arch = "aarch64", target_arch = "arm"),
+    target_feature = "neon"
+))]
+use crate::neon::planar_to_linear::neon_plane_to_linear;
+use crate::TransferFunction;
+
+#[inline(always)]
+fn channels_to_linear(
+    src: &[u8],
+    src_stride: u32,
+    dst: &mut [f32],
+    dst_stride: u32,
+    width: u32,
+    height: u32,
+    transfer_function: TransferFunction,
+) {
+    let mut src_offset = 0usize;
+    let mut dst_offset = 0usize;
+
+    let transfer = transfer_function.get_linearize_function();
+    for _ in 0..height as usize {
+        let mut _cx = 0usize;
+
+        let src_ptr = unsafe { src.as_ptr().add(src_offset) };
+        let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 };
+
+        #[cfg(all(
+            any(target_arch = "aarch64", target_arch = "arm"),
+            target_feature = "neon"
+        ))]
+        unsafe {
+            _cx = neon_plane_to_linear(
+                _cx,
+                src.as_ptr(),
+                src_offset,
+                width,
+                dst.as_mut_ptr(),
+                dst_offset,
+                transfer_function,
+            )
+        }
+
+        for x in _cx..width as usize {
+            let px = x;
+            let dst = unsafe { dst_ptr.add(px) };
+            let src = unsafe { src_ptr.add(px) };
+            let pixel_f = unsafe { src.read_unaligned() as f32 } * (1. / 255.);
+            let transferred = transfer(pixel_f);
+
+            unsafe {
+                dst.write_unaligned(transferred);
+            }
+        }
+
+        src_offset += src_stride as usize;
+        dst_offset += dst_stride as usize;
+    }
+}
+
+/// This function converts Plane to Linear. This is much more effective than naive direct transformation
+///
+/// # Arguments
+/// * `src` - A slice contains RGB data
+/// * `src_stride` - Bytes per row for src data.
+/// * `width` - Image width
+/// * `height` - Image height
+/// * `dst` - A mutable slice to receive linear data
+/// * `dst_stride` - Bytes per row for dst data
+/// * `transfer_function` - Transfer function from gamma to linear space. If you don't have specific pick `Srgb`
+pub fn plane_to_linear(
+    src: &[u8],
+    src_stride: u32,
+    dst: &mut [f32],
+    dst_stride: u32,
+    width: u32,
+    height: u32,
+    transfer_function: TransferFunction,
+) {
+    channels_to_linear(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        width,
+        height,
+        transfer_function,
+    );
+}
diff --git a/src/rgb.rs b/src/rgb.rs
index 3c46237..16e0770 100644
--- a/src/rgb.rs
+++ b/src/rgb.rs
@@ -4,9 +4,13 @@ use crate::luv::Luv;
 use crate::{Hsl, LCh, Sigmoidal};
 
 #[derive(Debug, PartialOrd, PartialEq, Clone, Copy)]
+/// Represents any RGB values, Rgb<u8>, Rgb<u16> etc.
 pub struct Rgb<T> {
+    /// Red component
     pub r: T,
+    /// Green component
     pub g: T,
+    /// Blue component
     pub b: T,
 }
 
diff --git a/src/rgba.rs b/src/rgba.rs
index d180e4b..f17abc6 100644
--- a/src/rgba.rs
+++ b/src/rgba.rs
@@ -2,10 +2,15 @@ use crate::rgb::Rgb;
 use half::f16;
 
 #[derive(Debug, Copy, Clone, PartialOrd, PartialEq)]
+/// Represents any RGBA values, Rgba<u8>, Rgba<u16> etc.
 pub struct Rgba<T> {
+    /// Red component
     pub r: T,
+    /// Green component
     pub g: T,
+    /// Blue component
     pub b: T,
+    /// Alpha component
     pub a: T,
 }
 
@@ -131,6 +136,8 @@ impl ToRgbaF16 for Rgba<u8> {
     }
 }
 
+#[derive(Debug, Copy, Clone, PartialOrd, PartialEq)]
+/// Represents RGB 565 color in one u16
 pub struct Rgb565 {
     pub rgb565: u16,
 }
@@ -199,6 +206,8 @@ impl ToRgb565 for Rgba<f32> {
     }
 }
 
+#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
+/// Represents RGBA 1010102 in one u32 store
 pub struct Rgba1010102 {
     pub rgba: u32,
 }
diff --git a/src/xyz_transform.rs b/src/xyz_transform.rs
index 09d3245..9b95285 100644
--- a/src/xyz_transform.rs
+++ b/src/xyz_transform.rs
@@ -1,25 +1,25 @@
-/// sRGB to XYZ, D65 White point
+/// sRGB to XYZ transformation matrix, D65 White point
 pub const SRGB_TO_XYZ_D65: [[f32; 3]; 3] = [
     [0.4124564f32, 0.3575761f32, 0.1804375f32],
     [0.2126729f32, 0.7151522f32, 0.0721750f32],
     [0.0193339f32, 0.1191920f32, 0.9503041f32],
 ];
 
-/// XYZ to sRGB, D65 White point
+/// XYZ to sRGB transformation matrix, D65 White point
 pub const XYZ_TO_SRGB_D65: [[f32; 3]; 3] = [
     [3.2404542f32, -1.5371385f32, -0.4985314f32],
     [-0.9692660f32, 1.8760108f32, 0.0415560f32],
     [0.0556434f32, -0.2040259f32, 1.0572252f32],
 ];
 
-/// sRGB to XYZ, D50 White point
+/// sRGB to XYZ transformation matrix, D50 White point
 pub const SRGB_TO_XYZ_D50: [[f32; 3]; 3] = [
     [0.4360747f32, 0.3850649f32, 0.1430804f32],
     [0.2225045f32, 0.7168786f32, 0.0606169f32],
     [0.0139322f32, 0.0971045f32, 0.7141733f32],
 ];
 
-/// XYZ to sRGB, D50 White point
+/// XYZ to sRGB transformation matrix, D50 White point
 pub const XYZ_TO_SRGB_D50: [[f32; 3]; 3] = [
     [3.1338561f32, -1.6168667f32, -0.4906146f32],
     [-0.9787684f32, 1.9161415f32, 0.0334540f32],