From f3ee41068187c9fa19d6dcb9a870742ede148e6f Mon Sep 17 00:00:00 2001
From: awxkee <radzivon.bartoshyk@proton.me>
Date: Thu, 10 Oct 2024 21:36:03 +0100
Subject: [PATCH] Big reworking with speed increasing

---
 Cargo.lock                     |   2 +-
 Cargo.toml                     |   2 +-
 src/app/src/main.rs            |  10 +-
 src/avx/cie.rs                 |  21 +-
 src/avx/image_to_oklab.rs      | 392 ++------------------------------
 src/avx/linear_to_image.rs     | 156 -------------
 src/avx/mod.rs                 |   4 -
 src/avx/oklab_to_image.rs      | 312 ++-----------------------
 src/avx/support.rs             |  16 +-
 src/avx/to_linear.rs           | 340 ---------------------------
 src/avx/to_xyz_lab.rs          | 404 +++------------------------------
 src/avx/xyz_lab_to_image.rs    | 278 ++++-------------------
 src/avx/xyza_laba_to_image.rs  | 217 ++++--------------
 src/image_to_linear.rs         |  66 +-----
 src/image_to_linear_u8.rs      |  64 +-----
 src/image_to_oklab.rs          | 157 ++++++-------
 src/image_to_xyz_lab.rs        | 359 ++++++++++++++++++-----------
 src/image_xyza_laba.rs         | 197 ++++++++--------
 src/lab.rs                     |   9 +-
 src/linear_to_image.rs         | 161 +++++--------
 src/linear_to_image_u8.rs      |  66 +-----
 src/luv.rs                     |  39 +++-
 src/neon/cie.rs                |  18 +-
 src/neon/image_to_oklab.rs     | 320 +-------------------------
 src/neon/linear_to_image.rs    | 223 ------------------
 src/neon/mod.rs                |   6 -
 src/neon/oklab_to_image.rs     | 315 ++-----------------------
 src/neon/to_linear.rs          | 312 -------------------------
 src/neon/to_linear_u8.rs       | 256 ---------------------
 src/neon/to_xyz_lab.rs         | 376 +-----------------------------
 src/neon/to_xyza_laba.rs       | 358 +----------------------------
 src/neon/xyz_lab_to_image.rs   | 298 +-----------------------
 src/neon/xyza_laba_to_image.rs | 213 +----------------
 src/oklab.rs                   |  16 +-
 src/oklab_to_image.rs          | 182 ++++++++-------
 src/oklch.rs                   |  19 +-
 src/rgb.rs                     |  19 ++
 src/sse/cie.rs                 |  34 +--
 src/sse/image_to_linear_u8.rs  | 237 -------------------
 src/sse/image_to_oklab.rs      | 296 ++----------------------
 src/sse/linear_to_image.rs     | 167 --------------
 src/sse/mod.rs                 |  10 +-
 src/sse/oklab_to_image.rs      | 263 ++-------------------
 src/sse/to_linear.rs           | 264 ---------------------
 src/sse/to_xyz_lab.rs          | 327 +-------------------------
 src/sse/to_xyza_laba.rs        | 327 +-------------------------
 src/sse/xyz_lab_to_image.rs    | 272 ++--------------------
 src/sse/xyza_laba_to_image.rs  | 187 ++-------------
 src/xyz.rs                     |   2 +-
 src/xyz_lab_to_image.rs        | 324 +++++++++++++++++---------
 src/xyza_laba_to_image.rs      | 191 ++++++++--------
 51 files changed, 1311 insertions(+), 7793 deletions(-)
 delete mode 100644 src/avx/linear_to_image.rs
 delete mode 100644 src/avx/to_linear.rs
 delete mode 100644 src/neon/linear_to_image.rs
 delete mode 100644 src/neon/to_linear.rs
 delete mode 100644 src/neon/to_linear_u8.rs
 delete mode 100644 src/sse/image_to_linear_u8.rs
 delete mode 100644 src/sse/linear_to_image.rs
 delete mode 100644 src/sse/to_linear.rs

diff --git a/Cargo.lock b/Cargo.lock
index a9c1c58..9a7e897 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -169,7 +169,7 @@ checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b"
 
 [[package]]
 name = "colorutils-rs"
-version = "0.6.1"
+version = "0.6.2"
 dependencies = [
  "erydanos",
  "half",
diff --git a/Cargo.toml b/Cargo.toml
index b3e9b15..756da70 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,7 @@ workspace = { members = ["src/app"] }
 
 [package]
 name = "colorutils-rs"
-version = "0.6.1"
+version = "0.7.0"
 edition = "2021"
 description = "High performance utilities for color format handling and conversion."
 readme = "README.md"
diff --git a/src/app/src/main.rs b/src/app/src/main.rs
index 1fff3d1..0b52aab 100644
--- a/src/app/src/main.rs
+++ b/src/app/src/main.rs
@@ -33,7 +33,7 @@ fn main() {
     // let restored = lalphabeta.to_rgb(TransferFunction::Srgb);
     // println!("Restored RGB {:?}", restored);
 
-    let img = ImageReader::open("./assets/asset.jpg")
+    let img = ImageReader::open("./assets/op_fhd.jpg")
         .unwrap()
         .decode()
         .unwrap();
@@ -41,7 +41,7 @@ fn main() {
     println!("dimensions {:?}", img.dimensions());
 
     println!("{:?}", img.color());
-    // let img = img.to_rgba8();
+    let img = img.to_rgb8();
     let mut src_bytes = img.as_bytes();
     let width = dimensions.0;
     let height = dimensions.1;
@@ -68,13 +68,14 @@ fn main() {
         lab_store.resize(width as usize * components * height as usize, 0.);
         let src_stride = width * components as u32;
         let start_time = Instant::now();
-        rgb_to_oklch(
+        rgb_to_jzazbz(
             src_bytes,
             src_stride,
             &mut lab_store,
             store_stride as u32,
             width,
             height,
+            200.,
             TransferFunction::Srgb,
         );
         let elapsed_time = start_time.elapsed();
@@ -103,13 +104,14 @@ fn main() {
         // }
 
         let start_time = Instant::now();
-        oklch_to_rgb(
+        jzazbz_to_rgb(
             &lab_store,
             store_stride as u32,
             &mut dst_slice,
             src_stride,
             width,
             height,
+            200.,
             TransferFunction::Srgb,
         );
 
diff --git a/src/avx/cie.rs b/src/avx/cie.rs
index 0b61fa0..2977a36 100644
--- a/src/avx/cie.rs
+++ b/src/avx/cie.rs
@@ -6,13 +6,11 @@
  */
 
 use crate::avx::_mm256_cube_ps;
-use crate::avx::gamma_curves::perform_avx2_linear_transfer;
 use crate::avx::math::*;
 use crate::luv::{
     LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y, LUV_MULTIPLIER_INVERSE_Y, LUV_WHITE_U_PRIME,
     LUV_WHITE_V_PRIME,
 };
-use crate::TransferFunction;
 use erydanos::{
     _mm256_atan2_ps, _mm256_cbrt_ps, _mm256_cos_ps, _mm256_hypot_ps, _mm256_prefer_fma_ps,
     _mm256_select_ps, _mm256_sin_ps,
@@ -103,9 +101,9 @@ pub(crate) unsafe fn avx_lch_to_xyz(l: __m256, c: __m256, h: __m256) -> (__m256,
 
 #[inline(always)]
 pub(crate) unsafe fn avx2_triple_to_xyz(
-    r: __m256i,
-    g: __m256i,
-    b: __m256i,
+    r: __m256,
+    g: __m256,
+    b: __m256,
     c1: __m256,
     c2: __m256,
     c3: __m256,
@@ -115,19 +113,8 @@ pub(crate) unsafe fn avx2_triple_to_xyz(
     c7: __m256,
     c8: __m256,
     c9: __m256,
-    transfer_function: TransferFunction,
 ) -> (__m256, __m256, __m256) {
-    let u8_scale = _mm256_set1_ps(1f32 / 255f32);
-    let r_f = _mm256_mul_ps(_mm256_cvtepi32_ps(r), u8_scale);
-    let g_f = _mm256_mul_ps(_mm256_cvtepi32_ps(g), u8_scale);
-    let b_f = _mm256_mul_ps(_mm256_cvtepi32_ps(b), u8_scale);
-    let r_linear = perform_avx2_linear_transfer(transfer_function, r_f);
-    let g_linear = perform_avx2_linear_transfer(transfer_function, g_f);
-    let b_linear = perform_avx2_linear_transfer(transfer_function, b_f);
-
-    let (x, y, z) = _mm256_color_matrix_ps(
-        r_linear, g_linear, b_linear, c1, c2, c3, c4, c5, c6, c7, c8, c9,
-    );
+    let (x, y, z) = _mm256_color_matrix_ps(r, g, b, c1, c2, c3, c4, c5, c6, c7, c8, c9);
     (x, y, z)
 }
 
diff --git a/src/avx/image_to_oklab.rs b/src/avx/image_to_oklab.rs
index a3c59d9..3dad138 100644
--- a/src/avx/image_to_oklab.rs
+++ b/src/avx/image_to_oklab.rs
@@ -4,17 +4,13 @@
  * // Use of this source code is governed by a BSD-style
  * // license that can be found in the LICENSE file.
  */
-use crate::avx::gamma_curves::perform_avx2_linear_transfer;
-use crate::avx::routines::{
-    avx_vld_u8_and_deinterleave, avx_vld_u8_and_deinterleave_half,
-    avx_vld_u8_and_deinterleave_quarter,
-};
+use crate::avx::routines::avx_vld_f32_and_deinterleave;
 use crate::avx::{_mm256_color_matrix_ps, avx2_interleave_rgb_ps, avx2_interleave_rgba_ps};
 use crate::image::ImageConfiguration;
 use crate::image_to_oklab::OklabTarget;
 use crate::{
-    avx_store_and_interleave_v3_direct_f32, avx_store_and_interleave_v4_direct_f32,
-    TransferFunction,
+    avx_store_and_interleave_v3_direct_f32, avx_store_and_interleave_v4_direct_f32
+    ,
 };
 use erydanos::{_mm256_atan2_ps, _mm256_cbrt_fast_ps, _mm256_hypot_fast_ps};
 #[cfg(target_arch = "x86")]
@@ -23,22 +19,12 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 
 macro_rules! triple_to_oklab {
-    ($r: expr, $g: expr, $b: expr, $transfer: expr, $target: expr,
+    ($r: expr, $g: expr, $b: expr, $target: expr,
     $c0:expr, $c1:expr, $c2: expr, $c3: expr, $c4:expr, $c5: expr, $c6:expr, $c7: expr, $c8: expr,
         $m0: expr, $m1: expr, $m2: expr, $m3: expr, $m4: expr, $m5: expr, $m6: expr, $m7: expr, $m8: expr
     ) => {{
-        let u8_scale = _mm256_set1_ps(1f32 / 255f32);
-        let r_f = _mm256_mul_ps(_mm256_cvtepi32_ps($r), u8_scale);
-        let g_f = _mm256_mul_ps(_mm256_cvtepi32_ps($g), u8_scale);
-        let b_f = _mm256_mul_ps(_mm256_cvtepi32_ps($b), u8_scale);
-
-        let r_linear = perform_avx2_linear_transfer($transfer, r_f);
-        let g_linear = perform_avx2_linear_transfer($transfer, g_f);
-        let b_linear = perform_avx2_linear_transfer($transfer, b_f);
-
-        let (l_l, l_m, l_s) = _mm256_color_matrix_ps(
-            r_linear, g_linear, b_linear, $c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7, $c8,
-        );
+        let (l_l, l_m, l_s) =
+            _mm256_color_matrix_ps($r, $g, $b, $c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7, $c8);
 
         let l_ = _mm256_cbrt_fast_ps(l_l);
         let m_ = _mm256_cbrt_fast_ps(l_m);
@@ -61,12 +47,9 @@ macro_rules! triple_to_oklab {
 #[target_feature(enable = "avx2")]
 pub unsafe fn avx_image_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     start_cx: usize,
-    src: *const u8,
-    src_offset: usize,
     width: u32,
     dst: *mut f32,
     dst_offset: usize,
-    transfer_function: TransferFunction,
 ) -> usize {
     let target: OklabTarget = TARGET.into();
     let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
@@ -99,369 +82,26 @@ pub unsafe fn avx_image_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET:
         _mm256_set1_ps(-0.8086757660f32),
     );
 
-    while cx + 32 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            avx_vld_u8_and_deinterleave::<CHANNELS_CONFIGURATION>(src_ptr);
-
-        let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan));
-        let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan));
-        let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan));
-
-        let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low));
-        let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low));
-        let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low));
-
-        let (x_low_low, y_low_low, z_low_low) = triple_to_oklab!(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            transfer_function,
-            target,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8
-        );
-
-        let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan));
-
-        let u8_scale = _mm256_set1_ps(1f32 / 255f32);
-
-        if image_configuration.has_alpha() {
-            let a_low_low = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))),
-                u8_scale,
-            );
-            let ptr = dst_ptr.add(cx * 4);
-            avx_store_and_interleave_v4_direct_f32!(
-                ptr, x_low_low, y_low_low, z_low_low, a_low_low
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3);
-            avx_store_and_interleave_v3_direct_f32!(ptr, x_low_low, y_low_low, z_low_low);
-        }
-
-        let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low));
-        let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low));
-        let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low));
-
-        let (x_low_high, y_low_high, z_low_high) = triple_to_oklab!(
-            r_low_high,
-            g_low_high,
-            b_low_high,
-            transfer_function,
-            target,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8
-        );
-
-        if image_configuration.has_alpha() {
-            let a_low_high = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(a_low))),
-                u8_scale,
-            );
-
-            let ptr = dst_ptr.add(cx * 4 + 32);
-            avx_store_and_interleave_v4_direct_f32!(
-                ptr, x_low_high, y_low_high, z_low_high, a_low_high
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3 + 8 * 3);
-            avx_store_and_interleave_v3_direct_f32!(ptr, x_low_high, y_low_high, z_low_high);
-        }
-
-        let r_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(r_chan));
-        let g_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(g_chan));
-        let b_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(b_chan));
-
-        let r_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_high));
-        let g_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_high));
-        let b_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_high));
-
-        let (x_high_low, y_high_low, z_high_low) = triple_to_oklab!(
-            r_high_low,
-            g_high_low,
-            b_high_low,
-            transfer_function,
-            target,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8
-        );
-
-        let a_high = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan));
-
-        if image_configuration.has_alpha() {
-            let a_high_low = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_high))),
-                u8_scale,
-            );
-            let ptr = dst_ptr.add(cx * 4 + 8 * 4 * 2);
-            avx_store_and_interleave_v4_direct_f32!(
-                ptr, x_high_low, y_high_low, z_high_low, a_high_low
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3 + 8 * 3 * 2);
-            avx_store_and_interleave_v3_direct_f32!(ptr, x_high_low, y_high_low, z_high_low);
-        }
-
-        let r_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_high));
-        let g_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_high));
-        let b_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_high));
-
-        let (x_high_high, y_high_high, z_high_high) = triple_to_oklab!(
-            r_high_high,
-            g_high_high,
-            b_high_high,
-            transfer_function,
-            target,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8
-        );
-
-        if image_configuration.has_alpha() {
-            let a_high_high = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_castsi256_si128(a_high))),
-                u8_scale,
-            );
-            let ptr = dst_ptr.add(cx * 4 + 8 * 4 * 3);
-            avx_store_and_interleave_v4_direct_f32!(
-                ptr,
-                x_high_high,
-                y_high_high,
-                z_high_high,
-                a_high_high
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3 + 8 * 3 * 3);
-            avx_store_and_interleave_v3_direct_f32!(ptr, x_high_high, y_high_high, z_high_high);
-        }
-
-        cx += 32;
-    }
-
-    while cx + 16 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            avx_vld_u8_and_deinterleave_half::<CHANNELS_CONFIGURATION>(src_ptr);
-
-        let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan));
-        let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan));
-        let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan));
-
-        let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low));
-        let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low));
-        let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low));
-
-        let (x_low_low, y_low_low, z_low_low) = triple_to_oklab!(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            transfer_function,
-            target,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8
-        );
-
-        let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan));
-
-        let u8_scale = _mm256_set1_ps(1f32 / 255f32);
-
-        if image_configuration.has_alpha() {
-            let a_low_low = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))),
-                u8_scale,
-            );
-            let ptr = dst_ptr.add(cx * 4);
-            avx_store_and_interleave_v4_direct_f32!(
-                ptr, x_low_low, y_low_low, z_low_low, a_low_low
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3);
-            avx_store_and_interleave_v3_direct_f32!(ptr, x_low_low, y_low_low, z_low_low);
-        }
-
-        let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low));
-        let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low));
-        let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low));
-
-        let (x_low_high, y_low_high, z_low_high) = triple_to_oklab!(
-            r_low_high,
-            g_low_high,
-            b_low_high,
-            transfer_function,
-            target,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8
-        );
-
-        if image_configuration.has_alpha() {
-            let a_low_high = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))),
-                u8_scale,
-            );
-
-            let ptr = dst_ptr.add(cx * 4 + 32);
-            avx_store_and_interleave_v4_direct_f32!(
-                ptr, x_low_high, y_low_high, z_low_high, a_low_high
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3 + 8 * 3);
-            avx_store_and_interleave_v3_direct_f32!(ptr, x_low_high, y_low_high, z_low_high);
-        }
-
-        cx += 16;
-    }
-
     while cx + 8 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
+        let in_place_ptr = dst_ptr.add(cx * channels);
         let (r_chan, g_chan, b_chan, a_chan) =
-            avx_vld_u8_and_deinterleave_quarter::<CHANNELS_CONFIGURATION>(src_ptr);
-
-        let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan));
-        let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan));
-        let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan));
-
-        let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low));
-        let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low));
-        let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low));
+            avx_vld_f32_and_deinterleave::<CHANNELS_CONFIGURATION>(in_place_ptr);
 
         let (x_low_low, y_low_low, z_low_low) = triple_to_oklab!(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            transfer_function,
-            target,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8
+            r_chan, g_chan, b_chan, target, c0, c1, c2, c3, c4, c5, c6, c7, c8, m0, m1, m2, m3, m4,
+            m5, m6, m7, m8
         );
 
-        let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan));
-
-        let u8_scale = _mm256_set1_ps(1f32 / 255f32);
-
         if image_configuration.has_alpha() {
-            let a_low_low = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))),
-                u8_scale,
-            );
-            let ptr = dst_ptr.add(cx * 4);
             avx_store_and_interleave_v4_direct_f32!(
-                ptr, x_low_low, y_low_low, z_low_low, a_low_low
+                in_place_ptr,
+                x_low_low,
+                y_low_low,
+                z_low_low,
+                a_chan
             );
         } else {
-            let ptr = dst_ptr.add(cx * 3);
-            avx_store_and_interleave_v3_direct_f32!(ptr, x_low_low, y_low_low, z_low_low);
+            avx_store_and_interleave_v3_direct_f32!(in_place_ptr, x_low_low, y_low_low, z_low_low);
         }
 
         cx += 8;
diff --git a/src/avx/linear_to_image.rs b/src/avx/linear_to_image.rs
deleted file mode 100644
index b4fa328..0000000
--- a/src/avx/linear_to_image.rs
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved.
- * //
- * // Use of this source code is governed by a BSD-style
- * // license that can be found in the LICENSE file.
- */
-
-#[cfg(target_arch = "x86")]
-use std::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use std::arch::x86_64::*;
-
-use crate::avx::gamma_curves::perform_avx_gamma_transfer;
-use crate::avx::routines::avx_vld_f32_and_deinterleave;
-use crate::avx::{
-    _mm256_packus_four_epi32, avx2_interleave_rgb, avx2_interleave_rgba_epi8, avx2_pack_u16,
-    avx2_pack_u32,
-};
-use crate::image::ImageConfiguration;
-use crate::{
-    avx_store_and_interleave_v3_half_u8, avx_store_and_interleave_v3_u8,
-    avx_store_and_interleave_v4_half_u8, avx_store_and_interleave_v4_u8, TransferFunction,
-};
-
-#[inline(always)]
-unsafe fn gamma_vld<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
-    src: *const f32,
-    transfer_function: TransferFunction,
-) -> (__m256i, __m256i, __m256i, __m256i) {
-    let v_scale_alpha = _mm256_set1_ps(255f32);
-    let (mut r_f32, mut g_f32, mut b_f32, mut a_f32) =
-        avx_vld_f32_and_deinterleave::<CHANNELS_CONFIGURATION>(src);
-    r_f32 = perform_avx_gamma_transfer(transfer_function, r_f32);
-    g_f32 = perform_avx_gamma_transfer(transfer_function, g_f32);
-    b_f32 = perform_avx_gamma_transfer(transfer_function, b_f32);
-    r_f32 = _mm256_mul_ps(r_f32, v_scale_alpha);
-    g_f32 = _mm256_mul_ps(g_f32, v_scale_alpha);
-    b_f32 = _mm256_mul_ps(b_f32, v_scale_alpha);
-    if USE_ALPHA {
-        a_f32 = _mm256_mul_ps(a_f32, v_scale_alpha);
-    }
-    (
-        _mm256_cvtps_epi32(_mm256_round_ps::<0>(r_f32)),
-        _mm256_cvtps_epi32(_mm256_round_ps::<0>(g_f32)),
-        _mm256_cvtps_epi32(_mm256_round_ps::<0>(b_f32)),
-        _mm256_cvtps_epi32(_mm256_round_ps::<0>(a_f32)),
-    )
-}
-
-#[target_feature(enable = "avx2")]
-pub unsafe fn avx_linear_to_gamma<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
-    start_cx: usize,
-    src: *const f32,
-    src_offset: u32,
-    dst: *mut u8,
-    dst_offset: u32,
-    width: u32,
-    transfer_function: TransferFunction,
-) -> usize {
-    let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
-    let channels = image_configuration.get_channels_count();
-    let mut cx = start_cx;
-
-    while cx + 32 < width as usize {
-        let offset_src_ptr =
-            ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_, a_row0_) =
-            gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_0, transfer_function);
-
-        let src_ptr_1 = offset_src_ptr.add(8 * channels);
-
-        let (r_row1_, g_row1_, b_row1_, a_row1_) =
-            gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_1, transfer_function);
-
-        let src_ptr_2 = offset_src_ptr.add(8 * 2 * channels);
-
-        let (r_row2_, g_row2_, b_row2_, a_row2_) =
-            gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_2, transfer_function);
-
-        let src_ptr_3 = offset_src_ptr.add(8 * 3 * channels);
-
-        let (r_row3_, g_row3_, b_row3_, a_row3_) =
-            gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_3, transfer_function);
-
-        let r_row = _mm256_packus_four_epi32(r_row0_, r_row1_, r_row2_, r_row3_);
-        let g_row = _mm256_packus_four_epi32(g_row0_, g_row1_, g_row2_, g_row3_);
-        let b_row = _mm256_packus_four_epi32(b_row0_, b_row1_, b_row2_, b_row3_);
-
-        let dst_ptr = dst.add(dst_offset as usize + cx * channels);
-
-        if USE_ALPHA {
-            let a_row = _mm256_packus_four_epi32(a_row0_, a_row1_, a_row2_, a_row3_);
-            avx_store_and_interleave_v4_u8!(
-                dst_ptr,
-                image_configuration,
-                r_row,
-                g_row,
-                b_row,
-                a_row
-            );
-        } else {
-            avx_store_and_interleave_v3_u8!(dst_ptr, image_configuration, r_row, g_row, b_row);
-        }
-
-        cx += 32;
-    }
-
-    let zeros = _mm256_setzero_si256();
-
-    while cx + 16 < width as usize {
-        let offset_src_ptr =
-            ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_, a_row0_) =
-            gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_0, transfer_function);
-
-        let src_ptr_1 = offset_src_ptr.add(8 * channels);
-
-        let (r_row1_, g_row1_, b_row1_, a_row1_) =
-            gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_1, transfer_function);
-
-        let r_row01 = avx2_pack_u32(r_row0_, r_row1_);
-        let g_row01 = avx2_pack_u32(g_row0_, g_row1_);
-        let b_row01 = avx2_pack_u32(b_row0_, b_row1_);
-
-        let r_row = avx2_pack_u16(r_row01, zeros);
-        let g_row = avx2_pack_u16(g_row01, zeros);
-        let b_row = avx2_pack_u16(b_row01, zeros);
-
-        let dst_ptr = dst.add(dst_offset as usize + cx * channels);
-
-        if USE_ALPHA {
-            let a_row01 = avx2_pack_u32(a_row0_, a_row1_);
-            let a_row = avx2_pack_u16(a_row01, zeros);
-            avx_store_and_interleave_v4_half_u8!(
-                dst_ptr,
-                image_configuration,
-                r_row,
-                g_row,
-                b_row,
-                a_row
-            );
-        } else {
-            avx_store_and_interleave_v3_half_u8!(dst_ptr, image_configuration, r_row, g_row, b_row);
-        }
-
-        cx += 16;
-    }
-
-    cx
-}
diff --git a/src/avx/mod.rs b/src/avx/mod.rs
index 8d56940..919d88c 100644
--- a/src/avx/mod.rs
+++ b/src/avx/mod.rs
@@ -9,13 +9,11 @@ mod cie;
 mod from_sigmoidal;
 mod gamma_curves;
 mod image_to_oklab;
-mod linear_to_image;
 mod math;
 mod oklab_to_image;
 mod routines;
 mod sigmoidal;
 mod support;
-mod to_linear;
 mod to_sigmoidal;
 mod to_xyz_lab;
 mod utils;
@@ -24,11 +22,9 @@ mod xyza_laba_to_image;
 
 pub use from_sigmoidal::avx_from_sigmoidal_row;
 pub use image_to_oklab::avx_image_to_oklab;
-pub use linear_to_image::avx_linear_to_gamma;
 pub use math::*;
 pub use oklab_to_image::avx_oklab_to_image;
 pub use support::*;
-pub use to_linear::avx_channels_to_linear;
 pub use to_sigmoidal::avx_image_to_sigmoidal_row;
 pub use to_xyz_lab::*;
 pub use utils::*;
diff --git a/src/avx/oklab_to_image.rs b/src/avx/oklab_to_image.rs
index b40d587..c590277 100644
--- a/src/avx/oklab_to_image.rs
+++ b/src/avx/oklab_to_image.rs
@@ -11,24 +11,16 @@ use std::arch::x86_64::*;
 
 use erydanos::{_mm256_cos_ps, _mm256_sin_ps};
 
-use crate::avx::gamma_curves::perform_avx_gamma_transfer;
 use crate::avx::routines::avx_vld_f32_and_deinterleave_direct;
-use crate::avx::{
-    _mm256_color_matrix_ps, _mm256_cube_ps, _mm256_packus_four_epi32, avx2_interleave_rgb,
-    avx2_interleave_rgba_epi8,
-};
+use crate::avx::{_mm256_color_matrix_ps, _mm256_cube_ps};
+use crate::avx::{avx2_interleave_rgb_ps, avx2_interleave_rgba_ps};
 use crate::image::ImageConfiguration;
 use crate::image_to_oklab::OklabTarget;
-use crate::{
-    avx_store_and_interleave_v3_half_u8, avx_store_and_interleave_v3_quarter_u8,
-    avx_store_and_interleave_v3_u8, avx_store_and_interleave_v4_half_u8,
-    avx_store_and_interleave_v4_quarter_u8, avx_store_and_interleave_v4_u8, TransferFunction,
-};
+use crate::{avx_store_and_interleave_v3_f32, avx_store_and_interleave_v4_f32};
 
 #[inline(always)]
 unsafe fn avx_oklab_vld<const CHANNELS_CONFIGURATION: u8>(
     src: *const f32,
-    transfer_function: TransferFunction,
     oklab_target: OklabTarget,
     m0: __m256,
     m1: __m256,
@@ -48,11 +40,8 @@ unsafe fn avx_oklab_vld<const CHANNELS_CONFIGURATION: u8>(
     c6: __m256,
     c7: __m256,
     c8: __m256,
-) -> (__m256i, __m256i, __m256i, __m256i) {
-    let v_scale_alpha = _mm256_set1_ps(255f32);
-    let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
-
-    let (l, mut a, mut b, mut a_f32) =
+) -> (__m256, __m256, __m256, __m256) {
+    let (l, mut a, mut b, a_f32) =
         avx_vld_f32_and_deinterleave_direct::<CHANNELS_CONFIGURATION>(src);
 
     if oklab_target == OklabTarget::Oklch {
@@ -70,44 +59,17 @@ unsafe fn avx_oklab_vld<const CHANNELS_CONFIGURATION: u8>(
     l_s = _mm256_cube_ps(l_s);
 
     let (r_l, g_l, b_l) = _mm256_color_matrix_ps(l_l, l_m, l_s, c0, c1, c2, c3, c4, c5, c6, c7, c8);
-
-    let mut r_f32 = perform_avx_gamma_transfer(transfer_function, r_l);
-    let mut g_f32 = perform_avx_gamma_transfer(transfer_function, g_l);
-    let mut b_f32 = perform_avx_gamma_transfer(transfer_function, b_l);
-
-    r_f32 = _mm256_mul_ps(r_f32, v_scale_alpha);
-    g_f32 = _mm256_mul_ps(g_f32, v_scale_alpha);
-    b_f32 = _mm256_mul_ps(b_f32, v_scale_alpha);
-    if image_configuration.has_alpha() {
-        a_f32 = _mm256_mul_ps(a_f32, v_scale_alpha);
-    }
-
-    if image_configuration.has_alpha() {
-        (
-            _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(r_f32)),
-            _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(g_f32)),
-            _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(b_f32)),
-            _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(a_f32)),
-        )
-    } else {
-        (
-            _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(r_f32)),
-            _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(g_f32)),
-            _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(b_f32)),
-            _mm256_set1_epi32(255),
-        )
-    }
+    (r_l, g_l, b_l, a_f32)
 }
 
 #[target_feature(enable = "avx2")]
 pub unsafe fn avx_oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     start_cx: usize,
     src: *const f32,
-    src_offset: u32,
-    dst: *mut u8,
+    src_offset: usize,
+    dst: *mut f32,
     dst_offset: u32,
     width: u32,
-    transfer_function: TransferFunction,
 ) -> usize {
     let target: OklabTarget = TARGET.into();
     let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
@@ -138,218 +100,6 @@ pub unsafe fn avx_oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:
         _mm256_set1_ps(1.7076147010f32),
     );
 
-    let zeros = _mm256_setzero_si256();
-
-    while cx + 32 < width as usize {
-        let offset_src_ptr =
-            ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_, a_row0_) = avx_oklab_vld::<CHANNELS_CONFIGURATION>(
-            src_ptr_0,
-            transfer_function,
-            target,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-        );
-
-        let src_ptr_1 = offset_src_ptr.add(8 * channels);
-
-        let (r_row1_, g_row1_, b_row1_, a_row1_) = avx_oklab_vld::<CHANNELS_CONFIGURATION>(
-            src_ptr_1,
-            transfer_function,
-            target,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-        );
-
-        let src_ptr_2 = offset_src_ptr.add(8 * 2 * channels);
-
-        let (r_row2_, g_row2_, b_row2_, a_row2_) = avx_oklab_vld::<CHANNELS_CONFIGURATION>(
-            src_ptr_2,
-            transfer_function,
-            target,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-        );
-
-        let src_ptr_3 = offset_src_ptr.add(8 * 3 * channels);
-
-        let (r_row3_, g_row3_, b_row3_, a_row3_) = avx_oklab_vld::<CHANNELS_CONFIGURATION>(
-            src_ptr_3,
-            transfer_function,
-            target,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-        );
-
-        let r_row = _mm256_packus_four_epi32(r_row0_, r_row1_, r_row2_, r_row3_);
-        let g_row = _mm256_packus_four_epi32(g_row0_, g_row1_, g_row2_, g_row3_);
-        let b_row = _mm256_packus_four_epi32(b_row0_, b_row1_, b_row2_, b_row3_);
-
-        let dst_ptr = dst.add(dst_offset as usize + cx * channels);
-
-        if image_configuration.has_alpha() {
-            let a_row = _mm256_packus_four_epi32(a_row0_, a_row1_, a_row2_, a_row3_);
-            avx_store_and_interleave_v4_u8!(
-                dst_ptr,
-                image_configuration,
-                r_row,
-                g_row,
-                b_row,
-                a_row
-            );
-        } else {
-            avx_store_and_interleave_v3_u8!(dst_ptr, image_configuration, r_row, g_row, b_row);
-        }
-
-        cx += 32;
-    }
-
-    while cx + 16 < width as usize {
-        let offset_src_ptr =
-            ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_, a_row0_) = avx_oklab_vld::<CHANNELS_CONFIGURATION>(
-            src_ptr_0,
-            transfer_function,
-            target,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-        );
-
-        let src_ptr_1 = offset_src_ptr.add(8 * channels);
-
-        let (r_row1_, g_row1_, b_row1_, a_row1_) = avx_oklab_vld::<CHANNELS_CONFIGURATION>(
-            src_ptr_1,
-            transfer_function,
-            target,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-        );
-
-        let r_row = _mm256_packus_four_epi32(r_row0_, r_row1_, zeros, zeros);
-        let g_row = _mm256_packus_four_epi32(g_row0_, g_row1_, zeros, zeros);
-        let b_row = _mm256_packus_four_epi32(b_row0_, b_row1_, zeros, zeros);
-
-        let dst_ptr = dst.add(dst_offset as usize + cx * channels);
-
-        if image_configuration.has_alpha() {
-            let a_row = _mm256_packus_four_epi32(a_row0_, a_row1_, zeros, zeros);
-            avx_store_and_interleave_v4_half_u8!(
-                dst_ptr,
-                image_configuration,
-                r_row,
-                g_row,
-                b_row,
-                a_row
-            );
-        } else {
-            avx_store_and_interleave_v3_half_u8!(dst_ptr, image_configuration, r_row, g_row, b_row);
-        }
-
-        cx += 16;
-    }
-
     while cx + 8 < width as usize {
         let offset_src_ptr =
             ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);
@@ -357,52 +107,28 @@ pub unsafe fn avx_oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:
         let src_ptr_0 = offset_src_ptr;
 
         let (r_row0_, g_row0_, b_row0_, a_row0_) = avx_oklab_vld::<CHANNELS_CONFIGURATION>(
-            src_ptr_0,
-            transfer_function,
-            target,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
+            src_ptr_0, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4, c5, c6, c7,
             c8,
         );
 
-        let r_row = _mm256_packus_four_epi32(r_row0_, zeros, zeros, zeros);
-        let g_row = _mm256_packus_four_epi32(g_row0_, zeros, zeros, zeros);
-        let b_row = _mm256_packus_four_epi32(b_row0_, zeros, zeros, zeros);
-
-        let dst_ptr = dst.add(dst_offset as usize + cx * channels);
+        let dst_ptr = ((dst as *mut u8).add(dst_offset as usize) as *mut f32).add(cx * channels);
 
         if image_configuration.has_alpha() {
-            let a_row = _mm256_packus_four_epi32(a_row0_, zeros, zeros, zeros);
-            avx_store_and_interleave_v4_quarter_u8!(
+            avx_store_and_interleave_v4_f32!(
                 dst_ptr,
                 image_configuration,
-                r_row,
-                g_row,
-                b_row,
-                a_row
+                r_row0_,
+                g_row0_,
+                b_row0_,
+                a_row0_
             );
         } else {
-            avx_store_and_interleave_v3_quarter_u8!(
+            avx_store_and_interleave_v3_f32!(
                 dst_ptr,
                 image_configuration,
-                r_row,
-                g_row,
-                b_row
+                r_row0_,
+                g_row0_,
+                b_row0_
             );
         }
 
diff --git a/src/avx/support.rs b/src/avx/support.rs
index 8e31b4e..4e785e7 100644
--- a/src/avx/support.rs
+++ b/src/avx/support.rs
@@ -452,18 +452,4 @@ pub unsafe fn avx2_div_by255(v: __m256i) -> __m256i {
     let multiplier = _mm256_set1_epi16(-32640);
     let r = _mm256_mulhi_epu16(x, multiplier);
     _mm256_srli_epi16::<7>(r)
-}
-
-#[inline(always)]
-pub unsafe fn avx2_pack_u16(s_1: __m256i, s_2: __m256i) -> __m256i {
-    let packed = _mm256_packus_epi16(s_1, s_2);
-    const MASK: i32 = shuffle(3, 1, 2, 0);
-    _mm256_permute4x64_epi64::<MASK>(packed)
-}
-
-#[inline(always)]
-pub unsafe fn avx2_pack_u32(s_1: __m256i, s_2: __m256i) -> __m256i {
-    let packed = _mm256_packus_epi32(s_1, s_2);
-    const MASK: i32 = shuffle(3, 1, 2, 0);
-    _mm256_permute4x64_epi64::<MASK>(packed)
-}
+}
\ No newline at end of file
diff --git a/src/avx/to_linear.rs b/src/avx/to_linear.rs
deleted file mode 100644
index 7b23b2d..0000000
--- a/src/avx/to_linear.rs
+++ /dev/null
@@ -1,340 +0,0 @@
-/*
- * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved.
- * //
- * // Use of this source code is governed by a BSD-style
- * // license that can be found in the LICENSE file.
- */
-
-#[cfg(target_arch = "x86")]
-use std::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use std::arch::x86_64::*;
-
-use crate::avx::gamma_curves::perform_avx2_linear_transfer;
-use crate::avx::routines::{
-    avx_vld_u8_and_deinterleave, avx_vld_u8_and_deinterleave_half,
-    avx_vld_u8_and_deinterleave_quarter,
-};
-use crate::avx::{avx2_interleave_rgb_ps, avx2_interleave_rgba_ps};
-use crate::gamma_curves::TransferFunction;
-use crate::image::ImageConfiguration;
-use crate::{avx_store_and_interleave_v3_f32, avx_store_and_interleave_v4_f32};
-
-#[inline(always)]
-unsafe fn triple_to_linear(
-    r: __m256i,
-    g: __m256i,
-    b: __m256i,
-    transfer_function: TransferFunction,
-) -> (__m256, __m256, __m256) {
-    let u8_scale = _mm256_set1_ps(1f32 / 255f32);
-    let r_f = _mm256_mul_ps(_mm256_cvtepi32_ps(r), u8_scale);
-    let g_f = _mm256_mul_ps(_mm256_cvtepi32_ps(g), u8_scale);
-    let b_f = _mm256_mul_ps(_mm256_cvtepi32_ps(b), u8_scale);
-    let r_linear = perform_avx2_linear_transfer(transfer_function, r_f);
-    let g_linear = perform_avx2_linear_transfer(transfer_function, g_f);
-    let b_linear = perform_avx2_linear_transfer(transfer_function, b_f);
-    (r_linear, g_linear, b_linear)
-}
-
-#[target_feature(enable = "avx2")]
-pub unsafe fn avx_channels_to_linear<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
-    start_cx: usize,
-    src: *const u8,
-    src_offset: usize,
-    width: u32,
-    dst: *mut f32,
-    dst_offset: usize,
-    transfer_function: TransferFunction,
-) -> usize {
-    let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
-    let channels = image_configuration.get_channels_count();
-    let mut cx = start_cx;
-
-    let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32;
-
-    while cx + 32 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            avx_vld_u8_and_deinterleave::<CHANNELS_CONFIGURATION>(src_ptr);
-
-        let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan));
-        let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan));
-        let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan));
-
-        let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low));
-        let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low));
-        let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low));
-
-        let (x_low_low, y_low_low, z_low_low) =
-            triple_to_linear(r_low_low, g_low_low, b_low_low, transfer_function);
-
-        let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan));
-
-        let u8_scale = _mm256_set1_ps(1f32 / 255f32);
-
-        if USE_ALPHA {
-            let a_low_low = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))),
-                u8_scale,
-            );
-
-            let ptr = dst_ptr.add(cx * 4);
-            avx_store_and_interleave_v4_f32!(
-                ptr,
-                image_configuration,
-                x_low_low,
-                y_low_low,
-                z_low_low,
-                a_low_low
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3);
-            avx_store_and_interleave_v3_f32!(
-                ptr,
-                image_configuration,
-                x_low_low,
-                y_low_low,
-                z_low_low
-            );
-        }
-
-        let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low));
-        let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low));
-        let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low));
-
-        let (x_low_high, y_low_high, z_low_high) =
-            triple_to_linear(r_low_high, g_low_high, b_low_high, transfer_function);
-
-        if USE_ALPHA {
-            let a_low_high = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))),
-                u8_scale,
-            );
-
-            let ptr = dst_ptr.add(cx * 4 + 32);
-            avx_store_and_interleave_v4_f32!(
-                ptr,
-                image_configuration,
-                x_low_high,
-                y_low_high,
-                z_low_high,
-                a_low_high
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3 + 24);
-            avx_store_and_interleave_v3_f32!(
-                ptr,
-                image_configuration,
-                x_low_high,
-                y_low_high,
-                z_low_high
-            );
-        }
-
-        let r_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(r_chan));
-        let g_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(g_chan));
-        let b_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(b_chan));
-
-        let r_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_high));
-        let g_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_high));
-        let b_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_high));
-
-        let (x_high_low, y_high_low, z_high_low) =
-            triple_to_linear(r_high_low, g_high_low, b_high_low, transfer_function);
-
-        let a_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(a_chan));
-
-        if USE_ALPHA {
-            let a_high_low = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_high))),
-                u8_scale,
-            );
-
-            let ptr = dst_ptr.add(cx * 4 + 64);
-            avx_store_and_interleave_v4_f32!(
-                ptr,
-                image_configuration,
-                x_high_low,
-                y_high_low,
-                z_high_low,
-                a_high_low
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3 + 48);
-            avx_store_and_interleave_v3_f32!(
-                ptr,
-                image_configuration,
-                x_high_low,
-                y_high_low,
-                z_high_low
-            );
-        }
-
-        let r_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_high));
-        let g_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_high));
-        let b_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_high));
-
-        let (x_high_high, y_high_high, z_high_high) =
-            triple_to_linear(r_high_high, g_high_high, b_high_high, transfer_function);
-
-        if USE_ALPHA {
-            let a_high_high = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_high))),
-                u8_scale,
-            );
-            let ptr = dst_ptr.add(cx * 4 + 96);
-            avx_store_and_interleave_v4_f32!(
-                ptr,
-                image_configuration,
-                x_high_high,
-                y_high_high,
-                z_high_high,
-                a_high_high
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3 + 24 * 3);
-            avx_store_and_interleave_v3_f32!(
-                ptr,
-                image_configuration,
-                x_high_high,
-                y_high_high,
-                z_high_high
-            );
-        }
-
-        cx += 32;
-    }
-
-    while cx + 16 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            avx_vld_u8_and_deinterleave_half::<CHANNELS_CONFIGURATION>(src_ptr);
-
-        let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan));
-        let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan));
-        let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan));
-
-        let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low));
-        let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low));
-        let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low));
-
-        let (x_low_low, y_low_low, z_low_low) =
-            triple_to_linear(r_low_low, g_low_low, b_low_low, transfer_function);
-
-        let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan));
-
-        let u8_scale = _mm256_set1_ps(1f32 / 255f32);
-
-        if USE_ALPHA {
-            let a_low_low = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))),
-                u8_scale,
-            );
-
-            let ptr = dst_ptr.add(cx * 4);
-            avx_store_and_interleave_v4_f32!(
-                ptr,
-                image_configuration,
-                x_low_low,
-                y_low_low,
-                z_low_low,
-                a_low_low
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3);
-            avx_store_and_interleave_v3_f32!(
-                ptr,
-                image_configuration,
-                x_low_low,
-                y_low_low,
-                z_low_low
-            );
-        }
-
-        let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low));
-        let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low));
-        let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low));
-
-        let (x_low_high, y_low_high, z_low_high) =
-            triple_to_linear(r_low_high, g_low_high, b_low_high, transfer_function);
-
-        if USE_ALPHA {
-            let a_low_high = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))),
-                u8_scale,
-            );
-
-            let ptr = dst_ptr.add(cx * 4 + 32);
-            avx_store_and_interleave_v4_f32!(
-                ptr,
-                image_configuration,
-                x_low_high,
-                y_low_high,
-                z_low_high,
-                a_low_high
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3 + 24);
-            avx_store_and_interleave_v3_f32!(
-                ptr,
-                image_configuration,
-                x_low_high,
-                y_low_high,
-                z_low_high
-            );
-        }
-
-        cx += 16;
-    }
-
-    while cx + 8 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            avx_vld_u8_and_deinterleave_quarter::<CHANNELS_CONFIGURATION>(src_ptr);
-
-        let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan));
-        let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan));
-        let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan));
-
-        let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low));
-        let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low));
-        let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low));
-
-        let (x_low_low, y_low_low, z_low_low) =
-            triple_to_linear(r_low_low, g_low_low, b_low_low, transfer_function);
-
-        let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan));
-
-        let u8_scale = _mm256_set1_ps(1f32 / 255f32);
-
-        if USE_ALPHA {
-            let a_low_low = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))),
-                u8_scale,
-            );
-
-            let ptr = dst_ptr.add(cx * 4);
-            avx_store_and_interleave_v4_f32!(
-                ptr,
-                image_configuration,
-                x_low_low,
-                y_low_low,
-                z_low_low,
-                a_low_low
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3);
-            avx_store_and_interleave_v3_f32!(
-                ptr,
-                image_configuration,
-                x_low_low,
-                y_low_low,
-                z_low_low
-            );
-        }
-        cx += 8;
-    }
-
-    cx
-}
diff --git a/src/avx/to_xyz_lab.rs b/src/avx/to_xyz_lab.rs
index c715145..7e83338 100644
--- a/src/avx/to_xyz_lab.rs
+++ b/src/avx/to_xyz_lab.rs
@@ -13,12 +13,13 @@ use std::arch::x86_64::*;
 use crate::avx::cie::{
     avx2_triple_to_lab, avx2_triple_to_luv, avx2_triple_to_xyz, avx_triple_to_lch,
 };
-use crate::avx::routines::{avx_vld_u8_and_deinterleave, avx_vld_u8_and_deinterleave_half};
+use crate::avx::routines::avx_vld_f32_and_deinterleave;
 use crate::avx::*;
-use crate::avx_store_and_interleave_v3_direct_f32;
-use crate::gamma_curves::TransferFunction;
 use crate::image::ImageConfiguration;
+use crate::sse::{sse_interleave_ps_rgb, sse_triple_to_lab, sse_triple_to_lch, sse_triple_to_luv, sse_triple_to_xyz};
 use crate::xyz_target::XyzTarget;
+use crate::{avx_store_and_interleave_v3_direct_f32, load_f32_and_deinterleave};
+use crate::sse::{sse_deinterleave_rgba_ps, sse_deinterleave_rgb_ps};
 
 #[target_feature(enable = "avx2")]
 pub unsafe fn avx2_image_to_xyz_lab<
@@ -27,7 +28,7 @@ pub unsafe fn avx2_image_to_xyz_lab<
     const TARGET: u8,
 >(
     start_cx: usize,
-    src: *const u8,
+    src: *const f32,
     src_offset: usize,
     width: u32,
     dst: *mut f32,
@@ -35,7 +36,6 @@ pub unsafe fn avx2_image_to_xyz_lab<
     a_linearized: *mut f32,
     a_offset: usize,
     matrix: &[[f32; 3]; 3],
-    transfer_function: TransferFunction,
 ) -> usize {
     if USE_ALPHA && a_linearized.is_null() {
         panic!("Null alpha channel with requirements of linearized alpha if not supported");
@@ -57,267 +57,13 @@ pub unsafe fn avx2_image_to_xyz_lab<
 
     let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32;
 
-    while cx + 32 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            avx_vld_u8_and_deinterleave::<CHANNELS_CONFIGURATION>(src_ptr);
-
-        let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan));
-        let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan));
-        let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan));
-
-        let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low));
-        let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low));
-        let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low));
-
-        let (mut x_low_low, mut y_low_low, mut z_low_low) = avx2_triple_to_xyz(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = avx2_triple_to_lab(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = a;
-                z_low_low = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = avx2_triple_to_luv(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = u;
-                z_low_low = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = avx_triple_to_lch(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = c;
-                z_low_low = h;
-            }
-        }
-
-        let write_dst_ptr = dst_ptr.add(cx * 3);
-        avx_store_and_interleave_v3_direct_f32!(write_dst_ptr, x_low_low, y_low_low, z_low_low);
-
-        let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low));
-        let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low));
-        let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low));
-
-        let (mut x_low_high, mut y_low_high, mut z_low_high) = avx2_triple_to_xyz(
-            r_low_high,
-            g_low_high,
-            b_low_high,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = avx2_triple_to_lab(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = a;
-                z_low_high = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = avx2_triple_to_luv(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = u;
-                z_low_high = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = avx_triple_to_lch(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = c;
-                z_low_high = h;
-            }
-        }
-
-        let ptr2 = write_dst_ptr.add(8 * 3);
-        avx_store_and_interleave_v3_direct_f32!(ptr2, x_low_high, y_low_high, z_low_high);
-
-        let r_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(r_chan));
-        let g_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(g_chan));
-        let b_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(b_chan));
-
-        let r_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_high));
-        let g_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_high));
-        let b_high_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_high));
-
-        let (mut x_high_low, mut y_high_low, mut z_high_low) = avx2_triple_to_xyz(
-            r_high_low,
-            g_high_low,
-            b_high_low,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = avx2_triple_to_lab(x_high_low, y_high_low, z_high_low);
-                x_high_low = l;
-                y_high_low = a;
-                z_high_low = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = avx2_triple_to_luv(x_high_low, y_high_low, z_high_low);
-                x_high_low = l;
-                y_high_low = u;
-                z_high_low = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = avx_triple_to_lch(x_high_low, y_high_low, z_high_low);
-                x_high_low = l;
-                y_high_low = c;
-                z_high_low = h;
-            }
-        }
-
-        let ptr3 = write_dst_ptr.add(8 * 3 * 2);
-        avx_store_and_interleave_v3_direct_f32!(ptr3, x_high_low, y_high_low, z_high_low);
-
-        let r_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_high));
-        let g_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_high));
-        let b_high_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_high));
-
-        let (mut x_high_high, mut y_high_high, mut z_high_high) = avx2_triple_to_xyz(
-            r_high_high,
-            g_high_high,
-            b_high_high,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = avx2_triple_to_lab(x_high_high, y_high_high, z_high_high);
-                x_high_high = l;
-                y_high_high = a;
-                z_high_high = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = avx2_triple_to_luv(x_high_high, y_high_high, z_high_high);
-                x_high_high = l;
-                y_high_high = u;
-                z_high_high = v;
-            }
-            XyzTarget::Lch => {
-                let (l, u, v) = avx_triple_to_lch(x_high_high, y_high_high, z_high_high);
-                x_high_high = l;
-                y_high_high = u;
-                z_high_high = v;
-            }
-        }
-
-        let ptr4 = write_dst_ptr.add(8 * 3 * 3);
-        avx_store_and_interleave_v3_direct_f32!(ptr4, x_high_high, y_high_high, z_high_high);
-
-        if USE_ALPHA {
-            let a_ptr = (a_linearized as *mut u8).add(a_offset) as *mut f32;
-
-            let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan));
-
-            let u8_scale = _mm256_set1_ps(1f32 / 255f32);
-
-            let a_low_low = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))),
-                u8_scale,
-            );
-
-            _mm256_storeu_ps(a_ptr.add(cx), a_low_low);
-
-            let a_low_high = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))),
-                u8_scale,
-            );
-
-            _mm256_storeu_ps(a_ptr.add(cx + 8), a_low_high);
-
-            let a_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(a_chan));
-
-            let a_high_low = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_high))),
-                u8_scale,
-            );
-
-            _mm256_storeu_ps(a_ptr.add(cx + 8 * 2), a_high_low);
-
-            let a_high_high = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_high))),
-                u8_scale,
-            );
-
-            _mm256_storeu_ps(a_ptr.add(cx + 8 * 3), a_high_high);
-        }
-
-        cx += 32;
-    }
-
-    while cx + 16 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
+    while cx + 8 < width as usize {
+        let src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * channels);
         let (r_chan, g_chan, b_chan, a_chan) =
-            avx_vld_u8_and_deinterleave_half::<CHANNELS_CONFIGURATION>(src_ptr);
-
-        let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan));
-        let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan));
-        let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan));
-
-        let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low));
-        let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low));
-        let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low));
+            avx_vld_f32_and_deinterleave::<CHANNELS_CONFIGURATION>(src_ptr);
 
         let (mut x_low_low, mut y_low_low, mut z_low_low) = avx2_triple_to_xyz(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
+            r_chan, g_chan, b_chan, cq1, cq2, cq3, cq4, cq5, cq6, cq7, cq8, cq9,
         );
 
         match target {
@@ -345,145 +91,69 @@ pub unsafe fn avx2_image_to_xyz_lab<
         let write_dst_ptr = dst_ptr.add(cx * 3);
         avx_store_and_interleave_v3_direct_f32!(write_dst_ptr, x_low_low, y_low_low, z_low_low);
 
-        let r_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(r_low));
-        let g_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(g_low));
-        let b_low_high = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(b_low));
-        let (mut x_low_high, mut y_low_high, mut z_low_high) = avx2_triple_to_xyz(
-            r_low_high,
-            g_low_high,
-            b_low_high,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = avx2_triple_to_lab(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = a;
-                z_low_high = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = avx2_triple_to_luv(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = u;
-                z_low_high = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = avx_triple_to_lch(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = c;
-                z_low_high = h;
-            }
-        }
-
-        let ptr2 = write_dst_ptr.add(8 * 3);
-        avx_store_and_interleave_v3_direct_f32!(ptr2, x_low_high, y_low_high, z_low_high);
-
         if USE_ALPHA {
             let a_ptr = (a_linearized as *mut u8).add(a_offset) as *mut f32;
 
-            let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan));
-
-            let u8_scale = _mm256_set1_ps(1f32 / 255f32);
-
-            let a_low_low = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))),
-                u8_scale,
-            );
-
-            _mm256_storeu_ps(a_ptr.add(cx), a_low_low);
-
-            let a_low_high = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(a_low))),
-                u8_scale,
-            );
-
-            _mm256_storeu_ps(a_ptr.add(cx + 8), a_low_high);
+            _mm256_storeu_ps(a_ptr.add(cx), a_chan);
         }
 
-        cx += 16;
+        cx += 8;
     }
 
-    while cx + 8 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
+    while cx + 4 < width as usize {
+        let src_ptr = ((src as * const u8).add(src_offset) as *const f32).add(cx * channels);
         let (r_chan, g_chan, b_chan, a_chan) =
-            avx_vld_u8_and_deinterleave_half::<CHANNELS_CONFIGURATION>(src_ptr);
-
-        let r_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(r_chan));
-        let g_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(g_chan));
-        let b_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(b_chan));
-
-        let r_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(r_low));
-        let g_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(g_low));
-        let b_low_low = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(b_low));
-
-        let (mut x_low_low, mut y_low_low, mut z_low_low) = avx2_triple_to_xyz(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
+            load_f32_and_deinterleave!(src_ptr, image_configuration);
+
+        let (mut x_low_low, mut y_low_low, mut z_low_low) = sse_triple_to_xyz(
+            r_chan,
+            g_chan,
+            b_chan,
+            _mm256_castps256_ps128(cq1),
+            _mm256_castps256_ps128(cq2),
+            _mm256_castps256_ps128(cq3),
+            _mm256_castps256_ps128(cq4),
+            _mm256_castps256_ps128(cq5),
+            _mm256_castps256_ps128(cq6),
+            _mm256_castps256_ps128(cq7),
+            _mm256_castps256_ps128(cq8),
+            _mm256_castps256_ps128(cq9),
         );
 
         match target {
             XyzTarget::Lab => {
-                let (l, a, b) = avx2_triple_to_lab(x_low_low, y_low_low, z_low_low);
+                let (l, a, b) = sse_triple_to_lab(x_low_low, y_low_low, z_low_low);
                 x_low_low = l;
                 y_low_low = a;
                 z_low_low = b;
             }
             XyzTarget::Xyz => {}
             XyzTarget::Luv => {
-                let (l, u, v) = avx2_triple_to_luv(x_low_low, y_low_low, z_low_low);
+                let (l, u, v) = sse_triple_to_luv(x_low_low, y_low_low, z_low_low);
                 x_low_low = l;
                 y_low_low = u;
                 z_low_low = v;
             }
             XyzTarget::Lch => {
-                let (l, c, h) = avx_triple_to_lch(x_low_low, y_low_low, z_low_low);
+                let (l, c, h) = sse_triple_to_lch(x_low_low, y_low_low, z_low_low);
                 x_low_low = l;
                 y_low_low = c;
                 z_low_low = h;
             }
         }
 
-        let write_dst_ptr = dst_ptr.add(cx * 3);
-        avx_store_and_interleave_v3_direct_f32!(write_dst_ptr, x_low_low, y_low_low, z_low_low);
+        let (v0, v1, v2) = sse_interleave_ps_rgb(x_low_low, y_low_low, z_low_low);
+        _mm_storeu_ps(dst_ptr.add(cx * 3), v0);
+        _mm_storeu_ps(dst_ptr.add(cx * 3 + 4), v1);
+        _mm_storeu_ps(dst_ptr.add(cx * 3 + 8), v2);
 
         if USE_ALPHA {
             let a_ptr = (a_linearized as *mut u8).add(a_offset) as *mut f32;
 
-            let a_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a_chan));
-
-            let u8_scale = _mm256_set1_ps(1f32 / 255f32);
-
-            let a_low_low = _mm256_mul_ps(
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(a_low))),
-                u8_scale,
-            );
-
-            _mm256_storeu_ps(a_ptr.add(cx), a_low_low);
+            _mm_storeu_ps(a_ptr.add(cx), a_chan);
         }
 
-        cx += 8;
+        cx += 4;
     }
 
     cx
diff --git a/src/avx/xyz_lab_to_image.rs b/src/avx/xyz_lab_to_image.rs
index 78a6844..95e3a6a 100644
--- a/src/avx/xyz_lab_to_image.rs
+++ b/src/avx/xyz_lab_to_image.rs
@@ -11,17 +11,15 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 
 use crate::avx::cie::{avx_lab_to_xyz, avx_lch_to_xyz, avx_luv_to_xyz};
-use crate::avx::gamma_curves::perform_avx_gamma_transfer;
-use crate::avx::{
-    _mm256_color_matrix_ps, _mm256_packus_four_epi32, avx2_deinterleave_rgb_ps,
-    avx2_interleave_rgb, avx2_interleave_rgba_epi8,
-};
+use crate::avx::{_mm256_color_matrix_ps, avx2_deinterleave_rgb_ps};
+use crate::avx::{avx2_interleave_rgb_ps, avx2_interleave_rgba_ps};
 use crate::image::ImageConfiguration;
+use crate::sse::sse_xyz_lab_vld;
+use crate::sse::{sse_interleave_ps_rgb, sse_interleave_ps_rgba};
 use crate::xyz_target::XyzTarget;
 use crate::{
-    avx_store_and_interleave_v3_half_u8, avx_store_and_interleave_v3_quarter_u8,
-    avx_store_and_interleave_v3_u8, avx_store_and_interleave_v4_half_u8,
-    avx_store_and_interleave_v4_quarter_u8, avx_store_and_interleave_v4_u8, TransferFunction,
+    avx_store_and_interleave_v3_f32, avx_store_and_interleave_v4_f32, store_and_interleave_v3_f32,
+    store_and_interleave_v4_f32,
 };
 
 #[inline(always)]
@@ -31,7 +29,6 @@ unsafe fn avx_xyz_lab_vld<
     const TARGET: u8,
 >(
     src: *const f32,
-    transfer_function: TransferFunction,
     c1: __m256,
     c2: __m256,
     c3: __m256,
@@ -41,9 +38,8 @@ unsafe fn avx_xyz_lab_vld<
     c7: __m256,
     c8: __m256,
     c9: __m256,
-) -> (__m256i, __m256i, __m256i) {
+) -> (__m256, __m256, __m256) {
     let target: XyzTarget = TARGET.into();
-    let v_scale_color = _mm256_set1_ps(255f32);
     let lab_pixel_0 = _mm256_loadu_ps(src);
     let lab_pixel_1 = _mm256_loadu_ps(src.add(8));
     let lab_pixel_2 = _mm256_loadu_ps(src.add(16));
@@ -75,21 +71,7 @@ unsafe fn avx_xyz_lab_vld<
     let (linear_r, linear_g, linear_b) =
         _mm256_color_matrix_ps(r_f32, g_f32, b_f32, c1, c2, c3, c4, c5, c6, c7, c8, c9);
 
-    r_f32 = linear_r;
-    g_f32 = linear_g;
-    b_f32 = linear_b;
-
-    r_f32 = perform_avx_gamma_transfer(transfer_function, r_f32);
-    g_f32 = perform_avx_gamma_transfer(transfer_function, g_f32);
-    b_f32 = perform_avx_gamma_transfer(transfer_function, b_f32);
-    r_f32 = _mm256_mul_ps(r_f32, v_scale_color);
-    g_f32 = _mm256_mul_ps(g_f32, v_scale_color);
-    b_f32 = _mm256_mul_ps(b_f32, v_scale_color);
-    (
-        _mm256_cvtps_epi32(_mm256_round_ps::<0>(r_f32)),
-        _mm256_cvtps_epi32(_mm256_round_ps::<0>(g_f32)),
-        _mm256_cvtps_epi32(_mm256_round_ps::<0>(b_f32)),
-    )
+    (linear_r, linear_g, linear_b)
 }
 
 #[target_feature(enable = "avx2")]
@@ -103,11 +85,10 @@ pub unsafe fn avx_xyz_to_channels<
     src_offset: usize,
     a_channel: *const f32,
     a_offset: usize,
-    dst: *mut u8,
+    dst: *mut f32,
     dst_offset: usize,
     width: u32,
     matrix: &[[f32; 3]; 3],
-    transfer_function: TransferFunction,
 ) -> usize {
     let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
     if USE_ALPHA && !image_configuration.has_alpha() {
@@ -130,256 +111,81 @@ pub unsafe fn avx_xyz_to_channels<
 
     const CHANNELS: usize = 3usize;
 
-    let color_rescale = _mm256_set1_ps(255f32);
-
-    while cx + 32 < width as usize {
+    while cx + 8 < width as usize {
         let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS);
 
         let src_ptr_0 = offset_src_ptr;
 
         let (r_row0_, g_row0_, b_row0_) =
             avx_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_0,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let src_ptr_1 = offset_src_ptr.add(8 * CHANNELS);
-
-        let (r_row1_, g_row1_, b_row1_) =
-            avx_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_1,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let src_ptr_2 = offset_src_ptr.add(8 * 2 * CHANNELS);
-
-        let (r_row2_, g_row2_, b_row2_) =
-            avx_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_2,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let src_ptr_3 = offset_src_ptr.add(8 * 3 * CHANNELS);
-
-        let (r_row3_, g_row3_, b_row3_) =
-            avx_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_3,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
+                src_ptr_0, c1, c2, c3, c4, c5, c6, c7, c8, c9,
             );
 
-        let r_row = _mm256_packus_four_epi32(r_row0_, r_row1_, r_row2_, r_row3_);
-        let g_row = _mm256_packus_four_epi32(g_row0_, g_row1_, g_row2_, g_row3_);
-        let b_row = _mm256_packus_four_epi32(b_row0_, b_row1_, b_row2_, b_row3_);
-
-        let dst_ptr = dst.add(dst_offset + cx * channels);
+        let dst_ptr = ((dst as *mut u8).add(dst_offset) as *mut f32).add(cx * channels);
 
         if USE_ALPHA {
             let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx);
-            let a_low_0_f = _mm256_loadu_ps(offset_a_src_ptr);
-            let a_row0_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps(
-                a_low_0_f,
-                color_rescale,
-            )));
-
-            let a_low_1_f = _mm256_loadu_ps(offset_a_src_ptr.add(8));
-            let a_row1_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps(
-                a_low_1_f,
-                color_rescale,
-            )));
-
-            let a_low_2_f = _mm256_loadu_ps(offset_a_src_ptr.add(16));
-            let a_row2_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps(
-                a_low_2_f,
-                color_rescale,
-            )));
-
-            let a_low_3_f = _mm256_loadu_ps(offset_a_src_ptr.add(24));
-            let a_row3_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps(
-                a_low_3_f,
-                color_rescale,
-            )));
+            let a_row = _mm256_loadu_ps(offset_a_src_ptr);
 
-            let a_row = _mm256_packus_four_epi32(a_row0_, a_row1_, a_row2_, a_row3_);
-            avx_store_and_interleave_v4_u8!(
+            avx_store_and_interleave_v4_f32!(
                 dst_ptr,
                 image_configuration,
-                r_row,
-                g_row,
-                b_row,
+                r_row0_,
+                g_row0_,
+                b_row0_,
                 a_row
             );
         } else {
-            avx_store_and_interleave_v3_u8!(dst_ptr, image_configuration, r_row, g_row, b_row);
-        }
-
-        cx += 32;
-    }
-
-    let zeros = _mm256_setzero_si256();
-
-    while cx + 16 < width as usize {
-        let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_) =
-            avx_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_0,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let src_ptr_1 = offset_src_ptr.add(8 * CHANNELS);
-
-        let (r_row1_, g_row1_, b_row1_) =
-            avx_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_1,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let r_row = _mm256_packus_four_epi32(r_row0_, r_row1_, zeros, zeros);
-        let g_row = _mm256_packus_four_epi32(g_row0_, g_row1_, zeros, zeros);
-        let b_row = _mm256_packus_four_epi32(b_row0_, b_row1_, zeros, zeros);
-
-        let dst_ptr = dst.add(dst_offset + cx * channels);
-
-        if USE_ALPHA {
-            let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx);
-            let a_low_0_f = _mm256_loadu_ps(offset_a_src_ptr);
-            let a_row0_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps(
-                a_low_0_f,
-                color_rescale,
-            )));
-
-            let a_low_1_f = _mm256_loadu_ps(offset_a_src_ptr.add(8));
-            let a_row1_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps(
-                a_low_1_f,
-                color_rescale,
-            )));
-
-            let a_row = _mm256_packus_four_epi32(a_row0_, a_row1_, zeros, zeros);
-            avx_store_and_interleave_v4_half_u8!(
+            avx_store_and_interleave_v3_f32!(
                 dst_ptr,
                 image_configuration,
-                r_row,
-                g_row,
-                b_row,
-                a_row
+                r_row0_,
+                g_row0_,
+                b_row0_
             );
-        } else {
-            avx_store_and_interleave_v3_half_u8!(dst_ptr, image_configuration, r_row, g_row, b_row);
         }
 
-        cx += 16;
+        cx += 8;
     }
 
-    while cx + 8 < width as usize {
-        let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS);
+    while cx + 4 < width as usize {
+        let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * channels);
 
         let src_ptr_0 = offset_src_ptr;
 
         let (r_row0_, g_row0_, b_row0_) =
-            avx_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
+            sse_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
                 src_ptr_0,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
+                _mm256_castps256_ps128(c1),
+                _mm256_castps256_ps128(c2),
+                _mm256_castps256_ps128(c3),
+                _mm256_castps256_ps128(c4),
+                _mm256_castps256_ps128(c5),
+                _mm256_castps256_ps128(c6),
+                _mm256_castps256_ps128(c7),
+                _mm256_castps256_ps128(c8),
+                _mm256_castps256_ps128(c9),
             );
 
-        let r_row = _mm256_packus_four_epi32(r_row0_, zeros, zeros, zeros);
-        let g_row = _mm256_packus_four_epi32(g_row0_, zeros, zeros, zeros);
-        let b_row = _mm256_packus_four_epi32(b_row0_, zeros, zeros, zeros);
-
-        let dst_ptr = dst.add(dst_offset + cx * channels);
+        let dst_ptr = ((dst as *mut u8).add(dst_offset) as *mut f32).add(cx * channels);
 
         if USE_ALPHA {
             let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx);
-            let a_low_0_f = _mm256_loadu_ps(offset_a_src_ptr);
-            let a_row0_ = _mm256_cvtps_epi32(_mm256_round_ps::<0>(_mm256_mul_ps(
-                a_low_0_f,
-                color_rescale,
-            )));
+            let a_row = _mm_loadu_ps(offset_a_src_ptr);
 
-            let a_row = _mm256_packus_four_epi32(a_row0_, zeros, zeros, zeros);
-            avx_store_and_interleave_v4_quarter_u8!(
+            store_and_interleave_v4_f32!(
                 dst_ptr,
                 image_configuration,
-                r_row,
-                g_row,
-                b_row,
+                r_row0_,
+                g_row0_,
+                b_row0_,
                 a_row
             );
         } else {
-            avx_store_and_interleave_v3_quarter_u8!(
-                dst_ptr,
-                image_configuration,
-                r_row,
-                g_row,
-                b_row
-            );
+            store_and_interleave_v3_f32!(dst_ptr, image_configuration, r_row0_, g_row0_, b_row0_);
         }
 
-        cx += 8;
+        cx += 4;
     }
 
     cx
diff --git a/src/avx/xyza_laba_to_image.rs b/src/avx/xyza_laba_to_image.rs
index f4cdbce..e83ec89 100644
--- a/src/avx/xyza_laba_to_image.rs
+++ b/src/avx/xyza_laba_to_image.rs
@@ -5,26 +5,22 @@
  * // license that can be found in the LICENSE file.
  */
 
-use crate::avx::{_mm256_packus_four_epi32, avx2_interleave_rgb};
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
 use crate::avx::cie::{avx_lab_to_xyz, avx_lch_to_xyz, avx_luv_to_xyz};
-use crate::avx::gamma_curves::perform_avx_gamma_transfer;
-use crate::avx::{_mm256_color_matrix_ps, avx2_deinterleave_rgba_ps, avx2_interleave_rgba_epi8};
+use crate::avx::{_mm256_color_matrix_ps, avx2_deinterleave_rgba_ps};
+use crate::avx::{avx2_interleave_rgb_ps, avx2_interleave_rgba_ps};
+use crate::avx_store_and_interleave_v4_f32;
 use crate::image::ImageConfiguration;
+use crate::sse::{sse_interleave_ps_rgba, sse_xyza_lab_vld};
 use crate::xyz_target::XyzTarget;
-use crate::{
-    avx_store_and_interleave_v4_half_u8, avx_store_and_interleave_v4_quarter_u8,
-    avx_store_and_interleave_v4_u8, TransferFunction,
-};
 
 #[inline(always)]
 unsafe fn avx_xyza_lab_vld<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     src: *const f32,
-    transfer_function: TransferFunction,
     c1: __m256,
     c2: __m256,
     c3: __m256,
@@ -34,9 +30,8 @@ unsafe fn avx_xyza_lab_vld<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     c7: __m256,
     c8: __m256,
     c9: __m256,
-) -> (__m256i, __m256i, __m256i, __m256i) {
+) -> (__m256, __m256, __m256, __m256) {
     let target: XyzTarget = TARGET.into();
-    let v_scale_color = _mm256_set1_ps(255f32);
     let pixel_0 = _mm256_loadu_ps(src);
     let pixel_1 = _mm256_loadu_ps(src.add(8));
     let pixel_2 = _mm256_loadu_ps(src.add(16));
@@ -69,23 +64,7 @@ unsafe fn avx_xyza_lab_vld<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     let (linear_r, linear_g, linear_b) =
         _mm256_color_matrix_ps(r_f32, g_f32, b_f32, c1, c2, c3, c4, c5, c6, c7, c8, c9);
 
-    r_f32 = linear_r;
-    g_f32 = linear_g;
-    b_f32 = linear_b;
-
-    r_f32 = perform_avx_gamma_transfer(transfer_function, r_f32);
-    g_f32 = perform_avx_gamma_transfer(transfer_function, g_f32);
-    b_f32 = perform_avx_gamma_transfer(transfer_function, b_f32);
-    r_f32 = _mm256_mul_ps(r_f32, v_scale_color);
-    g_f32 = _mm256_mul_ps(g_f32, v_scale_color);
-    b_f32 = _mm256_mul_ps(b_f32, v_scale_color);
-    let a_f32 = _mm256_mul_ps(a_f32, v_scale_color);
-    (
-        _mm256_cvtps_epi32(_mm256_round_ps::<0>(r_f32)),
-        _mm256_cvtps_epi32(_mm256_round_ps::<0>(g_f32)),
-        _mm256_cvtps_epi32(_mm256_round_ps::<0>(b_f32)),
-        _mm256_cvtps_epi32(_mm256_round_ps::<0>(a_f32)),
-    )
+    (linear_r, linear_g, linear_b, a_f32)
 }
 
 #[target_feature(enable = "sse4.1")]
@@ -93,11 +72,10 @@ pub unsafe fn avx_xyza_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:
     start_cx: usize,
     src: *const f32,
     src_offset: usize,
-    dst: *mut u8,
+    dst: *mut f32,
     dst_offset: usize,
     width: u32,
     matrix: &[[f32; 3]; 3],
-    transfer_function: TransferFunction,
 ) -> usize {
     let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
     if !image_configuration.has_alpha() {
@@ -120,177 +98,64 @@ pub unsafe fn avx_xyza_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:
 
     const CHANNELS: usize = 4usize;
 
-    let zeros = _mm256_setzero_si256();
-
-    while cx + 32 < width as usize {
-        let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_, a_row0_) = avx_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
-            src_ptr_0,
-            transfer_function,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            c9,
-        );
-
-        let src_ptr_1 = offset_src_ptr.add(8 * CHANNELS);
-
-        let (r_row1_, g_row1_, b_row1_, a_row1_) = avx_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
-            src_ptr_1,
-            transfer_function,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            c9,
-        );
-
-        let src_ptr_2 = offset_src_ptr.add(8 * 2 * CHANNELS);
-
-        let (r_row2_, g_row2_, b_row2_, a_row2_) = avx_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
-            src_ptr_2,
-            transfer_function,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            c9,
-        );
-
-        let src_ptr_3 = offset_src_ptr.add(8 * 3 * CHANNELS);
-
-        let (r_row3_, g_row3_, b_row3_, a_row3_) = avx_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
-            src_ptr_3,
-            transfer_function,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            c9,
-        );
-
-        let r_row = _mm256_packus_four_epi32(r_row0_, r_row1_, r_row2_, r_row3_);
-        let g_row = _mm256_packus_four_epi32(g_row0_, g_row1_, g_row2_, g_row3_);
-        let b_row = _mm256_packus_four_epi32(b_row0_, b_row1_, b_row2_, b_row3_);
-        let a_row = _mm256_packus_four_epi32(a_row0_, a_row1_, a_row2_, a_row3_);
-
-        let dst_ptr = dst.add(dst_offset + cx * channels);
-
-        avx_store_and_interleave_v4_u8!(dst_ptr, image_configuration, r_row, g_row, b_row, a_row);
-
-        cx += 32;
-    }
-
-    while cx + 16 < width as usize {
+    while cx + 8 < width as usize {
         let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS);
 
         let src_ptr_0 = offset_src_ptr;
 
         let (r_row0_, g_row0_, b_row0_, a_row0_) = avx_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
-            src_ptr_0,
-            transfer_function,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            c9,
-        );
-
-        let src_ptr_1 = offset_src_ptr.add(8 * CHANNELS);
-
-        let (r_row1_, g_row1_, b_row1_, a_row1_) = avx_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
-            src_ptr_1,
-            transfer_function,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            c9,
+            src_ptr_0, c1, c2, c3, c4, c5, c6, c7, c8, c9,
         );
 
-        let r_row = _mm256_packus_four_epi32(r_row0_, r_row1_, zeros, zeros);
-        let g_row = _mm256_packus_four_epi32(g_row0_, g_row1_, zeros, zeros);
-        let b_row = _mm256_packus_four_epi32(b_row0_, b_row1_, zeros, zeros);
-        let a_row = _mm256_packus_four_epi32(a_row0_, a_row1_, zeros, zeros);
-
-        let dst_ptr = dst.add(dst_offset + cx * channels);
+        let dst_ptr = ((dst as *mut u8).add(dst_offset) as *mut f32).add(cx * channels);
 
-        avx_store_and_interleave_v4_half_u8!(
+        avx_store_and_interleave_v4_f32!(
             dst_ptr,
             image_configuration,
-            r_row,
-            g_row,
-            b_row,
-            a_row
+            r_row0_,
+            g_row0_,
+            b_row0_,
+            a_row0_
         );
 
-        cx += 16;
+        cx += 8;
     }
 
-    while cx + 8 < width as usize {
+    while cx + 4 < width as usize {
         let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS);
 
         let src_ptr_0 = offset_src_ptr;
 
-        let (r_row0_, g_row0_, b_row0_, a_row0_) = avx_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
+        let (r_row0_, g_row0_, b_row0_, a_row0_) = sse_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
             src_ptr_0,
-            transfer_function,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            c9,
+            _mm256_castps256_ps128(c1),
+            _mm256_castps256_ps128(c2),
+            _mm256_castps256_ps128(c3),
+            _mm256_castps256_ps128(c4),
+            _mm256_castps256_ps128(c5),
+            _mm256_castps256_ps128(c6),
+            _mm256_castps256_ps128(c7),
+            _mm256_castps256_ps128(c8),
+            _mm256_castps256_ps128(c9),
         );
 
-        let r_row = _mm256_packus_four_epi32(r_row0_, zeros, zeros, zeros);
-        let g_row = _mm256_packus_four_epi32(g_row0_, zeros, zeros, zeros);
-        let b_row = _mm256_packus_four_epi32(b_row0_, zeros, zeros, zeros);
-        let a_row = _mm256_packus_four_epi32(a_row0_, zeros, zeros, zeros);
+        let dst_ptr = ((dst as *mut u8).add(dst_offset) as *mut f32).add(cx * channels);
 
-        let dst_ptr = dst.add(dst_offset + cx * channels);
+        let (rgba0, rgba1, rgba2, rgba3) = match image_configuration {
+            ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
+                sse_interleave_ps_rgba(r_row0_, g_row0_, b_row0_, a_row0_)
+            }
+            ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
+                sse_interleave_ps_rgba(b_row0_, g_row0_, r_row0_, a_row0_)
+            }
+        };
 
-        avx_store_and_interleave_v4_quarter_u8!(
-            dst_ptr,
-            image_configuration,
-            r_row,
-            g_row,
-            b_row,
-            a_row
-        );
+        _mm_storeu_ps(dst_ptr, rgba0);
+        _mm_storeu_ps(dst_ptr.add(4), rgba1);
+        _mm_storeu_ps(dst_ptr.add(8), rgba2);
+        _mm_storeu_ps(dst_ptr.add(12), rgba3);
 
-        cx += 8;
+        cx += 4;
     }
 
     cx
diff --git a/src/image_to_linear.rs b/src/image_to_linear.rs
index 62d19ec..0e9c7bc 100644
--- a/src/image_to_linear.rs
+++ b/src/image_to_linear.rs
@@ -4,14 +4,8 @@
  * // Use of this source code is governed by a BSD-style
  * // license that can be found in the LICENSE file.
  */
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-use crate::avx::avx_channels_to_linear;
 use crate::gamma_curves::TransferFunction;
 use crate::image::ImageConfiguration;
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-use crate::neon::neon_channels_to_linear;
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-use crate::sse::*;
 use crate::Rgb;
 #[cfg(feature = "rayon")]
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
@@ -37,23 +31,9 @@ fn channels_to_linear<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
 
     let channels = image_configuration.get_channels_count();
 
-    let mut _wide_row_handle: Option<
-        unsafe fn(usize, *const u8, usize, u32, *mut f32, usize, TransferFunction) -> usize,
-    > = None;
-
-    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-    if std::arch::is_x86_feature_detected!("sse4.1") {
-        _wide_row_handle = Some(sse_channels_to_linear::<CHANNELS_CONFIGURATION, USE_ALPHA>);
-    }
-
-    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-    if std::arch::is_x86_feature_detected!("avx2") {
-        _wide_row_handle = Some(avx_channels_to_linear::<CHANNELS_CONFIGURATION, USE_ALPHA>);
-    }
-
-    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-    {
-        _wide_row_handle = Some(neon_channels_to_linear::<CHANNELS_CONFIGURATION, USE_ALPHA>);
+    let mut lut_table = vec![0f32; 256];
+    for i in 0..256 {
+        lut_table[i] = transfer_function.linearize(i as f32 * (1. / 255.0));
     }
 
     #[cfg(not(feature = "rayon"))]
@@ -63,20 +43,6 @@ fn channels_to_linear<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
         for _ in 0..height as usize {
             let mut _cx = 0usize;
 
-            if let Some(dispatcher) = _wide_row_handle {
-                unsafe {
-                    _cx = dispatcher(
-                        _cx,
-                        src.as_ptr(),
-                        src_offset,
-                        width,
-                        dst.as_mut_ptr(),
-                        dst_offset,
-                        transfer_function,
-                    )
-                }
-            }
-
             let src_ptr = unsafe { src.as_ptr().add(src_offset) };
             let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 };
 
@@ -98,14 +64,13 @@ fn channels_to_linear<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
                 };
 
                 let rgb = Rgb::<u8>::new(r, g, b);
-                let rgb_f32 = rgb.to_rgb_f32();
 
                 unsafe {
-                    dst.write_unaligned(transfer_function.linearize(rgb_f32.r));
+                    dst.write_unaligned(*lut_table.get_unchecked(rgb.r as usize));
                     dst.add(1)
-                        .write_unaligned(transfer_function.linearize(rgb_f32.g));
+                        .write_unaligned(*lut_table.get_unchecked(rgb.g as usize));
                     dst.add(2)
-                        .write_unaligned(transfer_function.linearize(rgb_f32.b));
+                        .write_unaligned(*lut_table.get_unchecked(rgb.b as usize));
                 }
 
                 if USE_ALPHA && image_configuration.has_alpha() {
@@ -139,18 +104,6 @@ fn channels_to_linear<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
             .for_each(|(dst_row, src_row)| unsafe {
                 let mut _cx = 0usize;
 
-                if let Some(dispatcher) = _wide_row_handle {
-                    _cx = dispatcher(
-                        _cx,
-                        src_row.as_ptr(),
-                        0,
-                        width,
-                        dst_row.as_mut_ptr() as *mut f32,
-                        0,
-                        transfer_function,
-                    )
-                }
-
                 let src_ptr = src_row.as_ptr();
                 let dst_ptr = dst_row.as_mut_ptr() as *mut f32;
 
@@ -169,13 +122,12 @@ fn channels_to_linear<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
                         .read_unaligned();
 
                     let rgb = Rgb::<u8>::new(r, g, b);
-                    let rgb_f32 = rgb.to_rgb_f32();
 
-                    dst.write_unaligned(transfer_function.linearize(rgb_f32.r));
+                    dst.write_unaligned(*lut_table.get_unchecked(rgb.r as usize));
                     dst.add(1)
-                        .write_unaligned(transfer_function.linearize(rgb_f32.g));
+                        .write_unaligned(*lut_table.get_unchecked(rgb.g as usize));
                     dst.add(2)
-                        .write_unaligned(transfer_function.linearize(rgb_f32.b));
+                        .write_unaligned(*lut_table.get_unchecked(rgb.b as usize));
 
                     if USE_ALPHA && image_configuration.has_alpha() {
                         let a = src
diff --git a/src/image_to_linear_u8.rs b/src/image_to_linear_u8.rs
index 19e3502..84b41ee 100644
--- a/src/image_to_linear_u8.rs
+++ b/src/image_to_linear_u8.rs
@@ -6,10 +6,6 @@
  */
 use crate::gamma_curves::TransferFunction;
 use crate::image::ImageConfiguration;
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-use crate::neon::*;
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-use crate::sse::sse_image_to_linear_unsigned::sse_channels_to_linear_u8;
 use crate::Rgb;
 #[cfg(feature = "rayon")]
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
@@ -33,20 +29,11 @@ fn channels_to_linear<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
 
     let channels = image_configuration.get_channels_count();
 
-    let mut _wide_row_handler: Option<
-        unsafe fn(usize, *const u8, usize, u32, *mut u8, usize, TransferFunction) -> usize,
-    > = None;
-
-    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-    {
-        _wide_row_handler =
-            Some(neon_channels_to_linear_u8::<CHANNELS_CONFIGURATION, USE_ALPHA, true>);
-    }
-
-    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-    if std::arch::is_x86_feature_detected!("sse4.1") {
-        _wide_row_handler =
-            Some(sse_channels_to_linear_u8::<CHANNELS_CONFIGURATION, USE_ALPHA, true>);
+    let mut lut_table = vec![0u8; 256];
+    for i in 0..256 {
+        lut_table[i] = (transfer_function.linearize(i as f32 * (1. / 255.0)) * 255.)
+            .ceil()
+            .min(255.) as u8;
     }
 
     #[cfg(not(feature = "rayon"))]
@@ -56,20 +43,6 @@ fn channels_to_linear<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
     {
         let mut _cx = 0usize;
 
-        if let Some(dispatcher) = _wide_row_handler {
-            unsafe {
-                _cx = dispatcher(
-                    _cx,
-                    src_row.as_ptr(),
-                    0,
-                    width,
-                    dst_row.as_mut_ptr(),
-                    0,
-                    transfer_function,
-                )
-            }
-        }
-
         for x in _cx..width as usize {
             let px = x * channels;
             let r =
@@ -85,9 +58,9 @@ fn channels_to_linear<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
             let rgb = rgb_f32.to_u8();
 
             unsafe {
-                *dst_row.get_unchecked_mut(px) = rgb.r;
-                *dst_row.get_unchecked_mut(px + 1) = rgb.g;
-                *dst_row.get_unchecked_mut(px + 2) = rgb.b;
+                *dst_row.get_unchecked_mut(px) = *lut_table.get_unchecked(rgb.r as usize);
+                *dst_row.get_unchecked_mut(px + 1) = *lut_table.get_unchecked(rgb.g as usize);
+                *dst_row.get_unchecked_mut(px + 2) = *lut_table.get_unchecked(rgb.b as usize);
             }
 
             if USE_ALPHA && image_configuration.has_alpha() {
@@ -109,18 +82,6 @@ fn channels_to_linear<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
             .for_each(|(dst_row, src_row)| unsafe {
                 let mut _cx = 0usize;
 
-                if let Some(dispatcher) = _wide_row_handler {
-                    _cx = dispatcher(
-                        _cx,
-                        src_row.as_ptr(),
-                        0,
-                        width,
-                        dst_row.as_mut_ptr(),
-                        0,
-                        transfer_function,
-                    )
-                }
-
                 for x in _cx..width as usize {
                     let px = x * channels;
                     let r = *src_row.get_unchecked(px + image_configuration.get_r_channel_offset());
@@ -128,13 +89,10 @@ fn channels_to_linear<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
                     let b = *src_row.get_unchecked(px + image_configuration.get_b_channel_offset());
 
                     let rgb = Rgb::<u8>::new(r, g, b);
-                    let mut rgb_f32 = rgb.to_rgb_f32();
-                    rgb_f32 = rgb_f32.linearize(transfer_function);
-                    let rgb = rgb_f32.to_u8();
 
-                    *dst_row.get_unchecked_mut(px) = rgb.r;
-                    *dst_row.get_unchecked_mut(px + 1) = rgb.g;
-                    *dst_row.get_unchecked_mut(px + 2) = rgb.b;
+                    *dst_row.get_unchecked_mut(px) = *lut_table.get_unchecked(rgb.r as usize);
+                    *dst_row.get_unchecked_mut(px + 1) = *lut_table.get_unchecked(rgb.g as usize);
+                    *dst_row.get_unchecked_mut(px + 2) = *lut_table.get_unchecked(rgb.b as usize);
 
                     if USE_ALPHA && image_configuration.has_alpha() {
                         let a =
diff --git a/src/image_to_oklab.rs b/src/image_to_oklab.rs
index 25ceb67..90197b6 100644
--- a/src/image_to_oklab.rs
+++ b/src/image_to_oklab.rs
@@ -12,12 +12,13 @@ use crate::neon::neon_image_to_oklab;
 use crate::oklch::Oklch;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::sse::sse_image_to_oklab;
-use crate::{Oklab, Rgb, TransferFunction};
+use crate::{
+    bgr_to_linear, bgra_to_linear, rgb_to_linear, rgba_to_linear, Oklab, Rgb, TransferFunction,
+};
 #[cfg(feature = "rayon")]
-use rayon::iter::{IndexedParallelIterator, ParallelIterator};
-#[cfg(feature = "rayon")]
-use rayon::prelude::{ParallelSlice, ParallelSliceMut};
+use rayon::iter::ParallelIterator;
 #[cfg(feature = "rayon")]
+use rayon::prelude::ParallelSliceMut;
 use std::slice;
 
 #[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
@@ -53,9 +54,24 @@ fn channels_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
 
     let channels = image_configuration.get_channels_count();
 
-    let mut _wide_row_handle: Option<
-        unsafe fn(usize, *const u8, usize, u32, *mut f32, usize, TransferFunction) -> usize,
-    > = None;
+    let callee = match image_configuration {
+        ImageConfiguration::Rgb => rgb_to_linear,
+        ImageConfiguration::Rgba => rgba_to_linear,
+        ImageConfiguration::Bgra => bgra_to_linear,
+        ImageConfiguration::Bgr => bgr_to_linear,
+    };
+
+    callee(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        width,
+        height,
+        transfer_function,
+    );
+
+    let mut _wide_row_handle: Option<unsafe fn(usize, u32, *mut f32, usize) -> usize> = None;
 
     #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
     {
@@ -72,32 +88,30 @@ fn channels_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
         _wide_row_handle = Some(avx_image_to_oklab::<CHANNELS_CONFIGURATION, TARGET>);
     }
 
+    let dst_slice_safe_align = unsafe {
+        slice::from_raw_parts_mut(
+            dst.as_mut_ptr() as *mut u8,
+            dst_stride as usize * height as usize,
+        )
+    };
+
     #[cfg(feature = "rayon")]
     {
-        let dst_slice_safe_align = unsafe {
-            slice::from_raw_parts_mut(
-                dst.as_mut_ptr() as *mut u8,
-                dst_stride as usize * height as usize,
-            )
-        };
-
         dst_slice_safe_align
             .par_chunks_exact_mut(dst_stride as usize)
-            .zip(src.par_chunks_exact(src_stride as usize))
-            .for_each(|(dst, src)| unsafe {
+            .for_each(|dst| unsafe {
                 let mut _cx = 0usize;
 
-                let src_ptr = src.as_ptr();
                 let dst_ptr = dst.as_mut_ptr() as *mut f32;
 
                 if let Some(dispatcher) = _wide_row_handle {
-                    _cx = dispatcher(_cx, src.as_ptr(), 0, width, dst_ptr, 0, transfer_function)
+                    _cx = dispatcher(_cx, width, dst_ptr, 0)
                 }
 
                 for x in _cx..width as usize {
                     let px = x * channels;
 
-                    let src = src_ptr.add(px);
+                    let src = dst_ptr.add(px);
                     let r = src
                         .add(image_configuration.get_r_channel_offset())
                         .read_unaligned();
@@ -108,18 +122,18 @@ fn channels_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
                         .add(image_configuration.get_b_channel_offset())
                         .read_unaligned();
 
-                    let rgb = Rgb::<u8>::new(r, g, b);
+                    let rgb = Rgb::<f32>::new(r, g, b);
                     let dst_store = dst_ptr.add(px);
 
                     match target {
                         OklabTarget::Oklab => {
-                            let oklab = Oklab::from_rgb(rgb, transfer_function);
+                            let oklab = Oklab::from_linear_rgb(rgb);
                             dst_store.write_unaligned(oklab.l);
                             dst_store.add(1).write_unaligned(oklab.a);
                             dst_store.add(2).write_unaligned(oklab.b);
                         }
                         OklabTarget::Oklch => {
-                            let oklch = Oklch::from_rgb(rgb, transfer_function);
+                            let oklch = Oklch::from_linear_rgb(rgb);
                             dst_store.write_unaligned(oklch.l);
                             dst_store.add(1).write_unaligned(oklch.c);
                             dst_store.add(2).write_unaligned(oklch.h);
@@ -130,8 +144,7 @@ fn channels_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
                         let a = src
                             .add(image_configuration.get_a_channel_offset())
                             .read_unaligned();
-                        let a_lin = a as f32 * (1f32 / 255f32);
-                        dst_store.add(3).write_unaligned(a_lin);
+                        dst_store.add(3).write_unaligned(a);
                     }
                 }
             });
@@ -139,82 +152,56 @@ fn channels_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
 
     #[cfg(not(feature = "rayon"))]
     {
-        let mut src_offset = 0usize;
-        let mut dst_offset = 0usize;
-
-        for _ in 0..height as usize {
-            let mut _cx = 0usize;
-
-            let src_ptr = unsafe { src.as_ptr().add(src_offset) };
-            let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 };
-
-            if let Some(dispatcher) = _wide_row_handle {
-                unsafe {
-                    _cx = dispatcher(
-                        _cx,
-                        src.as_ptr(),
-                        src_offset,
-                        width,
-                        dst.as_mut_ptr(),
-                        dst_offset,
-                        transfer_function,
-                    )
+        for dst in dst_slice_safe_align.chunks_exact_mut(dst_stride as usize) {
+            unsafe {
+                let mut _cx = 0usize;
+
+                let dst_ptr = dst.as_mut_ptr() as *mut f32;
+
+                if let Some(dispatcher) = _wide_row_handle {
+                    _cx = dispatcher(_cx, width, dst_ptr, 0)
                 }
-            }
 
-            for x in _cx..width as usize {
-                let px = x * channels;
-
-                let src = unsafe { src_ptr.add(px) };
-                let r = unsafe {
-                    src.add(image_configuration.get_r_channel_offset())
-                        .read_unaligned()
-                };
-                let g = unsafe {
-                    src.add(image_configuration.get_g_channel_offset())
-                        .read_unaligned()
-                };
-                let b = unsafe {
-                    src.add(image_configuration.get_b_channel_offset())
-                        .read_unaligned()
-                };
-
-                let rgb = Rgb::<u8>::new(r, g, b);
-                let dst_store = unsafe { dst_ptr.add(px) };
-
-                match target {
-                    OklabTarget::Oklab => {
-                        let oklab = Oklab::from_rgb(rgb, transfer_function);
-                        unsafe {
+                for x in _cx..width as usize {
+                    let px = x * channels;
+
+                    let src = dst_ptr.add(px);
+                    let r = src
+                        .add(image_configuration.get_r_channel_offset())
+                        .read_unaligned();
+                    let g = src
+                        .add(image_configuration.get_g_channel_offset())
+                        .read_unaligned();
+                    let b = src
+                        .add(image_configuration.get_b_channel_offset())
+                        .read_unaligned();
+
+                    let rgb = Rgb::<f32>::new(r, g, b);
+                    let dst_store = dst_ptr.add(px);
+
+                    match target {
+                        OklabTarget::Oklab => {
+                            let oklab = Oklab::from_linear_rgb(rgb);
                             dst_store.write_unaligned(oklab.l);
                             dst_store.add(1).write_unaligned(oklab.a);
                             dst_store.add(2).write_unaligned(oklab.b);
                         }
-                    }
-                    OklabTarget::Oklch => {
-                        let oklch = Oklch::from_rgb(rgb, transfer_function);
-                        unsafe {
+                        OklabTarget::Oklch => {
+                            let oklch = Oklch::from_linear_rgb(rgb);
                             dst_store.write_unaligned(oklch.l);
                             dst_store.add(1).write_unaligned(oklch.c);
                             dst_store.add(2).write_unaligned(oklch.h);
                         }
                     }
-                }
 
-                if image_configuration.has_alpha() {
-                    let a = unsafe {
-                        src.add(image_configuration.get_a_channel_offset())
-                            .read_unaligned()
-                    };
-                    let a_lin = a as f32 * (1f32 / 255f32);
-                    unsafe {
-                        dst_store.add(3).write_unaligned(a_lin);
+                    if image_configuration.has_alpha() {
+                        let a = src
+                            .add(image_configuration.get_a_channel_offset())
+                            .read_unaligned();
+                        dst_store.add(3).write_unaligned(a);
                     }
                 }
             }
-
-            src_offset += src_stride as usize;
-            dst_offset += dst_stride as usize;
         }
     }
 }
diff --git a/src/image_to_xyz_lab.rs b/src/image_to_xyz_lab.rs
index 39d0368..5617d10 100644
--- a/src/image_to_xyz_lab.rs
+++ b/src/image_to_xyz_lab.rs
@@ -13,12 +13,11 @@ use crate::neon::neon_channels_to_xyz_or_lab;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::sse::sse_channels_to_xyz_or_lab;
 use crate::xyz_target::XyzTarget;
-use crate::{Rgb, Xyz, SRGB_TO_XYZ_D65};
+use crate::{LCh, Lab, Luv, Rgb, Xyz, SRGB_TO_XYZ_D65};
 #[cfg(feature = "rayon")]
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 #[cfg(feature = "rayon")]
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
-#[cfg(feature = "rayon")]
 use std::slice;
 
 #[allow(clippy::type_complexity)]
@@ -45,7 +44,7 @@ fn channels_to_xyz<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
     let mut _wide_row_handler: Option<
         unsafe fn(
             usize,
-            *const u8,
+            *const f32,
             usize,
             u32,
             *mut f32,
@@ -53,7 +52,6 @@ fn channels_to_xyz<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
             *mut f32,
             usize,
             &[[f32; 3]; 3],
-            TransferFunction,
         ) -> usize,
     > = None;
 
@@ -75,15 +73,20 @@ fn channels_to_xyz<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
             Some(avx2_image_to_xyz_lab::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>);
     }
 
+    let mut lut_table = vec![0f32; 256];
+    for i in 0..256 {
+        lut_table[i] = transfer_function.linearize(i as f32 * (1. / 255.0));
+    }
+
+    let dst_slice_safe_align = unsafe {
+        slice::from_raw_parts_mut(
+            dst.as_mut_ptr() as *mut u8,
+            dst_stride as usize * height as usize,
+        )
+    };
+
     #[cfg(feature = "rayon")]
     {
-        let dst_slice_safe_align = unsafe {
-            slice::from_raw_parts_mut(
-                dst.as_mut_ptr() as *mut u8,
-                dst_stride as usize * height as usize,
-            )
-        };
-
         if USE_ALPHA {
             let a_slice_safe_align = unsafe {
                 slice::from_raw_parts_mut(
@@ -99,10 +102,22 @@ fn channels_to_xyz<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
                 .for_each(|((dst, src), a_channel)| unsafe {
                     let mut _cx = 0usize;
 
+                    let mut transient_row = vec![0f32; width as usize * channels];
+
+                    for (dst_chunk, src_chunks) in transient_row
+                        .chunks_exact_mut(channels)
+                        .zip(src.chunks_exact(channels))
+                    {
+                        dst_chunk[0] = *lut_table.get_unchecked(src_chunks[0] as usize);
+                        dst_chunk[1] = *lut_table.get_unchecked(src_chunks[1] as usize);
+                        dst_chunk[2] = *lut_table.get_unchecked(src_chunks[2] as usize);
+                        dst_chunk[3] = src_chunks[3] as f32 * (1. / 255.0);
+                    }
+
                     if let Some(dispatcher) = _wide_row_handler {
                         _cx = dispatcher(
                             _cx,
-                            src.as_ptr(),
+                            transient_row.as_ptr(),
                             0,
                             width,
                             dst.as_mut_ptr() as *mut f32,
@@ -110,49 +125,44 @@ fn channels_to_xyz<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
                             a_channel.as_mut_ptr() as *mut f32,
                             0,
                             matrix,
-                            transfer_function,
                         );
                     }
 
-                    let src_ptr = src.as_ptr().add(0);
                     let dst_ptr = dst.as_mut_ptr().add(0) as *mut f32;
 
                     for x in _cx..width as usize {
                         let px = x * channels;
-                        let src = src_ptr.add(px);
-                        let r = src
-                            .add(image_configuration.get_r_channel_offset())
-                            .read_unaligned();
-                        let g = src
-                            .add(image_configuration.get_g_channel_offset())
-                            .read_unaligned();
-                        let b = src
-                            .add(image_configuration.get_b_channel_offset())
-                            .read_unaligned();
-
-                        let rgb = Rgb::<u8>::new(r, g, b);
+                        let src = transient_row.get_unchecked(px..);
+                        let r = *src.get_unchecked(image_configuration.get_r_channel_offset());
+                        let g = *src.get_unchecked(image_configuration.get_g_channel_offset());
+                        let b = *src.get_unchecked(image_configuration.get_b_channel_offset());
+
+                        let rgb = Rgb::<f32>::new(r, g, b);
                         let ptr = dst_ptr.add(x * 3);
+
+                        let xyz = Xyz::from_linear_rgb(rgb, matrix);
+
                         match target {
                             XyzTarget::Lab => {
-                                let lab = rgb.to_lab();
+                                let lab = Lab::from_xyz(xyz);
                                 ptr.write_unaligned(lab.l);
                                 ptr.add(1).write_unaligned(lab.a);
                                 ptr.add(2).write_unaligned(lab.b);
                             }
                             XyzTarget::Xyz => {
-                                let xyz = Xyz::from_rgb(rgb, matrix, transfer_function);
                                 ptr.write_unaligned(xyz.x);
                                 ptr.add(1).write_unaligned(xyz.y);
                                 ptr.add(2).write_unaligned(xyz.z);
                             }
                             XyzTarget::Luv => {
-                                let luv = rgb.to_luv();
+                                let luv = Luv::from_xyz(xyz);
                                 ptr.write_unaligned(luv.l);
                                 ptr.add(1).write_unaligned(luv.u);
                                 ptr.add(2).write_unaligned(luv.v);
                             }
                             XyzTarget::Lch => {
-                                let lch = rgb.to_lch();
+                                let luv = Luv::from_xyz(xyz);
+                                let lch = LCh::from_luv(luv);
                                 ptr.write_unaligned(lch.l);
                                 ptr.add(1).write_unaligned(lch.c);
                                 ptr.add(2).write_unaligned(lch.h);
@@ -160,12 +170,9 @@ fn channels_to_xyz<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
                         }
 
                         if USE_ALPHA && image_configuration.has_alpha() {
-                            let a = src
-                                .add(image_configuration.get_a_channel_offset())
-                                .read_unaligned();
-                            let a_lin = a as f32 * (1f32 / 255f32);
+                            let a = *src.get_unchecked(image_configuration.get_a_channel_offset());
                             let a_ptr = a_channel.as_mut_ptr() as *mut f32;
-                            a_ptr.add(x).write_unaligned(a_lin);
+                            a_ptr.add(x).write_unaligned(a);
                         }
                     }
                 });
@@ -176,10 +183,21 @@ fn channels_to_xyz<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
                 .for_each(|(dst, src)| unsafe {
                     let mut _cx = 0usize;
 
+                    let mut transient_row = vec![0f32; width as usize * channels];
+
+                    for (dst_chunk, src_chunks) in transient_row
+                        .chunks_exact_mut(channels)
+                        .zip(src.chunks_exact(channels))
+                    {
+                        dst_chunk[0] = *lut_table.get_unchecked(src_chunks[0] as usize);
+                        dst_chunk[1] = *lut_table.get_unchecked(src_chunks[1] as usize);
+                        dst_chunk[2] = *lut_table.get_unchecked(src_chunks[2] as usize);
+                    }
+
                     if let Some(dispatcher) = _wide_row_handler {
                         _cx = dispatcher(
                             _cx,
-                            src.as_ptr(),
+                            transient_row.as_ptr(),
                             0,
                             width,
                             dst.as_mut_ptr() as *mut f32,
@@ -187,49 +205,44 @@ fn channels_to_xyz<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
                             std::ptr::null_mut(),
                             0,
                             matrix,
-                            transfer_function,
                         );
                     }
 
-                    let src_ptr = src.as_ptr().add(0);
                     let dst_ptr = dst.as_mut_ptr().add(0) as *mut f32;
 
                     for x in _cx..width as usize {
                         let px = x * channels;
-                        let src = src_ptr.add(px);
-                        let r = src
-                            .add(image_configuration.get_r_channel_offset())
-                            .read_unaligned();
-                        let g = src
-                            .add(image_configuration.get_g_channel_offset())
-                            .read_unaligned();
-                        let b = src
-                            .add(image_configuration.get_b_channel_offset())
-                            .read_unaligned();
-
-                        let rgb = Rgb::<u8>::new(r, g, b);
+                        let src = transient_row.get_unchecked(px..);
+                        let r = *src.get_unchecked(image_configuration.get_r_channel_offset());
+                        let g = *src.get_unchecked(image_configuration.get_g_channel_offset());
+                        let b = *src.get_unchecked(image_configuration.get_b_channel_offset());
+
+                        let rgb = Rgb::<f32>::new(r, g, b);
                         let ptr = dst_ptr.add(x * 3);
+
+                        let xyz = Xyz::from_linear_rgb(rgb, matrix);
+
                         match target {
                             XyzTarget::Lab => {
-                                let lab = rgb.to_lab();
+                                let lab = Lab::from_xyz(xyz);
                                 ptr.write_unaligned(lab.l);
                                 ptr.add(1).write_unaligned(lab.a);
                                 ptr.add(2).write_unaligned(lab.b);
                             }
                             XyzTarget::Xyz => {
-                                let xyz = Xyz::from_rgb(rgb, matrix, transfer_function);
                                 ptr.write_unaligned(xyz.x);
                                 ptr.add(1).write_unaligned(xyz.y);
                                 ptr.add(2).write_unaligned(xyz.z);
                             }
                             XyzTarget::Luv => {
-                                let luv = rgb.to_luv();
+                                let luv = Luv::from_xyz(xyz);
                                 ptr.write_unaligned(luv.l);
                                 ptr.add(1).write_unaligned(luv.u);
                                 ptr.add(2).write_unaligned(luv.v);
                             }
                             XyzTarget::Lch => {
-                                let lch = rgb.to_lch();
+                                let luv = Luv::from_xyz(xyz);
+                                let lch = LCh::from_luv(luv);
                                 ptr.write_unaligned(lch.l);
                                 ptr.add(1).write_unaligned(lch.c);
                                 ptr.add(2).write_unaligned(lch.h);
@@ -242,103 +255,173 @@ fn channels_to_xyz<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
 
     #[cfg(not(feature = "rayon"))]
     {
-        let mut src_offset = 0usize;
-        let mut dst_offset = 0usize;
-        let mut a_offset = 0usize;
-
-        for _ in 0..height as usize {
-            let mut _cx = 0usize;
+        if USE_ALPHA {
+            let a_slice_safe_align = unsafe {
+                slice::from_raw_parts_mut(
+                    a_channel.as_mut_ptr() as *mut u8,
+                    a_stride as usize * height as usize,
+                )
+            };
 
-            if let Some(dispatcher) = _wide_row_handler {
+            for ((dst, src), a_channel) in dst_slice_safe_align
+                .chunks_exact_mut(dst_stride as usize)
+                .zip(src.chunks_exact(src_stride as usize))
+                .zip(a_slice_safe_align.chunks_exact_mut(a_stride as usize))
+            {
                 unsafe {
-                    _cx = dispatcher(
-                        _cx,
-                        src.as_ptr(),
-                        src_offset,
-                        width,
-                        dst.as_mut_ptr(),
-                        dst_offset,
-                        a_channel.as_mut_ptr(),
-                        a_offset,
-                        matrix,
-                        transfer_function,
-                    );
-                }
-            }
+                    let mut _cx = 0usize;
 
-            let src_ptr = unsafe { src.as_ptr().add(src_offset) };
-            let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 };
-
-            for x in _cx..width as usize {
-                let px = x * channels;
-                let src = unsafe { src_ptr.add(px) };
-                let r = unsafe {
-                    src.add(image_configuration.get_r_channel_offset())
-                        .read_unaligned()
-                };
-                let g = unsafe {
-                    src.add(image_configuration.get_g_channel_offset())
-                        .read_unaligned()
-                };
-                let b = unsafe {
-                    src.add(image_configuration.get_b_channel_offset())
-                        .read_unaligned()
-                };
-
-                let rgb = Rgb::<u8>::new(r, g, b);
-                let ptr = unsafe { dst_ptr.add(x * 3) };
-                match target {
-                    XyzTarget::Lab => {
-                        let lab = rgb.to_lab();
-                        unsafe {
-                            ptr.write_unaligned(lab.l);
-                            ptr.add(1).write_unaligned(lab.a);
-                            ptr.add(2).write_unaligned(lab.b);
-                        }
+                    let mut transient_row = vec![0f32; width as usize * channels];
+
+                    for (dst_chunk, src_chunks) in transient_row
+                        .chunks_exact_mut(channels)
+                        .zip(src.chunks_exact(channels))
+                    {
+                        dst_chunk[0] = *lut_table.get_unchecked(src_chunks[0] as usize);
+                        dst_chunk[1] = *lut_table.get_unchecked(src_chunks[1] as usize);
+                        dst_chunk[2] = *lut_table.get_unchecked(src_chunks[2] as usize);
+                        dst_chunk[3] = src_chunks[3] as f32 * (1. / 255.0);
                     }
-                    XyzTarget::Xyz => {
-                        let xyz = Xyz::from_rgb(rgb, matrix, transfer_function);
-                        unsafe {
-                            ptr.write_unaligned(xyz.x);
-                            ptr.add(1).write_unaligned(xyz.y);
-                            ptr.add(2).write_unaligned(xyz.z);
-                        }
+
+                    if let Some(dispatcher) = _wide_row_handler {
+                        _cx = dispatcher(
+                            _cx,
+                            transient_row.as_ptr(),
+                            0,
+                            width,
+                            dst.as_mut_ptr() as *mut f32,
+                            0,
+                            a_channel.as_mut_ptr() as *mut f32,
+                            0,
+                            matrix,
+                        );
                     }
-                    XyzTarget::Luv => {
-                        let luv = rgb.to_luv();
-                        unsafe {
-                            ptr.write_unaligned(luv.l);
-                            ptr.add(1).write_unaligned(luv.u);
-                            ptr.add(2).write_unaligned(luv.v);
+
+                    let dst_ptr = dst.as_mut_ptr().add(0) as *mut f32;
+
+                    for x in _cx..width as usize {
+                        let px = x * channels;
+                        let src = transient_row.get_unchecked(px..);
+                        let r = *src.get_unchecked(image_configuration.get_r_channel_offset());
+                        let g = *src.get_unchecked(image_configuration.get_g_channel_offset());
+                        let b = *src.get_unchecked(image_configuration.get_b_channel_offset());
+
+                        let rgb = Rgb::<f32>::new(r, g, b);
+                        let ptr = dst_ptr.add(x * 3);
+
+                        let xyz = Xyz::from_linear_rgb(rgb, matrix);
+
+                        match target {
+                            XyzTarget::Lab => {
+                                let lab = Lab::from_xyz(xyz);
+                                ptr.write_unaligned(lab.l);
+                                ptr.add(1).write_unaligned(lab.a);
+                                ptr.add(2).write_unaligned(lab.b);
+                            }
+                            XyzTarget::Xyz => {
+                                ptr.write_unaligned(xyz.x);
+                                ptr.add(1).write_unaligned(xyz.y);
+                                ptr.add(2).write_unaligned(xyz.z);
+                            }
+                            XyzTarget::Luv => {
+                                let luv = Luv::from_xyz(xyz);
+                                ptr.write_unaligned(luv.l);
+                                ptr.add(1).write_unaligned(luv.u);
+                                ptr.add(2).write_unaligned(luv.v);
+                            }
+                            XyzTarget::Lch => {
+                                let luv = Luv::from_xyz(xyz);
+                                let lch = LCh::from_luv(luv);
+                                ptr.write_unaligned(lch.l);
+                                ptr.add(1).write_unaligned(lch.c);
+                                ptr.add(2).write_unaligned(lch.h);
+                            }
                         }
-                    }
-                    XyzTarget::Lch => {
-                        let lch = rgb.to_lch();
-                        unsafe {
-                            ptr.write_unaligned(lch.l);
-                            ptr.add(1).write_unaligned(lch.c);
-                            ptr.add(2).write_unaligned(lch.h);
+
+                        if USE_ALPHA && image_configuration.has_alpha() {
+                            let a = *src.get_unchecked(image_configuration.get_a_channel_offset());
+                            let a_ptr = a_channel.as_mut_ptr() as *mut f32;
+                            a_ptr.add(x).write_unaligned(a);
                         }
                     }
                 }
+            }
+        } else {
+            for (dst, src) in dst_slice_safe_align
+                .chunks_exact_mut(dst_stride as usize)
+                .zip(src.chunks_exact(src_stride as usize))
+            {
+                unsafe {
+                    let mut _cx = 0usize;
+
+                    let mut transient_row = vec![0f32; width as usize * channels];
+
+                    for (dst_chunk, src_chunks) in transient_row
+                        .chunks_exact_mut(channels)
+                        .zip(src.chunks_exact(channels))
+                    {
+                        dst_chunk[0] = *lut_table.get_unchecked(src_chunks[0] as usize);
+                        dst_chunk[1] = *lut_table.get_unchecked(src_chunks[1] as usize);
+                        dst_chunk[2] = *lut_table.get_unchecked(src_chunks[2] as usize);
+                    }
+
+                    if let Some(dispatcher) = _wide_row_handler {
+                        _cx = dispatcher(
+                            _cx,
+                            transient_row.as_ptr(),
+                            0,
+                            width,
+                            dst.as_mut_ptr() as *mut f32,
+                            0,
+                            std::ptr::null_mut(),
+                            0,
+                            matrix,
+                        );
+                    }
+
+                    let dst_ptr = dst.as_mut_ptr().add(0) as *mut f32;
+
+                    for x in _cx..width as usize {
+                        let px = x * channels;
+                        let src = transient_row.get_unchecked(px..);
+                        let r = *src.get_unchecked(image_configuration.get_r_channel_offset());
+                        let g = *src.get_unchecked(image_configuration.get_g_channel_offset());
+                        let b = *src.get_unchecked(image_configuration.get_b_channel_offset());
+
+                        let rgb = Rgb::<f32>::new(r, g, b);
+                        let ptr = dst_ptr.add(x * 3);
 
-                if USE_ALPHA && image_configuration.has_alpha() {
-                    let a = unsafe {
-                        src.add(image_configuration.get_a_channel_offset())
-                            .read_unaligned()
-                    };
-                    let a_lin = a as f32 * (1f32 / 255f32);
-                    let a_ptr =
-                        unsafe { (a_channel.as_mut_ptr() as *mut u8).add(a_offset) as *mut f32 };
-                    unsafe {
-                        a_ptr.add(x).write_unaligned(a_lin);
+                        let xyz = Xyz::from_linear_rgb(rgb, matrix);
+
+                        match target {
+                            XyzTarget::Lab => {
+                                let lab = Lab::from_xyz(xyz);
+                                ptr.write_unaligned(lab.l);
+                                ptr.add(1).write_unaligned(lab.a);
+                                ptr.add(2).write_unaligned(lab.b);
+                            }
+                            XyzTarget::Xyz => {
+                                ptr.write_unaligned(xyz.x);
+                                ptr.add(1).write_unaligned(xyz.y);
+                                ptr.add(2).write_unaligned(xyz.z);
+                            }
+                            XyzTarget::Luv => {
+                                let luv = Luv::from_xyz(xyz);
+                                ptr.write_unaligned(luv.l);
+                                ptr.add(1).write_unaligned(luv.u);
+                                ptr.add(2).write_unaligned(luv.v);
+                            }
+                            XyzTarget::Lch => {
+                                let luv = Luv::from_xyz(xyz);
+                                let lch = LCh::from_luv(luv);
+                                ptr.write_unaligned(lch.l);
+                                ptr.add(1).write_unaligned(lch.c);
+                                ptr.add(2).write_unaligned(lch.h);
+                            }
+                        }
                     }
                 }
             }
-
-            src_offset += src_stride as usize;
-            dst_offset += dst_stride as usize;
-            a_offset += a_stride as usize;
         }
     }
 }
diff --git a/src/image_xyza_laba.rs b/src/image_xyza_laba.rs
index 1e97acb..4d6f051 100644
--- a/src/image_xyza_laba.rs
+++ b/src/image_xyza_laba.rs
@@ -11,12 +11,11 @@ use crate::neon::neon_channels_to_xyza_or_laba;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::sse::sse_channels_to_xyza_laba;
 use crate::xyz_target::XyzTarget;
-use crate::{Rgb, TransferFunction, Xyz, SRGB_TO_XYZ_D65};
+use crate::{LCh, Lab, Luv, Rgb, TransferFunction, Xyz, SRGB_TO_XYZ_D65};
 #[cfg(feature = "rayon")]
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 #[cfg(feature = "rayon")]
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
-#[cfg(feature = "rayon")]
 use std::slice;
 
 #[allow(clippy::type_complexity)]
@@ -37,16 +36,7 @@ fn channels_to_xyz_with_alpha<const CHANNELS_CONFIGURATION: u8, const TARGET: u8
     }
 
     let mut _wide_row_handler: Option<
-        unsafe fn(
-            usize,
-            *const u8,
-            usize,
-            u32,
-            *mut f32,
-            usize,
-            &[[f32; 3]; 3],
-            TransferFunction,
-        ) -> usize,
+        unsafe fn(usize, *const f32, usize, u32, *mut f32, usize, &[[f32; 3]; 3]) -> usize,
     > = None;
 
     #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
@@ -63,180 +53,177 @@ fn channels_to_xyz_with_alpha<const CHANNELS_CONFIGURATION: u8, const TARGET: u8
 
     let channels = image_configuration.get_channels_count();
 
+    let mut lut_table = vec![0f32; 256];
+    for i in 0..256 {
+        lut_table[i] = transfer_function.linearize(i as f32 * (1. / 255.0));
+    }
+
+    let dst_slice_safe_align = unsafe {
+        slice::from_raw_parts_mut(
+            dst.as_mut_ptr() as *mut u8,
+            dst_stride as usize * height as usize,
+        )
+    };
+
     #[cfg(feature = "rayon")]
     {
-        let dst_slice_safe_align = unsafe {
-            slice::from_raw_parts_mut(
-                dst.as_mut_ptr() as *mut u8,
-                dst_stride as usize * height as usize,
-            )
-        };
         dst_slice_safe_align
             .par_chunks_exact_mut(dst_stride as usize)
             .zip(src.par_chunks_exact(src_stride as usize))
             .for_each(|(dst, src)| unsafe {
                 let mut _cx = 0usize;
 
+                let mut transient_row = vec![0f32; width as usize * channels];
+
+                for (dst_chunk, src_chunks) in transient_row
+                    .chunks_exact_mut(channels)
+                    .zip(src.chunks_exact(channels))
+                {
+                    dst_chunk[0] = *lut_table.get_unchecked(src_chunks[0] as usize);
+                    dst_chunk[1] = *lut_table.get_unchecked(src_chunks[1] as usize);
+                    dst_chunk[2] = *lut_table.get_unchecked(src_chunks[2] as usize);
+                    dst_chunk[3] = src_chunks[3] as f32 * (1. / 255.0);
+                }
+
                 if let Some(dispatcher) = _wide_row_handler {
                     _cx = dispatcher(
                         _cx,
-                        src.as_ptr(),
+                        transient_row.as_ptr(),
                         0,
                         width,
                         dst.as_mut_ptr() as *mut f32,
                         0,
                         matrix,
-                        transfer_function,
                     );
                 }
 
-                let src_ptr = src.as_ptr();
                 let dst_ptr = dst.as_mut_ptr() as *mut f32;
 
                 for x in _cx..width as usize {
                     let px = x * channels;
-                    let src = src_ptr.add(px);
-                    let r = src
-                        .add(image_configuration.get_r_channel_offset())
-                        .read_unaligned();
-                    let g = src
-                        .add(image_configuration.get_g_channel_offset())
-                        .read_unaligned();
-                    let b = src
-                        .add(image_configuration.get_b_channel_offset())
-                        .read_unaligned();
-
-                    let rgb = Rgb::<u8>::new(r, g, b);
+                    let src = transient_row.get_unchecked(px..);
+
+                    let r = *src.get_unchecked(image_configuration.get_r_channel_offset());
+                    let g = *src.get_unchecked(image_configuration.get_g_channel_offset());
+                    let b = *src.get_unchecked(image_configuration.get_b_channel_offset());
+
+                    let rgb = Rgb::<f32>::new(r, g, b);
                     let px = x * channels;
                     let dst_store = dst_ptr.add(px);
+
+                    let xyz = Xyz::from_linear_rgb(rgb, matrix);
+
                     match target {
                         XyzTarget::Lab => {
-                            let lab = rgb.to_lab();
+                            let lab = Lab::from_xyz(xyz);
                             dst_store.write_unaligned(lab.l);
                             dst_store.add(1).write_unaligned(lab.a);
                             dst_store.add(2).write_unaligned(lab.b);
                         }
                         XyzTarget::Xyz => {
-                            let xyz = Xyz::from_rgb(rgb, matrix, transfer_function);
                             dst_store.write_unaligned(xyz.x);
                             dst_store.add(1).write_unaligned(xyz.y);
                             dst_store.add(2).write_unaligned(xyz.z);
                         }
                         XyzTarget::Luv => {
-                            let luv = rgb.to_luv();
+                            let luv = Luv::from_xyz(xyz);
                             dst_store.write_unaligned(luv.l);
                             dst_store.add(1).write_unaligned(luv.u);
                             dst_store.add(2).write_unaligned(luv.v);
                         }
                         XyzTarget::Lch => {
-                            let lch = rgb.to_lch();
+                            let luv = Luv::from_xyz(xyz);
+                            let lch = LCh::from_luv(luv);
                             dst_store.write_unaligned(lch.l);
                             dst_store.add(1).write_unaligned(lch.c);
                             dst_store.add(2).write_unaligned(lch.h);
                         }
                     }
-
-                    let a = src
-                        .add(image_configuration.get_a_channel_offset())
-                        .read_unaligned();
-                    let a_lin = a as f32 * (1f32 / 255f32);
-                    dst_store.add(3).write_unaligned(a_lin);
+                    let a = *src.get_unchecked(image_configuration.get_a_channel_offset());
+                    dst_store.add(3).write_unaligned(a);
                 }
             });
     }
 
     #[cfg(not(feature = "rayon"))]
     {
-        let mut src_offset = 0usize;
-        let mut dst_offset = 0usize;
+        for (dst, src) in dst_slice_safe_align
+            .chunks_exact_mut(dst_stride as usize)
+            .zip(src.chunks_exact(src_stride as usize))
+        {
+            unsafe {
+                let mut _cx = 0usize;
 
-        for _ in 0..height as usize {
-            let mut _cx = 0usize;
+                let mut transient_row = vec![0f32; width as usize * channels];
 
-            if let Some(dispatcher) = _wide_row_handler {
-                unsafe {
+                for (dst_chunk, src_chunks) in transient_row
+                    .chunks_exact_mut(channels)
+                    .zip(src.chunks_exact(channels))
+                {
+                    dst_chunk[0] = *lut_table.get_unchecked(src_chunks[0] as usize);
+                    dst_chunk[1] = *lut_table.get_unchecked(src_chunks[1] as usize);
+                    dst_chunk[2] = *lut_table.get_unchecked(src_chunks[2] as usize);
+                    dst_chunk[3] = src_chunks[3] as f32 * (1. / 255.0);
+                }
+
+                if let Some(dispatcher) = _wide_row_handler {
                     _cx = dispatcher(
                         _cx,
-                        src.as_ptr(),
-                        src_offset,
+                        transient_row.as_ptr(),
+                        0,
                         width,
-                        dst.as_mut_ptr(),
-                        dst_offset,
+                        dst.as_mut_ptr() as *mut f32,
+                        0,
                         matrix,
-                        transfer_function,
                     );
                 }
-            }
 
-            let src_ptr = unsafe { src.as_ptr().add(src_offset) };
-            let dst_ptr = unsafe { (dst.as_mut_ptr() as *mut u8).add(dst_offset) as *mut f32 };
-
-            for x in _cx..width as usize {
-                let px = x * channels;
-                let src = unsafe { src_ptr.add(px) };
-                let r = unsafe {
-                    src.add(image_configuration.get_r_channel_offset())
-                        .read_unaligned()
-                };
-                let g = unsafe {
-                    src.add(image_configuration.get_g_channel_offset())
-                        .read_unaligned()
-                };
-                let b = unsafe {
-                    src.add(image_configuration.get_b_channel_offset())
-                        .read_unaligned()
-                };
-
-                let rgb = Rgb::<u8>::new(r, g, b);
-                let px = x * channels;
-                let dst_store = unsafe { dst_ptr.add(px) };
-                match target {
-                    XyzTarget::Lab => {
-                        let lab = rgb.to_lab();
-                        unsafe {
+                let dst_ptr = dst.as_mut_ptr() as *mut f32;
+
+                for x in _cx..width as usize {
+                    let px = x * channels;
+                    let src = transient_row.get_unchecked(px..);
+
+                    let r = *src.get_unchecked(image_configuration.get_r_channel_offset());
+                    let g = *src.get_unchecked(image_configuration.get_g_channel_offset());
+                    let b = *src.get_unchecked(image_configuration.get_b_channel_offset());
+
+                    let rgb = Rgb::<f32>::new(r, g, b);
+                    let px = x * channels;
+                    let dst_store = dst_ptr.add(px);
+
+                    let xyz = Xyz::from_linear_rgb(rgb, matrix);
+
+                    match target {
+                        XyzTarget::Lab => {
+                            let lab = Lab::from_xyz(xyz);
                             dst_store.write_unaligned(lab.l);
                             dst_store.add(1).write_unaligned(lab.a);
                             dst_store.add(2).write_unaligned(lab.b);
                         }
-                    }
-                    XyzTarget::Xyz => {
-                        let xyz = Xyz::from_rgb(rgb, matrix, transfer_function);
-                        unsafe {
+                        XyzTarget::Xyz => {
                             dst_store.write_unaligned(xyz.x);
                             dst_store.add(1).write_unaligned(xyz.y);
                             dst_store.add(2).write_unaligned(xyz.z);
                         }
-                    }
-                    XyzTarget::Luv => {
-                        let luv = rgb.to_luv();
-                        unsafe {
+                        XyzTarget::Luv => {
+                            let luv = Luv::from_xyz(xyz);
                             dst_store.write_unaligned(luv.l);
                             dst_store.add(1).write_unaligned(luv.u);
                             dst_store.add(2).write_unaligned(luv.v);
                         }
-                    }
-                    XyzTarget::Lch => {
-                        let lch = rgb.to_lch();
-                        unsafe {
+                        XyzTarget::Lch => {
+                            let luv = Luv::from_xyz(xyz);
+                            let lch = LCh::from_luv(luv);
                             dst_store.write_unaligned(lch.l);
                             dst_store.add(1).write_unaligned(lch.c);
                             dst_store.add(2).write_unaligned(lch.h);
                         }
                     }
-                }
-
-                let a = unsafe {
-                    src.add(image_configuration.get_a_channel_offset())
-                        .read_unaligned()
-                };
-                let a_lin = a as f32 * (1f32 / 255f32);
-                unsafe {
-                    dst_store.add(3).write_unaligned(a_lin);
+                    let a = *src.get_unchecked(image_configuration.get_a_channel_offset());
+                    dst_store.add(3).write_unaligned(a);
                 }
             }
-
-            src_offset += src_stride as usize;
-            dst_offset += dst_stride as usize;
         }
     }
 }
diff --git a/src/lab.rs b/src/lab.rs
index 5bd9b32..587a32a 100644
--- a/src/lab.rs
+++ b/src/lab.rs
@@ -85,7 +85,7 @@ impl Lab {
 }
 
 impl Lab {
-    /// Converts CIE Lab into CIE XYZ
+    /// Converts CIE [Lab] into CIE [Xyz]
     #[inline]
     pub fn to_xyz(&self) -> Xyz {
         let y = (self.l + 16.0) / 116.0;
@@ -125,6 +125,13 @@ impl Lab {
         Xyz::new(xyz.x, xyz.y, xyz.z).to_srgb()
     }
 
+    /// Converts CIE [Lab] into linear [Rgb]
+    #[inline]
+    pub fn to_linear_rgb(&self, matrix: &[[f32; 3]; 3]) -> Rgb<f32> {
+        let xyz = self.to_xyz();
+        xyz.to_linear_rgb(matrix)
+    }
+
     /// Converts CIE Lab into Rgb
     #[inline]
     pub fn to_rgb(&self) -> Rgb<u8> {
diff --git a/src/linear_to_image.rs b/src/linear_to_image.rs
index e74a35d..0a76bba 100644
--- a/src/linear_to_image.rs
+++ b/src/linear_to_image.rs
@@ -4,20 +4,13 @@
  * // Use of this source code is governed by a BSD-style
  * // license that can be found in the LICENSE file.
  */
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-use crate::avx::avx_linear_to_gamma;
 use crate::gamma_curves::TransferFunction;
 use crate::image::ImageConfiguration;
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-use crate::neon::neon_linear_to_gamma;
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-use crate::sse::sse_linear_to_gamma;
 use crate::Rgb;
 #[cfg(feature = "rayon")]
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 #[cfg(feature = "rayon")]
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
-#[cfg(feature = "rayon")]
 use std::slice;
 
 #[allow(clippy::type_complexity)]
@@ -35,52 +28,29 @@ fn linear_to_gamma_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: b
         panic!("Alpha may be set only on images with alpha");
     }
 
-    let mut _wide_row_handle: Option<
-        unsafe fn(usize, *const f32, u32, *mut u8, u32, u32, TransferFunction) -> usize,
-    > = None;
-
-    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-    {
-        _wide_row_handle = Some(neon_linear_to_gamma::<CHANNELS_CONFIGURATION, USE_ALPHA>);
-    }
-
     let channels = image_configuration.get_channels_count();
 
-    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-    if std::arch::is_x86_feature_detected!("sse4.1") {
-        _wide_row_handle = Some(sse_linear_to_gamma::<CHANNELS_CONFIGURATION, USE_ALPHA>);
+    let mut lut_table = vec![0u8; 2049];
+    for i in 0..2049 {
+        lut_table[i] = (transfer_function.gamma(i as f32 * (1. / 2048.0)) * 255.)
+            .ceil()
+            .min(255.) as u8;
     }
 
-    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-    if std::arch::is_x86_feature_detected!("avx2") {
-        _wide_row_handle = Some(avx_linear_to_gamma::<CHANNELS_CONFIGURATION, USE_ALPHA>);
-    }
+    let src_slice_safe_align = unsafe {
+        slice::from_raw_parts(
+            src.as_ptr() as *const u8,
+            src_stride as usize * height as usize,
+        )
+    };
 
     #[cfg(feature = "rayon")]
     {
-        let src_slice_safe_align = unsafe {
-            slice::from_raw_parts(
-                src.as_ptr() as *const u8,
-                src_stride as usize * height as usize,
-            )
-        };
         dst.par_chunks_exact_mut(dst_stride as usize)
             .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize))
             .for_each(|(dst, src)| unsafe {
                 let mut _cx = 0usize;
 
-                if let Some(dispatcher) = _wide_row_handle {
-                    _cx = dispatcher(
-                        _cx,
-                        src.as_ptr() as *const f32,
-                        0,
-                        dst.as_mut_ptr(),
-                        0,
-                        width,
-                        transfer_function,
-                    );
-                }
-
                 let src_ptr = src.as_ptr() as *const f32;
                 let dst_ptr = dst.as_mut_ptr();
 
@@ -97,19 +67,21 @@ fn linear_to_gamma_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: b
                         .add(image_configuration.get_b_channel_offset())
                         .read_unaligned();
 
-                    let rgb = Rgb::<f32>::new(
+                    let rgb = (Rgb::<f32>::new(
                         r.min(1f32).max(0f32),
                         g.min(1f32).max(0f32),
                         b.min(1f32).max(0f32),
-                    );
+                    ) * Rgb::<f32>::dup(2048f32))
+                    .round()
+                    .cast::<u16>();
 
                     let dst = dst_ptr.add(px);
-                    let transferred = rgb.gamma(transfer_function);
-                    let rgb8 = transferred.to_u8();
 
-                    dst.write_unaligned(rgb8.r);
-                    dst.add(1).write_unaligned(rgb8.g);
-                    dst.add(2).write_unaligned(rgb8.b);
+                    dst.write_unaligned(*lut_table.get_unchecked(rgb.r.min(2048) as usize));
+                    dst.add(1)
+                        .write_unaligned(*lut_table.get_unchecked(rgb.g.min(2048) as usize));
+                    dst.add(2)
+                        .write_unaligned(*lut_table.get_unchecked(rgb.b.min(2048) as usize));
 
                     if USE_ALPHA && image_configuration.has_alpha() {
                         let a = src_slice
@@ -124,79 +96,54 @@ fn linear_to_gamma_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: b
 
     #[cfg(not(feature = "rayon"))]
     {
-        let mut src_offset = 0usize;
-        let mut dst_offset = 0usize;
-
-        for _ in 0..height as usize {
-            let mut _cx = 0usize;
-
-            if let Some(dispatcher) = _wide_row_handle {
-                unsafe {
-                    _cx = dispatcher(
-                        _cx,
-                        src.as_ptr(),
-                        src_offset as u32,
-                        dst.as_mut_ptr(),
-                        dst_offset as u32,
-                        width,
-                        transfer_function,
-                    );
-                }
-            }
+        for (dst, src) in dst
+            .chunks_exact_mut(dst_stride as usize)
+            .zip(src_slice_safe_align.chunks_exact(src_stride as usize))
+        {
+            unsafe {
+                let mut _cx = 0usize;
 
-            let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *const f32 };
-            let dst_ptr = unsafe { dst.as_mut_ptr().add(dst_offset) };
+                let src_ptr = src.as_ptr() as *const f32;
+                let dst_ptr = dst.as_mut_ptr();
 
-            for x in _cx..width as usize {
-                let px = x * channels;
-                let src_slice = unsafe { src_ptr.add(px) };
-                let r = unsafe {
-                    src_slice
+                for x in _cx..width as usize {
+                    let px = x * channels;
+                    let src_slice = src_ptr.add(px);
+                    let r = src_slice
                         .add(image_configuration.get_r_channel_offset())
-                        .read_unaligned()
-                };
-                let g = unsafe {
-                    src_slice
+                        .read_unaligned();
+                    let g = src_slice
                         .add(image_configuration.get_g_channel_offset())
-                        .read_unaligned()
-                };
-                let b = unsafe {
-                    src_slice
+                        .read_unaligned();
+                    let b = src_slice
                         .add(image_configuration.get_b_channel_offset())
-                        .read_unaligned()
-                };
+                        .read_unaligned();
 
-                let rgb = Rgb::<f32>::new(
-                    r.min(1f32).max(0f32),
-                    g.min(1f32).max(0f32),
-                    b.min(1f32).max(0f32),
-                );
+                    let rgb = (Rgb::<f32>::new(
+                        r.min(1f32).max(0f32),
+                        g.min(1f32).max(0f32),
+                        b.min(1f32).max(0f32),
+                    ) * Rgb::<f32>::dup(2048f32))
+                    .round()
+                    .cast::<u16>();
 
-                let dst = unsafe { dst_ptr.add(px) };
-                let transferred = rgb.gamma(transfer_function);
-                let rgb8 = transferred.to_u8();
+                    let dst = dst_ptr.add(px);
 
-                unsafe {
-                    dst.write_unaligned(rgb8.r);
-                    dst.add(1).write_unaligned(rgb8.g);
-                    dst.add(2).write_unaligned(rgb8.b);
-                }
+                    dst.write_unaligned(*lut_table.get_unchecked(rgb.r.min(2048) as usize));
+                    dst.add(1)
+                        .write_unaligned(*lut_table.get_unchecked(rgb.g.min(2048) as usize));
+                    dst.add(2)
+                        .write_unaligned(*lut_table.get_unchecked(rgb.b.min(2048) as usize));
 
-                if USE_ALPHA && image_configuration.has_alpha() {
-                    let a = unsafe {
-                        src_slice
+                    if USE_ALPHA && image_configuration.has_alpha() {
+                        let a = src_slice
                             .add(image_configuration.get_a_channel_offset())
-                            .read_unaligned()
-                    };
-                    let a_lin = (a * 255f32).round() as u8;
-                    unsafe {
+                            .read_unaligned();
+                        let a_lin = (a * 255f32).round() as u8;
                         dst.add(3).write_unaligned(a_lin);
                     }
                 }
             }
-
-            src_offset += src_stride as usize;
-            dst_offset += dst_stride as usize;
         }
     }
 }
diff --git a/src/linear_to_image_u8.rs b/src/linear_to_image_u8.rs
index b3cdd31..2cf1a36 100644
--- a/src/linear_to_image_u8.rs
+++ b/src/linear_to_image_u8.rs
@@ -7,10 +7,6 @@
 
 use crate::gamma_curves::TransferFunction;
 use crate::image::ImageConfiguration;
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-use crate::neon::*;
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-use crate::sse::sse_image_to_linear_unsigned::sse_channels_to_linear_u8;
 use crate::Rgb;
 #[cfg(feature = "rayon")]
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
@@ -36,20 +32,11 @@ fn linear_to_gamma_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: b
 
     let channels = image_configuration.get_channels_count();
 
-    let mut _wide_row_handler: Option<
-        unsafe fn(usize, *const u8, usize, u32, *mut u8, usize, TransferFunction) -> usize,
-    > = None;
-
-    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-    if std::arch::is_x86_feature_detected!("sse4.1") {
-        _wide_row_handler =
-            Some(sse_channels_to_linear_u8::<CHANNELS_CONFIGURATION, USE_ALPHA, false>);
-    }
-
-    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-    {
-        _wide_row_handler =
-            Some(neon_channels_to_linear_u8::<CHANNELS_CONFIGURATION, USE_ALPHA, false>);
+    let mut lut_table = vec![0u8; 256];
+    for i in 0..256 {
+        lut_table[i] = (transfer_function.gamma(i as f32 * (1. / 255.0)) * 255.)
+            .ceil()
+            .min(255.) as u8;
     }
 
     #[cfg(feature = "rayon")]
@@ -59,18 +46,6 @@ fn linear_to_gamma_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: b
             .for_each(|(dst, src)| unsafe {
                 let mut _cx = 0usize;
 
-                if let Some(dispatcher) = _wide_row_handler {
-                    _cx = dispatcher(
-                        _cx,
-                        src.as_ptr(),
-                        0,
-                        width,
-                        dst.as_mut_ptr(),
-                        0,
-                        transfer_function,
-                    );
-                }
-
                 for x in _cx..width as usize {
                     let px = x * channels;
                     let r = *src.get_unchecked(px + image_configuration.get_r_channel_offset());
@@ -78,14 +53,10 @@ fn linear_to_gamma_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: b
                     let b = *src.get_unchecked(px + image_configuration.get_b_channel_offset());
 
                     let rgb = Rgb::<u8>::new(r, g, b);
-                    let mut rgb = rgb.to_rgb_f32();
 
-                    rgb = rgb.gamma(transfer_function);
-                    let new_rgb = rgb.to_u8();
-
-                    *dst.get_unchecked_mut(px) = new_rgb.r;
-                    *dst.get_unchecked_mut(px + 1) = new_rgb.g;
-                    *dst.get_unchecked_mut(px + 2) = new_rgb.b;
+                    *dst.get_unchecked_mut(px) = *lut_table.get_unchecked(rgb.r as usize);
+                    *dst.get_unchecked_mut(px + 1) = *lut_table.get_unchecked(rgb.g as usize);
+                    *dst.get_unchecked_mut(px + 2) = *lut_table.get_unchecked(rgb.b as usize);
 
                     if USE_ALPHA && image_configuration.has_alpha() {
                         let a = src.get_unchecked(px + image_configuration.get_a_channel_offset());
@@ -103,20 +74,6 @@ fn linear_to_gamma_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: b
         for _ in 0.._height as usize {
             let mut _cx = 0usize;
 
-            if let Some(dispatcher) = _wide_row_handler {
-                unsafe {
-                    _cx = dispatcher(
-                        _cx,
-                        src.as_ptr(),
-                        src_offset,
-                        width,
-                        dst.as_mut_ptr(),
-                        dst_offset,
-                        transfer_function,
-                    );
-                }
-            }
-
             let src_ptr = unsafe { src.as_ptr().add(src_offset) };
             let dst_ptr = unsafe { dst.as_mut_ptr().add(dst_offset) };
 
@@ -140,12 +97,11 @@ fn linear_to_gamma_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: b
                 let mut rgb = rgb.to_rgb_f32();
 
                 rgb = rgb.gamma(transfer_function);
-                let new_rgb = rgb.to_u8();
 
                 unsafe {
-                    *dst_slice.get_unchecked_mut(px) = new_rgb.r;
-                    *dst_slice.get_unchecked_mut(px + 1) = new_rgb.g;
-                    *dst_slice.get_unchecked_mut(px + 2) = new_rgb.b;
+                    *dst_slice.get_unchecked_mut(px) = *lut_table.get_unchecked(rgb.r as usize);
+                    *dst_slice.get_unchecked_mut(px + 1) = *lut_table.get_unchecked(rgb.g as usize);
+                    *dst_slice.get_unchecked_mut(px + 2) = *lut_table.get_unchecked(rgb.b as usize);
                 }
 
                 if USE_ALPHA && image_configuration.has_alpha() {
diff --git a/src/luv.rs b/src/luv.rs
index 8062d66..d978410 100644
--- a/src/luv.rs
+++ b/src/luv.rs
@@ -76,6 +76,13 @@ impl Luv {
     #[allow(clippy::manual_clamp)]
     pub fn from_rgb(rgb: Rgb<u8>) -> Self {
         let xyz = Xyz::from_srgb(rgb);
+        Self::from_xyz(xyz)
+    }
+
+    /// Converts CIE XYZ to CIE Luv
+    #[inline]
+    #[allow(clippy::manual_clamp)]
+    pub fn from_xyz(xyz: Xyz) -> Self {
         let [x, y, z] = [xyz.x, xyz.y, xyz.z];
         let den = x + 15.0 * y + 3.0 * z;
 
@@ -106,9 +113,9 @@ impl Luv {
     }
 
     #[inline]
-    pub fn to_rgb(&self) -> Rgb<u8> {
+    pub fn to_xyz(&self) -> Xyz {
         if self.l <= 0f32 {
-            return Xyz::new(0f32, 0f32, 0f32).to_srgb();
+            return Xyz::new(0f32, 0f32, 0f32);
         }
         let l13 = 1f32 / (13f32 * self.l);
         let u = self.u * l13 + LUV_WHITE_U_PRIME;
@@ -128,7 +135,20 @@ impl Luv {
             z = 0f32;
         }
 
-        Xyz::new(x, y, z).to_srgb()
+        Xyz::new(x, y, z)
+    }
+
+    /// Converts CIE [Luv] into linear [Rgb]
+    #[inline]
+    pub fn to_linear_rgb(&self, matrix: &[[f32; 3]; 3]) -> Rgb<f32> {
+        let xyz = self.to_xyz();
+        xyz.to_linear_rgb(matrix)
+    }
+
+    #[inline]
+    pub fn to_rgb(&self) -> Rgb<u8> {
+        let xyz = self.to_xyz();
+        xyz.to_srgb()
     }
 
     pub fn new(l: f32, u: f32, v: f32) -> Luv {
@@ -169,10 +189,23 @@ impl LCh {
         }
     }
 
+    #[inline]
     pub fn to_rgb(&self) -> Rgb<u8> {
         self.to_luv().to_rgb()
     }
 
+    #[inline]
+    pub fn to_xyz(&self) -> Xyz {
+        self.to_luv().to_xyz()
+    }
+
+    #[inline]
+    pub fn to_linear_rgb(&self, matrix: &[[f32; 3]; 3]) -> Rgb<f32> {
+        let xyz = self.to_xyz();
+        xyz.to_linear_rgb(matrix)
+    }
+
+    #[inline]
     pub fn to_luv(&self) -> Luv {
         Luv {
             l: self.l,
diff --git a/src/neon/cie.rs b/src/neon/cie.rs
index 49bfcc7..9456929 100644
--- a/src/neon/cie.rs
+++ b/src/neon/cie.rs
@@ -10,16 +10,14 @@ use crate::luv::{
     LUV_WHITE_V_PRIME,
 };
 use crate::neon::math::{prefer_vfmaq_f32, vcolorq_matrix_f32, vcubeq_f32};
-use crate::neon::neon_perform_linear_transfer;
-use crate::TransferFunction;
 use erydanos::{vatan2q_f32, vcbrtq_fast_f32, vcosq_f32, vhypotq_fast_f32, vsinq_f32};
 use std::arch::aarch64::*;
 
 #[inline(always)]
 pub(crate) unsafe fn neon_triple_to_xyz(
-    r: uint32x4_t,
-    g: uint32x4_t,
-    b: uint32x4_t,
+    r: float32x4_t,
+    g: float32x4_t,
+    b: float32x4_t,
     c1: float32x4_t,
     c2: float32x4_t,
     c3: float32x4_t,
@@ -29,17 +27,9 @@ pub(crate) unsafe fn neon_triple_to_xyz(
     c7: float32x4_t,
     c8: float32x4_t,
     c9: float32x4_t,
-    transfer_function: TransferFunction,
 ) -> (float32x4_t, float32x4_t, float32x4_t) {
-    let r_f = vmulq_n_f32(vcvtq_f32_u32(r), 1f32 / 255f32);
-    let g_f = vmulq_n_f32(vcvtq_f32_u32(g), 1f32 / 255f32);
-    let b_f = vmulq_n_f32(vcvtq_f32_u32(b), 1f32 / 255f32);
-    let r_linear = neon_perform_linear_transfer(transfer_function, r_f);
-    let g_linear = neon_perform_linear_transfer(transfer_function, g_f);
-    let b_linear = neon_perform_linear_transfer(transfer_function, b_f);
-
     let (x, y, z) = vcolorq_matrix_f32(
-        r_linear, g_linear, b_linear, c1, c2, c3, c4, c5, c6, c7, c8, c9,
+        r, g, b, c1, c2, c3, c4, c5, c6, c7, c8, c9,
     );
     (x, y, z)
 }
diff --git a/src/neon/image_to_oklab.rs b/src/neon/image_to_oklab.rs
index 151e936..9e72371 100644
--- a/src/neon/image_to_oklab.rs
+++ b/src/neon/image_to_oklab.rs
@@ -7,11 +7,7 @@
 use crate::image::ImageConfiguration;
 use crate::image_to_oklab::OklabTarget;
 use crate::neon::math::vcolorq_matrix_f32;
-use crate::neon::neon_perform_linear_transfer;
-use crate::{
-    load_u8_and_deinterleave, load_u8_and_deinterleave_half, load_u8_and_deinterleave_quarter,
-    TransferFunction,
-};
+use crate::load_f32_and_deinterleave;
 use erydanos::{vatan2q_f32, vcbrtq_fast_f32, vhypotq_fast_f32};
 use std::arch::aarch64::*;
 
@@ -20,15 +16,8 @@ macro_rules! triple_to_oklab {
     $c0:expr, $c1:expr, $c2: expr, $c3: expr, $c4:expr, $c5: expr, $c6:expr, $c7: expr, $c8: expr,
         $m0: expr, $m1: expr, $m2: expr, $m3: expr, $m4: expr, $m5: expr, $m6: expr, $m7: expr, $m8: expr
     ) => {{
-        let r_f = vmulq_n_f32(vcvtq_f32_u32($r), 1f32 / 255f32);
-        let g_f = vmulq_n_f32(vcvtq_f32_u32($g), 1f32 / 255f32);
-        let b_f = vmulq_n_f32(vcvtq_f32_u32($b), 1f32 / 255f32);
-        let dl_l = neon_perform_linear_transfer($transfer, r_f);
-        let dl_m = neon_perform_linear_transfer($transfer, g_f);
-        let dl_s = neon_perform_linear_transfer($transfer, b_f);
-
         let (l_l, l_m, l_s) = vcolorq_matrix_f32(
-            dl_l, dl_m, dl_s, $c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7, $c8,
+            $r, $g, $b, $c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7, $c8,
         );
 
         let l_ = vcbrtq_fast_f32(l_l);
@@ -52,12 +41,9 @@ macro_rules! triple_to_oklab {
 #[inline(always)]
 pub unsafe fn neon_image_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     start_cx: usize,
-    src: *const u8,
-    src_offset: usize,
     width: u32,
     dst: *mut f32,
     dst_offset: usize,
-    transfer_function: TransferFunction,
 ) -> usize {
     let target: OklabTarget = TARGET.into();
     let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
@@ -90,297 +76,15 @@ pub unsafe fn neon_image_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET
         vdupq_n_f32(-0.8086757660f32),
     );
 
-    while cx + 16 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave!(src_ptr, image_configuration);
-
-        let r_low = vmovl_u8(vget_low_u8(r_chan));
-        let g_low = vmovl_u8(vget_low_u8(g_chan));
-        let b_low = vmovl_u8(vget_low_u8(b_chan));
-
-        let r_low_low = vmovl_u16(vget_low_u16(r_low));
-        let g_low_low = vmovl_u16(vget_low_u16(g_low));
-        let b_low_low = vmovl_u16(vget_low_u16(b_low));
-
-        let (x_low_low, y_low_low, z_low_low) = triple_to_oklab!(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            transfer_function,
-            target,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8
-        );
-
-        let a_low = vmovl_u8(vget_low_u8(a_chan));
-
-        if image_configuration.has_alpha() {
-            let a_low_low =
-                vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32);
-            let xyz_low_low = float32x4x4_t(x_low_low, y_low_low, z_low_low, a_low_low);
-            vst4q_f32(dst_ptr.add(cx * channels), xyz_low_low);
-        } else {
-            let xyz_low_low = float32x4x3_t(x_low_low, y_low_low, z_low_low);
-            vst3q_f32(dst_ptr.add(cx * channels), xyz_low_low);
-        }
-
-        let r_low_high = vmovl_high_u16(r_low);
-        let g_low_high = vmovl_high_u16(g_low);
-        let b_low_high = vmovl_high_u16(b_low);
-
-        let (x_low_high, y_low_high, z_low_high) = triple_to_oklab!(
-            r_low_high,
-            g_low_high,
-            b_low_high,
-            transfer_function,
-            target,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8
-        );
-
-        if image_configuration.has_alpha() {
-            let a_low_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_low)), 1f32 / 255f32);
-            let xyz_low_low = float32x4x4_t(x_low_high, y_low_high, z_low_high, a_low_high);
-            vst4q_f32(dst_ptr.add(cx * channels + 4 * channels), xyz_low_low);
-        } else {
-            let xyz_low_low = float32x4x3_t(x_low_high, y_low_high, z_low_high);
-            vst3q_f32(dst_ptr.add(cx * channels + 4 * channels), xyz_low_low);
-        }
-
-        let r_high = vmovl_high_u8(r_chan);
-        let g_high = vmovl_high_u8(g_chan);
-        let b_high = vmovl_high_u8(b_chan);
-
-        let r_high_low = vmovl_u16(vget_low_u16(r_high));
-        let g_high_low = vmovl_u16(vget_low_u16(g_high));
-        let b_high_low = vmovl_u16(vget_low_u16(b_high));
-
-        let (x_high_low, y_high_low, z_high_low) = triple_to_oklab!(
-            r_high_low,
-            g_high_low,
-            b_high_low,
-            transfer_function,
-            target,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8
-        );
-
-        let a_high = vmovl_high_u8(a_chan);
-
-        if image_configuration.has_alpha() {
-            let a_high_low = vmulq_n_f32(
-                vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_high))),
-                1f32 / 255f32,
-            );
-
-            let xyz_low_low = float32x4x4_t(x_high_low, y_high_low, z_high_low, a_high_low);
-            vst4q_f32(dst_ptr.add(cx * channels + 4 * channels * 2), xyz_low_low);
-        } else {
-            let xyz_low_low = float32x4x3_t(x_high_low, y_high_low, z_high_low);
-            vst3q_f32(dst_ptr.add(cx * channels + 4 * channels * 2), xyz_low_low);
-        }
-
-        let r_high_high = vmovl_high_u16(r_high);
-        let g_high_high = vmovl_high_u16(g_high);
-        let b_high_high = vmovl_high_u16(b_high);
-
-        let (x_high_high, y_high_high, z_high_high) = triple_to_oklab!(
-            r_high_high,
-            g_high_high,
-            b_high_high,
-            transfer_function,
-            target,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8
-        );
-
-        if image_configuration.has_alpha() {
-            let a_high_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_high)), 1f32 / 255f32);
-            let xyz_low_low = float32x4x4_t(x_high_high, y_high_high, z_high_high, a_high_high);
-            vst4q_f32(dst_ptr.add(cx * channels + 4 * channels * 3), xyz_low_low);
-        } else {
-            let xyz_low_low = float32x4x3_t(x_high_high, y_high_high, z_high_high);
-            vst3q_f32(dst_ptr.add(cx * channels + 4 * channels * 3), xyz_low_low);
-        }
-
-        cx += 16;
-    }
-
-    while cx + 8 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave_half!(src_ptr, image_configuration);
-
-        let r_low = vmovl_u8(vget_low_u8(r_chan));
-        let g_low = vmovl_u8(vget_low_u8(g_chan));
-        let b_low = vmovl_u8(vget_low_u8(b_chan));
-
-        let r_low_low = vmovl_u16(vget_low_u16(r_low));
-        let g_low_low = vmovl_u16(vget_low_u16(g_low));
-        let b_low_low = vmovl_u16(vget_low_u16(b_low));
-
-        let (x_low_low, y_low_low, z_low_low) = triple_to_oklab!(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            transfer_function,
-            target,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8
-        );
-
-        let a_low = vmovl_u8(vget_low_u8(a_chan));
-
-        if image_configuration.has_alpha() {
-            let a_low_low =
-                vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32);
-            let xyz_low_low = float32x4x4_t(x_low_low, y_low_low, z_low_low, a_low_low);
-            vst4q_f32(dst_ptr.add(cx * channels), xyz_low_low);
-        } else {
-            let xyz_low_low = float32x4x3_t(x_low_low, y_low_low, z_low_low);
-            vst3q_f32(dst_ptr.add(cx * channels), xyz_low_low);
-        }
-
-        let r_low_high = vmovl_high_u16(r_low);
-        let g_low_high = vmovl_high_u16(g_low);
-        let b_low_high = vmovl_high_u16(b_low);
-
-        let (x_low_high, y_low_high, z_low_high) = triple_to_oklab!(
-            r_low_high,
-            g_low_high,
-            b_low_high,
-            transfer_function,
-            target,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8
-        );
-
-        if image_configuration.has_alpha() {
-            let a_low_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_low)), 1f32 / 255f32);
-            let xyz_low_low = float32x4x4_t(x_low_high, y_low_high, z_low_high, a_low_high);
-            vst4q_f32(dst_ptr.add(cx * channels + 4 * channels), xyz_low_low);
-        } else {
-            let xyz_low_low = float32x4x3_t(x_low_high, y_low_high, z_low_high);
-            vst3q_f32(dst_ptr.add(cx * channels + 4 * channels), xyz_low_low);
-        }
-
-        cx += 8;
-    }
-
     while cx + 4 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
+        let in_place_ptr = dst_ptr.add(cx * channels);
         let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave_quarter!(src_ptr, image_configuration);
-
-        let r_low = vmovl_u8(vget_low_u8(r_chan));
-        let g_low = vmovl_u8(vget_low_u8(g_chan));
-        let b_low = vmovl_u8(vget_low_u8(b_chan));
-
-        let r_low_low = vmovl_u16(vget_low_u16(r_low));
-        let g_low_low = vmovl_u16(vget_low_u16(g_low));
-        let b_low_low = vmovl_u16(vget_low_u16(b_low));
+            load_f32_and_deinterleave!(in_place_ptr, image_configuration);
 
         let (x_low_low, y_low_low, z_low_low) = triple_to_oklab!(
-            r_low_low,
-            g_low_low,
-            b_low_low,
+            r_chan,
+            g_chan,
+            b_chan,
             transfer_function,
             target,
             c0,
@@ -403,16 +107,12 @@ pub unsafe fn neon_image_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET
             m8
         );
 
-        let a_low = vmovl_u8(vget_low_u8(a_chan));
-
         if image_configuration.has_alpha() {
-            let a_low_low =
-                vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32);
-            let xyz_low_low = float32x4x4_t(x_low_low, y_low_low, z_low_low, a_low_low);
-            vst4q_f32(dst_ptr.add(cx * channels), xyz_low_low);
+            let xyz_low_low = float32x4x4_t(x_low_low, y_low_low, z_low_low, a_chan);
+            vst4q_f32(in_place_ptr, xyz_low_low);
         } else {
             let xyz_low_low = float32x4x3_t(x_low_low, y_low_low, z_low_low);
-            vst3q_f32(dst_ptr.add(cx * channels), xyz_low_low);
+            vst3q_f32(in_place_ptr, xyz_low_low);
         }
 
         cx += 4;
diff --git a/src/neon/linear_to_image.rs b/src/neon/linear_to_image.rs
deleted file mode 100644
index 7bbe9cd..0000000
--- a/src/neon/linear_to_image.rs
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved.
- * //
- * // Use of this source code is governed by a BSD-style
- * // license that can be found in the LICENSE file.
- */
-
-use crate::image::ImageConfiguration;
-use crate::neon::*;
-use crate::{load_f32_and_deinterleave, TransferFunction};
-use std::arch::aarch64::*;
-
-#[inline(always)]
-unsafe fn neon_gamma_vld<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
-    src: *const f32,
-    transfer_function: TransferFunction,
-) -> (uint32x4_t, uint32x4_t, uint32x4_t, uint32x4_t) {
-    let v_scale_alpha = vdupq_n_f32(255f32);
-    let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
-    let (mut r_f32, mut g_f32, mut b_f32, mut a_f32) =
-        load_f32_and_deinterleave!(src, image_configuration);
-
-    r_f32 = neon_perform_gamma_transfer(transfer_function, r_f32);
-    g_f32 = neon_perform_gamma_transfer(transfer_function, g_f32);
-    b_f32 = neon_perform_gamma_transfer(transfer_function, b_f32);
-    r_f32 = vmulq_f32(r_f32, v_scale_alpha);
-    g_f32 = vmulq_f32(g_f32, v_scale_alpha);
-    b_f32 = vmulq_f32(b_f32, v_scale_alpha);
-    if USE_ALPHA {
-        a_f32 = vmulq_f32(a_f32, v_scale_alpha);
-    }
-    (
-        vcvtaq_u32_f32(r_f32),
-        vcvtaq_u32_f32(g_f32),
-        vcvtaq_u32_f32(b_f32),
-        vcvtaq_u32_f32(a_f32),
-    )
-}
-
-pub unsafe fn neon_linear_to_gamma<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
-    start_cx: usize,
-    src: *const f32,
-    src_offset: u32,
-    dst: *mut u8,
-    dst_offset: u32,
-    width: u32,
-    transfer_function: TransferFunction,
-) -> usize {
-    let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
-    let channels = image_configuration.get_channels_count();
-    let mut cx = start_cx;
-
-    while cx + 16 < width as usize {
-        let offset_src_ptr =
-            ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_, a_row0_) =
-            neon_gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_0, transfer_function);
-
-        let src_ptr_1 = offset_src_ptr.add(4 * channels);
-
-        let (r_row1_, g_row1_, b_row1_, a_row1_) =
-            neon_gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_1, transfer_function);
-
-        let src_ptr_2 = offset_src_ptr.add(4 * 2 * channels);
-
-        let (r_row2_, g_row2_, b_row2_, a_row2_) =
-            neon_gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_2, transfer_function);
-
-        let src_ptr_3 = offset_src_ptr.add(4 * 3 * channels);
-
-        let (r_row3_, g_row3_, b_row3_, a_row3_) =
-            neon_gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_3, transfer_function);
-
-        let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_));
-        let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_));
-        let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_));
-
-        let r_row23 = vcombine_u16(vqmovn_u32(r_row2_), vqmovn_u32(r_row3_));
-        let g_row23 = vcombine_u16(vqmovn_u32(g_row2_), vqmovn_u32(g_row3_));
-        let b_row23 = vcombine_u16(vqmovn_u32(b_row2_), vqmovn_u32(b_row3_));
-
-        let r_row = vcombine_u8(vqmovn_u16(r_row01), vqmovn_u16(r_row23));
-        let g_row = vcombine_u8(vqmovn_u16(g_row01), vqmovn_u16(g_row23));
-        let b_row = vcombine_u8(vqmovn_u16(b_row01), vqmovn_u16(b_row23));
-
-        let dst_ptr = dst.add(dst_offset as usize + cx * channels);
-
-        if USE_ALPHA {
-            let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_));
-            let a_row23 = vcombine_u16(vqmovn_u32(a_row2_), vqmovn_u32(a_row3_));
-            let a_row = vcombine_u8(vqmovn_u16(a_row01), vqmovn_u16(a_row23));
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    uint8x16x4_t(r_row, g_row, b_row, a_row)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    uint8x16x4_t(b_row, g_row, r_row, a_row)
-                }
-            };
-            vst4q_u8(dst_ptr, store_rows);
-        } else {
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    uint8x16x3_t(r_row, g_row, b_row)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    uint8x16x3_t(b_row, g_row, r_row)
-                }
-            };
-            vst3q_u8(dst_ptr, store_rows);
-        }
-
-        cx += 16;
-    }
-
-    while cx + 8 < width as usize {
-        let offset_src_ptr =
-            ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_, a_row0_) =
-            neon_gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_0, transfer_function);
-
-        let src_ptr_1 = offset_src_ptr.add(4 * channels);
-
-        let (r_row1_, g_row1_, b_row1_, a_row1_) =
-            neon_gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_1, transfer_function);
-
-        let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_));
-        let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_));
-        let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_));
-
-        let r_row = vqmovn_u16(r_row01);
-        let g_row = vqmovn_u16(g_row01);
-        let b_row = vqmovn_u16(b_row01);
-
-        let dst_ptr = dst.add(dst_offset as usize + cx * channels);
-
-        if USE_ALPHA {
-            let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_));
-            let a_row = vqmovn_u16(a_row01);
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    uint8x8x4_t(r_row, g_row, b_row, a_row)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    uint8x8x4_t(b_row, g_row, r_row, a_row)
-                }
-            };
-            vst4_u8(dst_ptr, store_rows);
-        } else {
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    uint8x8x3_t(r_row, g_row, b_row)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    uint8x8x3_t(b_row, g_row, r_row)
-                }
-            };
-            vst3_u8(dst_ptr, store_rows);
-        }
-
-        cx += 8;
-    }
-
-    while cx + 4 < width as usize {
-        let offset_src_ptr =
-            ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_, a_row0_) =
-            neon_gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_0, transfer_function);
-
-        let zero = vdup_n_u16(0);
-
-        let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), zero);
-        let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), zero);
-        let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), zero);
-
-        let r_row = vqmovn_u16(r_row01);
-        let g_row = vqmovn_u16(g_row01);
-        let b_row = vqmovn_u16(b_row01);
-
-        let dst_ptr = dst.add(dst_offset as usize + cx * channels);
-
-        if USE_ALPHA {
-            let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), zero);
-            let a_row = vqmovn_u16(a_row01);
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    uint8x8x4_t(r_row, g_row, b_row, a_row)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    uint8x8x4_t(b_row, g_row, r_row, a_row)
-                }
-            };
-            let mut transient: [u8; 32] = [0; 32];
-            vst4_u8(transient.as_mut_ptr(), store_rows);
-            std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 4);
-        } else {
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    uint8x8x3_t(r_row, g_row, b_row)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    uint8x8x3_t(b_row, g_row, r_row)
-                }
-            };
-            let mut transient: [u8; 24] = [0; 24];
-            vst3_u8(transient.as_mut_ptr(), store_rows);
-            std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 3);
-        }
-
-        cx += 4;
-    }
-
-    cx
-}
diff --git a/src/neon/mod.rs b/src/neon/mod.rs
index c7c20f9..22c8d9d 100644
--- a/src/neon/mod.rs
+++ b/src/neon/mod.rs
@@ -14,15 +14,12 @@ mod image_to_hsv;
 mod image_to_jzazbz;
 mod image_to_oklab;
 mod jzazbz_to_image;
-mod linear_to_image;
 pub mod linear_to_planar;
 mod math;
 mod oklab_to_image;
 pub mod planar_to_linear;
 mod routines;
 mod sigmoidal;
-mod to_linear;
-mod to_linear_u8;
 mod to_sigmoidal;
 mod to_xyz_lab;
 mod to_xyza_laba;
@@ -37,10 +34,7 @@ pub use image_to_hsv::*;
 pub use image_to_jzazbz::neon_image_to_jzazbz;
 pub use image_to_oklab::neon_image_to_oklab;
 pub use jzazbz_to_image::neon_jzazbz_to_image;
-pub use linear_to_image::*;
 pub use oklab_to_image::neon_oklab_to_image;
-pub use to_linear::*;
-pub use to_linear_u8::*;
 pub use to_sigmoidal::neon_image_to_sigmoidal;
 pub use to_xyz_lab::*;
 pub use to_xyza_laba::*;
diff --git a/src/neon/oklab_to_image.rs b/src/neon/oklab_to_image.rs
index e883cda..ca94daf 100644
--- a/src/neon/oklab_to_image.rs
+++ b/src/neon/oklab_to_image.rs
@@ -11,13 +11,11 @@ use erydanos::{vcosq_f32, vsinq_f32};
 use crate::image::ImageConfiguration;
 use crate::image_to_oklab::OklabTarget;
 use crate::neon::math::vcolorq_matrix_f32;
-use crate::neon::neon_perform_gamma_transfer;
-use crate::{load_f32_and_deinterleave_direct, TransferFunction};
+use crate::load_f32_and_deinterleave_direct;
 
 #[inline(always)]
 unsafe fn neon_oklab_gamma_vld<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     src: *const f32,
-    transfer_function: TransferFunction,
     m0: float32x4_t,
     m1: float32x4_t,
     m2: float32x4_t,
@@ -36,11 +34,10 @@ unsafe fn neon_oklab_gamma_vld<const CHANNELS_CONFIGURATION: u8, const TARGET: u
     c6: float32x4_t,
     c7: float32x4_t,
     c8: float32x4_t,
-) -> (uint32x4_t, uint32x4_t, uint32x4_t, uint32x4_t) {
+) -> (float32x4_t, float32x4_t, float32x4_t, float32x4_t) {
     let target: OklabTarget = TARGET.into();
-    let v_scale_alpha = vdupq_n_f32(255f32);
     let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
-    let (l, mut a, mut b, mut a_f32) = load_f32_and_deinterleave_direct!(src, image_configuration);
+    let (l, mut a, mut b, a_f32) = load_f32_and_deinterleave_direct!(src, image_configuration);
 
     if target == OklabTarget::Oklch {
         let a0 = vmulq_f32(a, vcosq_f32(b));
@@ -57,34 +54,17 @@ unsafe fn neon_oklab_gamma_vld<const CHANNELS_CONFIGURATION: u8, const TARGET: u
     l_s = vmulq_f32(vmulq_f32(l_s, l_s), l_s);
 
     let (r_l, g_l, b_l) = vcolorq_matrix_f32(l_l, l_m, l_s, c0, c1, c2, c3, c4, c5, c6, c7, c8);
-
-    let mut r_f32 = neon_perform_gamma_transfer(transfer_function, r_l);
-    let mut g_f32 = neon_perform_gamma_transfer(transfer_function, g_l);
-    let mut b_f32 = neon_perform_gamma_transfer(transfer_function, b_l);
-
-    r_f32 = vmulq_f32(r_f32, v_scale_alpha);
-    g_f32 = vmulq_f32(g_f32, v_scale_alpha);
-    b_f32 = vmulq_f32(b_f32, v_scale_alpha);
-    if image_configuration.has_alpha() {
-        a_f32 = vmulq_f32(a_f32, v_scale_alpha);
-    }
-    (
-        vcvtaq_u32_f32(r_f32),
-        vcvtaq_u32_f32(g_f32),
-        vcvtaq_u32_f32(b_f32),
-        vcvtaq_u32_f32(a_f32),
-    )
+    (r_l, g_l, b_l, a_f32)
 }
 
 #[inline(always)]
 pub unsafe fn neon_oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     start_cx: usize,
     src: *const f32,
-    src_offset: u32,
-    dst: *mut u8,
+    src_offset: usize,
+    dst: *mut f32,
     dst_offset: u32,
     width: u32,
-    transfer_function: TransferFunction,
 ) -> usize {
     let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
     let channels = image_configuration.get_channels_count();
@@ -114,259 +94,13 @@ pub unsafe fn neon_oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET
         vdupq_n_f32(1.7076147010f32),
     );
 
-    while cx + 16 < width as usize {
-        let offset_src_ptr =
-            ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_, a_row0_) =
-            neon_oklab_gamma_vld::<CHANNELS_CONFIGURATION, TARGET>(
-                src_ptr_0,
-                transfer_function,
-                m0,
-                m1,
-                m2,
-                m3,
-                m4,
-                m5,
-                m6,
-                m7,
-                m8,
-                c0,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-            );
-
-        let src_ptr_1 = offset_src_ptr.add(4 * channels);
-
-        let (r_row1_, g_row1_, b_row1_, a_row1_) =
-            neon_oklab_gamma_vld::<CHANNELS_CONFIGURATION, TARGET>(
-                src_ptr_1,
-                transfer_function,
-                m0,
-                m1,
-                m2,
-                m3,
-                m4,
-                m5,
-                m6,
-                m7,
-                m8,
-                c0,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-            );
-
-        let src_ptr_2 = offset_src_ptr.add(4 * 2 * channels);
-
-        let (r_row2_, g_row2_, b_row2_, a_row2_) =
-            neon_oklab_gamma_vld::<CHANNELS_CONFIGURATION, TARGET>(
-                src_ptr_2,
-                transfer_function,
-                m0,
-                m1,
-                m2,
-                m3,
-                m4,
-                m5,
-                m6,
-                m7,
-                m8,
-                c0,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-            );
-
-        let src_ptr_3 = offset_src_ptr.add(4 * 3 * channels);
-
-        let (r_row3_, g_row3_, b_row3_, a_row3_) =
-            neon_oklab_gamma_vld::<CHANNELS_CONFIGURATION, TARGET>(
-                src_ptr_3,
-                transfer_function,
-                m0,
-                m1,
-                m2,
-                m3,
-                m4,
-                m5,
-                m6,
-                m7,
-                m8,
-                c0,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-            );
-
-        let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_));
-        let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_));
-        let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_));
-
-        let r_row23 = vcombine_u16(vqmovn_u32(r_row2_), vqmovn_u32(r_row3_));
-        let g_row23 = vcombine_u16(vqmovn_u32(g_row2_), vqmovn_u32(g_row3_));
-        let b_row23 = vcombine_u16(vqmovn_u32(b_row2_), vqmovn_u32(b_row3_));
-
-        let r_row = vcombine_u8(vqmovn_u16(r_row01), vqmovn_u16(r_row23));
-        let g_row = vcombine_u8(vqmovn_u16(g_row01), vqmovn_u16(g_row23));
-        let b_row = vcombine_u8(vqmovn_u16(b_row01), vqmovn_u16(b_row23));
-
-        let dst_ptr = dst.add(dst_offset as usize + cx * channels);
-
-        if image_configuration.has_alpha() {
-            let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_));
-            let a_row23 = vcombine_u16(vqmovn_u32(a_row2_), vqmovn_u32(a_row3_));
-            let a_row = vcombine_u8(vqmovn_u16(a_row01), vqmovn_u16(a_row23));
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    uint8x16x4_t(r_row, g_row, b_row, a_row)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    uint8x16x4_t(b_row, g_row, r_row, a_row)
-                }
-            };
-            vst4q_u8(dst_ptr, store_rows);
-        } else {
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    uint8x16x3_t(r_row, g_row, b_row)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    uint8x16x3_t(b_row, g_row, r_row)
-                }
-            };
-            vst3q_u8(dst_ptr, store_rows);
-        }
-
-        cx += 16;
-    }
-
-    while cx + 8 < width as usize {
-        let offset_src_ptr =
-            ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_, a_row0_) =
-            neon_oklab_gamma_vld::<CHANNELS_CONFIGURATION, TARGET>(
-                src_ptr_0,
-                transfer_function,
-                m0,
-                m1,
-                m2,
-                m3,
-                m4,
-                m5,
-                m6,
-                m7,
-                m8,
-                c0,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-            );
-
-        let src_ptr_1 = offset_src_ptr.add(4 * channels);
-
-        let (r_row1_, g_row1_, b_row1_, a_row1_) =
-            neon_oklab_gamma_vld::<CHANNELS_CONFIGURATION, TARGET>(
-                src_ptr_1,
-                transfer_function,
-                m0,
-                m1,
-                m2,
-                m3,
-                m4,
-                m5,
-                m6,
-                m7,
-                m8,
-                c0,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-            );
-
-        let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_));
-        let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_));
-        let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_));
-
-        let r_row = vqmovn_u16(r_row01);
-        let g_row = vqmovn_u16(g_row01);
-        let b_row = vqmovn_u16(b_row01);
-
-        let dst_ptr = dst.add(dst_offset as usize + cx * channels);
-
-        if image_configuration.has_alpha() {
-            let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_));
-            let a_row = vqmovn_u16(a_row01);
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    uint8x8x4_t(r_row, g_row, b_row, a_row)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    uint8x8x4_t(b_row, g_row, r_row, a_row)
-                }
-            };
-            vst4_u8(dst_ptr, store_rows);
-        } else {
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    uint8x8x3_t(r_row, g_row, b_row)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    uint8x8x3_t(b_row, g_row, r_row)
-                }
-            };
-            vst3_u8(dst_ptr, store_rows);
-        }
-
-        cx += 8;
-    }
-
     while cx + 4 < width as usize {
-        let offset_src_ptr =
-            ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);
-
-        let src_ptr_0 = offset_src_ptr;
+        let v_src_ptr =
+            ((src as *mut u8).add(src_offset) as *mut f32).add(cx * channels);
 
         let (r_row0_, g_row0_, b_row0_, a_row0_) =
             neon_oklab_gamma_vld::<CHANNELS_CONFIGURATION, TARGET>(
-                src_ptr_0,
-                transfer_function,
+                v_src_ptr,
                 m0,
                 m1,
                 m2,
@@ -387,44 +121,29 @@ pub unsafe fn neon_oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET
                 c8,
             );
 
-        let zeros = vdup_n_u16(0);
-
-        let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), zeros);
-        let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), zeros);
-        let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), zeros);
-
-        let r_row = vqmovn_u16(r_row01);
-        let g_row = vqmovn_u16(g_row01);
-        let b_row = vqmovn_u16(b_row01);
-
-        let dst_ptr = dst.add(dst_offset as usize + cx * channels);
+        let in_place_ptr =
+            ((dst as *mut u8).add(dst_offset as usize) as *mut f32).add(cx * channels);
 
         if image_configuration.has_alpha() {
-            let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), zeros);
-            let a_row = vqmovn_u16(a_row01);
             let store_rows = match image_configuration {
                 ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    uint8x8x4_t(r_row, g_row, b_row, a_row)
+                    float32x4x4_t(r_row0_, g_row0_, b_row0_, a_row0_)
                 }
                 ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    uint8x8x4_t(b_row, g_row, r_row, a_row)
+                    float32x4x4_t(b_row0_, g_row0_, r_row0_, a_row0_)
                 }
             };
-            let mut transient: [u8; 32] = [0; 32];
-            vst4_u8(transient.as_mut_ptr(), store_rows);
-            std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 4);
+            vst4q_f32(in_place_ptr, store_rows);
         } else {
             let store_rows = match image_configuration {
                 ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    uint8x8x3_t(r_row, g_row, b_row)
+                    float32x4x3_t(r_row0_, g_row0_, b_row0_)
                 }
                 ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    uint8x8x3_t(b_row, g_row, r_row)
+                    float32x4x3_t(b_row0_, g_row0_, r_row0_)
                 }
             };
-            let mut transient: [u8; 24] = [0; 24];
-            vst3_u8(transient.as_mut_ptr(), store_rows);
-            std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 3);
+            vst3q_f32(in_place_ptr, store_rows);
         }
 
         cx += 4;
diff --git a/src/neon/to_linear.rs b/src/neon/to_linear.rs
deleted file mode 100644
index ed0cc8d..0000000
--- a/src/neon/to_linear.rs
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved.
- * //
- * // Use of this source code is governed by a BSD-style
- * // license that can be found in the LICENSE file.
- */
-
-use crate::gamma_curves::TransferFunction;
-use crate::image::ImageConfiguration;
-use crate::neon::*;
-use crate::{
-    load_u8_and_deinterleave, load_u8_and_deinterleave_half, load_u8_and_deinterleave_quarter,
-};
-use std::arch::aarch64::*;
-
-#[inline(always)]
-pub(crate) unsafe fn neon_triple_to_linear(
-    r: uint32x4_t,
-    g: uint32x4_t,
-    b: uint32x4_t,
-    transfer_function: TransferFunction,
-) -> (float32x4_t, float32x4_t, float32x4_t) {
-    let r_f = vmulq_n_f32(vcvtq_f32_u32(r), 1f32 / 255f32);
-    let g_f = vmulq_n_f32(vcvtq_f32_u32(g), 1f32 / 255f32);
-    let b_f = vmulq_n_f32(vcvtq_f32_u32(b), 1f32 / 255f32);
-    let r_linear = neon_perform_linear_transfer(transfer_function, r_f);
-    let g_linear = neon_perform_linear_transfer(transfer_function, g_f);
-    let b_linear = neon_perform_linear_transfer(transfer_function, b_f);
-    (r_linear, g_linear, b_linear)
-}
-
-pub unsafe fn neon_channels_to_linear<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
-    start_cx: usize,
-    src: *const u8,
-    src_offset: usize,
-    width: u32,
-    dst: *mut f32,
-    dst_offset: usize,
-    transfer_function: TransferFunction,
-) -> usize {
-    let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
-    let channels = image_configuration.get_channels_count();
-    let mut cx = start_cx;
-
-    let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32;
-
-    while cx + 16 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave!(src_ptr, image_configuration);
-
-        let r_low = vmovl_u8(vget_low_u8(r_chan));
-        let g_low = vmovl_u8(vget_low_u8(g_chan));
-        let b_low = vmovl_u8(vget_low_u8(b_chan));
-
-        let r_low_low = vmovl_u16(vget_low_u16(r_low));
-        let g_low_low = vmovl_u16(vget_low_u16(g_low));
-        let b_low_low = vmovl_u16(vget_low_u16(b_low));
-
-        let (x_low_low, y_low_low, z_low_low) =
-            neon_triple_to_linear(r_low_low, g_low_low, b_low_low, transfer_function);
-
-        let a_low = vmovl_u8(vget_low_u8(a_chan));
-
-        if USE_ALPHA {
-            let a_low_low =
-                vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32);
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    float32x4x4_t(x_low_low, y_low_low, z_low_low, a_low_low)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    float32x4x4_t(z_low_low, y_low_low, x_low_low, a_low_low)
-                }
-            };
-            vst4q_f32(dst_ptr.add(cx * channels), store_rows);
-        } else {
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    float32x4x3_t(x_low_low, y_low_low, z_low_low)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    float32x4x3_t(z_low_low, y_low_low, x_low_low)
-                }
-            };
-            vst3q_f32(dst_ptr.add(cx * channels), store_rows);
-        }
-
-        let r_low_high = vmovl_high_u16(r_low);
-        let g_low_high = vmovl_high_u16(g_low);
-        let b_low_high = vmovl_high_u16(b_low);
-
-        let (x_low_high, y_low_high, z_low_high) =
-            neon_triple_to_linear(r_low_high, g_low_high, b_low_high, transfer_function);
-
-        if USE_ALPHA {
-            let a_low_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_low)), 1f32 / 255f32);
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    float32x4x4_t(x_low_high, y_low_high, z_low_high, a_low_high)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    float32x4x4_t(z_low_high, y_low_high, x_low_high, a_low_high)
-                }
-            };
-            vst4q_f32(dst_ptr.add(cx * channels + 4 * channels), store_rows);
-        } else {
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    float32x4x3_t(x_low_high, y_low_high, z_low_high)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    float32x4x3_t(z_low_high, y_low_high, x_low_high)
-                }
-            };
-            vst3q_f32(dst_ptr.add(cx * channels + 4 * channels), store_rows);
-        }
-
-        let r_high = vmovl_high_u8(r_chan);
-        let g_high = vmovl_high_u8(g_chan);
-        let b_high = vmovl_high_u8(b_chan);
-
-        let r_high_low = vmovl_u16(vget_low_u16(r_high));
-        let g_high_low = vmovl_u16(vget_low_u16(g_high));
-        let b_high_low = vmovl_u16(vget_low_u16(b_high));
-
-        let (x_high_low, y_high_low, z_high_low) =
-            neon_triple_to_linear(r_high_low, g_high_low, b_high_low, transfer_function);
-
-        let a_high = vmovl_high_u8(a_chan);
-
-        if USE_ALPHA {
-            let a_high_low = vmulq_n_f32(
-                vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_high))),
-                1f32 / 255f32,
-            );
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    float32x4x4_t(x_high_low, y_high_low, z_high_low, a_high_low)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    float32x4x4_t(z_high_low, y_high_low, x_high_low, a_high_low)
-                }
-            };
-            vst4q_f32(dst_ptr.add(cx * channels + 4 * channels * 2), store_rows);
-        } else {
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    float32x4x3_t(x_high_low, y_high_low, z_high_low)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    float32x4x3_t(z_high_low, y_high_low, x_high_low)
-                }
-            };
-            vst3q_f32(dst_ptr.add(cx * channels + 4 * channels * 2), store_rows);
-        }
-
-        let r_high_high = vmovl_high_u16(r_high);
-        let g_high_high = vmovl_high_u16(g_high);
-        let b_high_high = vmovl_high_u16(b_high);
-
-        let (x_high_high, y_high_high, z_high_high) =
-            neon_triple_to_linear(r_high_high, g_high_high, b_high_high, transfer_function);
-
-        if USE_ALPHA {
-            let a_high_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_high)), 1f32 / 255f32);
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    float32x4x4_t(x_high_high, y_high_high, z_high_high, a_high_high)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    float32x4x4_t(z_high_high, y_high_high, x_high_high, a_high_high)
-                }
-            };
-            vst4q_f32(dst_ptr.add(cx * channels + 4 * channels * 3), store_rows);
-        } else {
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    float32x4x3_t(x_high_high, y_high_high, z_high_high)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    float32x4x3_t(z_high_high, y_high_high, x_high_high)
-                }
-            };
-            vst3q_f32(dst_ptr.add(cx * channels + 4 * channels * 3), store_rows);
-        }
-
-        cx += 16;
-    }
-
-    while cx + 8 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave_half!(src_ptr, image_configuration);
-
-        let r_low = vmovl_u8(vget_low_u8(r_chan));
-        let g_low = vmovl_u8(vget_low_u8(g_chan));
-        let b_low = vmovl_u8(vget_low_u8(b_chan));
-
-        let r_low_low = vmovl_u16(vget_low_u16(r_low));
-        let g_low_low = vmovl_u16(vget_low_u16(g_low));
-        let b_low_low = vmovl_u16(vget_low_u16(b_low));
-
-        let (x_low_low, y_low_low, z_low_low) =
-            neon_triple_to_linear(r_low_low, g_low_low, b_low_low, transfer_function);
-
-        let a_low = vmovl_u8(vget_low_u8(a_chan));
-
-        if USE_ALPHA {
-            let a_low_low =
-                vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32);
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    float32x4x4_t(x_low_low, y_low_low, z_low_low, a_low_low)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    float32x4x4_t(z_low_low, y_low_low, x_low_low, a_low_low)
-                }
-            };
-            vst4q_f32(dst_ptr.add(cx * channels), store_rows);
-        } else {
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    float32x4x3_t(x_low_low, y_low_low, z_low_low)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    float32x4x3_t(z_low_low, y_low_low, x_low_low)
-                }
-            };
-            vst3q_f32(dst_ptr.add(cx * channels), store_rows);
-        }
-
-        let r_low_high = vmovl_high_u16(r_low);
-        let g_low_high = vmovl_high_u16(g_low);
-        let b_low_high = vmovl_high_u16(b_low);
-
-        let (x_low_high, y_low_high, z_low_high) =
-            neon_triple_to_linear(r_low_high, g_low_high, b_low_high, transfer_function);
-
-        if USE_ALPHA {
-            let a_low_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_low)), 1f32 / 255f32);
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    float32x4x4_t(x_low_high, y_low_high, z_low_high, a_low_high)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    float32x4x4_t(z_low_high, y_low_high, x_low_high, a_low_high)
-                }
-            };
-            vst4q_f32(dst_ptr.add(cx * channels + 4 * channels), store_rows);
-        } else {
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    float32x4x3_t(x_low_high, y_low_high, z_low_high)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    float32x4x3_t(z_low_high, y_low_high, x_low_high)
-                }
-            };
-            vst3q_f32(dst_ptr.add(cx * channels + 4 * channels), store_rows);
-        }
-
-        cx += 8;
-    }
-
-    while cx + 4 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave_quarter!(src_ptr, image_configuration);
-
-        let r_low = vmovl_u8(vget_low_u8(r_chan));
-        let g_low = vmovl_u8(vget_low_u8(g_chan));
-        let b_low = vmovl_u8(vget_low_u8(b_chan));
-
-        let r_low_low = vmovl_u16(vget_low_u16(r_low));
-        let g_low_low = vmovl_u16(vget_low_u16(g_low));
-        let b_low_low = vmovl_u16(vget_low_u16(b_low));
-
-        let (x_low_low, y_low_low, z_low_low) =
-            neon_triple_to_linear(r_low_low, g_low_low, b_low_low, transfer_function);
-
-        let a_low = vmovl_u8(vget_low_u8(a_chan));
-
-        if USE_ALPHA {
-            let a_low_low =
-                vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32);
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    float32x4x4_t(x_low_low, y_low_low, z_low_low, a_low_low)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    float32x4x4_t(z_low_low, y_low_low, x_low_low, a_low_low)
-                }
-            };
-            vst4q_f32(dst_ptr.add(cx * channels), store_rows);
-        } else {
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    float32x4x3_t(x_low_low, y_low_low, z_low_low)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    float32x4x3_t(z_low_low, y_low_low, x_low_low)
-                }
-            };
-            vst3q_f32(dst_ptr.add(cx * channels), store_rows);
-        }
-
-        cx += 4;
-    }
-
-    cx
-}
diff --git a/src/neon/to_linear_u8.rs b/src/neon/to_linear_u8.rs
deleted file mode 100644
index bdd15a3..0000000
--- a/src/neon/to_linear_u8.rs
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved.
- * //
- * // Use of this source code is governed by a BSD-style
- * // license that can be found in the LICENSE file.
- */
-
-use crate::image::ImageConfiguration;
-use crate::neon::{neon_perform_gamma_transfer, neon_perform_linear_transfer};
-use crate::{
-    load_u8_and_deinterleave, load_u8_and_deinterleave_half, load_u8_and_deinterleave_quarter,
-    TransferFunction,
-};
-use std::arch::aarch64::*;
-
-#[inline(always)]
-pub(crate) unsafe fn neon_triple_to_linear_u8<const INTO_LINEAR: bool>(
-    r: uint32x4_t,
-    g: uint32x4_t,
-    b: uint32x4_t,
-    transfer_function: TransferFunction,
-) -> (uint32x4_t, uint32x4_t, uint32x4_t) {
-    let r_f = vmulq_n_f32(vcvtq_f32_u32(r), 1f32 / 255f32);
-    let g_f = vmulq_n_f32(vcvtq_f32_u32(g), 1f32 / 255f32);
-    let b_f = vmulq_n_f32(vcvtq_f32_u32(b), 1f32 / 255f32);
-    let r_linear = vmulq_n_f32(
-        match INTO_LINEAR {
-            true => neon_perform_linear_transfer(transfer_function, r_f),
-            false => neon_perform_gamma_transfer(transfer_function, r_f),
-        },
-        255f32,
-    );
-    let g_linear = vmulq_n_f32(
-        match INTO_LINEAR {
-            true => neon_perform_linear_transfer(transfer_function, g_f),
-            false => neon_perform_gamma_transfer(transfer_function, g_f),
-        },
-        255f32,
-    );
-    let b_linear = vmulq_n_f32(
-        match INTO_LINEAR {
-            true => neon_perform_linear_transfer(transfer_function, b_f),
-            false => neon_perform_gamma_transfer(transfer_function, b_f),
-        },
-        255f32,
-    );
-
-    (
-        vcvtaq_u32_f32(r_linear),
-        vcvtaq_u32_f32(g_linear),
-        vcvtaq_u32_f32(b_linear),
-    )
-}
-
-#[inline]
-pub unsafe fn neon_channels_to_linear_u8<
-    const CHANNELS_CONFIGURATION: u8,
-    const USE_ALPHA: bool,
-    const INTO_LINEAR: bool,
->(
-    start_cx: usize,
-    src: *const u8,
-    src_offset: usize,
-    width: u32,
-    dst: *mut u8,
-    dst_offset: usize,
-    transfer_function: TransferFunction,
-) -> usize {
-    let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
-    let channels = image_configuration.get_channels_count();
-    let mut cx = start_cx;
-
-    let dst_ptr = dst.add(dst_offset);
-
-    while cx + 16 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave!(src_ptr, image_configuration);
-
-        let r_low = vmovl_u8(vget_low_u8(r_chan));
-        let g_low = vmovl_u8(vget_low_u8(g_chan));
-        let b_low = vmovl_u8(vget_low_u8(b_chan));
-
-        let r_low_low = vmovl_u16(vget_low_u16(r_low));
-        let g_low_low = vmovl_u16(vget_low_u16(g_low));
-        let b_low_low = vmovl_u16(vget_low_u16(b_low));
-
-        let (x_low_low, y_low_low, z_low_low) = neon_triple_to_linear_u8::<INTO_LINEAR>(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            transfer_function,
-        );
-
-        let r_low_high = vmovl_high_u16(r_low);
-        let g_low_high = vmovl_high_u16(g_low);
-        let b_low_high = vmovl_high_u16(b_low);
-
-        let (x_low_high, y_low_high, z_low_high) = neon_triple_to_linear_u8::<INTO_LINEAR>(
-            r_low_high,
-            g_low_high,
-            b_low_high,
-            transfer_function,
-        );
-
-        let r_high = vmovl_high_u8(r_chan);
-        let g_high = vmovl_high_u8(g_chan);
-        let b_high = vmovl_high_u8(b_chan);
-
-        let r_high_low = vmovl_u16(vget_low_u16(r_high));
-        let g_high_low = vmovl_u16(vget_low_u16(g_high));
-        let b_high_low = vmovl_u16(vget_low_u16(b_high));
-
-        let (x_high_low, y_high_low, z_high_low) = neon_triple_to_linear_u8::<INTO_LINEAR>(
-            r_high_low,
-            g_high_low,
-            b_high_low,
-            transfer_function,
-        );
-
-        let r_high_high = vmovl_high_u16(r_high);
-        let g_high_high = vmovl_high_u16(g_high);
-        let b_high_high = vmovl_high_u16(b_high);
-
-        let (x_high_high, y_high_high, z_high_high) = neon_triple_to_linear_u8::<INTO_LINEAR>(
-            r_high_high,
-            g_high_high,
-            b_high_high,
-            transfer_function,
-        );
-
-        let r_u_norm = vcombine_u8(
-            vqmovn_u16(vcombine_u16(vmovn_u32(x_low_low), vmovn_u32(x_low_high))),
-            vqmovn_u16(vcombine_u16(vmovn_u32(x_high_low), vmovn_u32(x_high_high))),
-        );
-
-        let g_u_norm = vcombine_u8(
-            vqmovn_u16(vcombine_u16(vmovn_u32(y_low_low), vmovn_u32(y_low_high))),
-            vqmovn_u16(vcombine_u16(vmovn_u32(y_high_low), vmovn_u32(y_high_high))),
-        );
-
-        let b_u_norm = vcombine_u8(
-            vqmovn_u16(vcombine_u16(vmovn_u32(z_low_low), vmovn_u32(z_low_high))),
-            vqmovn_u16(vcombine_u16(vmovn_u32(z_high_low), vmovn_u32(z_high_high))),
-        );
-
-        if USE_ALPHA {
-            let v_4 = uint8x16x4_t(r_u_norm, g_u_norm, b_u_norm, a_chan);
-            vst4q_u8(dst_ptr.add(cx * channels), v_4);
-        } else {
-            let v_4 = uint8x16x3_t(r_u_norm, g_u_norm, b_u_norm);
-            vst3q_u8(dst_ptr.add(cx * channels), v_4);
-        }
-
-        cx += 16;
-    }
-
-    while cx + 8 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-
-        let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave_half!(src_ptr, image_configuration);
-
-        let r_low = vmovl_u8(vget_low_u8(r_chan));
-        let g_low = vmovl_u8(vget_low_u8(g_chan));
-        let b_low = vmovl_u8(vget_low_u8(b_chan));
-
-        let r_low_low = vmovl_u16(vget_low_u16(r_low));
-        let g_low_low = vmovl_u16(vget_low_u16(g_low));
-        let b_low_low = vmovl_u16(vget_low_u16(b_low));
-
-        let (x_low_low, y_low_low, z_low_low) = neon_triple_to_linear_u8::<INTO_LINEAR>(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            transfer_function,
-        );
-
-        let r_low_high = vmovl_high_u16(r_low);
-        let g_low_high = vmovl_high_u16(g_low);
-        let b_low_high = vmovl_high_u16(b_low);
-
-        let (x_low_high, y_low_high, z_low_high) = neon_triple_to_linear_u8::<INTO_LINEAR>(
-            r_low_high,
-            g_low_high,
-            b_low_high,
-            transfer_function,
-        );
-
-        let r_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(x_low_low), vmovn_u32(x_low_high)));
-
-        let g_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(y_low_low), vmovn_u32(y_low_high)));
-
-        let b_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(z_low_low), vmovn_u32(z_low_high)));
-
-        let dst = dst_ptr.add(cx * channels);
-
-        if USE_ALPHA {
-            let v_4 = uint8x8x4_t(r_u_norm, g_u_norm, b_u_norm, vget_low_u8(a_chan));
-            vst4_u8(dst, v_4);
-        } else {
-            let v_4 = uint8x8x3_t(r_u_norm, g_u_norm, b_u_norm);
-            vst3_u8(dst, v_4);
-        }
-
-        cx += 8;
-    }
-
-    while cx + 4 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-
-        let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave_quarter!(src_ptr, image_configuration);
-
-        let r_low = vmovl_u8(vget_low_u8(r_chan));
-        let g_low = vmovl_u8(vget_low_u8(g_chan));
-        let b_low = vmovl_u8(vget_low_u8(b_chan));
-
-        let r_low_low = vmovl_u16(vget_low_u16(r_low));
-        let g_low_low = vmovl_u16(vget_low_u16(g_low));
-        let b_low_low = vmovl_u16(vget_low_u16(b_low));
-
-        let (x_low_low, y_low_low, z_low_low) = neon_triple_to_linear_u8::<INTO_LINEAR>(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            transfer_function,
-        );
-
-        let zeros = vdup_n_u16(0);
-
-        let r_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(x_low_low), zeros));
-
-        let g_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(y_low_low), zeros));
-
-        let b_u_norm = vqmovn_u16(vcombine_u16(vmovn_u32(z_low_low), zeros));
-
-        let dst = dst_ptr.add(cx * channels);
-
-        if USE_ALPHA {
-            let v_4 = uint8x8x4_t(r_u_norm, g_u_norm, b_u_norm, vget_low_u8(a_chan));
-            let mut transient: [u8; 32] = [0; 32];
-            vst4_u8(transient.as_mut_ptr(), v_4);
-            std::ptr::copy_nonoverlapping(transient.as_ptr(), dst, 4 * 4);
-        } else {
-            let v_3 = uint8x8x3_t(r_u_norm, g_u_norm, b_u_norm);
-            let mut transient: [u8; 24] = [0; 24];
-            vst3_u8(transient.as_mut_ptr(), v_3);
-            std::ptr::copy_nonoverlapping(transient.as_ptr(), dst, 4 * 3);
-        }
-
-        cx += 4;
-    }
-
-    cx
-}
diff --git a/src/neon/to_xyz_lab.rs b/src/neon/to_xyz_lab.rs
index a1df476..4ad96bd 100644
--- a/src/neon/to_xyz_lab.rs
+++ b/src/neon/to_xyz_lab.rs
@@ -5,15 +5,12 @@
  * // license that can be found in the LICENSE file.
  */
 
-use crate::gamma_curves::TransferFunction;
 use crate::image::ImageConfiguration;
+use crate::load_f32_and_deinterleave;
 use crate::neon::cie::{
     neon_triple_to_lab, neon_triple_to_lch, neon_triple_to_luv, neon_triple_to_xyz,
 };
 use crate::xyz_target::XyzTarget;
-use crate::{
-    load_u8_and_deinterleave, load_u8_and_deinterleave_half, load_u8_and_deinterleave_quarter,
-};
 use std::arch::aarch64::*;
 
 #[inline(always)]
@@ -23,7 +20,7 @@ pub unsafe fn neon_channels_to_xyz_or_lab<
     const TARGET: u8,
 >(
     start_cx: usize,
-    src: *const u8,
+    src: *const f32,
     src_offset: usize,
     width: u32,
     dst: *mut f32,
@@ -31,7 +28,6 @@ pub unsafe fn neon_channels_to_xyz_or_lab<
     a_linearized: *mut f32,
     a_offset: usize,
     matrix: &[[f32; 3]; 3],
-    transfer_function: TransferFunction,
 ) -> usize {
     if USE_ALPHA && a_linearized.is_null() {
         panic!("Null alpha channel with requirements of linearized alpha if not supported");
@@ -53,364 +49,15 @@ pub unsafe fn neon_channels_to_xyz_or_lab<
 
     let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32;
 
-    while cx + 16 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave!(src_ptr, image_configuration);
-
-        let r_low = vmovl_u8(vget_low_u8(r_chan));
-        let g_low = vmovl_u8(vget_low_u8(g_chan));
-        let b_low = vmovl_u8(vget_low_u8(b_chan));
-
-        let r_low_low = vmovl_u16(vget_low_u16(r_low));
-        let g_low_low = vmovl_u16(vget_low_u16(g_low));
-        let b_low_low = vmovl_u16(vget_low_u16(b_low));
-
-        let (mut x_low_low, mut y_low_low, mut z_low_low) = neon_triple_to_xyz(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = neon_triple_to_lab(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = a;
-                z_low_low = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = neon_triple_to_luv(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = u;
-                z_low_low = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = neon_triple_to_lch(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = c;
-                z_low_low = h;
-            }
-        }
-
-        let xyz_low_low = float32x4x3_t(x_low_low, y_low_low, z_low_low);
-        vst3q_f32(dst_ptr.add(cx * 3), xyz_low_low);
-
-        let r_low_high = vmovl_high_u16(r_low);
-        let g_low_high = vmovl_high_u16(g_low);
-        let b_low_high = vmovl_high_u16(b_low);
-
-        let (mut x_low_high, mut y_low_high, mut z_low_high) = neon_triple_to_xyz(
-            r_low_high,
-            g_low_high,
-            b_low_high,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = neon_triple_to_lab(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = a;
-                z_low_high = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = neon_triple_to_luv(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = u;
-                z_low_high = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = neon_triple_to_lch(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = c;
-                z_low_high = h;
-            }
-        }
-
-        let xyz_low_low = float32x4x3_t(x_low_high, y_low_high, z_low_high);
-        vst3q_f32(dst_ptr.add(cx * 3 + 4 * 3), xyz_low_low);
-
-        let r_high = vmovl_high_u8(r_chan);
-        let g_high = vmovl_high_u8(g_chan);
-        let b_high = vmovl_high_u8(b_chan);
-
-        let r_high_low = vmovl_u16(vget_low_u16(r_high));
-        let g_high_low = vmovl_u16(vget_low_u16(g_high));
-        let b_high_low = vmovl_u16(vget_low_u16(b_high));
-
-        let (mut x_high_low, mut y_high_low, mut z_high_low) = neon_triple_to_xyz(
-            r_high_low,
-            g_high_low,
-            b_high_low,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = neon_triple_to_lab(x_high_low, y_high_low, z_high_low);
-                x_high_low = l;
-                y_high_low = a;
-                z_high_low = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = neon_triple_to_luv(x_high_low, y_high_low, z_high_low);
-                x_high_low = l;
-                y_high_low = u;
-                z_high_low = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = neon_triple_to_lch(x_high_low, y_high_low, z_high_low);
-                x_high_low = l;
-                y_high_low = c;
-                z_high_low = h;
-            }
-        }
-
-        let xyz_low_low = float32x4x3_t(x_high_low, y_high_low, z_high_low);
-        vst3q_f32(dst_ptr.add(cx * 3 + 4 * 3 * 2), xyz_low_low);
-
-        let r_high_high = vmovl_high_u16(r_high);
-        let g_high_high = vmovl_high_u16(g_high);
-        let b_high_high = vmovl_high_u16(b_high);
-
-        let (mut x_high_high, mut y_high_high, mut z_high_high) = neon_triple_to_xyz(
-            r_high_high,
-            g_high_high,
-            b_high_high,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = neon_triple_to_lab(x_high_high, y_high_high, z_high_high);
-                x_high_high = l;
-                y_high_high = a;
-                z_high_high = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = neon_triple_to_luv(x_high_high, y_high_high, z_high_high);
-                x_high_high = l;
-                y_high_high = u;
-                z_high_high = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = neon_triple_to_lch(x_high_high, y_high_high, z_high_high);
-                x_high_high = l;
-                y_high_high = c;
-                z_high_high = h;
-            }
-        }
-
-        let xyz_low_low = float32x4x3_t(x_high_high, y_high_high, z_high_high);
-        vst3q_f32(dst_ptr.add(cx * 3 + 4 * 3 * 3), xyz_low_low);
-
-        if USE_ALPHA {
-            let a_ptr = (a_linearized as *mut u8).add(a_offset) as *mut f32;
-
-            let a_low = vmovl_u8(vget_low_u8(a_chan));
-
-            let a_low_low =
-                vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32);
-
-            vst1q_f32(a_ptr.add(cx), a_low_low);
-
-            let a_low_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_low)), 1f32 / 255f32);
-
-            vst1q_f32(a_ptr.add(cx + 4), a_low_high);
-
-            let a_high = vmovl_high_u8(a_chan);
-
-            let a_high_low = vmulq_n_f32(
-                vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_high))),
-                1f32 / 255f32,
-            );
-
-            vst1q_f32(a_ptr.add(cx + 4 * 2), a_high_low);
-
-            let a_high_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_high)), 1f32 / 255f32);
-
-            vst1q_f32(a_ptr.add(cx + 4 * 3), a_high_high);
-        }
-
-        cx += 16;
-    }
-
-    while cx + 8 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave_half!(src_ptr, image_configuration);
-
-        let r_low = vmovl_u8(vget_low_u8(r_chan));
-        let g_low = vmovl_u8(vget_low_u8(g_chan));
-        let b_low = vmovl_u8(vget_low_u8(b_chan));
-
-        let r_low_low = vmovl_u16(vget_low_u16(r_low));
-        let g_low_low = vmovl_u16(vget_low_u16(g_low));
-        let b_low_low = vmovl_u16(vget_low_u16(b_low));
-
-        let (mut x_low_low, mut y_low_low, mut z_low_low) = neon_triple_to_xyz(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = neon_triple_to_lab(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = a;
-                z_low_low = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = neon_triple_to_luv(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = u;
-                z_low_low = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = neon_triple_to_lch(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = c;
-                z_low_low = h;
-            }
-        }
-
-        let xyz_low_low = float32x4x3_t(x_low_low, y_low_low, z_low_low);
-        vst3q_f32(dst_ptr.add(cx * 3), xyz_low_low);
-
-        let r_low_high = vmovl_high_u16(r_low);
-        let g_low_high = vmovl_high_u16(g_low);
-        let b_low_high = vmovl_high_u16(b_low);
-
-        let (mut x_low_high, mut y_low_high, mut z_low_high) = neon_triple_to_xyz(
-            r_low_high,
-            g_low_high,
-            b_low_high,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = neon_triple_to_lab(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = a;
-                z_low_high = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = neon_triple_to_luv(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = u;
-                z_low_high = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = neon_triple_to_lch(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = c;
-                z_low_high = h;
-            }
-        }
-
-        let xyz_low_low = float32x4x3_t(x_low_high, y_low_high, z_low_high);
-        vst3q_f32(dst_ptr.add(cx * 3 + 4 * 3), xyz_low_low);
-
-        if USE_ALPHA {
-            let a_ptr = (a_linearized as *mut u8).add(a_offset) as *mut f32;
-
-            let a_low = vmovl_u8(vget_low_u8(a_chan));
-
-            let a_low_low =
-                vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32);
-
-            vst1q_f32(a_ptr.add(cx), a_low_low);
-
-            let a_low_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_low)), 1f32 / 255f32);
-
-            vst1q_f32(a_ptr.add(cx + 4), a_low_high);
-        }
-
-        cx += 8;
-    }
-
     while cx + 4 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
+        let src_ptr = ((src as * const u8).add(src_offset) as *const f32).add(cx * channels);
         let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave_quarter!(src_ptr, image_configuration);
-
-        let r_low = vmovl_u8(vget_low_u8(r_chan));
-        let g_low = vmovl_u8(vget_low_u8(g_chan));
-        let b_low = vmovl_u8(vget_low_u8(b_chan));
-
-        let r_low_low = vmovl_u16(vget_low_u16(r_low));
-        let g_low_low = vmovl_u16(vget_low_u16(g_low));
-        let b_low_low = vmovl_u16(vget_low_u16(b_low));
+            load_f32_and_deinterleave!(src_ptr, image_configuration);
 
         let (mut x_low_low, mut y_low_low, mut z_low_low) = neon_triple_to_xyz(
-            r_low_low,
-            g_low_low,
-            b_low_low,
+            r_chan,
+            g_chan,
+            b_chan,
             cq1,
             cq2,
             cq3,
@@ -420,7 +67,6 @@ pub unsafe fn neon_channels_to_xyz_or_lab<
             cq7,
             cq8,
             cq9,
-            transfer_function,
         );
 
         match target {
@@ -451,16 +97,12 @@ pub unsafe fn neon_channels_to_xyz_or_lab<
         if USE_ALPHA {
             let a_ptr = (a_linearized as *mut u8).add(a_offset) as *mut f32;
 
-            let a_low = vmovl_u8(vget_low_u8(a_chan));
-
-            let a_low_low =
-                vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32);
-
-            vst1q_f32(a_ptr.add(cx), a_low_low);
+            vst1q_f32(a_ptr.add(cx), a_chan);
         }
 
         cx += 4;
     }
 
+
     cx
 }
diff --git a/src/neon/to_xyza_laba.rs b/src/neon/to_xyza_laba.rs
index 8e93f19..0542683 100644
--- a/src/neon/to_xyza_laba.rs
+++ b/src/neon/to_xyza_laba.rs
@@ -5,27 +5,23 @@
  * // license that can be found in the LICENSE file.
  */
 
-use crate::gamma_curves::TransferFunction;
 use crate::image::ImageConfiguration;
 use crate::neon::cie::{
     neon_triple_to_lab, neon_triple_to_lch, neon_triple_to_luv, neon_triple_to_xyz,
 };
 use crate::xyz_target::XyzTarget;
-use crate::{
-    load_u8_and_deinterleave, load_u8_and_deinterleave_half, load_u8_and_deinterleave_quarter,
-};
+use crate::load_f32_and_deinterleave;
 use std::arch::aarch64::*;
 
 #[inline(always)]
 pub unsafe fn neon_channels_to_xyza_or_laba<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     start_cx: usize,
-    src: *const u8,
+    src: *const f32,
     src_offset: usize,
     width: u32,
     dst: *mut f32,
     dst_offset: usize,
     matrix: &[[f32; 3]; 3],
-    transfer_function: TransferFunction,
 ) -> usize {
     let target: XyzTarget = TARGET.into();
     let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
@@ -44,351 +40,13 @@ pub unsafe fn neon_channels_to_xyza_or_laba<const CHANNELS_CONFIGURATION: u8, co
 
     let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32;
 
-    while cx + 16 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave!(src_ptr, image_configuration);
-
-        let r_low = vmovl_u8(vget_low_u8(r_chan));
-        let g_low = vmovl_u8(vget_low_u8(g_chan));
-        let b_low = vmovl_u8(vget_low_u8(b_chan));
-
-        let r_low_low = vmovl_u16(vget_low_u16(r_low));
-        let g_low_low = vmovl_u16(vget_low_u16(g_low));
-        let b_low_low = vmovl_u16(vget_low_u16(b_low));
-
-        let (mut x_low_low, mut y_low_low, mut z_low_low) = neon_triple_to_xyz(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = neon_triple_to_lab(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = a;
-                z_low_low = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Lch => {
-                let (l, c, h) = neon_triple_to_lch(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = c;
-                z_low_low = h;
-            }
-            XyzTarget::Luv => {
-                let (l, u, v) = neon_triple_to_luv(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = u;
-                z_low_low = v;
-            }
-        }
-
-        let a_low = vmovl_u8(vget_low_u8(a_chan));
-
-        let a_low_low = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32);
-
-        let xyz_low_low = float32x4x4_t(x_low_low, y_low_low, z_low_low, a_low_low);
-        vst4q_f32(dst_ptr.add(cx * 4), xyz_low_low);
-
-        let r_low_high = vmovl_high_u16(r_low);
-        let g_low_high = vmovl_high_u16(g_low);
-        let b_low_high = vmovl_high_u16(b_low);
-
-        let (mut x_low_high, mut y_low_high, mut z_low_high) = neon_triple_to_xyz(
-            r_low_high,
-            g_low_high,
-            b_low_high,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = neon_triple_to_lab(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = a;
-                z_low_high = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = neon_triple_to_luv(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = u;
-                z_low_high = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = neon_triple_to_lch(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = c;
-                z_low_high = h;
-            }
-        }
-
-        let a_low_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_low)), 1f32 / 255f32);
-
-        let xyz_low_low = float32x4x4_t(x_low_high, y_low_high, z_low_high, a_low_high);
-        vst4q_f32(dst_ptr.add(cx * 4 + 4 * 4), xyz_low_low);
-
-        let r_high = vmovl_high_u8(r_chan);
-        let g_high = vmovl_high_u8(g_chan);
-        let b_high = vmovl_high_u8(b_chan);
-
-        let r_high_low = vmovl_u16(vget_low_u16(r_high));
-        let g_high_low = vmovl_u16(vget_low_u16(g_high));
-        let b_high_low = vmovl_u16(vget_low_u16(b_high));
-
-        let (mut x_high_low, mut y_high_low, mut z_high_low) = neon_triple_to_xyz(
-            r_high_low,
-            g_high_low,
-            b_high_low,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = neon_triple_to_lab(x_high_low, y_high_low, z_high_low);
-                x_high_low = l;
-                y_high_low = a;
-                z_high_low = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = neon_triple_to_luv(x_high_low, y_high_low, z_high_low);
-                x_high_low = l;
-                y_high_low = u;
-                z_high_low = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = neon_triple_to_lch(x_high_low, y_high_low, z_high_low);
-                x_high_low = l;
-                y_high_low = c;
-                z_high_low = h;
-            }
-        }
-
-        let a_high = vmovl_high_u8(a_chan);
-        let a_high_low = vmulq_n_f32(
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_high))),
-            1f32 / 255f32,
-        );
-
-        let xyz_low_low = float32x4x4_t(x_high_low, y_high_low, z_high_low, a_high_low);
-        vst4q_f32(dst_ptr.add(cx * 4 + 4 * 4 * 2), xyz_low_low);
-
-        let r_high_high = vmovl_high_u16(r_high);
-        let g_high_high = vmovl_high_u16(g_high);
-        let b_high_high = vmovl_high_u16(b_high);
-
-        let (mut x_high_high, mut y_high_high, mut z_high_high) = neon_triple_to_xyz(
-            r_high_high,
-            g_high_high,
-            b_high_high,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = neon_triple_to_lab(x_high_high, y_high_high, z_high_high);
-                x_high_high = l;
-                y_high_high = a;
-                z_high_high = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = neon_triple_to_luv(x_high_high, y_high_high, z_high_high);
-                x_high_high = l;
-                y_high_high = u;
-                z_high_high = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = neon_triple_to_lch(x_high_high, y_high_high, z_high_high);
-                x_high_high = l;
-                y_high_high = c;
-                z_high_high = h;
-            }
-        }
-
-        let a_high_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_high)), 1f32 / 255f32);
-
-        let xyz_low_low = float32x4x4_t(x_high_high, y_high_high, z_high_high, a_high_high);
-        vst4q_f32(dst_ptr.add(cx * 4 + 4 * 4 * 3), xyz_low_low);
-
-        cx += 16;
-    }
-
-    while cx + 8 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave_half!(src_ptr, image_configuration);
-
-        let r_low = vmovl_u8(vget_low_u8(r_chan));
-        let g_low = vmovl_u8(vget_low_u8(g_chan));
-        let b_low = vmovl_u8(vget_low_u8(b_chan));
-
-        let r_low_low = vmovl_u16(vget_low_u16(r_low));
-        let g_low_low = vmovl_u16(vget_low_u16(g_low));
-        let b_low_low = vmovl_u16(vget_low_u16(b_low));
-
-        let (mut x_low_low, mut y_low_low, mut z_low_low) = neon_triple_to_xyz(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = neon_triple_to_lab(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = a;
-                z_low_low = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Lch => {
-                let (l, c, h) = neon_triple_to_lch(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = c;
-                z_low_low = h;
-            }
-            XyzTarget::Luv => {
-                let (l, u, v) = neon_triple_to_luv(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = u;
-                z_low_low = v;
-            }
-        }
-
-        let a_low = vmovl_u8(vget_low_u8(a_chan));
-
-        let a_low_low = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32);
-
-        let xyz_low_low = float32x4x4_t(x_low_low, y_low_low, z_low_low, a_low_low);
-        vst4q_f32(dst_ptr.add(cx * 4), xyz_low_low);
-
-        let r_low_high = vmovl_high_u16(r_low);
-        let g_low_high = vmovl_high_u16(g_low);
-        let b_low_high = vmovl_high_u16(b_low);
-
-        let (mut x_low_high, mut y_low_high, mut z_low_high) = neon_triple_to_xyz(
-            r_low_high,
-            g_low_high,
-            b_low_high,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = neon_triple_to_lab(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = a;
-                z_low_high = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = neon_triple_to_luv(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = u;
-                z_low_high = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = neon_triple_to_lch(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = c;
-                z_low_high = h;
-            }
-        }
-
-        let a_low_high = vmulq_n_f32(vcvtq_f32_u32(vmovl_high_u16(a_low)), 1f32 / 255f32);
-
-        let xyz_low_low = float32x4x4_t(x_low_high, y_low_high, z_low_high, a_low_high);
-        vst4q_f32(dst_ptr.add(cx * 4 + 4 * 4), xyz_low_low);
-
-        cx += 8;
-    }
-
     while cx + 4 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
+        let src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * channels);
         let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave_quarter!(src_ptr, image_configuration);
-
-        let r_low = vmovl_u8(vget_low_u8(r_chan));
-        let g_low = vmovl_u8(vget_low_u8(g_chan));
-        let b_low = vmovl_u8(vget_low_u8(b_chan));
-
-        let r_low_low = vmovl_u16(vget_low_u16(r_low));
-        let g_low_low = vmovl_u16(vget_low_u16(g_low));
-        let b_low_low = vmovl_u16(vget_low_u16(b_low));
+            load_f32_and_deinterleave!(src_ptr, image_configuration);
 
         let (mut x_low_low, mut y_low_low, mut z_low_low) = neon_triple_to_xyz(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
+            r_chan, g_chan, b_chan, cq1, cq2, cq3, cq4, cq5, cq6, cq7, cq8, cq9,
         );
 
         match target {
@@ -413,11 +71,7 @@ pub unsafe fn neon_channels_to_xyza_or_laba<const CHANNELS_CONFIGURATION: u8, co
             }
         }
 
-        let a_low = vmovl_u8(vget_low_u8(a_chan));
-
-        let a_low_low = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_low))), 1f32 / 255f32);
-
-        let xyz_low_low = float32x4x4_t(x_low_low, y_low_low, z_low_low, a_low_low);
+        let xyz_low_low = float32x4x4_t(x_low_low, y_low_low, z_low_low, a_chan);
         vst4q_f32(dst_ptr.add(cx * 4), xyz_low_low);
 
         cx += 4;
diff --git a/src/neon/xyz_lab_to_image.rs b/src/neon/xyz_lab_to_image.rs
index c0615fe..f5e54a6 100644
--- a/src/neon/xyz_lab_to_image.rs
+++ b/src/neon/xyz_lab_to_image.rs
@@ -8,9 +8,7 @@
 use crate::image::ImageConfiguration;
 use crate::neon::cie::{neon_lab_to_xyz, neon_lch_to_xyz, neon_luv_to_xyz};
 use crate::neon::math::*;
-use crate::neon::*;
 use crate::xyz_target::XyzTarget;
-use crate::TransferFunction;
 use std::arch::aarch64::*;
 
 #[inline(always)]
@@ -20,7 +18,6 @@ pub(crate) unsafe fn neon_xyz_lab_vld<
     const TARGET: u8,
 >(
     src: *const f32,
-    transfer_function: TransferFunction,
     c1: float32x4_t,
     c2: float32x4_t,
     c3: float32x4_t,
@@ -30,9 +27,8 @@ pub(crate) unsafe fn neon_xyz_lab_vld<
     c7: float32x4_t,
     c8: float32x4_t,
     c9: float32x4_t,
-) -> (uint32x4_t, uint32x4_t, uint32x4_t) {
+) -> (float32x4_t, float32x4_t, float32x4_t) {
     let target: XyzTarget = TARGET.into();
-    let v_scale_color = vdupq_n_f32(255f32);
     let lab_pixel = vld3q_f32(src);
     let (mut r_f32, mut g_f32, mut b_f32) = (lab_pixel.0, lab_pixel.1, lab_pixel.2);
 
@@ -61,22 +57,7 @@ pub(crate) unsafe fn neon_xyz_lab_vld<
     let (linear_r, linear_g, linear_b) =
         vcolorq_matrix_f32(r_f32, g_f32, b_f32, c1, c2, c3, c4, c5, c6, c7, c8, c9);
 
-    r_f32 = linear_r;
-    g_f32 = linear_g;
-    b_f32 = linear_b;
-
-    r_f32 = neon_perform_gamma_transfer(transfer_function, r_f32);
-    g_f32 = neon_perform_gamma_transfer(transfer_function, g_f32);
-    b_f32 = neon_perform_gamma_transfer(transfer_function, b_f32);
-
-    r_f32 = vmulq_f32(r_f32, v_scale_color);
-    g_f32 = vmulq_f32(g_f32, v_scale_color);
-    b_f32 = vmulq_f32(b_f32, v_scale_color);
-    (
-        vcvtaq_u32_f32(r_f32),
-        vcvtaq_u32_f32(g_f32),
-        vcvtaq_u32_f32(b_f32),
-    )
+    (linear_r, linear_g, linear_b)
 }
 
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
@@ -91,11 +72,10 @@ pub unsafe fn neon_xyz_to_channels<
     src_offset: usize,
     a_channel: *const f32,
     a_offset: usize,
-    dst: *mut u8,
+    dst: *mut f32,
     dst_offset: usize,
     width: u32,
     matrix: &[[f32; 3]; 3],
-    transfer_function: TransferFunction,
 ) -> usize {
     let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
     if USE_ALPHA && !image_configuration.has_alpha() {
@@ -118,133 +98,6 @@ pub unsafe fn neon_xyz_to_channels<
 
     let src_channels = 3usize;
 
-    while cx + 16 < width as usize {
-        let offset_src_ptr =
-            ((src as *const u8).add(src_offset) as *const f32).add(cx * src_channels);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_) =
-            neon_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_0,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let src_ptr_1 = offset_src_ptr.add(4 * src_channels);
-
-        let (r_row1_, g_row1_, b_row1_) =
-            neon_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_1,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let src_ptr_2 = offset_src_ptr.add(4 * 2 * src_channels);
-
-        let (r_row2_, g_row2_, b_row2_) =
-            neon_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_2,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let src_ptr_3 = offset_src_ptr.add(4 * 3 * src_channels);
-
-        let (r_row3_, g_row3_, b_row3_) =
-            neon_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_3,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_));
-        let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_));
-        let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_));
-
-        let r_row23 = vcombine_u16(vqmovn_u32(r_row2_), vqmovn_u32(r_row3_));
-        let g_row23 = vcombine_u16(vqmovn_u32(g_row2_), vqmovn_u32(g_row3_));
-        let b_row23 = vcombine_u16(vqmovn_u32(b_row2_), vqmovn_u32(b_row3_));
-
-        let r_row = vcombine_u8(vqmovn_u16(r_row01), vqmovn_u16(r_row23));
-        let g_row = vcombine_u8(vqmovn_u16(g_row01), vqmovn_u16(g_row23));
-        let b_row = vcombine_u8(vqmovn_u16(b_row01), vqmovn_u16(b_row23));
-
-        let dst_ptr = dst.add(dst_offset + cx * channels);
-
-        if USE_ALPHA {
-            let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx);
-            let a_low_0_f = vld1q_f32(offset_a_src_ptr);
-            let a_row0_ = vcvtaq_u32_f32(vmulq_n_f32(a_low_0_f, 255f32));
-
-            let a_low_1_f = vld1q_f32(offset_a_src_ptr.add(4));
-            let a_row1_ = vcvtaq_u32_f32(vmulq_n_f32(a_low_1_f, 255f32));
-
-            let a_low_2_f = vld1q_f32(offset_a_src_ptr.add(8));
-            let a_row2_ = vcvtaq_u32_f32(vmulq_n_f32(a_low_2_f, 255f32));
-
-            let a_low_3_f = vld1q_f32(offset_a_src_ptr.add(12));
-            let a_row3_ = vcvtaq_u32_f32(vmulq_n_f32(a_low_3_f, 255f32));
-
-            let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_));
-            let a_row23 = vcombine_u16(vqmovn_u32(a_row2_), vqmovn_u32(a_row3_));
-            let a_row = vcombine_u8(vqmovn_u16(a_row01), vqmovn_u16(a_row23));
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    uint8x16x4_t(r_row, g_row, b_row, a_row)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    uint8x16x4_t(b_row, g_row, r_row, a_row)
-                }
-            };
-            vst4q_u8(dst_ptr, store_rows);
-        } else {
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    uint8x16x3_t(r_row, g_row, b_row)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    uint8x16x3_t(b_row, g_row, r_row)
-                }
-            };
-            vst3q_u8(dst_ptr, store_rows);
-        }
-
-        cx += 16;
-    }
-
     while cx + 4 < width as usize {
         let offset_src_ptr =
             ((src as *const u8).add(src_offset) as *const f32).add(cx * src_channels);
@@ -253,160 +106,33 @@ pub unsafe fn neon_xyz_to_channels<
 
         let (r_row0_, g_row0_, b_row0_) =
             neon_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_0,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let src_ptr_1 = offset_src_ptr.add(4 * src_channels);
-
-        let (r_row1_, g_row1_, b_row1_) =
-            neon_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_1,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
+                src_ptr_0, c1, c2, c3, c4, c5, c6, c7, c8, c9,
             );
 
-        let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_));
-        let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_));
-        let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_));
-
-        let r_row = vqmovn_u16(r_row01);
-        let g_row = vqmovn_u16(g_row01);
-        let b_row = vqmovn_u16(b_row01);
-
-        let dst_ptr = dst.add(dst_offset + cx * channels);
+        let dst_ptr = ((dst as *mut u8).add(dst_offset) as *mut f32).add(cx * channels);
 
         if USE_ALPHA {
             let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx);
-            let a_low_0_f = vld1q_f32(offset_a_src_ptr);
-            let a_row0_ = vcvtaq_u32_f32(vmulq_n_f32(a_low_0_f, 255f32));
-
-            let a_low_1_f = vld1q_f32(offset_a_src_ptr.add(4));
-            let a_row1_ = vcvtaq_u32_f32(vmulq_n_f32(a_low_1_f, 255f32));
-
-            let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_));
-            let a_row = vqmovn_u16(a_row01);
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    uint8x8x4_t(r_row, g_row, b_row, a_row)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    uint8x8x4_t(b_row, g_row, r_row, a_row)
-                }
-            };
-            vst4_u8(dst_ptr, store_rows);
-        } else {
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    uint8x8x3_t(r_row, g_row, b_row)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    uint8x8x3_t(b_row, g_row, r_row)
-                }
-            };
-            vst3_u8(dst_ptr, store_rows);
-        }
-
-        cx += 8;
-    }
-    while cx + 8 < width as usize {
-        let offset_src_ptr =
-            ((src as *const u8).add(src_offset) as *const f32).add(cx * src_channels);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_) =
-            neon_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_0,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let src_ptr_1 = offset_src_ptr.add(4 * src_channels);
-
-        let (r_row1_, g_row1_, b_row1_) =
-            neon_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_1,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_));
-        let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_));
-        let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_));
-
-        let r_row = vqmovn_u16(r_row01);
-        let g_row = vqmovn_u16(g_row01);
-        let b_row = vqmovn_u16(b_row01);
-
-        let dst_ptr = dst.add(dst_offset + cx * channels);
-
-        if USE_ALPHA {
-            let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx);
-            let a_low_0_f = vld1q_f32(offset_a_src_ptr);
-            let a_row0_ = vcvtaq_u32_f32(vmulq_n_f32(a_low_0_f, 255f32));
-
-            let a_low_1_f = vld1q_f32(offset_a_src_ptr.add(4));
-            let a_row1_ = vcvtaq_u32_f32(vmulq_n_f32(a_low_1_f, 255f32));
-
-            let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_));
-            let a_row = vqmovn_u16(a_row01);
+            let a_row = vld1q_f32(offset_a_src_ptr);
             let store_rows = match image_configuration {
                 ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    uint8x8x4_t(r_row, g_row, b_row, a_row)
+                    float32x4x4_t(r_row0_, g_row0_, b_row0_, a_row)
                 }
                 ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    uint8x8x4_t(b_row, g_row, r_row, a_row)
+                    float32x4x4_t(b_row0_, g_row0_, r_row0_, a_row)
                 }
             };
-            let mut transient: [u8; 32] = [0; 32];
-            vst4_u8(transient.as_mut_ptr(), store_rows);
-            std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 4);
+            vst4q_f32(dst_ptr, store_rows);
         } else {
             let store_rows = match image_configuration {
                 ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    uint8x8x3_t(r_row, g_row, b_row)
+                    float32x4x3_t(r_row0_, g_row0_, b_row0_)
                 }
                 ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    uint8x8x3_t(b_row, g_row, r_row)
+                    float32x4x3_t(b_row0_, g_row0_, r_row0_)
                 }
             };
-            let mut transient: [u8; 24] = [0; 24];
-            vst3_u8(transient.as_mut_ptr(), store_rows);
-            std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 3);
+            vst3q_f32(dst_ptr, store_rows);
         }
 
         cx += 4;
diff --git a/src/neon/xyza_laba_to_image.rs b/src/neon/xyza_laba_to_image.rs
index c1e0062..ed829ea 100644
--- a/src/neon/xyza_laba_to_image.rs
+++ b/src/neon/xyza_laba_to_image.rs
@@ -8,15 +8,12 @@
 use crate::image::ImageConfiguration;
 use crate::neon::cie::{neon_lab_to_xyz, neon_lch_to_xyz, neon_luv_to_xyz};
 use crate::neon::math::vcolorq_matrix_f32;
-use crate::neon::*;
 use crate::xyz_target::XyzTarget;
-use crate::TransferFunction;
 use std::arch::aarch64::*;
 
 #[inline(always)]
 pub(crate) unsafe fn neon_xyza_lab_vld<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     src: *const f32,
-    transfer_function: TransferFunction,
     c1: float32x4_t,
     c2: float32x4_t,
     c3: float32x4_t,
@@ -26,9 +23,8 @@ pub(crate) unsafe fn neon_xyza_lab_vld<const CHANNELS_CONFIGURATION: u8, const T
     c7: float32x4_t,
     c8: float32x4_t,
     c9: float32x4_t,
-) -> (uint32x4_t, uint32x4_t, uint32x4_t, uint32x4_t) {
+) -> (float32x4_t, float32x4_t, float32x4_t, float32x4_t) {
     let target: XyzTarget = TARGET.into();
-    let v_scale_color = vdupq_n_f32(255f32);
     let lab_pixel = vld4q_f32(src);
     let (mut r_f32, mut g_f32, mut b_f32) = (lab_pixel.0, lab_pixel.1, lab_pixel.2);
 
@@ -57,23 +53,7 @@ pub(crate) unsafe fn neon_xyza_lab_vld<const CHANNELS_CONFIGURATION: u8, const T
     let (linear_r, linear_g, linear_b) =
         vcolorq_matrix_f32(r_f32, g_f32, b_f32, c1, c2, c3, c4, c5, c6, c7, c8, c9);
 
-    r_f32 = linear_r;
-    g_f32 = linear_g;
-    b_f32 = linear_b;
-
-    r_f32 = neon_perform_gamma_transfer(transfer_function, r_f32);
-    g_f32 = neon_perform_gamma_transfer(transfer_function, g_f32);
-    b_f32 = neon_perform_gamma_transfer(transfer_function, b_f32);
-    r_f32 = vmulq_f32(r_f32, v_scale_color);
-    g_f32 = vmulq_f32(g_f32, v_scale_color);
-    b_f32 = vmulq_f32(b_f32, v_scale_color);
-    let a_f32 = vmulq_f32(lab_pixel.3, v_scale_color);
-    (
-        vcvtaq_u32_f32(r_f32),
-        vcvtaq_u32_f32(g_f32),
-        vcvtaq_u32_f32(b_f32),
-        vcvtaq_u32_f32(a_f32),
-    )
+    (linear_r, linear_g, linear_b, lab_pixel.3)
 }
 
 #[inline(always)]
@@ -81,11 +61,10 @@ pub unsafe fn neon_xyza_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:
     start_cx: usize,
     src: *const f32,
     src_offset: usize,
-    dst: *mut u8,
+    dst: *mut f32,
     dst_offset: usize,
     width: u32,
     matrix: &[[f32; 3]; 3],
-    transfer_function: TransferFunction,
 ) -> usize {
     let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
     if !image_configuration.has_alpha() {
@@ -108,169 +87,6 @@ pub unsafe fn neon_xyza_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:
 
     const CHANNELS: usize = 4usize;
 
-    while cx + 16 < width as usize {
-        let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_, a_row0_) =
-            neon_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
-                src_ptr_0,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let src_ptr_1 = offset_src_ptr.add(4 * CHANNELS);
-
-        let (r_row1_, g_row1_, b_row1_, a_row1_) =
-            neon_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
-                src_ptr_1,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let src_ptr_2 = offset_src_ptr.add(4 * 2 * CHANNELS);
-
-        let (r_row2_, g_row2_, b_row2_, a_row2_) =
-            neon_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
-                src_ptr_2,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let src_ptr_3 = offset_src_ptr.add(4 * 3 * CHANNELS);
-
-        let (r_row3_, g_row3_, b_row3_, a_row3_) =
-            neon_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
-                src_ptr_3,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_));
-        let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_));
-        let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_));
-        let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_));
-
-        let r_row23 = vcombine_u16(vqmovn_u32(r_row2_), vqmovn_u32(r_row3_));
-        let g_row23 = vcombine_u16(vqmovn_u32(g_row2_), vqmovn_u32(g_row3_));
-        let b_row23 = vcombine_u16(vqmovn_u32(b_row2_), vqmovn_u32(b_row3_));
-        let a_row23 = vcombine_u16(vqmovn_u32(a_row2_), vqmovn_u32(a_row3_));
-
-        let r_row = vcombine_u8(vqmovn_u16(r_row01), vqmovn_u16(r_row23));
-        let g_row = vcombine_u8(vqmovn_u16(g_row01), vqmovn_u16(g_row23));
-        let b_row = vcombine_u8(vqmovn_u16(b_row01), vqmovn_u16(b_row23));
-        let a_row = vcombine_u8(vqmovn_u16(a_row01), vqmovn_u16(a_row23));
-
-        let dst_ptr = dst.add(dst_offset + cx * channels);
-
-        let store_rows = match image_configuration {
-            ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                uint8x16x4_t(r_row, g_row, b_row, a_row)
-            }
-            ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                uint8x16x4_t(b_row, g_row, r_row, a_row)
-            }
-        };
-        vst4q_u8(dst_ptr, store_rows);
-
-        cx += 16;
-    }
-
-    while cx + 8 < width as usize {
-        let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_, a_row0_) =
-            neon_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
-                src_ptr_0,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let src_ptr_1 = offset_src_ptr.add(4 * CHANNELS);
-
-        let (r_row1_, g_row1_, b_row1_, a_row1_) =
-            neon_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
-                src_ptr_1,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), vqmovn_u32(r_row1_));
-        let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), vqmovn_u32(g_row1_));
-        let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), vqmovn_u32(b_row1_));
-        let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), vqmovn_u32(a_row1_));
-
-        let r_row = vqmovn_u16(r_row01);
-        let g_row = vqmovn_u16(g_row01);
-        let b_row = vqmovn_u16(b_row01);
-        let a_row = vqmovn_u16(a_row01);
-
-        let dst_ptr = dst.add(dst_offset + cx * channels);
-
-        let store_rows = match image_configuration {
-            ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                uint8x8x4_t(r_row, g_row, b_row, a_row)
-            }
-            ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                uint8x8x4_t(b_row, g_row, r_row, a_row)
-            }
-        };
-        vst4_u8(dst_ptr, store_rows);
-
-        cx += 8;
-    }
-
     while cx + 4 < width as usize {
         let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS);
 
@@ -279,7 +95,6 @@ pub unsafe fn neon_xyza_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:
         let (r_row0_, g_row0_, b_row0_, a_row0_) =
             neon_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
                 src_ptr_0,
-                transfer_function,
                 c1,
                 c2,
                 c3,
@@ -291,31 +106,17 @@ pub unsafe fn neon_xyza_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:
                 c9,
             );
 
-        let zeros = vdup_n_u16(0);
-
-        let r_row01 = vcombine_u16(vqmovn_u32(r_row0_), zeros);
-        let g_row01 = vcombine_u16(vqmovn_u32(g_row0_), zeros);
-        let b_row01 = vcombine_u16(vqmovn_u32(b_row0_), zeros);
-        let a_row01 = vcombine_u16(vqmovn_u32(a_row0_), zeros);
-
-        let r_row = vqmovn_u16(r_row01);
-        let g_row = vqmovn_u16(g_row01);
-        let b_row = vqmovn_u16(b_row01);
-        let a_row = vqmovn_u16(a_row01);
-
-        let dst_ptr = dst.add(dst_offset + cx * channels);
+        let dst_ptr = ((dst as *mut u8).add(dst_offset) as* mut f32).add(cx * channels);
 
         let store_rows = match image_configuration {
             ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                uint8x8x4_t(r_row, g_row, b_row, a_row)
+                float32x4x4_t(r_row0_, g_row0_, b_row0_, a_row0_)
             }
             ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                uint8x8x4_t(b_row, g_row, r_row, a_row)
+                float32x4x4_t(b_row0_, g_row0_, r_row0_, a_row0_)
             }
         };
-        let mut transient: [u8; 32] = [0; 32];
-        vst4_u8(transient.as_mut_ptr(), store_rows);
-        std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, 4 * 4);
+        vst4q_f32(dst_ptr, store_rows);
 
         cx += 4;
     }
diff --git a/src/oklab.rs b/src/oklab.rs
index e2e03aa..a4986b9 100644
--- a/src/oklab.rs
+++ b/src/oklab.rs
@@ -42,10 +42,16 @@ impl Oklab {
         Self::linear_rgb_to_oklab(linearized)
     }
 
+    #[inline]
+    /// Convert Linear Rgb to [Oklab]
+    pub fn from_linear_rgb(rgb: Rgb<f32>) -> Oklab {
+        Self::linear_rgb_to_oklab(rgb)
+    }
+
     #[inline]
     /// Converts [Oklab] to [Rgb] using sRGB transfer function
     pub fn to_srgb(&self) -> Rgb<u8> {
-        let linear_rgb = self.to_linear_srgb();
+        let linear_rgb = self.to_linear_rgb();
         let transferred = linear_rgb.gamma(TransferFunction::Srgb);
         transferred.to_u8()
     }
@@ -53,7 +59,7 @@ impl Oklab {
     #[inline]
     /// Converts [Oklab] to [Rgb] using provided [TransferFunction]
     pub fn to_rgb(&self, transfer_function: TransferFunction) -> Rgb<u8> {
-        let linear_rgb = self.to_linear_srgb();
+        let linear_rgb = self.to_linear_rgb();
         let transferred = linear_rgb.gamma(transfer_function);
         transferred.to_u8()
     }
@@ -61,14 +67,14 @@ impl Oklab {
     #[inline]
     /// Converts [Oklab] to linear [Rgb] using sRGB transfer function
     pub fn to_srgb_f32(&self) -> Rgb<f32> {
-        let linear_rgb = self.to_linear_srgb();
+        let linear_rgb = self.to_linear_rgb();
         linear_rgb.gamma(TransferFunction::Srgb)
     }
 
     #[inline]
     /// Converts [Oklab] to [Rgb] using provided [TransferFunction]
     pub fn to_rgb_f32(&self, transfer_function: TransferFunction) -> Rgb<f32> {
-        let linear_rgb = self.to_linear_srgb();
+        let linear_rgb = self.to_linear_rgb();
         linear_rgb.gamma(transfer_function)
     }
 
@@ -91,7 +97,7 @@ impl Oklab {
 
     #[inline]
     /// Converts to linear RGB
-    pub fn to_linear_srgb(&self) -> Rgb<f32> {
+    pub fn to_linear_rgb(&self) -> Rgb<f32> {
         let l_ = self.l + 0.3963377774f32 * self.a + 0.2158037573f32 * self.b;
         let m_ = self.l - 0.1055613458f32 * self.a - 0.0638541728f32 * self.b;
         let s_ = self.l - 0.0894841775f32 * self.a - 1.2914855480f32 * self.b;
diff --git a/src/oklab_to_image.rs b/src/oklab_to_image.rs
index 157075e..35377dd 100644
--- a/src/oklab_to_image.rs
+++ b/src/oklab_to_image.rs
@@ -13,12 +13,11 @@ use crate::neon::neon_oklab_to_image;
 use crate::oklch::Oklch;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::sse::sse_oklab_to_image;
-use crate::{Oklab, TransferFunction};
+use crate::{Oklab, Rgb, TransferFunction};
 #[cfg(feature = "rayon")]
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 #[cfg(feature = "rayon")]
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
-#[cfg(feature = "rayon")]
 use std::slice;
 
 #[allow(clippy::type_complexity)]
@@ -35,7 +34,7 @@ fn oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
 
     let mut _wide_row_handle: Option<
-        unsafe fn(usize, *const f32, u32, *mut u8, u32, u32, TransferFunction) -> usize,
+        unsafe fn(usize, *const f32, usize, *mut f32, u32, u32) -> usize,
     > = None;
 
     #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
@@ -53,34 +52,35 @@ fn oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
         _wide_row_handle = Some(neon_oklab_to_image::<CHANNELS_CONFIGURATION, TARGET>);
     }
 
+    let mut lut_table = vec![0u8; 2049];
+    for i in 0..2049 {
+        lut_table[i] = (transfer_function.gamma(i as f32 * (1. / 2048.0)) * 255.)
+            .ceil()
+            .min(255.) as u8;
+    }
+
+    let channels = image_configuration.get_channels_count();
+
+    let src_slice_safe_align = unsafe {
+        slice::from_raw_parts(
+            src.as_ptr() as *const u8,
+            src_stride as usize * height as usize,
+        )
+    };
+
     #[cfg(feature = "rayon")]
     {
-        let src_slice_safe_align = unsafe {
-            slice::from_raw_parts(
-                src.as_ptr() as *const u8,
-                src_stride as usize * height as usize,
-            )
-        };
         dst.par_chunks_exact_mut(dst_stride as usize)
             .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize))
             .for_each(|(dst, src)| unsafe {
-                let channels = image_configuration.get_channels_count();
-
                 let mut _cx = 0usize;
 
+                let mut transient_row = vec![0f32; width as usize * channels];
+
                 let src_ptr = src.as_ptr() as *mut f32;
-                let dst_ptr = dst.as_mut_ptr();
 
                 if let Some(dispatcher) = _wide_row_handle {
-                    _cx = dispatcher(
-                        _cx,
-                        src.as_ptr() as *const f32,
-                        0,
-                        dst.as_mut_ptr(),
-                        0,
-                        width,
-                        transfer_function,
-                    )
+                    _cx = dispatcher(_cx, src_ptr, 0, transient_row.as_mut_ptr(), 0, width)
                 }
 
                 for x in _cx..width as usize {
@@ -92,26 +92,38 @@ fn oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
                     let rgb = match target {
                         OklabTarget::Oklab => {
                             let oklab = Oklab::new(l_x, l_y, l_z);
-                            oklab.to_rgb(transfer_function)
+                            oklab.to_linear_rgb()
                         }
                         OklabTarget::Oklch => {
                             let oklch = Oklch::new(l_x, l_y, l_z);
-                            oklch.to_rgb(transfer_function)
+                            oklch.to_linear_rgb()
                         }
                     };
 
-                    let dst = dst_ptr.add(x * channels);
-                    dst.add(image_configuration.get_r_channel_offset())
-                        .write_unaligned(rgb.r);
-                    dst.add(image_configuration.get_g_channel_offset())
-                        .write_unaligned(rgb.g);
-                    dst.add(image_configuration.get_b_channel_offset())
-                        .write_unaligned(rgb.b);
+                    let v_dst = transient_row.get_unchecked_mut((x * channels)..);
+                    *v_dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r;
+                    *v_dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g;
+                    *v_dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b;
                     if image_configuration.has_alpha() {
                         let l_a = source_p.add(3).read_unaligned();
-                        let a_value = (l_a * 255f32).max(0f32);
-                        dst.add(image_configuration.get_a_channel_offset())
-                            .write_unaligned(a_value as u8);
+                        *v_dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a;
+                    }
+                }
+
+                for (dst_chunks, src_chunks) in dst
+                    .chunks_exact_mut(channels)
+                    .zip(transient_row.chunks_exact_mut(channels))
+                {
+                    let rgb = (Rgb::<f32>::new(src_chunks[0], src_chunks[1], src_chunks[2])
+                        * Rgb::<f32>::dup(2048f32))
+                    .cast::<u16>();
+
+                    dst_chunks[0] = *lut_table.get_unchecked(rgb.r as usize);
+                    dst_chunks[1] = *lut_table.get_unchecked(rgb.g as usize);
+                    dst_chunks[2] = *lut_table.get_unchecked(rgb.b as usize);
+                    if image_configuration.has_alpha() {
+                        let a_lin = (src_chunks[4] * 255f32).round() as u8;
+                        dst_chunks[0] = a_lin;
                     }
                 }
             });
@@ -119,67 +131,63 @@ fn oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
 
     #[cfg(not(feature = "rayon"))]
     {
-        let mut src_offset = 0usize;
-        let mut dst_offset = 0usize;
-
-        let channels = image_configuration.get_channels_count();
-
-        for _ in 0..height as usize {
-            let mut _cx = 0usize;
-
-            let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *mut f32 };
-            let dst_ptr = unsafe { dst.as_mut_ptr().add(dst_offset) };
-
-            if let Some(dispatcher) = _wide_row_handle {
-                unsafe {
-                    _cx = dispatcher(
-                        _cx,
-                        src.as_ptr(),
-                        src_offset as u32,
-                        dst.as_mut_ptr(),
-                        dst_offset as u32,
-                        width,
-                        transfer_function,
-                    )
+        for (dst, src) in dst.chunks_exact_mut(dst_stride as usize)
+            .zip(src_slice_safe_align.chunks_exact(src_stride as usize)) {
+            unsafe {
+                let mut _cx = 0usize;
+
+                let mut transient_row = vec![0f32; width as usize * channels];
+
+                let src_ptr = src.as_ptr() as *mut f32;
+
+                if let Some(dispatcher) = _wide_row_handle {
+                    _cx = dispatcher(_cx, src_ptr, 0, transient_row.as_mut_ptr(), 0, width)
                 }
-            }
 
-            for x in _cx..width as usize {
-                let px = x * channels;
-                let source_p = unsafe { src_ptr.add(px) };
-                let l_x = unsafe { source_p.read_unaligned() };
-                let l_y = unsafe { source_p.add(1).read_unaligned() };
-                let l_z = unsafe { source_p.add(2).read_unaligned() };
-                let rgb = match target {
-                    OklabTarget::Oklab => {
-                        let oklab = Oklab::new(l_x, l_y, l_z);
-                        oklab.to_rgb(transfer_function)
-                    }
-                    OklabTarget::Oklch => {
-                        let oklch = Oklch::new(l_x, l_y, l_z);
-                        oklch.to_rgb(transfer_function)
-                    }
-                };
-
-                unsafe {
-                    let dst = dst_ptr.add(x * channels);
-                    dst.add(image_configuration.get_r_channel_offset())
-                        .write_unaligned(rgb.r);
-                    dst.add(image_configuration.get_g_channel_offset())
-                        .write_unaligned(rgb.g);
-                    dst.add(image_configuration.get_b_channel_offset())
-                        .write_unaligned(rgb.b);
+                for x in _cx..width as usize {
+                    let px = x * channels;
+                    let source_p = src_ptr.add(px);
+                    let l_x = source_p.read_unaligned();
+                    let l_y = source_p.add(1).read_unaligned();
+                    let l_z = source_p.add(2).read_unaligned();
+                    let rgb = match target {
+                        OklabTarget::Oklab => {
+                            let oklab = Oklab::new(l_x, l_y, l_z);
+                            oklab.to_linear_rgb()
+                        }
+                        OklabTarget::Oklch => {
+                            let oklch = Oklch::new(l_x, l_y, l_z);
+                            oklch.to_linear_rgb()
+                        }
+                    };
+
+                    let v_dst = transient_row.get_unchecked_mut((x * channels)..);
+                    *v_dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r;
+                    *v_dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g;
+                    *v_dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b;
                     if image_configuration.has_alpha() {
                         let l_a = source_p.add(3).read_unaligned();
-                        let a_value = (l_a * 255f32).max(0f32);
-                        dst.add(image_configuration.get_a_channel_offset())
-                            .write_unaligned(a_value as u8);
+                        *v_dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a;
                     }
                 }
-            }
 
-            src_offset += src_stride as usize;
-            dst_offset += dst_stride as usize;
+                for (dst_chunks, src_chunks) in dst
+                    .chunks_exact_mut(channels)
+                    .zip(transient_row.chunks_exact_mut(channels))
+                {
+                    let rgb = (Rgb::<f32>::new(src_chunks[0], src_chunks[1], src_chunks[2])
+                        * Rgb::<f32>::dup(2048f32))
+                        .cast::<u16>();
+
+                    dst_chunks[0] = *lut_table.get_unchecked(rgb.r as usize);
+                    dst_chunks[1] = *lut_table.get_unchecked(rgb.g as usize);
+                    dst_chunks[2] = *lut_table.get_unchecked(rgb.b as usize);
+                    if image_configuration.has_alpha() {
+                        let a_lin = (src_chunks[4] * 255f32).round() as u8;
+                        dst_chunks[0] = a_lin;
+                    }
+                }
+            }
         }
     }
 }
diff --git a/src/oklch.rs b/src/oklch.rs
index cea1392..758bb06 100644
--- a/src/oklch.rs
+++ b/src/oklch.rs
@@ -38,7 +38,17 @@ impl Oklch {
         Oklch::from_oklab(oklab)
     }
 
-    /// Converts *Oklch* into *Rgb*
+    /// Converts Linear [Rgb] into [Oklch]
+    ///
+    /// # Arguments
+    /// `transfer_function` - Transfer function into linear colorspace and its inverse
+    #[inline]
+    pub fn from_linear_rgb(rgb: Rgb<f32>) -> Oklch {
+        let oklab = Oklab::from_linear_rgb(rgb);
+        Oklch::from_oklab(oklab)
+    }
+
+    /// Converts [Oklch] into [Rgb]
     ///
     /// # Arguments
     /// `transfer_function` - Transfer function into linear colorspace and its inverse
@@ -48,6 +58,13 @@ impl Oklch {
         oklab.to_rgb(transfer_function)
     }
 
+    /// Converts [Oklch] into linear [Rgb]
+    #[inline]
+    pub fn to_linear_rgb(&self) -> Rgb<f32> {
+        let oklab = self.to_oklab();
+        oklab.to_linear_rgb()
+    }
+
     /// Converts *Oklab* to *Oklch*
     #[inline]
     pub fn from_oklab(oklab: Oklab) -> Oklch {
diff --git a/src/rgb.rs b/src/rgb.rs
index 2639ecc..f234bf6 100644
--- a/src/rgb.rs
+++ b/src/rgb.rs
@@ -985,3 +985,22 @@ where
         Rgb::<T>::new(self.r.powf(rhs.r), self.g.powf(rhs.g), self.b.powf(rhs.b))
     }
 }
+
+impl<T> Rgb<T> {
+    pub fn cast<V>(self) -> Rgb<V>
+    where
+        T: AsPrimitive<V>,
+        V: Copy + 'static,
+    {
+        Rgb::new(self.r.as_(), self.g.as_(), self.b.as_())
+    }
+}
+
+impl<T> Rgb<T>
+where
+    T: Float + 'static,
+{
+    pub fn round(self) -> Rgb<T> {
+        Rgb::new(self.r.round(), self.g.round(), self.b.round())
+    }
+}
diff --git a/src/sse/cie.rs b/src/sse/cie.rs
index 2825467..ff2f349 100644
--- a/src/sse/cie.rs
+++ b/src/sse/cie.rs
@@ -10,9 +10,8 @@ use crate::luv::{
     LUV_WHITE_V_PRIME,
 };
 use crate::sse::{
-    _mm_color_matrix_ps, _mm_cube_ps, _mm_prefer_fma_ps, _mm_select_ps, perform_sse_linear_transfer,
+    _mm_color_matrix_ps, _mm_cube_ps, _mm_prefer_fma_ps, _mm_select_ps,
 };
-use crate::TransferFunction;
 use erydanos::{_mm_atan2_ps, _mm_cbrt_fast_ps, _mm_cos_ps, _mm_hypot_ps, _mm_sin_ps};
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
@@ -20,10 +19,10 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 
 #[inline(always)]
-pub(crate) unsafe fn sse_triple_to_xyz(
-    r: __m128i,
-    g: __m128i,
-    b: __m128i,
+pub unsafe fn sse_triple_to_xyz(
+    r: __m128,
+    g: __m128,
+    b: __m128,
     c1: __m128,
     c2: __m128,
     c3: __m128,
@@ -33,24 +32,15 @@ pub(crate) unsafe fn sse_triple_to_xyz(
     c7: __m128,
     c8: __m128,
     c9: __m128,
-    transfer_function: TransferFunction,
 ) -> (__m128, __m128, __m128) {
-    let u8_scale = _mm_set1_ps(1f32 / 255f32);
-    let r_f = _mm_mul_ps(_mm_cvtepi32_ps(r), u8_scale);
-    let g_f = _mm_mul_ps(_mm_cvtepi32_ps(g), u8_scale);
-    let b_f = _mm_mul_ps(_mm_cvtepi32_ps(b), u8_scale);
-    let r_linear = perform_sse_linear_transfer(transfer_function, r_f);
-    let g_linear = perform_sse_linear_transfer(transfer_function, g_f);
-    let b_linear = perform_sse_linear_transfer(transfer_function, b_f);
-
     let (x, y, z) = _mm_color_matrix_ps(
-        r_linear, g_linear, b_linear, c1, c2, c3, c4, c5, c6, c7, c8, c9,
+        r, g, b, c1, c2, c3, c4, c5, c6, c7, c8, c9,
     );
     (x, y, z)
 }
 
 #[inline(always)]
-pub(crate) unsafe fn sse_triple_to_luv(
+pub unsafe fn sse_triple_to_luv(
     x: __m128,
     y: __m128,
     z: __m128,
@@ -80,7 +70,7 @@ pub(crate) unsafe fn sse_triple_to_luv(
 }
 
 #[inline(always)]
-pub(crate) unsafe fn sse_triple_to_lab(
+pub unsafe fn sse_triple_to_lab(
     x: __m128,
     y: __m128,
     z: __m128,
@@ -106,7 +96,7 @@ pub(crate) unsafe fn sse_triple_to_lab(
 }
 
 #[inline(always)]
-pub(crate) unsafe fn sse_triple_to_lch(
+pub unsafe fn sse_triple_to_lch(
     x: __m128,
     y: __m128,
     z: __m128,
@@ -118,7 +108,7 @@ pub(crate) unsafe fn sse_triple_to_lch(
 }
 
 #[inline(always)]
-pub(crate) unsafe fn sse_lab_to_xyz(l: __m128, a: __m128, b: __m128) -> (__m128, __m128, __m128) {
+pub unsafe fn sse_lab_to_xyz(l: __m128, a: __m128, b: __m128) -> (__m128, __m128, __m128) {
     let y = _mm_mul_ps(
         _mm_add_ps(l, _mm_set1_ps(16f32)),
         _mm_set1_ps(1f32 / 116f32),
@@ -144,7 +134,7 @@ pub(crate) unsafe fn sse_lab_to_xyz(l: __m128, a: __m128, b: __m128) -> (__m128,
 }
 
 #[inline(always)]
-pub(crate) unsafe fn sse_luv_to_xyz(l: __m128, u: __m128, v: __m128) -> (__m128, __m128, __m128) {
+pub unsafe fn sse_luv_to_xyz(l: __m128, u: __m128, v: __m128) -> (__m128, __m128, __m128) {
     let zeros = _mm_setzero_ps();
     let zero_mask = _mm_cmpeq_ps(l, zeros);
     let l13 = _mm_rcp_ps(_mm_mul_ps(l, _mm_set1_ps(13f32)));
@@ -183,7 +173,7 @@ pub(crate) unsafe fn sse_luv_to_xyz(l: __m128, u: __m128, v: __m128) -> (__m128,
 }
 
 #[inline(always)]
-pub(crate) unsafe fn sse_lch_to_xyz(l: __m128, c: __m128, h: __m128) -> (__m128, __m128, __m128) {
+pub unsafe fn sse_lch_to_xyz(l: __m128, c: __m128, h: __m128) -> (__m128, __m128, __m128) {
     let u = _mm_mul_ps(c, _mm_cos_ps(h));
     let v = _mm_mul_ps(c, _mm_sin_ps(h));
     sse_luv_to_xyz(l, u, v)
diff --git a/src/sse/image_to_linear_u8.rs b/src/sse/image_to_linear_u8.rs
deleted file mode 100644
index 35e73c0..0000000
--- a/src/sse/image_to_linear_u8.rs
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved.
- * //
- * // Use of this source code is governed by a BSD-style
- * // license that can be found in the LICENSE file.
- */
-
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-pub mod sse_image_to_linear_unsigned {
-    #[cfg(target_arch = "x86")]
-    use std::arch::x86::*;
-    #[cfg(target_arch = "x86_64")]
-    use std::arch::x86_64::*;
-
-    use crate::image::ImageConfiguration;
-    use crate::sse::*;
-    use crate::{
-        load_u8_and_deinterleave, store_and_interleave_v3_half_u8, store_and_interleave_v3_u8,
-        store_and_interleave_v4_half_u8, store_and_interleave_v4_u8, TransferFunction,
-    };
-
-    #[inline(always)]
-    unsafe fn sse_triple_to_linear_u8<const INTO_LINEAR: bool>(
-        r: __m128i,
-        g: __m128i,
-        b: __m128i,
-        transfer_function: TransferFunction,
-    ) -> (__m128i, __m128i, __m128i) {
-        let u8_scale = _mm_set1_ps(1f32 / 255f32);
-        let r_f = _mm_mul_ps(_mm_cvtepi32_ps(r), u8_scale);
-        let g_f = _mm_mul_ps(_mm_cvtepi32_ps(g), u8_scale);
-        let b_f = _mm_mul_ps(_mm_cvtepi32_ps(b), u8_scale);
-        let u8_backwards = _mm_set1_ps(255f32);
-        let r_linear = _mm_mul_ps(
-            match INTO_LINEAR {
-                true => perform_sse_linear_transfer(transfer_function, r_f),
-                false => perform_sse_gamma_transfer(transfer_function, r_f),
-            },
-            u8_backwards,
-        );
-        let g_linear = _mm_mul_ps(
-            match INTO_LINEAR {
-                true => perform_sse_linear_transfer(transfer_function, g_f),
-                false => perform_sse_gamma_transfer(transfer_function, g_f),
-            },
-            u8_backwards,
-        );
-        let b_linear = _mm_mul_ps(
-            match INTO_LINEAR {
-                true => perform_sse_linear_transfer(transfer_function, b_f),
-                false => perform_sse_gamma_transfer(transfer_function, b_f),
-            },
-            u8_backwards,
-        );
-        (
-            _mm_cvtps_epi32(r_linear),
-            _mm_cvtps_epi32(g_linear),
-            _mm_cvtps_epi32(b_linear),
-        )
-    }
-
-    #[target_feature(enable = "sse4.1")]
-    pub(crate) unsafe fn sse_channels_to_linear_u8<
-        const CHANNELS_CONFIGURATION: u8,
-        const USE_ALPHA: bool,
-        const INTO_LINEAR: bool,
-    >(
-        start_cx: usize,
-        src: *const u8,
-        src_offset: usize,
-        width: u32,
-        dst: *mut u8,
-        dst_offset: usize,
-        transfer_function: TransferFunction,
-    ) -> usize {
-        let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
-        let channels = image_configuration.get_channels_count();
-        let mut cx = start_cx;
-
-        let dst_ptr = dst.add(dst_offset);
-
-        let zeros = _mm_setzero_si128();
-
-        while cx + 16 < width as usize {
-            let src_ptr = src.add(src_offset + cx * channels);
-            let (r_chan, g_chan, b_chan, a_chan) =
-                load_u8_and_deinterleave!(src_ptr, image_configuration);
-
-            let r_low = _mm_cvtepu8_epi16(r_chan);
-            let g_low = _mm_cvtepu8_epi16(g_chan);
-            let b_low = _mm_cvtepu8_epi16(b_chan);
-
-            let r_low_low = _mm_cvtepu16_epi32(r_low);
-            let g_low_low = _mm_cvtepu16_epi32(g_low);
-            let b_low_low = _mm_cvtepu16_epi32(b_low);
-
-            let (x_low_low, y_low_low, z_low_low) = sse_triple_to_linear_u8::<INTO_LINEAR>(
-                r_low_low,
-                g_low_low,
-                b_low_low,
-                transfer_function,
-            );
-
-            let r_low_high = _mm_unpackhi_epi16(r_low, zeros);
-            let g_low_high = _mm_unpackhi_epi16(g_low, zeros);
-            let b_low_high = _mm_unpackhi_epi16(b_low, zeros);
-
-            let (x_low_high, y_low_high, z_low_high) = sse_triple_to_linear_u8::<INTO_LINEAR>(
-                r_low_high,
-                g_low_high,
-                b_low_high,
-                transfer_function,
-            );
-
-            let r_high = _mm_unpackhi_epi8(r_chan, zeros);
-            let g_high = _mm_unpackhi_epi8(g_chan, zeros);
-            let b_high = _mm_unpackhi_epi8(b_chan, zeros);
-
-            let r_high_low = _mm_cvtepu16_epi32(r_high);
-            let g_high_low = _mm_cvtepu16_epi32(g_high);
-            let b_high_low = _mm_cvtepu16_epi32(b_high);
-
-            let (x_high_low, y_high_low, z_high_low) = sse_triple_to_linear_u8::<INTO_LINEAR>(
-                r_high_low,
-                g_high_low,
-                b_high_low,
-                transfer_function,
-            );
-
-            let r_high_high = _mm_unpackhi_epi16(r_high, zeros);
-            let g_high_high = _mm_unpackhi_epi16(g_high, zeros);
-            let b_high_high = _mm_unpackhi_epi16(b_high, zeros);
-
-            let (x_high_high, y_high_high, z_high_high) = sse_triple_to_linear_u8::<INTO_LINEAR>(
-                r_high_high,
-                g_high_high,
-                b_high_high,
-                transfer_function,
-            );
-
-            let r_u_norm = _mm_packus_epi16(
-                _mm_packus_epi32(x_low_low, x_low_high),
-                _mm_packus_epi32(x_high_low, x_high_high),
-            );
-
-            let g_u_norm = _mm_packus_epi16(
-                _mm_packus_epi32(y_low_low, y_low_high),
-                _mm_packus_epi32(y_high_low, y_high_high),
-            );
-
-            let b_u_norm = _mm_packus_epi16(
-                _mm_packus_epi32(z_low_low, z_low_high),
-                _mm_packus_epi32(z_high_low, z_high_high),
-            );
-
-            let dst = dst_ptr.add(cx * channels);
-
-            if USE_ALPHA {
-                store_and_interleave_v4_u8!(
-                    dst,
-                    image_configuration,
-                    r_u_norm,
-                    g_u_norm,
-                    b_u_norm,
-                    a_chan
-                );
-            } else {
-                store_and_interleave_v3_u8!(dst, image_configuration, r_u_norm, g_u_norm, b_u_norm);
-            }
-
-            cx += 16;
-        }
-
-        while cx + 8 < width as usize {
-            let src_ptr = src.add(src_offset + cx * channels);
-            let (r_chan, g_chan, b_chan, a_chan) =
-                load_u8_and_deinterleave!(src_ptr, image_configuration);
-
-            let r_low = _mm_cvtepu8_epi16(r_chan);
-            let g_low = _mm_cvtepu8_epi16(g_chan);
-            let b_low = _mm_cvtepu8_epi16(b_chan);
-
-            let r_low_low = _mm_cvtepu16_epi32(r_low);
-            let g_low_low = _mm_cvtepu16_epi32(g_low);
-            let b_low_low = _mm_cvtepu16_epi32(b_low);
-
-            let (x_low_low, y_low_low, z_low_low) = sse_triple_to_linear_u8::<INTO_LINEAR>(
-                r_low_low,
-                g_low_low,
-                b_low_low,
-                transfer_function,
-            );
-
-            let r_low_high = _mm_unpackhi_epi16(r_low, zeros);
-            let g_low_high = _mm_unpackhi_epi16(g_low, zeros);
-            let b_low_high = _mm_unpackhi_epi16(b_low, zeros);
-
-            let (x_low_high, y_low_high, z_low_high) = sse_triple_to_linear_u8::<INTO_LINEAR>(
-                r_low_high,
-                g_low_high,
-                b_low_high,
-                transfer_function,
-            );
-
-            let r_u_norm = _mm_packus_epi16(_mm_packus_epi32(x_low_low, x_low_high), zeros);
-
-            let g_u_norm = _mm_packus_epi16(_mm_packus_epi32(y_low_low, y_low_high), zeros);
-
-            let b_u_norm = _mm_packus_epi16(_mm_packus_epi32(z_low_low, z_low_high), zeros);
-
-            let dst = dst_ptr.add(cx * channels);
-
-            if USE_ALPHA {
-                store_and_interleave_v4_half_u8!(
-                    dst,
-                    image_configuration,
-                    r_u_norm,
-                    g_u_norm,
-                    b_u_norm,
-                    a_chan
-                );
-            } else {
-                store_and_interleave_v3_half_u8!(
-                    dst,
-                    image_configuration,
-                    r_u_norm,
-                    g_u_norm,
-                    b_u_norm
-                );
-            }
-
-            cx += 8;
-        }
-
-        cx
-    }
-}
diff --git a/src/sse/image_to_oklab.rs b/src/sse/image_to_oklab.rs
index 897f91f..58befe8 100644
--- a/src/sse/image_to_oklab.rs
+++ b/src/sse/image_to_oklab.rs
@@ -4,7 +4,7 @@
  * // Use of this source code is governed by a BSD-style
  * // license that can be found in the LICENSE file.
  */
-use crate::sse::perform_sse_linear_transfer;
+use crate::sse::{sse_deinterleave_rgb_ps, sse_deinterleave_rgba_ps};
 use erydanos::{_mm_atan2_ps, _mm_cbrt_fast_ps, _mm_hypot_fast_ps};
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
@@ -14,29 +14,18 @@ use std::arch::x86_64::*;
 use crate::image::ImageConfiguration;
 use crate::image_to_oklab::OklabTarget;
 use crate::sse::{
-    _mm_color_matrix_ps, sse_deinterleave_rgb, sse_deinterleave_rgba, sse_interleave_ps_rgb,
+    _mm_color_matrix_ps, sse_interleave_ps_rgb,
     sse_interleave_ps_rgba,
 };
-use crate::{
-    load_u8_and_deinterleave, load_u8_and_deinterleave_half, store_and_interleave_v3_direct_f32,
-    store_and_interleave_v4_direct_f32, TransferFunction,
-};
+use crate::{load_f32_and_deinterleave, store_and_interleave_v3_direct_f32, store_and_interleave_v4_direct_f32};
 
 macro_rules! triple_to_oklab {
-    ($r: expr, $g: expr, $b: expr, $transfer: expr, $target: expr,
+    ($r: expr, $g: expr, $b: expr,  $target: expr,
     $c0:expr, $c1:expr, $c2: expr, $c3: expr, $c4:expr, $c5: expr, $c6:expr, $c7: expr, $c8: expr,
         $m0: expr, $m1: expr, $m2: expr, $m3: expr, $m4: expr, $m5: expr, $m6: expr, $m7: expr, $m8: expr
     ) => {{
-        let u8_scale = _mm_set1_ps(1f32 / 255f32);
-        let r_f = _mm_mul_ps(_mm_cvtepi32_ps($r), u8_scale);
-        let g_f = _mm_mul_ps(_mm_cvtepi32_ps($g), u8_scale);
-        let b_f = _mm_mul_ps(_mm_cvtepi32_ps($b), u8_scale);
-        let r_linear = perform_sse_linear_transfer($transfer, r_f);
-        let g_linear = perform_sse_linear_transfer($transfer, g_f);
-        let b_linear = perform_sse_linear_transfer($transfer, b_f);
-
         let (l_l, l_m, l_s) = _mm_color_matrix_ps(
-            r_linear, g_linear, b_linear, $c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7, $c8,
+            $r, $g, $b, $c0, $c1, $c2, $c3, $c4, $c5, $c6, $c7, $c8,
         );
 
         let l_ = _mm_cbrt_fast_ps(l_l);
@@ -60,12 +49,9 @@ macro_rules! triple_to_oklab {
 #[target_feature(enable = "sse4.1")]
 pub unsafe fn sse_image_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     start_cx: usize,
-    src: *const u8,
-    src_offset: usize,
     width: u32,
     dst: *mut f32,
     dst_offset: usize,
-    transfer_function: TransferFunction,
 ) -> usize {
     let target: OklabTarget = TARGET.into();
     let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
@@ -98,217 +84,15 @@ pub unsafe fn sse_image_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET:
         _mm_set1_ps(-0.8086757660f32),
     );
 
-    let zeros = _mm_setzero_si128();
-
-    while cx + 16 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave!(src_ptr, image_configuration);
-
-        let r_low = _mm_cvtepu8_epi16(r_chan);
-        let g_low = _mm_cvtepu8_epi16(g_chan);
-        let b_low = _mm_cvtepu8_epi16(b_chan);
-
-        let r_low_low = _mm_cvtepu16_epi32(r_low);
-        let g_low_low = _mm_cvtepu16_epi32(g_low);
-        let b_low_low = _mm_cvtepu16_epi32(b_low);
-
-        let (x_low_low, y_low_low, z_low_low) = triple_to_oklab!(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            transfer_function,
-            target,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8
-        );
-
-        let a_low = _mm_cvtepu8_epi16(a_chan);
-
-        let u8_scale = _mm_set1_ps(1f32 / 255f32);
-
-        if image_configuration.has_alpha() {
-            let a_low_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_low)), u8_scale);
-            let ptr = dst_ptr.add(cx * 4);
-            store_and_interleave_v4_direct_f32!(ptr, x_low_low, y_low_low, z_low_low, a_low_low);
-        } else {
-            let ptr = dst_ptr.add(cx * 3);
-            store_and_interleave_v3_direct_f32!(ptr, x_low_low, y_low_low, z_low_low);
-        }
-
-        let r_low_high = _mm_unpackhi_epi16(r_low, zeros);
-        let g_low_high = _mm_unpackhi_epi16(g_low, zeros);
-        let b_low_high = _mm_unpackhi_epi16(b_low, zeros);
-
-        let (x_low_high, y_low_high, z_low_high) = triple_to_oklab!(
-            r_low_high,
-            g_low_high,
-            b_low_high,
-            transfer_function,
-            target,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8
-        );
-
-        if image_configuration.has_alpha() {
-            let a_low_high =
-                _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(a_low, zeros)), u8_scale);
-
-            let ptr = dst_ptr.add(cx * 4 + 16);
-            store_and_interleave_v4_direct_f32!(
-                ptr, x_low_high, y_low_high, z_low_high, a_low_high
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3 + 4 * 3);
-            store_and_interleave_v3_direct_f32!(ptr, x_low_high, y_low_high, z_low_high);
-        }
-
-        let r_high = _mm_unpackhi_epi8(r_chan, zeros);
-        let g_high = _mm_unpackhi_epi8(g_chan, zeros);
-        let b_high = _mm_unpackhi_epi8(b_chan, zeros);
-
-        let r_high_low = _mm_cvtepu16_epi32(r_high);
-        let g_high_low = _mm_cvtepu16_epi32(g_high);
-        let b_high_low = _mm_cvtepu16_epi32(b_high);
-
-        let (x_high_low, y_high_low, z_high_low) = triple_to_oklab!(
-            r_high_low,
-            g_high_low,
-            b_high_low,
-            transfer_function,
-            target,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8
-        );
-
-        let a_high = _mm_unpackhi_epi8(a_chan, zeros);
-
-        if image_configuration.has_alpha() {
-            let a_high_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_high)), u8_scale);
-            let ptr = dst_ptr.add(cx * 4 + 4 * 4 * 2);
-            store_and_interleave_v4_direct_f32!(
-                ptr, x_high_low, y_high_low, z_high_low, a_high_low
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3 + 4 * 3 * 2);
-            store_and_interleave_v3_direct_f32!(ptr, x_high_low, y_high_low, z_high_low);
-        }
-
-        let r_high_high = _mm_unpackhi_epi16(r_high, zeros);
-        let g_high_high = _mm_unpackhi_epi16(g_high, zeros);
-        let b_high_high = _mm_unpackhi_epi16(b_high, zeros);
-
-        let (x_high_high, y_high_high, z_high_high) = triple_to_oklab!(
-            r_high_high,
-            g_high_high,
-            b_high_high,
-            transfer_function,
-            target,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8
-        );
-
-        if image_configuration.has_alpha() {
-            let a_high_high =
-                _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(a_high, zeros)), u8_scale);
-            let ptr = dst_ptr.add(cx * 4 + 4 * 4 * 3);
-            store_and_interleave_v4_direct_f32!(
-                ptr,
-                x_high_high,
-                y_high_high,
-                z_high_high,
-                a_high_high
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3 + 4 * 3 * 3);
-            store_and_interleave_v3_direct_f32!(ptr, x_high_high, y_high_high, z_high_high);
-        }
-
-        cx += 16;
-    }
-
-    while cx + 8 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
+    while cx + 4 < width as usize {
+        let in_place_ptr = dst_ptr.add(cx * channels);
         let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave_half!(src_ptr, image_configuration);
-
-        let r_low = _mm_cvtepu8_epi16(r_chan);
-        let g_low = _mm_cvtepu8_epi16(g_chan);
-        let b_low = _mm_cvtepu8_epi16(b_chan);
-
-        let r_low_low = _mm_cvtepu16_epi32(r_low);
-        let g_low_low = _mm_cvtepu16_epi32(g_low);
-        let b_low_low = _mm_cvtepu16_epi32(b_low);
+            load_f32_and_deinterleave!(in_place_ptr, image_configuration);
 
-        let (x_low_low, y_low_low, z_low_low) = triple_to_oklab!(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            transfer_function,
+        let (l_oklab, a_oklab, b_oklab) = triple_to_oklab!(
+            r_chan,
+            g_chan,
+            b_chan,
             target,
             c0,
             c1,
@@ -330,63 +114,13 @@ pub unsafe fn sse_image_to_oklab<const CHANNELS_CONFIGURATION: u8, const TARGET:
             m8
         );
 
-        let a_low = _mm_cvtepu8_epi16(a_chan);
-
-        let u8_scale = _mm_set1_ps(1f32 / 255f32);
-
         if image_configuration.has_alpha() {
-            let a_low_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_low)), u8_scale);
-            let ptr = dst_ptr.add(cx * 4);
-            store_and_interleave_v4_direct_f32!(ptr, x_low_low, y_low_low, z_low_low, a_low_low);
-        } else {
-            let ptr = dst_ptr.add(cx * 3);
-            store_and_interleave_v3_direct_f32!(ptr, x_low_low, y_low_low, z_low_low);
-        }
-
-        let r_low_high = _mm_unpackhi_epi16(r_low, zeros);
-        let g_low_high = _mm_unpackhi_epi16(g_low, zeros);
-        let b_low_high = _mm_unpackhi_epi16(b_low, zeros);
-
-        let (x_low_high, y_low_high, z_low_high) = triple_to_oklab!(
-            r_low_high,
-            g_low_high,
-            b_low_high,
-            transfer_function,
-            target,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8
-        );
-
-        if image_configuration.has_alpha() {
-            let a_low_high =
-                _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(a_low, zeros)), u8_scale);
-
-            let ptr = dst_ptr.add(cx * 4 + 16);
-            store_and_interleave_v4_direct_f32!(
-                ptr, x_low_high, y_low_high, z_low_high, a_low_high
-            );
+            store_and_interleave_v4_direct_f32!(in_place_ptr, l_oklab, a_oklab, b_oklab, a_chan);
         } else {
-            let ptr = dst_ptr.add(cx * 3 + 4 * 3);
-            store_and_interleave_v3_direct_f32!(ptr, x_low_high, y_low_high, z_low_high);
+            store_and_interleave_v3_direct_f32!(in_place_ptr, l_oklab, a_oklab, b_oklab);
         }
 
-        cx += 8;
+        cx += 4;
     }
 
     cx
diff --git a/src/sse/linear_to_image.rs b/src/sse/linear_to_image.rs
deleted file mode 100644
index e4c158c..0000000
--- a/src/sse/linear_to_image.rs
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved.
- * //
- * // Use of this source code is governed by a BSD-style
- * // license that can be found in the LICENSE file.
- */
-
-use crate::image::ImageConfiguration;
-use crate::sse::*;
-use crate::{
-    load_f32_and_deinterleave, store_and_interleave_v3_half_u8, store_and_interleave_v3_u8,
-    store_and_interleave_v4_half_u8, store_and_interleave_v4_u8, TransferFunction,
-};
-#[cfg(target_arch = "x86")]
-use std::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use std::arch::x86_64::*;
-
-#[inline(always)]
-unsafe fn sse_gamma_vld<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
-    src: *const f32,
-    transfer_function: TransferFunction,
-) -> (__m128i, __m128i, __m128i, __m128i) {
-    let v_scale_alpha = _mm_set1_ps(255f32);
-    let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
-
-    let (mut r_f32, mut g_f32, mut b_f32, mut a_f32) =
-        load_f32_and_deinterleave!(src, image_configuration);
-
-    r_f32 = perform_sse_gamma_transfer(transfer_function, r_f32);
-    g_f32 = perform_sse_gamma_transfer(transfer_function, g_f32);
-    b_f32 = perform_sse_gamma_transfer(transfer_function, b_f32);
-    r_f32 = _mm_mul_ps(r_f32, v_scale_alpha);
-    g_f32 = _mm_mul_ps(g_f32, v_scale_alpha);
-    b_f32 = _mm_mul_ps(b_f32, v_scale_alpha);
-    if USE_ALPHA {
-        a_f32 = _mm_mul_ps(a_f32, v_scale_alpha);
-    }
-    const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
-
-    if USE_ALPHA {
-        (
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(r_f32)),
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(g_f32)),
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(b_f32)),
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(a_f32)),
-        )
-    } else {
-        (
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(r_f32)),
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(g_f32)),
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(b_f32)),
-            _mm_set1_epi32(255),
-        )
-    }
-}
-
-#[target_feature(enable = "sse4.1")]
-pub unsafe fn sse_linear_to_gamma<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
-    start_cx: usize,
-    src: *const f32,
-    src_offset: u32,
-    dst: *mut u8,
-    dst_offset: u32,
-    width: u32,
-    transfer_function: TransferFunction,
-) -> usize {
-    let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
-    let channels = image_configuration.get_channels_count();
-    let mut cx = start_cx;
-
-    while cx + 16 < width as usize {
-        let offset_src_ptr =
-            ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_, a_row0_) =
-            sse_gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_0, transfer_function);
-
-        let src_ptr_1 = offset_src_ptr.add(4 * channels);
-
-        let (r_row1_, g_row1_, b_row1_, a_row1_) =
-            sse_gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_1, transfer_function);
-
-        let src_ptr_2 = offset_src_ptr.add(4 * 2 * channels);
-
-        let (r_row2_, g_row2_, b_row2_, a_row2_) =
-            sse_gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_2, transfer_function);
-
-        let src_ptr_3 = offset_src_ptr.add(4 * 3 * channels);
-
-        let (r_row3_, g_row3_, b_row3_, a_row3_) =
-            sse_gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_3, transfer_function);
-
-        let r_row01 = _mm_packus_epi32(r_row0_, r_row1_);
-        let g_row01 = _mm_packus_epi32(g_row0_, g_row1_);
-        let b_row01 = _mm_packus_epi32(b_row0_, b_row1_);
-
-        let r_row23 = _mm_packus_epi32(r_row2_, r_row3_);
-        let g_row23 = _mm_packus_epi32(g_row2_, g_row3_);
-        let b_row23 = _mm_packus_epi32(b_row2_, b_row3_);
-
-        let r_row = _mm_packus_epi16(r_row01, r_row23);
-        let g_row = _mm_packus_epi16(g_row01, g_row23);
-        let b_row = _mm_packus_epi16(b_row01, b_row23);
-
-        let dst_ptr = dst.add(dst_offset as usize + cx * channels);
-
-        if USE_ALPHA {
-            let a_row01 = _mm_packus_epi32(a_row0_, a_row1_);
-            let a_row23 = _mm_packus_epi32(a_row2_, a_row3_);
-            let a_row = _mm_packus_epi16(a_row01, a_row23);
-            store_and_interleave_v4_u8!(dst_ptr, image_configuration, r_row, g_row, b_row, a_row);
-        } else {
-            store_and_interleave_v3_u8!(dst_ptr, image_configuration, r_row, g_row, b_row);
-        }
-
-        cx += 16;
-    }
-
-    let zeros = _mm_setzero_si128();
-
-    while cx + 8 < width as usize {
-        let offset_src_ptr =
-            ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_, a_row0_) =
-            sse_gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_0, transfer_function);
-
-        let src_ptr_1 = offset_src_ptr.add(4 * channels);
-
-        let (r_row1_, g_row1_, b_row1_, a_row1_) =
-            sse_gamma_vld::<CHANNELS_CONFIGURATION, USE_ALPHA>(src_ptr_1, transfer_function);
-
-        let r_row01 = _mm_packus_epi32(r_row0_, r_row1_);
-        let g_row01 = _mm_packus_epi32(g_row0_, g_row1_);
-        let b_row01 = _mm_packus_epi32(b_row0_, b_row1_);
-
-        let r_row = _mm_packus_epi16(r_row01, zeros);
-        let g_row = _mm_packus_epi16(g_row01, zeros);
-        let b_row = _mm_packus_epi16(b_row01, zeros);
-
-        let dst_ptr = dst.add(dst_offset as usize + cx * channels);
-
-        if USE_ALPHA {
-            let a_row01 = _mm_packus_epi32(a_row0_, a_row1_);
-            let a_row = _mm_packus_epi16(a_row01, zeros);
-            store_and_interleave_v4_half_u8!(
-                dst_ptr,
-                image_configuration,
-                r_row,
-                g_row,
-                b_row,
-                a_row
-            );
-        } else {
-            store_and_interleave_v3_half_u8!(dst_ptr, image_configuration, r_row, g_row, b_row);
-        }
-
-        cx += 8;
-    }
-
-    cx
-}
diff --git a/src/sse/mod.rs b/src/sse/mod.rs
index bbfe330..536b84a 100644
--- a/src/sse/mod.rs
+++ b/src/sse/mod.rs
@@ -12,16 +12,10 @@ mod hsv_to_image;
 
 mod image_to_hsv;
 
-mod image_to_linear_u8;
-
-mod linear_to_image;
-
 mod math;
 
 mod support;
 
-mod to_linear;
-
 mod to_xyz_lab;
 
 mod to_xyza_laba;
@@ -46,18 +40,16 @@ pub use gamma_curves::*;
 pub use hsv_to_image::*;
 pub use image_to_hsv::*;
 pub use image_to_jzazbz::sse_image_to_jzazbz;
-pub use image_to_linear_u8::*;
 pub use image_to_oklab::sse_image_to_oklab;
 pub use jzazbz_to_image::sse_jzazbz_to_image;
-pub use linear_to_image::*;
 pub use linear_to_planar::sse_linear_plane_to_gamma;
 pub use math::*;
 pub use oklab_to_image::sse_oklab_to_image;
 pub use planar_to_linear::sse_plane_to_linear;
 pub use support::*;
-pub use to_linear::*;
 pub use to_sigmoidal::sse_image_to_sigmoidal_row;
 pub use to_xyz_lab::*;
 pub use to_xyza_laba::*;
 pub use xyz_lab_to_image::*;
 pub use xyza_laba_to_image::*;
+pub use cie::*;
\ No newline at end of file
diff --git a/src/sse/oklab_to_image.rs b/src/sse/oklab_to_image.rs
index 011162e..77119d1 100644
--- a/src/sse/oklab_to_image.rs
+++ b/src/sse/oklab_to_image.rs
@@ -7,13 +7,10 @@
 use crate::image::ImageConfiguration;
 use crate::image_to_oklab::OklabTarget;
 use crate::sse::{
-    _mm_color_matrix_ps, _mm_cube_ps, perform_sse_gamma_transfer, sse_deinterleave_rgb_ps,
-    sse_deinterleave_rgba_ps, sse_interleave_rgb, sse_interleave_rgba,
-};
-use crate::{
-    load_f32_and_deinterleave, store_and_interleave_v3_half_u8, store_and_interleave_v3_u8,
-    store_and_interleave_v4_half_u8, store_and_interleave_v4_u8, TransferFunction,
+    _mm_color_matrix_ps, _mm_cube_ps, sse_deinterleave_rgb_ps, sse_deinterleave_rgba_ps,
 };
+use crate::sse::{sse_interleave_ps_rgb, sse_interleave_ps_rgba};
+use crate::{load_f32_and_deinterleave, store_and_interleave_v3_f32, store_and_interleave_v4_f32};
 use erydanos::{_mm_cos_ps, _mm_sin_ps};
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
@@ -23,7 +20,6 @@ use std::arch::x86_64::*;
 #[inline(always)]
 unsafe fn sse_oklab_vld<const CHANNELS_CONFIGURATION: u8>(
     src: *const f32,
-    transfer: TransferFunction,
     oklab_target: OklabTarget,
     m0: __m128,
     m1: __m128,
@@ -43,11 +39,10 @@ unsafe fn sse_oklab_vld<const CHANNELS_CONFIGURATION: u8>(
     c6: __m128,
     c7: __m128,
     c8: __m128,
-) -> (__m128i, __m128i, __m128i, __m128i) {
-    let v_scale_alpha = _mm_set1_ps(255f32);
+) -> (__m128, __m128, __m128, __m128) {
     let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
 
-    let (l, mut a, mut b, mut a_f32) = load_f32_and_deinterleave!(src, image_configuration);
+    let (l, mut a, mut b, a_f32) = load_f32_and_deinterleave!(src, image_configuration);
 
     if oklab_target == OklabTarget::Oklch {
         let a0 = _mm_mul_ps(a, _mm_cos_ps(b));
@@ -64,45 +59,17 @@ unsafe fn sse_oklab_vld<const CHANNELS_CONFIGURATION: u8>(
     l_s = _mm_cube_ps(l_s);
 
     let (r_l, g_l, b_l) = _mm_color_matrix_ps(l_l, l_m, l_s, c0, c1, c2, c3, c4, c5, c6, c7, c8);
-
-    let mut r_f32 = perform_sse_gamma_transfer(transfer, r_l);
-    let mut g_f32 = perform_sse_gamma_transfer(transfer, g_l);
-    let mut b_f32 = perform_sse_gamma_transfer(transfer, b_l);
-
-    r_f32 = _mm_mul_ps(r_f32, v_scale_alpha);
-    g_f32 = _mm_mul_ps(g_f32, v_scale_alpha);
-    b_f32 = _mm_mul_ps(b_f32, v_scale_alpha);
-    if image_configuration.has_alpha() {
-        a_f32 = _mm_mul_ps(a_f32, v_scale_alpha);
-    }
-    const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
-
-    if image_configuration.has_alpha() {
-        (
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(r_f32)),
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(g_f32)),
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(b_f32)),
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(a_f32)),
-        )
-    } else {
-        (
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(r_f32)),
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(g_f32)),
-            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(b_f32)),
-            _mm_set1_epi32(255),
-        )
-    }
+    (r_l, g_l, b_l, a_f32)
 }
 
 #[target_feature(enable = "sse4.1")]
 pub unsafe fn sse_oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     start_cx: usize,
     src: *const f32,
-    src_offset: u32,
-    dst: *mut u8,
+    src_offset: usize,
+    dst: *mut f32,
     dst_offset: u32,
     width: u32,
-    transfer_function: TransferFunction,
 ) -> usize {
     let target: OklabTarget = TARGET.into();
     let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
@@ -133,224 +100,32 @@ pub unsafe fn sse_oklab_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:
         _mm_set1_ps(1.7076147010f32),
     );
 
-    let zeros = _mm_setzero_si128();
-
-    while cx + 16 < width as usize {
-        let offset_src_ptr =
-            ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_, a_row0_) = sse_oklab_vld::<CHANNELS_CONFIGURATION>(
-            src_ptr_0,
-            transfer_function,
-            target,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-        );
-
-        let src_ptr_1 = offset_src_ptr.add(4 * channels);
-
-        let (r_row1_, g_row1_, b_row1_, a_row1_) = sse_oklab_vld::<CHANNELS_CONFIGURATION>(
-            src_ptr_1,
-            transfer_function,
-            target,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-        );
-
-        let src_ptr_2 = offset_src_ptr.add(4 * 2 * channels);
-
-        let (r_row2_, g_row2_, b_row2_, a_row2_) = sse_oklab_vld::<CHANNELS_CONFIGURATION>(
-            src_ptr_2,
-            transfer_function,
-            target,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-        );
-
-        let src_ptr_3 = offset_src_ptr.add(4 * 3 * channels);
-
-        let (r_row3_, g_row3_, b_row3_, a_row3_) = sse_oklab_vld::<CHANNELS_CONFIGURATION>(
-            src_ptr_3,
-            transfer_function,
-            target,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-        );
-
-        let r_row01 = _mm_packus_epi32(r_row0_, r_row1_);
-        let g_row01 = _mm_packus_epi32(g_row0_, g_row1_);
-        let b_row01 = _mm_packus_epi32(b_row0_, b_row1_);
-
-        let r_row23 = _mm_packus_epi32(r_row2_, r_row3_);
-        let g_row23 = _mm_packus_epi32(g_row2_, g_row3_);
-        let b_row23 = _mm_packus_epi32(b_row2_, b_row3_);
-
-        let r_row = _mm_packus_epi16(r_row01, r_row23);
-        let g_row = _mm_packus_epi16(g_row01, g_row23);
-        let b_row = _mm_packus_epi16(b_row01, b_row23);
-
-        let dst_ptr = dst.add(dst_offset as usize + cx * channels);
-
-        if image_configuration.has_alpha() {
-            let a_row01 = _mm_packus_epi32(a_row0_, a_row1_);
-            let a_row23 = _mm_packus_epi32(a_row2_, a_row3_);
-            let a_row = _mm_packus_epi16(a_row01, a_row23);
-            store_and_interleave_v4_u8!(dst_ptr, image_configuration, r_row, g_row, b_row, a_row);
-        } else {
-            store_and_interleave_v3_u8!(dst_ptr, image_configuration, r_row, g_row, b_row);
-        }
-
-        cx += 16;
-    }
-
-    while cx + 8 < width as usize {
-        let offset_src_ptr =
-            ((src as *const u8).add(src_offset as usize) as *const f32).add(cx * channels);
+    while cx + 4 < width as usize {
+        let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * channels);
 
         let src_ptr_0 = offset_src_ptr;
 
         let (r_row0_, g_row0_, b_row0_, a_row0_) = sse_oklab_vld::<CHANNELS_CONFIGURATION>(
-            src_ptr_0,
-            transfer_function,
-            target,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-        );
-
-        let src_ptr_1 = offset_src_ptr.add(4 * channels);
-
-        let (r_row1_, g_row1_, b_row1_, a_row1_) = sse_oklab_vld::<CHANNELS_CONFIGURATION>(
-            src_ptr_1,
-            transfer_function,
-            target,
-            m0,
-            m1,
-            m2,
-            m3,
-            m4,
-            m5,
-            m6,
-            m7,
-            m8,
-            c0,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
+            src_ptr_0, target, m0, m1, m2, m3, m4, m5, m6, m7, m8, c0, c1, c2, c3, c4, c5, c6, c7,
             c8,
         );
 
-        let r_row01 = _mm_packus_epi32(r_row0_, r_row1_);
-        let g_row01 = _mm_packus_epi32(g_row0_, g_row1_);
-        let b_row01 = _mm_packus_epi32(b_row0_, b_row1_);
-
-        let r_row = _mm_packus_epi16(r_row01, zeros);
-        let g_row = _mm_packus_epi16(g_row01, zeros);
-        let b_row = _mm_packus_epi16(b_row01, zeros);
-
-        let dst_ptr = dst.add(dst_offset as usize + cx * channels);
+        let dst_ptr = ((dst as *mut u8).add(dst_offset as usize) as *mut f32).add(cx * channels);
 
         if image_configuration.has_alpha() {
-            let a_row01 = _mm_packus_epi32(a_row0_, a_row1_);
-            let a_row = _mm_packus_epi16(a_row01, zeros);
-            store_and_interleave_v4_half_u8!(
+            store_and_interleave_v4_f32!(
                 dst_ptr,
                 image_configuration,
-                r_row,
-                g_row,
-                b_row,
-                a_row
+                r_row0_,
+                g_row0_,
+                b_row0_,
+                a_row0_
             );
         } else {
-            store_and_interleave_v3_half_u8!(dst_ptr, image_configuration, r_row, g_row, b_row);
+            store_and_interleave_v3_f32!(dst_ptr, image_configuration, r_row0_, g_row0_, b_row0_);
         }
 
-        cx += 8;
+        cx += 4;
     }
 
     cx
diff --git a/src/sse/to_linear.rs b/src/sse/to_linear.rs
deleted file mode 100644
index 1706f20..0000000
--- a/src/sse/to_linear.rs
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * // Copyright 2024 (c) the Radzivon Bartoshyk. All rights reserved.
- * //
- * // Use of this source code is governed by a BSD-style
- * // license that can be found in the LICENSE file.
- */
-
-use crate::gamma_curves::TransferFunction;
-use crate::image::ImageConfiguration;
-use crate::sse::*;
-use crate::{
-    load_u8_and_deinterleave, load_u8_and_deinterleave_half, store_and_interleave_v3_f32,
-    store_and_interleave_v4_f32,
-};
-#[cfg(target_arch = "x86")]
-use std::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use std::arch::x86_64::*;
-
-#[inline(always)]
-unsafe fn sse_triple_to_linear(
-    r: __m128i,
-    g: __m128i,
-    b: __m128i,
-    transfer_function: TransferFunction,
-) -> (__m128, __m128, __m128) {
-    let u8_scale = _mm_set1_ps(1f32 / 255f32);
-    let r_f = _mm_mul_ps(_mm_cvtepi32_ps(r), u8_scale);
-    let g_f = _mm_mul_ps(_mm_cvtepi32_ps(g), u8_scale);
-    let b_f = _mm_mul_ps(_mm_cvtepi32_ps(b), u8_scale);
-    let r_linear = perform_sse_linear_transfer(transfer_function, r_f);
-    let g_linear = perform_sse_linear_transfer(transfer_function, g_f);
-    let b_linear = perform_sse_linear_transfer(transfer_function, b_f);
-    (r_linear, g_linear, b_linear)
-}
-
-#[target_feature(enable = "sse4.1")]
-pub unsafe fn sse_channels_to_linear<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool>(
-    start_cx: usize,
-    src: *const u8,
-    src_offset: usize,
-    width: u32,
-    dst: *mut f32,
-    dst_offset: usize,
-    transfer_function: TransferFunction,
-) -> usize {
-    let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
-    let channels = image_configuration.get_channels_count();
-    let mut cx = start_cx;
-
-    let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32;
-
-    let zeros = _mm_setzero_si128();
-
-    while cx + 16 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave!(src_ptr, image_configuration);
-
-        let r_low = _mm_cvtepu8_epi16(r_chan);
-        let g_low = _mm_cvtepu8_epi16(g_chan);
-        let b_low = _mm_cvtepu8_epi16(b_chan);
-
-        let r_low_low = _mm_cvtepu16_epi32(r_low);
-        let g_low_low = _mm_cvtepu16_epi32(g_low);
-        let b_low_low = _mm_cvtepu16_epi32(b_low);
-
-        let (x_low_low, y_low_low, z_low_low) =
-            sse_triple_to_linear(r_low_low, g_low_low, b_low_low, transfer_function);
-
-        let a_low = _mm_cvtepu8_epi16(a_chan);
-
-        let u8_scale = _mm_set1_ps(1f32 / 255f32);
-
-        if USE_ALPHA {
-            let a_low_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_low)), u8_scale);
-
-            let ptr = dst_ptr.add(cx * 4);
-            store_and_interleave_v4_f32!(
-                ptr,
-                image_configuration,
-                x_low_low,
-                y_low_low,
-                z_low_low,
-                a_low_low
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3);
-            store_and_interleave_v3_f32!(ptr, image_configuration, x_low_low, y_low_low, z_low_low);
-        }
-
-        let r_low_high = _mm_unpackhi_epi16(r_low, zeros);
-        let g_low_high = _mm_unpackhi_epi16(g_low, zeros);
-        let b_low_high = _mm_unpackhi_epi16(b_low, zeros);
-
-        let (x_low_high, y_low_high, z_low_high) =
-            sse_triple_to_linear(r_low_high, g_low_high, b_low_high, transfer_function);
-
-        if USE_ALPHA {
-            let a_low_high =
-                _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(a_low, zeros)), u8_scale);
-
-            let ptr = dst_ptr.add(cx * 4 + 16);
-            store_and_interleave_v4_f32!(
-                ptr,
-                image_configuration,
-                x_low_high,
-                y_low_high,
-                z_low_high,
-                a_low_high
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3 + 4 * 3);
-            store_and_interleave_v3_f32!(
-                ptr,
-                image_configuration,
-                x_low_high,
-                y_low_high,
-                z_low_high
-            );
-        }
-
-        let r_high = _mm_unpackhi_epi8(r_chan, zeros);
-        let g_high = _mm_unpackhi_epi8(g_chan, zeros);
-        let b_high = _mm_unpackhi_epi8(b_chan, zeros);
-
-        let r_high_low = _mm_cvtepu16_epi32(r_high);
-        let g_high_low = _mm_cvtepu16_epi32(g_high);
-        let b_high_low = _mm_cvtepu16_epi32(b_high);
-
-        let (x_high_low, y_high_low, z_high_low) =
-            sse_triple_to_linear(r_high_low, g_high_low, b_high_low, transfer_function);
-
-        let a_high = _mm_unpackhi_epi8(a_chan, zeros);
-
-        if USE_ALPHA {
-            let a_high_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_high)), u8_scale);
-
-            let ptr = dst_ptr.add(cx * 4 + 4 * 4 * 2);
-            store_and_interleave_v4_f32!(
-                ptr,
-                image_configuration,
-                x_high_low,
-                y_high_low,
-                z_high_low,
-                a_high_low
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3 + 4 * 3 * 2);
-            store_and_interleave_v3_f32!(
-                ptr,
-                image_configuration,
-                x_high_low,
-                y_high_low,
-                z_high_low
-            );
-        }
-
-        let r_high_high = _mm_unpackhi_epi16(r_high, zeros);
-        let g_high_high = _mm_unpackhi_epi16(g_high, zeros);
-        let b_high_high = _mm_unpackhi_epi16(b_high, zeros);
-
-        let (x_high_high, y_high_high, z_high_high) =
-            sse_triple_to_linear(r_high_high, g_high_high, b_high_high, transfer_function);
-
-        if USE_ALPHA {
-            let a_high_high = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_high)), u8_scale);
-
-            let ptr = dst_ptr.add(cx * 4 + 4 * 4 * 3);
-            store_and_interleave_v4_f32!(
-                ptr,
-                image_configuration,
-                x_high_high,
-                y_high_high,
-                z_high_high,
-                a_high_high
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3 + 4 * 3 * 3);
-            store_and_interleave_v3_f32!(
-                ptr,
-                image_configuration,
-                x_high_high,
-                y_high_high,
-                z_high_high
-            );
-        }
-
-        cx += 16;
-    }
-
-    while cx + 8 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave_half!(src_ptr, image_configuration);
-
-        let r_low = _mm_cvtepu8_epi16(r_chan);
-        let g_low = _mm_cvtepu8_epi16(g_chan);
-        let b_low = _mm_cvtepu8_epi16(b_chan);
-
-        let r_low_low = _mm_cvtepu16_epi32(r_low);
-        let g_low_low = _mm_cvtepu16_epi32(g_low);
-        let b_low_low = _mm_cvtepu16_epi32(b_low);
-
-        let (x_low_low, y_low_low, z_low_low) =
-            sse_triple_to_linear(r_low_low, g_low_low, b_low_low, transfer_function);
-
-        let a_low = _mm_cvtepu8_epi16(a_chan);
-
-        let u8_scale = _mm_set1_ps(1f32 / 255f32);
-
-        if USE_ALPHA {
-            let a_low_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_low)), u8_scale);
-
-            let ptr = dst_ptr.add(cx * 4);
-            store_and_interleave_v4_f32!(
-                ptr,
-                image_configuration,
-                x_low_low,
-                y_low_low,
-                z_low_low,
-                a_low_low
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3);
-            store_and_interleave_v3_f32!(ptr, image_configuration, x_low_low, y_low_low, z_low_low);
-        }
-
-        let r_low_high = _mm_unpackhi_epi16(r_low, zeros);
-        let g_low_high = _mm_unpackhi_epi16(g_low, zeros);
-        let b_low_high = _mm_unpackhi_epi16(b_low, zeros);
-
-        let (x_low_high, y_low_high, z_low_high) =
-            sse_triple_to_linear(r_low_high, g_low_high, b_low_high, transfer_function);
-
-        if USE_ALPHA {
-            let a_low_high =
-                _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(a_low, zeros)), u8_scale);
-
-            let ptr = dst_ptr.add(cx * 4 + 16);
-            store_and_interleave_v4_f32!(
-                ptr,
-                image_configuration,
-                x_low_high,
-                y_low_high,
-                z_low_high,
-                a_low_high
-            );
-        } else {
-            let ptr = dst_ptr.add(cx * 3 + 4 * 3);
-            store_and_interleave_v3_f32!(
-                ptr,
-                image_configuration,
-                x_low_high,
-                y_low_high,
-                z_low_high
-            );
-        }
-
-        cx += 8;
-    }
-
-    cx
-}
diff --git a/src/sse/to_xyz_lab.rs b/src/sse/to_xyz_lab.rs
index e04019e..af7416d 100644
--- a/src/sse/to_xyz_lab.rs
+++ b/src/sse/to_xyz_lab.rs
@@ -5,12 +5,11 @@
  * // license that can be found in the LICENSE file.
  */
 
-use crate::gamma_curves::TransferFunction;
 use crate::image::ImageConfiguration;
 use crate::sse::cie::{sse_triple_to_lab, sse_triple_to_lch, sse_triple_to_luv, sse_triple_to_xyz};
 use crate::sse::*;
 use crate::xyz_target::XyzTarget;
-use crate::{load_u8_and_deinterleave, load_u8_and_deinterleave_half};
+use crate::load_f32_and_deinterleave;
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
@@ -23,7 +22,7 @@ pub unsafe fn sse_channels_to_xyz_or_lab<
     const TARGET: u8,
 >(
     start_cx: usize,
-    src: *const u8,
+    src: *const f32,
     src_offset: usize,
     width: u32,
     dst: *mut f32,
@@ -31,7 +30,6 @@ pub unsafe fn sse_channels_to_xyz_or_lab<
     a_linearized: *mut f32,
     a_offset: usize,
     matrix: &[[f32; 3]; 3],
-    transfer_function: TransferFunction,
 ) -> usize {
     if USE_ALPHA && a_linearized.is_null() {
         panic!("Null alpha channel with requirements of linearized alpha if not supported");
@@ -53,25 +51,15 @@ pub unsafe fn sse_channels_to_xyz_or_lab<
 
     let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32;
 
-    let zeros = _mm_setzero_si128();
-
-    while cx + 16 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
+    while cx + 4 < width as usize {
+        let src_ptr = ((src as * const u8).add(src_offset) as *const f32).add(cx * channels);
         let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave!(src_ptr, image_configuration);
-
-        let r_low = _mm_cvtepu8_epi16(r_chan);
-        let g_low = _mm_cvtepu8_epi16(g_chan);
-        let b_low = _mm_cvtepu8_epi16(b_chan);
-
-        let r_low_low = _mm_cvtepu16_epi32(r_low);
-        let g_low_low = _mm_cvtepu16_epi32(g_low);
-        let b_low_low = _mm_cvtepu16_epi32(b_low);
+            load_f32_and_deinterleave!(src_ptr, image_configuration);
 
         let (mut x_low_low, mut y_low_low, mut z_low_low) = sse_triple_to_xyz(
-            r_low_low,
-            g_low_low,
-            b_low_low,
+            r_chan,
+            g_chan,
+            b_chan,
             cq1,
             cq2,
             cq3,
@@ -81,7 +69,6 @@ pub unsafe fn sse_channels_to_xyz_or_lab<
             cq7,
             cq8,
             cq9,
-            transfer_function,
         );
 
         match target {
@@ -111,307 +98,13 @@ pub unsafe fn sse_channels_to_xyz_or_lab<
         _mm_storeu_ps(dst_ptr.add(cx * 3 + 4), v1);
         _mm_storeu_ps(dst_ptr.add(cx * 3 + 8), v2);
 
-        let r_low_high = _mm_unpackhi_epi16(r_low, zeros);
-        let g_low_high = _mm_unpackhi_epi16(g_low, zeros);
-        let b_low_high = _mm_unpackhi_epi16(b_low, zeros);
-
-        let (mut x_low_high, mut y_low_high, mut z_low_high) = sse_triple_to_xyz(
-            r_low_high,
-            g_low_high,
-            b_low_high,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = sse_triple_to_lab(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = a;
-                z_low_high = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = sse_triple_to_luv(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = u;
-                z_low_high = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = sse_triple_to_lch(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = c;
-                z_low_high = h;
-            }
-        }
-
-        let (v0, v1, v2) = sse_interleave_ps_rgb(x_low_high, y_low_high, z_low_high);
-        _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3), v0);
-        _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 + 4), v1);
-        _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 + 8), v2);
-
-        let r_high = _mm_unpackhi_epi8(r_chan, zeros);
-        let g_high = _mm_unpackhi_epi8(g_chan, zeros);
-        let b_high = _mm_unpackhi_epi8(b_chan, zeros);
-
-        let r_high_low = _mm_cvtepu16_epi32(r_high);
-        let g_high_low = _mm_cvtepu16_epi32(g_high);
-        let b_high_low = _mm_cvtepu16_epi32(b_high);
-
-        let (mut x_high_low, mut y_high_low, mut z_high_low) = sse_triple_to_xyz(
-            r_high_low,
-            g_high_low,
-            b_high_low,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = sse_triple_to_lab(x_high_low, y_high_low, z_high_low);
-                x_high_low = l;
-                y_high_low = a;
-                z_high_low = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = sse_triple_to_luv(x_high_low, y_high_low, z_high_low);
-                x_high_low = l;
-                y_high_low = u;
-                z_high_low = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = sse_triple_to_lch(x_high_low, y_high_low, z_high_low);
-                x_high_low = l;
-                y_high_low = c;
-                z_high_low = h;
-            }
-        }
-
-        let (v0, v1, v2) = sse_interleave_ps_rgb(x_high_low, y_high_low, z_high_low);
-        _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 * 2), v0);
-        _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 * 2 + 4), v1);
-        _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 * 2 + 8), v2);
-
-        let r_high_high = _mm_unpackhi_epi16(r_high, zeros);
-        let g_high_high = _mm_unpackhi_epi16(g_high, zeros);
-        let b_high_high = _mm_unpackhi_epi16(b_high, zeros);
-
-        let (mut x_high_high, mut y_high_high, mut z_high_high) = sse_triple_to_xyz(
-            r_high_high,
-            g_high_high,
-            b_high_high,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = sse_triple_to_lab(x_high_high, y_high_high, z_high_high);
-                x_high_high = l;
-                y_high_high = a;
-                z_high_high = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = sse_triple_to_luv(x_high_high, y_high_high, z_high_high);
-                x_high_high = l;
-                y_high_high = u;
-                z_high_high = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = sse_triple_to_lch(x_high_high, y_high_high, z_high_high);
-                x_high_high = l;
-                y_high_high = c;
-                z_high_high = h;
-            }
-        }
-
-        let (v0, v1, v2) = sse_interleave_ps_rgb(x_high_high, y_high_high, z_high_high);
-        _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 * 3), v0);
-        _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 * 3 + 4), v1);
-        _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 * 3 + 8), v2);
-
         if USE_ALPHA {
             let a_ptr = (a_linearized as *mut u8).add(a_offset) as *mut f32;
 
-            let a_low = _mm_cvtepu8_epi16(a_chan);
-
-            let u8_scale = _mm_set1_ps(1f32 / 255f32);
-
-            let a_low_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_low)), u8_scale);
-
-            _mm_storeu_ps(a_ptr.add(cx), a_low_low);
-
-            let a_low_high = _mm_mul_ps(
-                _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128::<8>(a_low))),
-                u8_scale,
-            );
-
-            _mm_storeu_ps(a_ptr.add(cx + 4), a_low_high);
-
-            let a_high = _mm_unpackhi_epi8(a_chan, zeros);
-
-            let a_high_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_high)), u8_scale);
-
-            _mm_storeu_ps(a_ptr.add(cx + 4 * 2), a_high_low);
-
-            let a_high_high =
-                _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(a_high, zeros)), u8_scale);
-
-            _mm_storeu_ps(a_ptr.add(cx + 4 * 3), a_high_high);
-        }
-
-        cx += 16;
-    }
-
-    while cx + 8 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave_half!(src_ptr, image_configuration);
-
-        let r_low = _mm_cvtepu8_epi16(r_chan);
-        let g_low = _mm_cvtepu8_epi16(g_chan);
-        let b_low = _mm_cvtepu8_epi16(b_chan);
-
-        let r_low_low = _mm_cvtepu16_epi32(r_low);
-        let g_low_low = _mm_cvtepu16_epi32(g_low);
-        let b_low_low = _mm_cvtepu16_epi32(b_low);
-
-        let (mut x_low_low, mut y_low_low, mut z_low_low) = sse_triple_to_xyz(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = sse_triple_to_lab(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = a;
-                z_low_low = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = sse_triple_to_luv(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = u;
-                z_low_low = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = sse_triple_to_lch(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = c;
-                z_low_low = h;
-            }
-        }
-
-        let (v0, v1, v2) = sse_interleave_ps_rgb(x_low_low, y_low_low, z_low_low);
-        _mm_storeu_ps(dst_ptr.add(cx * 3), v0);
-        _mm_storeu_ps(dst_ptr.add(cx * 3 + 4), v1);
-        _mm_storeu_ps(dst_ptr.add(cx * 3 + 8), v2);
-
-        let r_low_high = _mm_unpackhi_epi16(r_low, zeros);
-        let g_low_high = _mm_unpackhi_epi16(g_low, zeros);
-        let b_low_high = _mm_unpackhi_epi16(b_low, zeros);
-
-        let (mut x_low_high, mut y_low_high, mut z_low_high) = sse_triple_to_xyz(
-            r_low_high,
-            g_low_high,
-            b_low_high,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = sse_triple_to_lab(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = a;
-                z_low_high = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = sse_triple_to_luv(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = u;
-                z_low_high = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = sse_triple_to_lch(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = c;
-                z_low_high = h;
-            }
-        }
-
-        let (v0, v1, v2) = sse_interleave_ps_rgb(x_low_high, y_low_high, z_low_high);
-        _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3), v0);
-        _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 + 4), v1);
-        _mm_storeu_ps(dst_ptr.add(cx * 3 + 4 * 3 + 8), v2);
-
-        if USE_ALPHA {
-            let a_ptr = (a_linearized as *mut u8).add(a_offset) as *mut f32;
-
-            let a_low = _mm_cvtepu8_epi16(a_chan);
-
-            let u8_scale = _mm_set1_ps(1f32 / 255f32);
-
-            let a_low_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_low)), u8_scale);
-
-            _mm_storeu_ps(a_ptr.add(cx), a_low_low);
-
-            let a_low_high = _mm_mul_ps(
-                _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128::<8>(a_low))),
-                u8_scale,
-            );
-
-            _mm_storeu_ps(a_ptr.add(cx + 4), a_low_high);
+            _mm_storeu_ps(a_ptr.add(cx), a_chan);
         }
 
-        cx += 8;
+        cx += 4;
     }
 
     cx
diff --git a/src/sse/to_xyza_laba.rs b/src/sse/to_xyza_laba.rs
index cb236fb..30f5b73 100644
--- a/src/sse/to_xyza_laba.rs
+++ b/src/sse/to_xyza_laba.rs
@@ -5,12 +5,11 @@
  * // license that can be found in the LICENSE file.
  */
 
-use crate::gamma_curves::TransferFunction;
 use crate::image::ImageConfiguration;
 use crate::sse::cie::{sse_triple_to_lab, sse_triple_to_lch, sse_triple_to_luv, sse_triple_to_xyz};
 use crate::sse::*;
 use crate::xyz_target::XyzTarget;
-use crate::{load_u8_and_deinterleave, load_u8_and_deinterleave_half, store_and_interleave_v4_f32};
+use crate::load_f32_and_deinterleave;
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
@@ -19,13 +18,12 @@ use std::arch::x86_64::*;
 #[target_feature(enable = "sse4.1")]
 pub unsafe fn sse_channels_to_xyza_laba<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     start_cx: usize,
-    src: *const u8,
+    src: *const f32,
     src_offset: usize,
     width: u32,
     dst: *mut f32,
     dst_offset: usize,
     matrix: &[[f32; 3]; 3],
-    transfer_function: TransferFunction,
 ) -> usize {
     const CHANNELS: usize = 4;
     let target: XyzTarget = TARGET.into();
@@ -48,264 +46,13 @@ pub unsafe fn sse_channels_to_xyza_laba<const CHANNELS_CONFIGURATION: u8, const
 
     let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut f32;
 
-    let zeros = _mm_setzero_si128();
-
-    while cx + 16 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
-        let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave!(src_ptr, image_configuration);
-
-        let r_low = _mm_cvtepu8_epi16(r_chan);
-        let g_low = _mm_cvtepu8_epi16(g_chan);
-        let b_low = _mm_cvtepu8_epi16(b_chan);
-
-        let r_low_low = _mm_cvtepu16_epi32(r_low);
-        let g_low_low = _mm_cvtepu16_epi32(g_low);
-        let b_low_low = _mm_cvtepu16_epi32(b_low);
-
-        let (mut x_low_low, mut y_low_low, mut z_low_low) = sse_triple_to_xyz(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = sse_triple_to_lab(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = a;
-                z_low_low = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = sse_triple_to_luv(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = u;
-                z_low_low = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = sse_triple_to_lch(x_low_low, y_low_low, z_low_low);
-                x_low_low = l;
-                y_low_low = c;
-                z_low_low = h;
-            }
-        }
-
-        let a_low = _mm_cvtepu8_epi16(a_chan);
-        let u8_scale = _mm_set1_ps(1f32 / 255f32);
-        let a_low_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_low)), u8_scale);
-
-        let (v0, v1, v2, v3) = sse_interleave_ps_rgba(x_low_low, y_low_low, z_low_low, a_low_low);
-        _mm_storeu_ps(dst_ptr.add(cx * CHANNELS), v0);
-        _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4), v1);
-        _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 8), v2);
-        _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 12), v3);
-
-        let r_low_high = _mm_unpackhi_epi16(r_low, zeros);
-        let g_low_high = _mm_unpackhi_epi16(g_low, zeros);
-        let b_low_high = _mm_unpackhi_epi16(b_low, zeros);
-
-        let (mut x_low_high, mut y_low_high, mut z_low_high) = sse_triple_to_xyz(
-            r_low_high,
-            g_low_high,
-            b_low_high,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = sse_triple_to_lab(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = a;
-                z_low_high = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = sse_triple_to_luv(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = u;
-                z_low_high = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = sse_triple_to_lch(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = c;
-                z_low_high = h;
-            }
-        }
-
-        let a_low_high = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(a_low, zeros)), u8_scale);
-
-        let ptr0 = dst_ptr.add(cx * CHANNELS + 4 * CHANNELS);
-        store_and_interleave_v4_f32!(
-            ptr0,
-            image_configuration,
-            x_low_high,
-            y_low_high,
-            z_low_high,
-            a_low_high
-        );
-
-        let r_high = _mm_unpackhi_epi8(r_chan, _mm_setzero_si128());
-        let g_high = _mm_unpackhi_epi8(g_chan, _mm_setzero_si128());
-        let b_high = _mm_unpackhi_epi8(b_chan, _mm_setzero_si128());
-
-        let r_high_low = _mm_cvtepu16_epi32(r_high);
-        let g_high_low = _mm_cvtepu16_epi32(g_high);
-        let b_high_low = _mm_cvtepu16_epi32(b_high);
-
-        let (mut x_high_low, mut y_high_low, mut z_high_low) = sse_triple_to_xyz(
-            r_high_low,
-            g_high_low,
-            b_high_low,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = sse_triple_to_lab(x_high_low, y_high_low, z_high_low);
-                x_high_low = l;
-                y_high_low = a;
-                z_high_low = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = sse_triple_to_luv(x_high_low, y_high_low, z_high_low);
-                x_high_low = l;
-                y_high_low = u;
-                z_high_low = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = sse_triple_to_lch(x_high_low, y_high_low, z_high_low);
-                x_high_low = l;
-                y_high_low = c;
-                z_high_low = h;
-            }
-        }
-
-        let a_high = _mm_unpackhi_epi8(a_chan, _mm_setzero_si128());
-
-        let a_high_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_high)), u8_scale);
-
-        let (v0, v1, v2, v3) =
-            sse_interleave_ps_rgba(x_high_low, y_high_low, z_high_low, a_high_low);
-        _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4 * CHANNELS * 2), v0);
-        _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4 * CHANNELS * 2 + 4), v1);
-        _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4 * CHANNELS * 2 + 8), v2);
-        _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4 * CHANNELS * 2 + 12), v3);
-
-        let r_high_high = _mm_unpackhi_epi16(r_high, _mm_setzero_si128());
-        let g_high_high = _mm_unpackhi_epi16(g_high, _mm_setzero_si128());
-        let b_high_high = _mm_unpackhi_epi16(b_high, _mm_setzero_si128());
-
-        let (mut x_high_high, mut y_high_high, mut z_high_high) = sse_triple_to_xyz(
-            r_high_high,
-            g_high_high,
-            b_high_high,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = sse_triple_to_lab(x_high_high, y_high_high, z_high_high);
-                x_high_high = l;
-                y_high_high = a;
-                z_high_high = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = sse_triple_to_luv(x_high_high, y_high_high, z_high_high);
-                x_high_high = l;
-                y_high_high = u;
-                z_high_high = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = sse_triple_to_lch(x_high_high, y_high_high, z_high_high);
-                x_high_high = l;
-                y_high_high = c;
-                z_high_high = h;
-            }
-        }
-
-        let a_high_high = _mm_mul_ps(
-            _mm_cvtepi32_ps(_mm_unpackhi_epi16(a_high, _mm_setzero_si128())),
-            u8_scale,
-        );
-
-        let (v0, v1, v2, v3) =
-            sse_interleave_ps_rgba(x_high_high, y_high_high, z_high_high, a_high_high);
-        _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4 * CHANNELS * 3), v0);
-        _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4 * CHANNELS * 3 + 4), v1);
-        _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4 * CHANNELS * 3 + 8), v2);
-        _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4 * CHANNELS * 3 + 12), v3);
-
-        cx += 16;
-    }
-
-    while cx + 8 < width as usize {
-        let src_ptr = src.add(src_offset + cx * channels);
+    while cx + 4 < width as usize {
+        let src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * channels);
         let (r_chan, g_chan, b_chan, a_chan) =
-            load_u8_and_deinterleave_half!(src_ptr, image_configuration);
-
-        let r_low = _mm_cvtepu8_epi16(r_chan);
-        let g_low = _mm_cvtepu8_epi16(g_chan);
-        let b_low = _mm_cvtepu8_epi16(b_chan);
-
-        let r_low_low = _mm_cvtepu16_epi32(r_low);
-        let g_low_low = _mm_cvtepu16_epi32(g_low);
-        let b_low_low = _mm_cvtepu16_epi32(b_low);
+            load_f32_and_deinterleave!(src_ptr, image_configuration);
 
         let (mut x_low_low, mut y_low_low, mut z_low_low) = sse_triple_to_xyz(
-            r_low_low,
-            g_low_low,
-            b_low_low,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
+            r_chan, g_chan, b_chan, cq1, cq2, cq3, cq4, cq5, cq6, cq7, cq8, cq9,
         );
 
         match target {
@@ -330,71 +77,13 @@ pub unsafe fn sse_channels_to_xyza_laba<const CHANNELS_CONFIGURATION: u8, const
             }
         }
 
-        let a_low = _mm_cvtepu8_epi16(a_chan);
-        let u8_scale = _mm_set1_ps(1f32 / 255f32);
-        let a_low_low = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_low)), u8_scale);
-
-        let (v0, v1, v2, v3) = sse_interleave_ps_rgba(x_low_low, y_low_low, z_low_low, a_low_low);
+        let (v0, v1, v2, v3) = sse_interleave_ps_rgba(x_low_low, y_low_low, z_low_low, a_chan);
         _mm_storeu_ps(dst_ptr.add(cx * CHANNELS), v0);
         _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 4), v1);
         _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 8), v2);
         _mm_storeu_ps(dst_ptr.add(cx * CHANNELS + 12), v3);
 
-        let r_low_high = _mm_unpackhi_epi16(r_low, zeros);
-        let g_low_high = _mm_unpackhi_epi16(g_low, zeros);
-        let b_low_high = _mm_unpackhi_epi16(b_low, zeros);
-
-        let (mut x_low_high, mut y_low_high, mut z_low_high) = sse_triple_to_xyz(
-            r_low_high,
-            g_low_high,
-            b_low_high,
-            cq1,
-            cq2,
-            cq3,
-            cq4,
-            cq5,
-            cq6,
-            cq7,
-            cq8,
-            cq9,
-            transfer_function,
-        );
-
-        match target {
-            XyzTarget::Lab => {
-                let (l, a, b) = sse_triple_to_lab(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = a;
-                z_low_high = b;
-            }
-            XyzTarget::Xyz => {}
-            XyzTarget::Luv => {
-                let (l, u, v) = sse_triple_to_luv(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = u;
-                z_low_high = v;
-            }
-            XyzTarget::Lch => {
-                let (l, c, h) = sse_triple_to_lch(x_low_high, y_low_high, z_low_high);
-                x_low_high = l;
-                y_low_high = c;
-                z_low_high = h;
-            }
-        }
-
-        let a_low_high = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(a_low, zeros)), u8_scale);
-
-        let ptr0 = dst_ptr.add(cx * CHANNELS + 4 * CHANNELS);
-        store_and_interleave_v4_f32!(
-            ptr0,
-            image_configuration,
-            x_low_high,
-            y_low_high,
-            z_low_high,
-            a_low_high
-        );
-
-        cx += 8;
+        cx += 4;
     }
 
     cx
diff --git a/src/sse/xyz_lab_to_image.rs b/src/sse/xyz_lab_to_image.rs
index 28b9098..db8bb99 100644
--- a/src/sse/xyz_lab_to_image.rs
+++ b/src/sse/xyz_lab_to_image.rs
@@ -7,25 +7,22 @@
 
 use crate::image::ImageConfiguration;
 use crate::sse::cie::{sse_lab_to_xyz, sse_lch_to_xyz, sse_luv_to_xyz};
-use crate::sse::{
-    _mm_color_matrix_ps, perform_sse_gamma_transfer, sse_deinterleave_rgb_ps, sse_interleave_rgb,
-    sse_interleave_rgba,
-};
+use crate::sse::{_mm_color_matrix_ps, sse_deinterleave_rgb_ps};
+use crate::sse::{sse_interleave_ps_rgb, sse_interleave_ps_rgba};
 use crate::xyz_target::XyzTarget;
-use crate::TransferFunction;
+use crate::{store_and_interleave_v3_f32, store_and_interleave_v4_f32};
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
 #[inline(always)]
-unsafe fn sse_xyz_lab_vld<
+pub unsafe fn sse_xyz_lab_vld<
     const CHANNELS_CONFIGURATION: u8,
     const USE_ALPHA: bool,
     const TARGET: u8,
 >(
     src: *const f32,
-    transfer_function: TransferFunction,
     c1: __m128,
     c2: __m128,
     c3: __m128,
@@ -35,9 +32,8 @@ unsafe fn sse_xyz_lab_vld<
     c7: __m128,
     c8: __m128,
     c9: __m128,
-) -> (__m128i, __m128i, __m128i) {
+) -> (__m128, __m128, __m128) {
     let target: XyzTarget = TARGET.into();
-    let v_scale_color = _mm_set1_ps(255f32);
     let lab_pixel_0 = _mm_loadu_ps(src);
     let lab_pixel_1 = _mm_loadu_ps(src.add(4));
     let lab_pixel_2 = _mm_loadu_ps(src.add(8));
@@ -68,23 +64,7 @@ unsafe fn sse_xyz_lab_vld<
 
     let (linear_r, linear_g, linear_b) =
         _mm_color_matrix_ps(r_f32, g_f32, b_f32, c1, c2, c3, c4, c5, c6, c7, c8, c9);
-
-    r_f32 = linear_r;
-    g_f32 = linear_g;
-    b_f32 = linear_b;
-
-    r_f32 = perform_sse_gamma_transfer(transfer_function, r_f32);
-    g_f32 = perform_sse_gamma_transfer(transfer_function, g_f32);
-    b_f32 = perform_sse_gamma_transfer(transfer_function, b_f32);
-    r_f32 = _mm_mul_ps(r_f32, v_scale_color);
-    g_f32 = _mm_mul_ps(g_f32, v_scale_color);
-    b_f32 = _mm_mul_ps(b_f32, v_scale_color);
-    const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
-    (
-        _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(r_f32)),
-        _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(g_f32)),
-        _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(b_f32)),
-    )
+    (linear_r, linear_g, linear_b)
 }
 
 #[target_feature(enable = "sse4.1")]
@@ -98,11 +78,10 @@ pub unsafe fn sse_xyz_to_channels<
     src_offset: usize,
     a_channel: *const f32,
     a_offset: usize,
-    dst: *mut u8,
+    dst: *mut f32,
     dst_offset: usize,
     width: u32,
     matrix: &[[f32; 3]; 3],
-    transfer_function: TransferFunction,
 ) -> usize {
     let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
     if USE_ALPHA && !image_configuration.has_alpha() {
@@ -125,11 +104,7 @@ pub unsafe fn sse_xyz_to_channels<
 
     let src_channels = 3usize;
 
-    let color_rescale = _mm_set1_ps(255f32);
-
-    let zeros = _mm_setzero_si128();
-
-    while cx + 16 < width as usize {
+    while cx + 4 < width as usize {
         let offset_src_ptr =
             ((src as *const u8).add(src_offset) as *const f32).add(cx * src_channels);
 
@@ -137,233 +112,28 @@ pub unsafe fn sse_xyz_to_channels<
 
         let (r_row0_, g_row0_, b_row0_) =
             sse_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_0,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let src_ptr_1 = offset_src_ptr.add(4 * src_channels);
-
-        let (r_row1_, g_row1_, b_row1_) =
-            sse_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_1,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let src_ptr_2 = offset_src_ptr.add(4 * 2 * src_channels);
-
-        let (r_row2_, g_row2_, b_row2_) =
-            sse_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_2,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
+                src_ptr_0, c1, c2, c3, c4, c5, c6, c7, c8, c9,
             );
 
-        let src_ptr_3 = offset_src_ptr.add(4 * 3 * src_channels);
-
-        let (r_row3_, g_row3_, b_row3_) =
-            sse_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_3,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let r_row01 = _mm_packs_epi32(r_row0_, r_row1_);
-        let g_row01 = _mm_packs_epi32(g_row0_, g_row1_);
-        let b_row01 = _mm_packs_epi32(b_row0_, b_row1_);
-
-        let r_row23 = _mm_packs_epi32(r_row2_, r_row3_);
-        let g_row23 = _mm_packs_epi32(g_row2_, g_row3_);
-        let b_row23 = _mm_packs_epi32(b_row2_, b_row3_);
-
-        let r_row = _mm_packus_epi16(r_row01, r_row23);
-        let g_row = _mm_packus_epi16(g_row01, g_row23);
-        let b_row = _mm_packus_epi16(b_row01, b_row23);
-
-        let dst_ptr = dst.add(dst_offset + cx * channels);
+        let dst_ptr = ((dst as *mut u8).add(dst_offset) as *mut f32).add(cx * channels);
 
         if USE_ALPHA {
-            const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
             let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx);
-            let a_low_0_f = _mm_loadu_ps(offset_a_src_ptr);
-            let a_row0_ = _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(_mm_mul_ps(
-                a_low_0_f,
-                color_rescale,
-            )));
-
-            let a_low_1_f = _mm_loadu_ps(offset_a_src_ptr.add(4));
-            let a_row1_ = _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(_mm_mul_ps(
-                a_low_1_f,
-                color_rescale,
-            )));
-
-            let a_low_2_f = _mm_loadu_ps(offset_a_src_ptr.add(8));
-            let a_row2_ = _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(_mm_mul_ps(
-                a_low_2_f,
-                color_rescale,
-            )));
-
-            let a_low_3_f = _mm_loadu_ps(offset_a_src_ptr.add(12));
-            let a_row3_ = _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(_mm_mul_ps(
-                a_low_3_f,
-                color_rescale,
-            )));
-
-            let a_row01 = _mm_packs_epi32(a_row0_, a_row1_);
-            let a_row23 = _mm_packs_epi32(a_row2_, a_row3_);
-            let a_row = _mm_packus_epi16(a_row01, a_row23);
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    sse_interleave_rgba(r_row, g_row, b_row, a_row)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    sse_interleave_rgba(b_row, g_row, r_row, a_row)
-                }
-            };
-            _mm_storeu_si128(dst_ptr as *mut __m128i, store_rows.0);
-            _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, store_rows.1);
-            _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, store_rows.2);
-            _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, store_rows.3);
-        } else {
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    sse_interleave_rgb(r_row, g_row, b_row)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    sse_interleave_rgb(b_row, g_row, r_row)
-                }
-            };
-            _mm_storeu_si128(dst_ptr as *mut __m128i, store_rows.0);
-            _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, store_rows.1);
-            _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, store_rows.2);
-        }
-
-        cx += 16;
-    }
-
-    while cx + 8 < width as usize {
-        let offset_src_ptr =
-            ((src as *const u8).add(src_offset) as *const f32).add(cx * src_channels);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_) =
-            sse_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_0,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
+            let a_row = _mm_loadu_ps(offset_a_src_ptr);
+
+            store_and_interleave_v4_f32!(
+                dst_ptr,
+                image_configuration,
+                r_row0_,
+                g_row0_,
+                b_row0_,
+                a_row
             );
-
-        let src_ptr_1 = offset_src_ptr.add(4 * src_channels);
-
-        let (r_row1_, g_row1_, b_row1_) =
-            sse_xyz_lab_vld::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
-                src_ptr_1,
-                transfer_function,
-                c1,
-                c2,
-                c3,
-                c4,
-                c5,
-                c6,
-                c7,
-                c8,
-                c9,
-            );
-
-        let r_row01 = _mm_packs_epi32(r_row0_, r_row1_);
-        let g_row01 = _mm_packs_epi32(g_row0_, g_row1_);
-        let b_row01 = _mm_packs_epi32(b_row0_, b_row1_);
-
-        let r_row = _mm_packus_epi16(r_row01, zeros);
-        let g_row = _mm_packus_epi16(g_row01, zeros);
-        let b_row = _mm_packus_epi16(b_row01, zeros);
-
-        let dst_ptr = dst.add(dst_offset + cx * channels);
-
-        if USE_ALPHA {
-            const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
-            let offset_a_src_ptr = ((a_channel as *const u8).add(a_offset) as *const f32).add(cx);
-            let a_low_0_f = _mm_loadu_ps(offset_a_src_ptr);
-            let a_row0_ = _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(_mm_mul_ps(
-                a_low_0_f,
-                color_rescale,
-            )));
-
-            let a_low_1_f = _mm_loadu_ps(offset_a_src_ptr.add(4));
-            let a_row1_ = _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(_mm_mul_ps(
-                a_low_1_f,
-                color_rescale,
-            )));
-
-            let a_row01 = _mm_packs_epi32(a_row0_, a_row1_);
-            let a_row = _mm_packus_epi16(a_row01, zeros);
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    sse_interleave_rgba(r_row, g_row, b_row, a_row)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    sse_interleave_rgba(b_row, g_row, r_row, a_row)
-                }
-            };
-            _mm_storeu_si128(dst_ptr as *mut __m128i, store_rows.0);
-            _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, store_rows.1);
         } else {
-            let store_rows = match image_configuration {
-                ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                    sse_interleave_rgb(r_row, g_row, b_row)
-                }
-                ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                    sse_interleave_rgb(b_row, g_row, r_row)
-                }
-            };
-            _mm_storeu_si128(dst_ptr as *mut __m128i, store_rows.0);
-            let regi = store_rows.1;
-            std::ptr::copy_nonoverlapping(&regi as *const _ as *const u8, dst_ptr.add(16), 8);
+            store_and_interleave_v3_f32!(dst_ptr, image_configuration, r_row0_, g_row0_, b_row0_);
         }
 
-        cx += 8;
+        cx += 4;
     }
 
     cx
diff --git a/src/sse/xyza_laba_to_image.rs b/src/sse/xyza_laba_to_image.rs
index fac74d4..d2cd846 100644
--- a/src/sse/xyza_laba_to_image.rs
+++ b/src/sse/xyza_laba_to_image.rs
@@ -7,20 +7,16 @@
 
 use crate::image::ImageConfiguration;
 use crate::sse::cie::{sse_lab_to_xyz, sse_lch_to_xyz, sse_luv_to_xyz};
-use crate::sse::{
-    _mm_color_matrix_ps, perform_sse_gamma_transfer, sse_deinterleave_rgba_ps, sse_interleave_rgba,
-};
+use crate::sse::{_mm_color_matrix_ps, sse_deinterleave_rgba_ps, sse_interleave_ps_rgba};
 use crate::xyz_target::XyzTarget;
-use crate::TransferFunction;
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
 #[inline(always)]
-unsafe fn sse_xyza_lab_vld<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
+pub unsafe fn sse_xyza_lab_vld<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     src: *const f32,
-    transfer_function: TransferFunction,
     c1: __m128,
     c2: __m128,
     c3: __m128,
@@ -30,9 +26,8 @@ unsafe fn sse_xyza_lab_vld<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     c7: __m128,
     c8: __m128,
     c9: __m128,
-) -> (__m128i, __m128i, __m128i, __m128i) {
+) -> (__m128, __m128, __m128, __m128) {
     let target: XyzTarget = TARGET.into();
-    let v_scale_color = _mm_set1_ps(255f32);
     let pixel_0 = _mm_loadu_ps(src);
     let pixel_1 = _mm_loadu_ps(src.add(4));
     let pixel_2 = _mm_loadu_ps(src.add(8));
@@ -68,21 +63,7 @@ unsafe fn sse_xyza_lab_vld<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
     r_f32 = linear_r;
     g_f32 = linear_g;
     b_f32 = linear_b;
-
-    r_f32 = perform_sse_gamma_transfer(transfer_function, r_f32);
-    g_f32 = perform_sse_gamma_transfer(transfer_function, g_f32);
-    b_f32 = perform_sse_gamma_transfer(transfer_function, b_f32);
-    r_f32 = _mm_mul_ps(r_f32, v_scale_color);
-    g_f32 = _mm_mul_ps(g_f32, v_scale_color);
-    b_f32 = _mm_mul_ps(b_f32, v_scale_color);
-    let a_f32 = _mm_mul_ps(a_f32, v_scale_color);
-    const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
-    (
-        _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(r_f32)),
-        _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(g_f32)),
-        _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(b_f32)),
-        _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(a_f32)),
-    )
+    (r_f32, g_f32, b_f32, a_f32)
 }
 
 #[target_feature(enable = "sse4.1")]
@@ -90,11 +71,10 @@ pub unsafe fn sse_xyza_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:
     start_cx: usize,
     src: *const f32,
     src_offset: usize,
-    dst: *mut u8,
+    dst: *mut f32,
     dst_offset: usize,
     width: u32,
     matrix: &[[f32; 3]; 3],
-    transfer_function: TransferFunction,
 ) -> usize {
     let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
     if !image_configuration.has_alpha() {
@@ -117,169 +97,32 @@ pub unsafe fn sse_xyza_to_image<const CHANNELS_CONFIGURATION: u8, const TARGET:
 
     const CHANNELS: usize = 4usize;
 
-    let zeros = _mm_setzero_si128();
-
-    while cx + 16 < width as usize {
+    while cx + 4 < width as usize {
         let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS);
 
         let src_ptr_0 = offset_src_ptr;
 
         let (r_row0_, g_row0_, b_row0_, a_row0_) = sse_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
-            src_ptr_0,
-            transfer_function,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            c9,
-        );
-
-        let src_ptr_1 = offset_src_ptr.add(4 * CHANNELS);
-
-        let (r_row1_, g_row1_, b_row1_, a_row1_) = sse_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
-            src_ptr_1,
-            transfer_function,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            c9,
-        );
-
-        let src_ptr_2 = offset_src_ptr.add(4 * 2 * CHANNELS);
-
-        let (r_row2_, g_row2_, b_row2_, a_row2_) = sse_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
-            src_ptr_2,
-            transfer_function,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            c9,
-        );
-
-        let src_ptr_3 = offset_src_ptr.add(4 * 3 * CHANNELS);
-
-        let (r_row3_, g_row3_, b_row3_, a_row3_) = sse_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
-            src_ptr_3,
-            transfer_function,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            c9,
+            src_ptr_0, c1, c2, c3, c4, c5, c6, c7, c8, c9,
         );
 
-        let r_row01 = _mm_packs_epi32(r_row0_, r_row1_);
-        let g_row01 = _mm_packs_epi32(g_row0_, g_row1_);
-        let b_row01 = _mm_packs_epi32(b_row0_, b_row1_);
-        let a_row01 = _mm_packs_epi32(a_row0_, a_row1_);
-
-        let r_row23 = _mm_packs_epi32(r_row2_, r_row3_);
-        let g_row23 = _mm_packs_epi32(g_row2_, g_row3_);
-        let b_row23 = _mm_packs_epi32(b_row2_, b_row3_);
-        let a_row23 = _mm_packs_epi32(a_row2_, a_row3_);
-
-        let r_row = _mm_packus_epi16(r_row01, r_row23);
-        let g_row = _mm_packus_epi16(g_row01, g_row23);
-        let b_row = _mm_packus_epi16(b_row01, b_row23);
-        let a_row = _mm_packus_epi16(a_row01, a_row23);
-
-        let dst_ptr = dst.add(dst_offset + cx * channels);
+        let dst_ptr = ((dst as *mut u8).add(dst_offset) as *mut f32).add(cx * channels);
 
         let (rgba0, rgba1, rgba2, rgba3) = match image_configuration {
             ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                sse_interleave_rgba(r_row, g_row, b_row, a_row)
-            }
-            ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                sse_interleave_rgba(b_row, g_row, r_row, a_row)
-            }
-        };
-
-        _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-        _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
-        _mm_storeu_si128(dst_ptr.add(32) as *mut __m128i, rgba2);
-        _mm_storeu_si128(dst_ptr.add(48) as *mut __m128i, rgba3);
-
-        cx += 16;
-    }
-
-    while cx + 8 < width as usize {
-        let offset_src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * CHANNELS);
-
-        let src_ptr_0 = offset_src_ptr;
-
-        let (r_row0_, g_row0_, b_row0_, a_row0_) = sse_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
-            src_ptr_0,
-            transfer_function,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            c9,
-        );
-
-        let src_ptr_1 = offset_src_ptr.add(4 * CHANNELS);
-
-        let (r_row1_, g_row1_, b_row1_, a_row1_) = sse_xyza_lab_vld::<CHANNELS_CONFIGURATION, TARGET>(
-            src_ptr_1,
-            transfer_function,
-            c1,
-            c2,
-            c3,
-            c4,
-            c5,
-            c6,
-            c7,
-            c8,
-            c9,
-        );
-
-        let r_row01 = _mm_packs_epi32(r_row0_, r_row1_);
-        let g_row01 = _mm_packs_epi32(g_row0_, g_row1_);
-        let b_row01 = _mm_packs_epi32(b_row0_, b_row1_);
-        let a_row01 = _mm_packs_epi32(a_row0_, a_row1_);
-
-        let r_row = _mm_packus_epi16(r_row01, zeros);
-        let g_row = _mm_packus_epi16(g_row01, zeros);
-        let b_row = _mm_packus_epi16(b_row01, zeros);
-        let a_row = _mm_packus_epi16(a_row01, zeros);
-
-        let dst_ptr = dst.add(dst_offset + cx * channels);
-
-        let (rgba0, rgba1, _, _) = match image_configuration {
-            ImageConfiguration::Rgb | ImageConfiguration::Rgba => {
-                sse_interleave_rgba(r_row, g_row, b_row, a_row)
+                sse_interleave_ps_rgba(r_row0_, g_row0_, b_row0_, a_row0_)
             }
             ImageConfiguration::Bgra | ImageConfiguration::Bgr => {
-                sse_interleave_rgba(b_row, g_row, r_row, a_row)
+                sse_interleave_ps_rgba(b_row0_, g_row0_, r_row0_, a_row0_)
             }
         };
 
-        _mm_storeu_si128(dst_ptr as *mut __m128i, rgba0);
-        _mm_storeu_si128(dst_ptr.add(16) as *mut __m128i, rgba1);
+        _mm_storeu_ps(dst_ptr, rgba0);
+        _mm_storeu_ps(dst_ptr.add(4), rgba1);
+        _mm_storeu_ps(dst_ptr.add(8), rgba2);
+        _mm_storeu_ps(dst_ptr.add(12), rgba3);
 
-        cx += 8;
+        cx += 4;
     }
 
     cx
diff --git a/src/xyz.rs b/src/xyz.rs
index cbdca90..2401883 100644
--- a/src/xyz.rs
+++ b/src/xyz.rs
@@ -129,7 +129,7 @@ impl Xyz {
     /// * `matrix` - Transformation matrix from RGB to XYZ, for example `SRGB_TO_XYZ_D65`
     /// * `transfer_function` - Transfer functions for current colorspace
     #[inline]
-    pub fn from_linear_rgb(rgb: &Rgb<f32>, matrix: &[[f32; 3]; 3]) -> Self {
+    pub fn from_linear_rgb(rgb: Rgb<f32>, matrix: &[[f32; 3]; 3]) -> Self {
         unsafe {
             Self::new(
                 (*(*matrix.get_unchecked(0)).get_unchecked(0)) * rgb.r
diff --git a/src/xyz_lab_to_image.rs b/src/xyz_lab_to_image.rs
index ba8c885..3cd011b 100644
--- a/src/xyz_lab_to_image.rs
+++ b/src/xyz_lab_to_image.rs
@@ -19,7 +19,6 @@ use crate::{LCh, Lab, Luv, Xyz, XYZ_TO_SRGB_D65};
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 #[cfg(feature = "rayon")]
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
-#[cfg(feature = "rayon")]
 use std::slice;
 
 fn xyz_to_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, const TARGET: u8>(
@@ -50,11 +49,10 @@ fn xyz_to_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
             usize,
             *const f32,
             usize,
-            *mut u8,
+            *mut f32,
             usize,
             u32,
             &[[f32; 3]; 3],
-            TransferFunction,
         ) -> usize,
     > = None;
 
@@ -73,15 +71,22 @@ fn xyz_to_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
         _wide_row_handler = Some(neon_xyz_to_channels::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>);
     }
 
+    let src_slice_safe_align = unsafe {
+        slice::from_raw_parts_mut(
+            src.as_ptr() as *mut u8,
+            src_stride as usize * height as usize,
+        )
+    };
+
+    let mut lut_table = vec![0u8; 2049];
+    for i in 0..2049 {
+        lut_table[i] = (transfer_function.gamma(i as f32 * (1. / 2048.0)) * 255.)
+            .ceil()
+            .min(255.) as u8;
+    }
+
     #[cfg(feature = "rayon")]
     {
-        let src_slice_safe_align = unsafe {
-            slice::from_raw_parts_mut(
-                src.as_ptr() as *mut u8,
-                src_stride as usize * height as usize,
-            )
-        };
-
         if USE_ALPHA {
             let a_slice_safe_align = unsafe {
                 slice::from_raw_parts_mut(
@@ -96,6 +101,8 @@ fn xyz_to_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
                 .for_each(|((dst, src), a_channel)| unsafe {
                     let mut _cx = 0usize;
 
+                    let mut transient_row = vec![0f32; width as usize * channels];
+
                     if let Some(dispatcher) = _wide_row_handler {
                         _cx = dispatcher(
                             _cx,
@@ -103,16 +110,14 @@ fn xyz_to_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
                             0,
                             a_channel.as_ptr() as *const f32,
                             0,
-                            dst.as_mut_ptr(),
+                            transient_row.as_mut_ptr(),
                             0,
                             width,
                             matrix,
-                            transfer_function,
                         );
                     }
 
                     let src_ptr = src.as_ptr() as *mut f32;
-                    let dst_ptr = dst.as_mut_ptr();
 
                     for x in _cx..width as usize {
                         let src_slice = src_ptr.add(x * 3);
@@ -122,36 +127,49 @@ fn xyz_to_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
                         let rgb = match source {
                             XyzTarget::Lab => {
                                 let lab = Lab::new(l_x, l_y, l_z);
-                                lab.to_rgb()
+                                lab.to_linear_rgb(matrix)
                             }
                             XyzTarget::Xyz => {
                                 let xyz = Xyz::new(l_x, l_y, l_z);
-                                xyz.to_rgb(matrix, transfer_function)
+                                xyz.to_linear_rgb(matrix)
                             }
                             XyzTarget::Luv => {
                                 let luv = Luv::new(l_x, l_y, l_z);
-                                luv.to_rgb()
+                                luv.to_linear_rgb(matrix)
                             }
                             XyzTarget::Lch => {
                                 let lch = LCh::new(l_x, l_y, l_z);
-                                lch.to_rgb()
+                                lch.to_linear_rgb(matrix)
                             }
                         };
 
-                        let dst = dst_ptr.add(x * channels);
-
-                        dst.add(image_configuration.get_r_channel_offset())
-                            .write_unaligned(rgb.r);
-                        dst.add(image_configuration.get_g_channel_offset())
-                            .write_unaligned(rgb.g);
-                        dst.add(image_configuration.get_b_channel_offset())
-                            .write_unaligned(rgb.b);
+                        let dst = transient_row.get_unchecked_mut((x * channels)..);
+                        *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r;
+                        *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g;
+                        *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b;
                         if image_configuration.has_alpha() {
                             let a_ptr = a_channel.as_ptr() as *const f32;
                             let a_f = a_ptr.add(x).read_unaligned();
-                            let a_value = (a_f * 255f32).max(0f32);
-                            dst.add(image_configuration.get_a_channel_offset())
-                                .write_unaligned(a_value as u8);
+                            *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) =
+                                a_f;
+                        }
+                    }
+
+                    for (dst_chunk, src_chunks) in dst
+                        .chunks_exact_mut(channels)
+                        .zip(transient_row.chunks_exact(channels))
+                    {
+                        let r_cast = (src_chunks[0].min(1.).max(0.) * 2048f32).round() as usize;
+                        let g_cast = (src_chunks[1].min(1.).max(0.) * 2048f32).round() as usize;
+                        let b_cast = (src_chunks[2].min(1.).max(0.) * 2048f32).round() as usize;
+
+                        dst_chunk[0] = *lut_table.get_unchecked(r_cast.min(2048));
+                        dst_chunk[1] = *lut_table.get_unchecked(g_cast.min(2048));
+                        dst_chunk[2] = *lut_table.get_unchecked(b_cast.min(2048));
+
+                        if image_configuration.has_alpha() {
+                            let a_cast = (src_chunks[3] * 255.).min(255.).max(0.) as u8;
+                            dst_chunk[3] = a_cast;
                         }
                     }
                 });
@@ -161,23 +179,23 @@ fn xyz_to_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
                 .for_each(|(dst, src)| unsafe {
                     let mut _cx = 0usize;
 
+                    let mut transient_row = vec![0f32; width as usize * channels];
+
                     if let Some(dispatcher) = _wide_row_handler {
                         _cx = dispatcher(
                             _cx,
                             src.as_ptr() as *const f32,
                             0,
-                            std::ptr::null(),
+                            a_channel.as_ptr(),
                             0,
-                            dst.as_mut_ptr(),
+                            transient_row.as_mut_ptr(),
                             0,
                             width,
                             matrix,
-                            transfer_function,
                         );
                     }
 
                     let src_ptr = src.as_ptr() as *mut f32;
-                    let dst_ptr = dst.as_mut_ptr();
 
                     for x in _cx..width as usize {
                         let src_slice = src_ptr.add(x * 3);
@@ -187,30 +205,39 @@ fn xyz_to_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
                         let rgb = match source {
                             XyzTarget::Lab => {
                                 let lab = Lab::new(l_x, l_y, l_z);
-                                lab.to_rgb()
+                                lab.to_linear_rgb(matrix)
                             }
                             XyzTarget::Xyz => {
                                 let xyz = Xyz::new(l_x, l_y, l_z);
-                                xyz.to_rgb(matrix, transfer_function)
+                                xyz.to_linear_rgb(matrix)
                             }
                             XyzTarget::Luv => {
                                 let luv = Luv::new(l_x, l_y, l_z);
-                                luv.to_rgb()
+                                luv.to_linear_rgb(matrix)
                             }
                             XyzTarget::Lch => {
                                 let lch = LCh::new(l_x, l_y, l_z);
-                                lch.to_rgb()
+                                lch.to_linear_rgb(matrix)
                             }
                         };
 
-                        let dst = dst_ptr.add(x * channels);
+                        let dst = transient_row.get_unchecked_mut((x * channels)..);
+                        *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r;
+                        *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g;
+                        *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b;
+                    }
 
-                        dst.add(image_configuration.get_r_channel_offset())
-                            .write_unaligned(rgb.r);
-                        dst.add(image_configuration.get_g_channel_offset())
-                            .write_unaligned(rgb.g);
-                        dst.add(image_configuration.get_b_channel_offset())
-                            .write_unaligned(rgb.b);
+                    for (dst_chunk, src_chunks) in dst
+                        .chunks_exact_mut(channels)
+                        .zip(transient_row.chunks_exact(channels))
+                    {
+                        let r_cast = (src_chunks[0].min(1.).max(0.) * 2048f32).round() as usize;
+                        let g_cast = (src_chunks[1].min(1.).max(0.) * 2048f32).round() as usize;
+                        let b_cast = (src_chunks[2].min(1.).max(0.) * 2048f32).round() as usize;
+
+                        dst_chunk[0] = *lut_table.get_unchecked(r_cast.min(2048));
+                        dst_chunk[1] = *lut_table.get_unchecked(g_cast.min(2048));
+                        dst_chunk[2] = *lut_table.get_unchecked(b_cast.min(2048));
                     }
                 });
         }
@@ -218,82 +245,165 @@ fn xyz_to_channels<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
 
     #[cfg(not(feature = "rayon"))]
     {
-        let mut src_offset = 0usize;
-        let mut dst_offset = 0usize;
-        let mut a_offset = 0usize;
-
-        for _ in 0..height as usize {
-            let mut _cx = 0usize;
+        if USE_ALPHA {
+            let a_slice_safe_align = unsafe {
+                slice::from_raw_parts_mut(
+                    a_channel.as_ptr() as *mut u8,
+                    a_stride as usize * height as usize,
+                )
+            };
 
-            if let Some(dispatcher) = _wide_row_handler {
+            for ((dst, src), a_channel) in dst
+                .chunks_exact_mut(dst_stride as usize)
+                .zip(src_slice_safe_align.chunks_exact_mut(src_stride as usize))
+                .zip(a_slice_safe_align.chunks_exact(a_stride as usize))
+            {
                 unsafe {
-                    _cx = dispatcher(
-                        _cx,
-                        src.as_ptr(),
-                        src_offset,
-                        a_channel.as_ptr(),
-                        a_offset,
-                        dst.as_mut_ptr(),
-                        dst_offset,
-                        width,
-                        matrix,
-                        transfer_function,
-                    );
-                }
-            }
+                    let mut _cx = 0usize;
 
-            let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *mut f32 };
-            let dst_ptr = unsafe { dst.as_mut_ptr().add(dst_offset) };
-
-            for x in _cx..width as usize {
-                let src_slice = unsafe { src_ptr.add(x * 3) };
-                let l_x = unsafe { src_slice.read_unaligned() };
-                let l_y = unsafe { src_slice.add(1).read_unaligned() };
-                let l_z = unsafe { src_slice.add(2).read_unaligned() };
-                let rgb = match source {
-                    XyzTarget::Lab => {
-                        let lab = Lab::new(l_x, l_y, l_z);
-                        lab.to_rgb()
+                    let mut transient_row = vec![0f32; width as usize * channels];
+
+                    if let Some(dispatcher) = _wide_row_handler {
+                        _cx = dispatcher(
+                            _cx,
+                            src.as_ptr() as *const f32,
+                            0,
+                            a_channel.as_ptr() as *const f32,
+                            0,
+                            transient_row.as_mut_ptr(),
+                            0,
+                            width,
+                            matrix,
+                        );
                     }
-                    XyzTarget::Xyz => {
-                        let xyz = Xyz::new(l_x, l_y, l_z);
-                        xyz.to_rgb(matrix, transfer_function)
+
+                    let src_ptr = src.as_ptr() as *mut f32;
+
+                    for x in _cx..width as usize {
+                        let src_slice = src_ptr.add(x * 3);
+                        let l_x = src_slice.read_unaligned();
+                        let l_y = src_slice.add(1).read_unaligned();
+                        let l_z = src_slice.add(2).read_unaligned();
+                        let rgb = match source {
+                            XyzTarget::Lab => {
+                                let lab = Lab::new(l_x, l_y, l_z);
+                                lab.to_linear_rgb(matrix)
+                            }
+                            XyzTarget::Xyz => {
+                                let xyz = Xyz::new(l_x, l_y, l_z);
+                                xyz.to_linear_rgb(matrix)
+                            }
+                            XyzTarget::Luv => {
+                                let luv = Luv::new(l_x, l_y, l_z);
+                                luv.to_linear_rgb(matrix)
+                            }
+                            XyzTarget::Lch => {
+                                let lch = LCh::new(l_x, l_y, l_z);
+                                lch.to_linear_rgb(matrix)
+                            }
+                        };
+
+                        let dst = transient_row.get_unchecked_mut((x * channels)..);
+                        *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r;
+                        *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g;
+                        *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b;
+                        if image_configuration.has_alpha() {
+                            let a_ptr = a_channel.as_ptr() as *const f32;
+                            let a_f = a_ptr.add(x).read_unaligned();
+                            *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) =
+                                a_f;
+                        }
                     }
-                    XyzTarget::Luv => {
-                        let luv = Luv::new(l_x, l_y, l_z);
-                        luv.to_rgb()
+
+                    for (dst_chunk, src_chunks) in dst
+                        .chunks_exact_mut(channels)
+                        .zip(transient_row.chunks_exact(channels))
+                    {
+                        let r_cast = (src_chunks[0].min(1.).max(0.) * 2048f32).round() as usize;
+                        let g_cast = (src_chunks[1].min(1.).max(0.) * 2048f32).round() as usize;
+                        let b_cast = (src_chunks[2].min(1.).max(0.) * 2048f32).round() as usize;
+
+                        dst_chunk[0] = *lut_table.get_unchecked(r_cast.min(2048));
+                        dst_chunk[1] = *lut_table.get_unchecked(g_cast.min(2048));
+                        dst_chunk[2] = *lut_table.get_unchecked(b_cast.min(2048));
+
+                        if image_configuration.has_alpha() {
+                            let a_cast = (src_chunks[3] * 255.).min(255.).max(0.) as u8;
+                            dst_chunk[3] = a_cast;
+                        }
                     }
-                    XyzTarget::Lch => {
-                        let lch = LCh::new(l_x, l_y, l_z);
-                        lch.to_rgb()
+                }
+            }
+        } else {
+            for (dst, src) in dst
+                .chunks_exact_mut(dst_stride as usize)
+                .zip(src_slice_safe_align.chunks_exact_mut(src_stride as usize))
+            {
+                unsafe {
+                    let mut _cx = 0usize;
+
+                    let mut transient_row = vec![0f32; width as usize * channels];
+
+                    if let Some(dispatcher) = _wide_row_handler {
+                        _cx = dispatcher(
+                            _cx,
+                            src.as_ptr() as *const f32,
+                            0,
+                            a_channel.as_ptr(),
+                            0,
+                            transient_row.as_mut_ptr(),
+                            0,
+                            width,
+                            matrix,
+                        );
                     }
-                };
 
-                let dst = unsafe { dst_ptr.add(x * channels) };
+                    let src_ptr = src.as_ptr() as *mut f32;
 
-                unsafe {
-                    dst.add(image_configuration.get_r_channel_offset())
-                        .write_unaligned(rgb.r);
-                    dst.add(image_configuration.get_g_channel_offset())
-                        .write_unaligned(rgb.g);
-                    dst.add(image_configuration.get_b_channel_offset())
-                        .write_unaligned(rgb.b);
-                }
-                if image_configuration.has_alpha() {
-                    let a_ptr =
-                        unsafe { (a_channel.as_ptr() as *const u8).add(a_offset) as *const f32 };
-                    let a_f = unsafe { a_ptr.add(x).read_unaligned() };
-                    let a_value = (a_f * 255f32).max(0f32);
-                    unsafe {
-                        dst.add(image_configuration.get_a_channel_offset())
-                            .write_unaligned(a_value as u8);
+                    for x in _cx..width as usize {
+                        let src_slice = src_ptr.add(x * 3);
+                        let l_x = src_slice.read_unaligned();
+                        let l_y = src_slice.add(1).read_unaligned();
+                        let l_z = src_slice.add(2).read_unaligned();
+                        let rgb = match source {
+                            XyzTarget::Lab => {
+                                let lab = Lab::new(l_x, l_y, l_z);
+                                lab.to_linear_rgb(matrix)
+                            }
+                            XyzTarget::Xyz => {
+                                let xyz = Xyz::new(l_x, l_y, l_z);
+                                xyz.to_linear_rgb(matrix)
+                            }
+                            XyzTarget::Luv => {
+                                let luv = Luv::new(l_x, l_y, l_z);
+                                luv.to_linear_rgb(matrix)
+                            }
+                            XyzTarget::Lch => {
+                                let lch = LCh::new(l_x, l_y, l_z);
+                                lch.to_linear_rgb(matrix)
+                            }
+                        };
+
+                        let dst = transient_row.get_unchecked_mut((x * channels)..);
+                        *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r;
+                        *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g;
+                        *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b;
+                    }
+
+                    for (dst_chunk, src_chunks) in dst
+                        .chunks_exact_mut(channels)
+                        .zip(transient_row.chunks_exact(channels))
+                    {
+                        let r_cast = (src_chunks[0].min(1.).max(0.) * 2048f32).round() as usize;
+                        let g_cast = (src_chunks[1].min(1.).max(0.) * 2048f32).round() as usize;
+                        let b_cast = (src_chunks[2].min(1.).max(0.) * 2048f32).round() as usize;
+
+                        dst_chunk[0] = *lut_table.get_unchecked(r_cast.min(2048));
+                        dst_chunk[1] = *lut_table.get_unchecked(g_cast.min(2048));
+                        dst_chunk[2] = *lut_table.get_unchecked(b_cast.min(2048));
                     }
                 }
             }
-
-            src_offset += src_stride as usize;
-            dst_offset += dst_stride as usize;
-            a_offset += a_stride as usize;
         }
     }
 }
diff --git a/src/xyza_laba_to_image.rs b/src/xyza_laba_to_image.rs
index 9032b3e..6e7048c 100644
--- a/src/xyza_laba_to_image.rs
+++ b/src/xyza_laba_to_image.rs
@@ -19,7 +19,6 @@ use crate::{LCh, Lab, Luv, Xyz, XYZ_TO_SRGB_D65};
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 #[cfg(feature = "rayon")]
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
-#[cfg(feature = "rayon")]
 use std::slice;
 
 #[allow(clippy::type_complexity)]
@@ -40,16 +39,7 @@ fn xyz_with_alpha_to_channels<const CHANNELS_CONFIGURATION: u8, const TARGET: u8
     }
 
     let mut _wide_row_handler: Option<
-        unsafe fn(
-            usize,
-            *const f32,
-            usize,
-            *mut u8,
-            usize,
-            u32,
-            &[[f32; 3]; 3],
-            TransferFunction,
-        ) -> usize,
+        unsafe fn(usize, *const f32, usize, *mut f32, usize, u32, &[[f32; 3]; 3]) -> usize,
     > = None;
 
     #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
@@ -67,14 +57,22 @@ fn xyz_with_alpha_to_channels<const CHANNELS_CONFIGURATION: u8, const TARGET: u8
         _wide_row_handler = Some(avx_xyza_to_image::<CHANNELS_CONFIGURATION, TARGET>);
     }
 
+    let mut lut_table = vec![0u8; 2049];
+    for i in 0..2049 {
+        lut_table[i] = (transfer_function.gamma(i as f32 * (1. / 2048.0)) * 255.)
+            .ceil()
+            .min(255.) as u8;
+    }
+
+    let src_slice_safe_align = unsafe {
+        slice::from_raw_parts(
+            src.as_ptr() as *const u8,
+            src_stride as usize * height as usize,
+        )
+    };
+
     #[cfg(feature = "rayon")]
     {
-        let src_slice_safe_align = unsafe {
-            slice::from_raw_parts(
-                src.as_ptr() as *const u8,
-                src_stride as usize * height as usize,
-            )
-        };
         dst.par_chunks_exact_mut(dst_stride as usize)
             .zip(src_slice_safe_align.par_chunks_exact(src_stride as usize))
             .for_each(|(dst, src)| unsafe {
@@ -82,21 +80,21 @@ fn xyz_with_alpha_to_channels<const CHANNELS_CONFIGURATION: u8, const TARGET: u8
 
                 let mut _cx = 0usize;
 
+                let mut transient_row = vec![0f32; width as usize * channels];
+
                 if let Some(dispatcher) = _wide_row_handler {
                     _cx = dispatcher(
                         _cx,
                         src.as_ptr() as *const f32,
                         0,
-                        dst.as_mut_ptr(),
+                        transient_row.as_mut_ptr(),
                         0,
                         width,
                         matrix,
-                        transfer_function,
                     )
                 }
 
                 let src_ptr = src.as_ptr() as *mut f32;
-                let dst_ptr = dst.as_mut_ptr();
 
                 for x in _cx..width as usize {
                     let px = x * 4;
@@ -106,106 +104,121 @@ fn xyz_with_alpha_to_channels<const CHANNELS_CONFIGURATION: u8, const TARGET: u8
                     let rgb = match source {
                         XyzTarget::Lab => {
                             let lab = Lab::new(l_x, l_y, l_z);
-                            lab.to_rgb()
+                            lab.to_linear_rgb(matrix)
                         }
                         XyzTarget::Xyz => {
                             let xyz = Xyz::new(l_x, l_y, l_z);
-                            xyz.to_rgb(matrix, transfer_function)
+                            xyz.to_linear_rgb(matrix)
                         }
                         XyzTarget::Luv => {
                             let luv = Luv::new(l_x, l_y, l_z);
-                            luv.to_rgb()
+                            luv.to_linear_rgb(matrix)
                         }
                         XyzTarget::Lch => {
                             let lch = LCh::new(l_x, l_y, l_z);
-                            lch.to_rgb()
+                            lch.to_linear_rgb(matrix)
                         }
                     };
 
                     let l_a = src_ptr.add(px + 3).read_unaligned();
-                    let a_value = (l_a * 255f32).max(0f32);
-                    let dst = dst_ptr.add(x * channels);
-                    dst.add(image_configuration.get_r_channel_offset())
-                        .write_unaligned(rgb.r);
-                    dst.add(image_configuration.get_g_channel_offset())
-                        .write_unaligned(rgb.g);
-                    dst.add(image_configuration.get_b_channel_offset())
-                        .write_unaligned(rgb.b);
-                    dst.add(image_configuration.get_a_channel_offset())
-                        .write_unaligned(a_value as u8);
+                    let dst = transient_row.get_unchecked_mut((x * channels)..);
+                    *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r;
+                    *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g;
+                    *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b;
+                    *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a;
+                }
+
+                for (dst_chunk, src_chunks) in dst
+                    .chunks_exact_mut(channels)
+                    .zip(transient_row.chunks_exact(channels))
+                {
+                    let r_cast = (src_chunks[0].min(1.).max(0.) * 2048f32).round();
+                    let g_cast = (src_chunks[1].min(1.).max(0.) * 2048f32).round();
+                    let b_cast = (src_chunks[2].min(1.).max(0.) * 2048f32).round();
+                    let a_cast = (src_chunks[3] * 255.).min(255.).max(0.) as u8;
+
+                    dst_chunk[0] = *lut_table.get_unchecked(r_cast as usize);
+                    dst_chunk[1] = *lut_table.get_unchecked(g_cast as usize);
+                    dst_chunk[2] = *lut_table.get_unchecked(b_cast as usize);
+                    dst_chunk[3] = a_cast;
                 }
             });
     }
 
     #[cfg(not(feature = "rayon"))]
     {
-        let mut src_offset = 0usize;
-        let mut dst_offset = 0usize;
+        for (dst, src) in dst
+            .chunks_exact_mut(dst_stride as usize)
+            .zip(src_slice_safe_align.chunks_exact(src_stride as usize))
+        {
+            unsafe {
+                let channels = image_configuration.get_channels_count();
 
-        let channels = image_configuration.get_channels_count();
+                let mut _cx = 0usize;
 
-        for _ in 0..height as usize {
-            let mut _cx = 0usize;
+                let mut transient_row = vec![0f32; width as usize * channels];
 
-            if let Some(dispatcher) = _wide_row_handler {
-                unsafe {
+                if let Some(dispatcher) = _wide_row_handler {
                     _cx = dispatcher(
                         _cx,
-                        src.as_ptr(),
-                        src_offset,
-                        dst.as_mut_ptr(),
-                        dst_offset,
+                        src.as_ptr() as *const f32,
+                        0,
+                        transient_row.as_mut_ptr(),
+                        0,
                         width,
                         matrix,
-                        transfer_function,
                     )
                 }
-            }
 
-            let src_ptr = unsafe { (src.as_ptr() as *const u8).add(src_offset) as *mut f32 };
-            let dst_ptr = unsafe { dst.as_mut_ptr().add(dst_offset) };
-
-            for x in _cx..width as usize {
-                let px = x * 4;
-                let l_x = unsafe { src_ptr.add(px).read_unaligned() };
-                let l_y = unsafe { src_ptr.add(px + 1).read_unaligned() };
-                let l_z = unsafe { src_ptr.add(px + 2).read_unaligned() };
-                let rgb = match source {
-                    XyzTarget::Lab => {
-                        let lab = Lab::new(l_x, l_y, l_z);
-                        lab.to_rgb()
-                    }
-                    XyzTarget::Xyz => {
-                        let xyz = Xyz::new(l_x, l_y, l_z);
-                        xyz.to_rgb(matrix, transfer_function)
-                    }
-                    XyzTarget::Luv => {
-                        let luv = Luv::new(l_x, l_y, l_z);
-                        luv.to_rgb()
-                    }
-                    XyzTarget::Lch => {
-                        let lch = LCh::new(l_x, l_y, l_z);
-                        lch.to_rgb()
-                    }
-                };
-
-                let l_a = unsafe { src_ptr.add(px + 3).read_unaligned() };
-                let a_value = (l_a * 255f32).max(0f32);
-                unsafe {
-                    let dst = dst_ptr.add(x * channels);
-                    dst.add(image_configuration.get_r_channel_offset())
-                        .write_unaligned(rgb.r);
-                    dst.add(image_configuration.get_g_channel_offset())
-                        .write_unaligned(rgb.g);
-                    dst.add(image_configuration.get_b_channel_offset())
-                        .write_unaligned(rgb.b);
-                    dst.add(image_configuration.get_a_channel_offset())
-                        .write_unaligned(a_value as u8);
+                let src_ptr = src.as_ptr() as *mut f32;
+
+                for x in _cx..width as usize {
+                    let px = x * 4;
+                    let l_x = src_ptr.add(px).read_unaligned();
+                    let l_y = src_ptr.add(px + 1).read_unaligned();
+                    let l_z = src_ptr.add(px + 2).read_unaligned();
+                    let rgb = match source {
+                        XyzTarget::Lab => {
+                            let lab = Lab::new(l_x, l_y, l_z);
+                            lab.to_linear_rgb(matrix)
+                        }
+                        XyzTarget::Xyz => {
+                            let xyz = Xyz::new(l_x, l_y, l_z);
+                            xyz.to_linear_rgb(matrix)
+                        }
+                        XyzTarget::Luv => {
+                            let luv = Luv::new(l_x, l_y, l_z);
+                            luv.to_linear_rgb(matrix)
+                        }
+                        XyzTarget::Lch => {
+                            let lch = LCh::new(l_x, l_y, l_z);
+                            lch.to_linear_rgb(matrix)
+                        }
+                    };
+
+                    let l_a = src_ptr.add(px + 3).read_unaligned();
+                    let dst = transient_row.get_unchecked_mut((x * channels)..);
+                    *dst.get_unchecked_mut(image_configuration.get_r_channel_offset()) = rgb.r;
+                    *dst.get_unchecked_mut(image_configuration.get_g_channel_offset()) = rgb.g;
+                    *dst.get_unchecked_mut(image_configuration.get_b_channel_offset()) = rgb.b;
+                    *dst.get_unchecked_mut(image_configuration.get_a_channel_offset()) = l_a;
                 }
-            }
 
-            src_offset += src_stride as usize;
-            dst_offset += dst_stride as usize;
+                for (dst_chunk, src_chunks) in dst
+                    .chunks_exact_mut(channels)
+                    .zip(transient_row.chunks_exact(channels))
+                {
+                    let r_cast = (src_chunks[0].min(1.).max(0.) * 2048f32).round();
+                    let g_cast = (src_chunks[1].min(1.).max(0.) * 2048f32).round();
+                    let b_cast = (src_chunks[2].min(1.).max(0.) * 2048f32).round();
+                    let a_cast = (src_chunks[3] * 255.).min(255.).max(0.) as u8;
+
+                    dst_chunk[0] = *lut_table.get_unchecked(r_cast as usize);
+                    dst_chunk[1] = *lut_table.get_unchecked(g_cast as usize);
+                    dst_chunk[2] = *lut_table.get_unchecked(b_cast as usize);
+                    dst_chunk[3] = a_cast;
+                }
+            }
         }
     }
 }