diff --git a/src/app/src/main.rs b/src/app/src/main.rs
index 61a5d61..b707b7a 100644
--- a/src/app/src/main.rs
+++ b/src/app/src/main.rs
@@ -1,9 +1,6 @@
-use std::arch::aarch64::{vdupq_n_f32, vdupq_n_u32, vgetq_lane_f32, vgetq_lane_u32};
 use colorutils_rs::*;
 use image::io::Reader as ImageReader;
 use image::{EncodableLayout, GenericImageView};
-#[cfg(target_arch = "x86_64")]
-use std::arch::x86_64::*;
 use std::time::Instant;
 
 #[cfg(target_arch = "x86_64")]
@@ -36,31 +33,64 @@ fn main() {
     //     println!("Cbrt {}", l);
     // }
 
-    let rgb = Rgb::<u8>::new(140, 164, 177);
+    let r = 140;
+    let g = 164;
+    let b = 177;
+    let rgb = Rgb::<u8>::new(r, g, b);
     let hsl = rgb.to_hsl();
     println!("RGB {:?}", rgb);
     println!("HSL {:?}", hsl);
     println!("Back RGB {:?}", hsl.to_rgb8());
 
-    // unsafe  {
-    //     let (h, s, l) = neon_rgb_to_hsl(vdupq_n_u32(255), vdupq_n_u32(156), vdupq_n_u32(255), vdupq_n_f32(1f32));
-    //     println!("NEON HSL {}, {}, {}", vgetq_lane_f32::<0>(h), vgetq_lane_f32::<0>(s), vgetq_lane_f32::<0>(l));
-    //     let (r1, g1, b1) = neon_hsl_to_rgb(h, s, l, vdupq_n_f32(1f32));
+    // unsafe {
+    //     let (h, s, l) = sse_rgb_to_hsl(
+    //         _mm_set1_epi32(r as i32),
+    //         _mm_set1_epi32(g as i32),
+    //         _mm_set1_epi32(b as i32),
+    //         _mm_set1_ps(1f32),
+    //     );
+    //     println!(
+    //         "NEON HSL {}, {}, {}",
+    //         f32::from_bits(_mm_extract_ps::<0>(h) as u32),
+    //         f32::from_bits(_mm_extract_ps::<0>(s) as u32),
+    //         f32::from_bits(_mm_extract_ps::<0>(l) as u32)
+    //     );
+    //     let (r1, g1, b1) = sse_hsl_to_rgb(h, s, l, _mm_set1_ps(1f32));
     //
-    //     println!("NEON HSL -> RHB {}, {}, {}", vgetq_lane_u32::<0>(r1), vgetq_lane_u32::<0>(g1), vgetq_lane_u32::<0>(b1));
+    //     println!(
+    //         "NEON HSL -> RGB {}, {}, {}",
+    //         _mm_extract_epi32::<0>(r1),
+    //         _mm_extract_epi32::<0>(g1),
+    //         _mm_extract_epi32::<0>(b1)
+    //     );
     // }
     //
-    // unsafe  {
-    //     let (h, s, v) = neon_rgb_to_hsv(vdupq_n_u32(255), vdupq_n_u32(156), vdupq_n_u32(255), vdupq_n_f32(1f32));
+    // unsafe {
+    //     let (h, s, v) = sse_rgb_to_hsv(
+    //         _mm_set1_epi32(r as i32),
+    //         _mm_set1_epi32(g as i32),
+    //         _mm_set1_epi32(b as i32),
+    //         _mm_set1_ps(1f32),
+    //     );
     //     let hsv = rgb.to_hsv();
     //     println!("HSV {:?}", hsv);
-    //     println!("NEON HSV {}, {}, {}", vgetq_lane_f32::<0>(h), vgetq_lane_f32::<0>(s), vgetq_lane_f32::<0>(v));
-    //     let (r1, g1, b1) = neon_hsv_to_rgb(h, s,v, vdupq_n_f32(1f32));
-    //     println!("NEON RGB {}, {}, {}", vgetq_lane_u32::<0>(r1), vgetq_lane_u32::<0>(g1), vgetq_lane_u32::<0>(b1));
-
+    //     println!("HSV->RBB {:?}", hsv.to_rgb8());
+    //     println!(
+    //         "NEON HSV {}, {}, {}",
+    //         f32::from_bits(_mm_extract_ps::<0>(h) as u32),
+    //         f32::from_bits(_mm_extract_ps::<0>(s) as u32),
+    //         f32::from_bits(_mm_extract_ps::<0>(v) as u32)
+    //     );
+    //     let (r1, g1, b1) = sse_hsv_to_rgb(h, s, v, _mm_set1_ps(1f32));
+    //     println!(
+    //         "NEON RGB {}, {}, {}",
+    //         _mm_extract_epi32::<0>(r1),
+    //         _mm_extract_epi32::<0>(g1),
+    //         _mm_extract_epi32::<0>(b1)
+    //     );
     // }
 
-    let img = ImageReader::open("./assets/asset_middle.jpg")
+    let img = ImageReader::open("./assets/asset.jpg")
         .unwrap()
         .decode()
         .unwrap();
@@ -71,37 +101,42 @@ fn main() {
     let mut src_bytes = img.as_bytes();
     let width = dimensions.0;
     let height = dimensions.1;
-    let components = 4;
-
-    let mut dst_rgba = vec![];
-    dst_rgba.resize(4usize * width as usize * height as usize, 0u8);
-    rgb_to_rgba(
-        &src_bytes,
-        3u32 * width,
-        &mut dst_rgba,
-        4u32 * width,
-        width,
-        height,
-        255,
-    );
-    src_bytes = &dst_rgba;
+    let components = 3;
+
+    // let mut dst_rgba = vec![];
+    // dst_rgba.resize(4usize * width as usize * height as usize, 0u8);
+    // rgb_to_rgba(
+    //     &src_bytes,
+    //     3u32 * width,
+    //     &mut dst_rgba,
+    //     4u32 * width,
+    //     width,
+    //     height,
+    //     255,
+    // );
+    // src_bytes = &dst_rgba;
 
     let mut dst_slice: Vec<u8> = Vec::new();
-    dst_slice.resize(width as usize * 4 * height as usize, 0u8);
+    dst_slice.resize(width as usize * components * height as usize, 0u8);
 
     {
         let mut lab_store: Vec<u16> = vec![];
-        let store_stride = width as usize * 4usize * std::mem::size_of::<u16>();
-        lab_store.resize(width as usize * 4usize * height as usize, 0u16);
+        let store_stride = width as usize * components * std::mem::size_of::<u16>();
+        lab_store.resize(width as usize * components * height as usize, 0u16);
+        let src_stride = width * components as u32;
         let start_time = Instant::now();
-        rgba_to_hsl(
+        rgb_to_hsl(
             src_bytes,
-            4u32 * width,
+            src_stride,
             &mut lab_store,
             store_stride as u32,
             width,
-            height,100f32
+            height,
+            100f32,
         );
+        let elapsed_time = start_time.elapsed();
+        // Print the elapsed time in milliseconds
+        println!("RGBA To HSV: {:.2?}", elapsed_time);
         // let mut destination: Vec<f32> = vec![];
         // destination.resize(width as usize * height as usize * 4, 0f32);
         // let dst_stride = width * 4 * std::mem::size_of::<f32>() as u32;
@@ -124,18 +159,20 @@ fn main() {
         //     src_shift += src_stride as usize;
         // }
 
-        hsl_to_rgba(
+        let start_time = Instant::now();
+        hsl_to_rgb(
             &lab_store,
             store_stride as u32,
             &mut dst_slice,
-            4u32 * width,
+            src_stride,
             width,
-            height,100f32,
+            height,
+            100f32,
         );
 
         let elapsed_time = start_time.elapsed();
         // Print the elapsed time in milliseconds
-        println!("Fast image resize: {:.2?}", elapsed_time);
+        println!("HSV To RGBA: {:.2?}", elapsed_time);
 
         // laba_to_srgb(
         //     &lab_store,
diff --git a/src/avx/avx2_to_xyz_lab.rs b/src/avx/avx2_to_xyz_lab.rs
index 28b2c56..793f66d 100644
--- a/src/avx/avx2_to_xyz_lab.rs
+++ b/src/avx/avx2_to_xyz_lab.rs
@@ -4,8 +4,6 @@ use crate::gamma_curves::TransferFunction;
 use crate::image::ImageConfiguration;
 #[allow(unused_imports)]
 use crate::image_to_xyz_lab::XyzTarget;
-#[allow(unused_imports)]
-use crate::neon_gamma_curves::*;
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
diff --git a/src/hsv_to_image.rs b/src/hsv_to_image.rs
index d7e1e73..2e95d4b 100644
--- a/src/hsv_to_image.rs
+++ b/src/hsv_to_image.rs
@@ -1,9 +1,15 @@
 use std::slice;
 
-use crate::{Hsl, Hsv};
 use crate::image::ImageConfiguration;
 use crate::image_to_hsv_support::HsvTarget;
+#[cfg(all(
+    any(target_arch = "aarch64", target_arch = "arm"),
+    target_feature = "neon"
+))]
 use crate::neon::neon_hsv_u16_to_image;
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+use crate::sse::sse_hsv_u16_to_image;
+use crate::{Hsl, Hsv};
 
 #[inline(always)]
 fn hsv_u16_to_channels<
@@ -27,6 +33,17 @@ fn hsv_u16_to_channels<
         }
     }
 
+    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    let mut _has_sse = false;
+
+    #[cfg(all(
+        any(target_arch = "x86_64", target_arch = "x86"),
+        target_feature = "sse4.1"
+    ))]
+    if is_x86_feature_detected!("sse4.1") {
+        _has_sse = true;
+    }
+
     let mut src_offset = 0usize;
     let mut dst_offset = 0usize;
 
@@ -36,7 +53,22 @@ fn hsv_u16_to_channels<
 
     for _ in 0..height as usize {
         #[allow(unused_mut)]
-        let mut cx = 0usize;
+        let mut _cx = 0usize;
+
+        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+        unsafe {
+            if _has_sse {
+                _cx = sse_hsv_u16_to_image::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
+                    _cx,
+                    src.as_ptr(),
+                    src_offset,
+                    width,
+                    dst.as_mut_ptr(),
+                    dst_offset,
+                    scale,
+                )
+            }
+        }
 
         #[cfg(all(
             any(target_arch = "aarch64", target_arch = "arm"),
@@ -60,7 +92,7 @@ fn hsv_u16_to_channels<
         let src_slice = unsafe { slice::from_raw_parts(src_ptr, width as usize * channels) };
         let dst_slice = unsafe { slice::from_raw_parts_mut(dst_ptr, width as usize * channels) };
 
-        for x in cx..width as usize {
+        for x in _cx..width as usize {
             let px = x * channels;
             let h = unsafe { *src_slice.get_unchecked(px) };
             let s = unsafe { *src_slice.get_unchecked(px + 1) };
diff --git a/src/image_to_hsv.rs b/src/image_to_hsv.rs
index 2c4d3bc..40962f1 100644
--- a/src/image_to_hsv.rs
+++ b/src/image_to_hsv.rs
@@ -8,8 +8,10 @@ use crate::image_to_hsv_support::HsvTarget;
 ))]
 use crate::neon::neon_channels_to_hsv_u16;
 use crate::Rgb;
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+use crate::sse::sse_channels_to_hsv_u16;
 
-#[inline(always)]
+#[inline]
 fn channels_to_hsv_u16<
     const CHANNELS_CONFIGURATION: u8,
     const USE_ALPHA: bool,
@@ -31,6 +33,17 @@ fn channels_to_hsv_u16<
         }
     }
 
+    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    let mut _has_sse = false;
+
+    #[cfg(all(
+        any(target_arch = "x86_64", target_arch = "x86"),
+        target_feature = "sse4.1"
+    ))]
+    if is_x86_feature_detected!("sse4.1") {
+        _has_sse = true;
+    }
+
     let mut src_offset = 0usize;
     let mut dst_offset = 0usize;
 
@@ -40,6 +53,21 @@ fn channels_to_hsv_u16<
         #[allow(unused_mut)]
         let mut cx = 0usize;
 
+        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+        unsafe {
+            if _has_sse {
+                cx = sse_channels_to_hsv_u16::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
+                    cx,
+                    src.as_ptr(),
+                    src_offset,
+                    width,
+                    dst.as_mut_ptr(),
+                    dst_offset,
+                    scale,
+                )
+            }
+        }
+
         #[cfg(all(
             any(target_arch = "aarch64", target_arch = "arm"),
             target_feature = "neon"
diff --git a/src/lib.rs b/src/lib.rs
index 3be5f67..3ef571c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -79,8 +79,4 @@ pub use hsv_to_image::*;
 pub use image_to_linear_u8::*;
 pub use linear_to_image_u8::*;
 
-pub use rgb_expand::*;
-pub use neon::neon_rgb_to_hsv;
-pub use neon::neon_rgb_to_hsl;
-pub use neon::neon_hsv_to_rgb;
-pub use neon::neon_hsl_to_rgb;
\ No newline at end of file
+pub use rgb_expand::*;
\ No newline at end of file
diff --git a/src/luv.rs b/src/luv.rs
index e72aa88..6713263 100644
--- a/src/luv.rs
+++ b/src/luv.rs
@@ -49,7 +49,6 @@ const D65_XYZ: [f32; 3] = [95.047f32, 100.0f32, 108.883f32];
 use crate::rgb::Rgb;
 use crate::rgba::Rgba;
 use crate::xyz::Xyz;
-use clap::Parser;
 
 pub(crate) const LUV_WHITE_U_PRIME: f32 =
     4.0f32 * D65_XYZ[1] / (D65_XYZ[0] + 15.0 * D65_XYZ[1] + 3.0 * D65_XYZ[2]);
diff --git a/src/neon/mod.rs b/src/neon/mod.rs
index 6c33638..007729d 100644
--- a/src/neon/mod.rs
+++ b/src/neon/mod.rs
@@ -1,9 +1,17 @@
+#[cfg(all(
+    any(target_arch = "aarch64", target_arch = "arm"),
+    target_feature = "neon"
+))]
 mod neon_colors;
 #[cfg(all(
     any(target_arch = "aarch64", target_arch = "arm"),
     target_feature = "neon"
 ))]
 mod neon_gamma_curves;
+#[cfg(all(
+    any(target_arch = "aarch64", target_arch = "arm"),
+    target_feature = "neon"
+))]
 mod neon_image_to_hsv;
 #[cfg(all(
     any(target_arch = "aarch64", target_arch = "arm"),
diff --git a/src/neon/neon_colors.rs b/src/neon/neon_colors.rs
index 4447b09..5c9e423 100644
--- a/src/neon/neon_colors.rs
+++ b/src/neon/neon_colors.rs
@@ -147,10 +147,6 @@ pub unsafe fn neon_hsv_to_rgb(
     (vcvtaq_u32_f32(r), vcvtaq_u32_f32(g), vcvtaq_u32_f32(b))
 }
 
-#[cfg(all(
-    any(target_arch = "aarch64", target_arch = "arm"),
-    target_feature = "neon"
-))]
 #[inline(always)]
 pub unsafe fn neon_rgb_to_hsv(
     r: uint32x4_t,
@@ -209,10 +205,6 @@ pub unsafe fn neon_rgb_to_hsv(
     (h, vmulq_f32(s, scale), vmulq_f32(v, scale))
 }
 
-#[cfg(all(
-    any(target_arch = "aarch64", target_arch = "arm"),
-    target_feature = "neon"
-))]
 #[inline(always)]
 pub unsafe fn neon_rgb_to_hsl(
     r: uint32x4_t,
diff --git a/src/neon/neon_hsv_to_image.rs b/src/neon/neon_hsv_to_image.rs
index 0862881..cc1e0fb 100644
--- a/src/neon/neon_hsv_to_image.rs
+++ b/src/neon/neon_hsv_to_image.rs
@@ -8,7 +8,7 @@ use crate::image_to_hsv_support::HsvTarget;
     any(target_arch = "aarch64", target_arch = "arm"),
     target_feature = "neon"
 ))]
-#[inline(always)]
+#[inline]
 pub unsafe fn neon_hsv_u16_to_image<
     const CHANNELS_CONFIGURATION: u8,
     const USE_ALPHA: bool,
diff --git a/src/sse/mod.rs b/src/sse/mod.rs
index 098a10d..0e4cc9b 100644
--- a/src/sse/mod.rs
+++ b/src/sse/mod.rs
@@ -29,6 +29,10 @@ mod sse_xyza_laba_to_image;
 mod sse_color;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 mod sse_xyz_lab_to_image;
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+mod sse_image_to_hsv;
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+mod sse_hsv_to_image;
 
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 pub use sse_image_to_linear_u8::*;
@@ -49,4 +53,8 @@ pub use sse_xyza_laba_to_image::*;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 pub use sse_xyz_lab_to_image::*;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-pub use sse_linear_to_image::*;
\ No newline at end of file
+pub use sse_linear_to_image::*;
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+pub use sse_image_to_hsv::*;
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+pub use sse_hsv_to_image::*;
\ No newline at end of file
diff --git a/src/sse/sse_color.rs b/src/sse/sse_color.rs
index 4c24f8a..77db068 100644
--- a/src/sse/sse_color.rs
+++ b/src/sse/sse_color.rs
@@ -4,7 +4,7 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 
 use crate::luv::{LUV_MULTIPLIER_INVERSE_Y, LUV_WHITE_U_PRIME, LUV_WHITE_V_PRIME};
-use crate::sse::{_mm_cube_ps, _mm_prefer_fma_ps, _mm_select_ps};
+use crate::sse::{_mm_abs_ps, _mm_cube_ps, _mm_fmod_ps, _mm_prefer_fma_ps, _mm_select_ps};
 
 #[inline(always)]
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
@@ -72,3 +72,304 @@ pub(crate) unsafe fn sse_luv_to_xyz(l: __m128, u: __m128, v: __m128) -> (__m128,
     z = _mm_select_ps(zero_mask_2, zeros, z);
     (x, y, z)
 }
+
+#[inline(always)]
+pub unsafe fn sse_hsl_to_rgb(
+    h: __m128,
+    s: __m128,
+    l: __m128,
+    scale: __m128,
+) -> (__m128i, __m128i, __m128i) {
+    let s = _mm_mul_ps(s, scale);
+    let l = _mm_mul_ps(l, scale);
+    let ones = _mm_set1_ps(1f32);
+    let twos = _mm_set1_ps(2f32);
+    let c = _mm_mul_ps(
+        _mm_sub_ps(ones, _mm_abs_ps(_mm_sub_ps(_mm_mul_ps(l, twos), ones))),
+        s,
+    );
+    let x = _mm_mul_ps(
+        _mm_sub_ps(
+            ones,
+            _mm_abs_ps(_mm_sub_ps(
+                _mm_fmod_ps(_mm_mul_ps(h, _mm_set1_ps(1f32 / 60f32)), twos),
+                ones,
+            )),
+        ),
+        c,
+    );
+
+    let zeros = _mm_setzero_ps();
+    let m = _mm_sub_ps(l, _mm_mul_ps(c, _mm_set1_ps(0.5f32)));
+    let h_prime = h;
+    let (mut r, mut g, mut b) = (zeros, zeros, zeros);
+
+    let between_zero_and_one_mask = _mm_and_ps(
+        _mm_cmpge_ps(h, zeros),
+        _mm_cmplt_ps(h_prime, _mm_set1_ps(60f32)),
+    );
+    let between_one_and_two_mask = _mm_and_ps(
+        _mm_cmpge_ps(h_prime, _mm_set1_ps(60f32)),
+        _mm_cmplt_ps(h_prime, _mm_set1_ps(120f32)),
+    );
+    let between_two_and_three_mask = _mm_and_ps(
+        _mm_cmpge_ps(h_prime, _mm_set1_ps(120f32)),
+        _mm_cmplt_ps(h_prime, _mm_set1_ps(180f32)),
+    );
+    let between_three_and_four_mask = _mm_and_ps(
+        _mm_cmpge_ps(h_prime, _mm_set1_ps(180f32)),
+        _mm_cmplt_ps(h_prime, _mm_set1_ps(240f32)),
+    );
+    let between_four_and_five_mask = _mm_and_ps(
+        _mm_cmpge_ps(h_prime, _mm_set1_ps(240f32)),
+        _mm_cmplt_ps(h_prime, _mm_set1_ps(300f32)),
+    );
+    let between_five_and_six_mask = _mm_and_ps(
+        _mm_cmpge_ps(h_prime, _mm_set1_ps(300f32)),
+        _mm_cmplt_ps(h_prime, _mm_set1_ps(360f32)),
+    );
+    // if h_prime >= 0f32 && h_prime < 1f32 {
+    r = _mm_select_ps(between_zero_and_one_mask, c, r);
+    g = _mm_select_ps(between_zero_and_one_mask, x, g);
+    // if h_prime >= 1f32 && h_prime < 2f32 {
+    r = _mm_select_ps(between_one_and_two_mask, x, r);
+    g = _mm_select_ps(between_one_and_two_mask, c, g);
+    // if h_prime >= 2f32 && h_prime < 3f32
+    g = _mm_select_ps(between_two_and_three_mask, c, g);
+    b = _mm_select_ps(between_two_and_three_mask, x, b);
+    // if h_prime >= 3f32 && h_prime < 4f32 {
+    g = _mm_select_ps(between_three_and_four_mask, x, g);
+    b = _mm_select_ps(between_three_and_four_mask, c, b);
+    // if h_prime >= 4f32 && h_prime < 5f32 {
+    r = _mm_select_ps(between_four_and_five_mask, x, r);
+    b = _mm_select_ps(between_four_and_five_mask, c, b);
+    // if h_prime >= 5f32 && h_prime < 6f32 {
+    r = _mm_select_ps(between_five_and_six_mask, c, r);
+    b = _mm_select_ps(between_five_and_six_mask, x, b);
+    r = _mm_add_ps(r, m);
+    g = _mm_add_ps(g, m);
+    b = _mm_add_ps(b, m);
+    let rgb_scale = _mm_set1_ps(255f32);
+    r = _mm_mul_ps(r, rgb_scale);
+    g = _mm_mul_ps(g, rgb_scale);
+    b = _mm_mul_ps(b, rgb_scale);
+    const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+    (
+        _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(r)),
+        _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(g)),
+        _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(b)),
+    )
+}
+
+#[inline(always)]
+pub unsafe fn sse_hsv_to_rgb(
+    h: __m128,
+    s: __m128,
+    v: __m128,
+    scale: __m128,
+) -> (__m128i, __m128i, __m128i) {
+    let s = _mm_mul_ps(s, scale);
+    let v = _mm_mul_ps(v, scale);
+    let c = _mm_mul_ps(s, v);
+    let h_der = _mm_mul_ps(h, _mm_set1_ps(1f32 / 60f32));
+    let six = _mm_set1_ps(6f32);
+    let h_prime = _mm_fmod_ps(h_der, six);
+    let ones = _mm_set1_ps(1f32);
+    let x = _mm_mul_ps(
+        _mm_sub_ps(
+            ones,
+            _mm_abs_ps(_mm_sub_ps(_mm_fmod_ps(h_prime, _mm_set1_ps(2f32)), ones)),
+        ),
+        c,
+    );
+    let zeros = _mm_setzero_ps();
+    let m = _mm_sub_ps(v, c);
+    let (mut r, mut g, mut b) = (zeros, zeros, zeros);
+    let between_zero_and_one_mask = _mm_and_ps(
+        _mm_cmpge_ps(h_prime, zeros),
+        _mm_cmplt_ps(h_prime, ones),
+    );
+    let twos = _mm_set1_ps(2f32);
+    let between_one_and_two_mask = _mm_and_ps(
+        _mm_cmpge_ps(h_prime, ones),
+        _mm_cmplt_ps(h_prime, twos),
+    );
+    let threes = _mm_set1_ps(3f32);
+    let between_two_and_three_mask = _mm_and_ps(
+        _mm_cmpge_ps(h_prime, twos),
+        _mm_cmplt_ps(h_prime, threes),
+    );
+    let fours = _mm_set1_ps(4f32);
+    let between_three_and_four_mask = _mm_and_ps(
+        _mm_cmpge_ps(h_prime, threes),
+        _mm_cmplt_ps(h_prime, fours),
+    );
+    let fives = _mm_set1_ps(5f32);
+    let between_four_and_five_mask = _mm_and_ps(
+        _mm_cmpge_ps(h_prime, fours),
+        _mm_cmplt_ps(h_prime, fives),
+    );
+    let between_five_and_six_mask = _mm_and_ps(
+        _mm_cmpge_ps(h_prime, fives),
+        _mm_cmplt_ps(h_prime, six),
+    );
+    // if h_prime >= 0f32 && h_prime < 1f32 {
+    r = _mm_select_ps(between_zero_and_one_mask, c, r);
+    g = _mm_select_ps(between_zero_and_one_mask, x, g);
+    // if h_prime >= 1f32 && h_prime < 2f32 {
+    r = _mm_select_ps(between_one_and_two_mask, x, r);
+    g = _mm_select_ps(between_one_and_two_mask, c, g);
+    // if h_prime >= 2f32 && h_prime < 3f32
+    g = _mm_select_ps(between_two_and_three_mask, c, g);
+    b = _mm_select_ps(between_two_and_three_mask, x, b);
+    // if h_prime >= 3f32 && h_prime < 4f32 {
+    g = _mm_select_ps(between_three_and_four_mask, x, g);
+    b = _mm_select_ps(between_three_and_four_mask, c, b);
+    // if h_prime >= 4f32 && h_prime < 5f32 {
+    r = _mm_select_ps(between_four_and_five_mask, x, r);
+    b = _mm_select_ps(between_four_and_five_mask, c, b);
+    // if h_prime >= 5f32 && h_prime < 6f32 {
+    r = _mm_select_ps(between_five_and_six_mask, c, r);
+    b = _mm_select_ps(between_five_and_six_mask, x, b);
+    r = _mm_add_ps(r, m);
+    g = _mm_add_ps(g, m);
+    b = _mm_add_ps(b, m);
+    let rgb_scale = _mm_set1_ps(255f32);
+    r = _mm_mul_ps(r, rgb_scale);
+    g = _mm_mul_ps(g, rgb_scale);
+    b = _mm_mul_ps(b, rgb_scale);
+    const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+    (
+        _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(r)),
+        _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(g)),
+        _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(b)),
+    )
+}
+
+#[inline(always)]
+pub unsafe fn sse_rgb_to_hsv(
+    r: __m128i,
+    g: __m128i,
+    b: __m128i,
+    scale: __m128,
+) -> (__m128, __m128, __m128) {
+    let rgb_scale = _mm_set1_ps(1f32 / 255f32);
+    let r = _mm_mul_ps(_mm_cvtepi32_ps(r), rgb_scale);
+    let g = _mm_mul_ps(_mm_cvtepi32_ps(g), rgb_scale);
+    let b = _mm_mul_ps(_mm_cvtepi32_ps(b), rgb_scale);
+    let c_max = _mm_max_ps(_mm_max_ps(r, g), b);
+    let c_min = _mm_min_ps(_mm_min_ps(r, g), b);
+    let delta = _mm_sub_ps(c_max, c_min);
+    let rcp_delta = _mm_rcp_ps(delta);
+    let is_r_max = _mm_cmpeq_ps(c_max, r);
+    let is_g_max = _mm_cmpeq_ps(c_max, g);
+    let is_b_max = _mm_cmpeq_ps(c_max, b);
+    let immediate_zero_flag = _mm_cmpeq_ps(delta, _mm_setzero_ps());
+    let mut h = _mm_setzero_ps();
+    let v_six = _mm_set1_ps(60f32);
+    h = _mm_select_ps(
+        is_r_max,
+        _mm_mul_ps(
+            _mm_fmod_ps(_mm_mul_ps(_mm_sub_ps(g, b), rcp_delta), _mm_set1_ps(6f32)),
+            v_six,
+        ),
+        h,
+    );
+    let adding_2 = _mm_set1_ps(2f32);
+    h = _mm_select_ps(
+        is_g_max,
+        _mm_mul_ps(
+            _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b, r), rcp_delta), adding_2),
+            v_six,
+        ),
+        h,
+    );
+    let adding_4 = _mm_set1_ps(4f32);
+    h = _mm_select_ps(
+        is_b_max,
+        _mm_mul_ps(
+            _mm_add_ps(_mm_mul_ps(_mm_sub_ps(r, g), rcp_delta), adding_4),
+            v_six,
+        ),
+        h,
+    );
+    let zeros = _mm_setzero_ps();
+    h = _mm_select_ps(immediate_zero_flag, zeros, h);
+    let s = _mm_select_ps(
+        _mm_cmpeq_ps(c_max, zeros),
+        zeros,
+        _mm_mul_ps(delta, _mm_rcp_ps(c_max)),
+    );
+    h = _mm_select_ps(
+        _mm_cmplt_ps(h, zeros),
+        _mm_add_ps(h, _mm_set1_ps(360f32)),
+        h,
+    );
+    let v = c_max;
+    (h, _mm_mul_ps(s, scale), _mm_mul_ps(v, scale))
+}
+
+#[inline(always)]
+pub unsafe fn sse_rgb_to_hsl(
+    r: __m128i,
+    g: __m128i,
+    b: __m128i,
+    scale: __m128,
+) -> (__m128, __m128, __m128) {
+    let rgb_scale = _mm_set1_ps(1f32 / 255f32);
+    let r = _mm_mul_ps(_mm_cvtepi32_ps(r), rgb_scale);
+    let g = _mm_mul_ps(_mm_cvtepi32_ps(g), rgb_scale);
+    let b = _mm_mul_ps(_mm_cvtepi32_ps(b), rgb_scale);
+    let c_max = _mm_max_ps(_mm_max_ps(r, g), b);
+    let c_min = _mm_min_ps(_mm_min_ps(r, g), b);
+    let delta = _mm_sub_ps(c_max, c_min);
+    let rcp_delta = _mm_rcp_ps(delta);
+    let is_r_max = _mm_cmpeq_ps(c_max, r);
+    let is_g_max = _mm_cmpeq_ps(c_max, g);
+    let is_b_max = _mm_cmpeq_ps(c_max, b);
+    let zeros = _mm_setzero_ps();
+    let immediate_zero_flag = _mm_cmpeq_ps(delta, zeros);
+    let v_six = _mm_set1_ps(60f32);
+    let mut h = _mm_setzero_ps();
+    h = _mm_select_ps(
+        is_r_max,
+        _mm_mul_ps(
+            _mm_fmod_ps(_mm_mul_ps(_mm_sub_ps(g, b), rcp_delta), _mm_set1_ps(6f32)),
+            v_six,
+        ),
+        h,
+    );
+    let adding_2 = _mm_set1_ps(2f32);
+    h = _mm_select_ps(
+        is_g_max,
+        _mm_mul_ps(
+            _mm_add_ps(_mm_mul_ps(_mm_sub_ps(b, r), rcp_delta), adding_2),
+            v_six,
+        ),
+        h,
+    );
+    let adding_4 = _mm_set1_ps(4f32);
+    h = _mm_select_ps(
+        is_b_max,
+        _mm_mul_ps(
+            _mm_add_ps(_mm_mul_ps(_mm_sub_ps(r, g), rcp_delta), adding_4),
+            v_six,
+        ),
+        h,
+    );
+    h = _mm_select_ps(immediate_zero_flag, zeros, h);
+    h = _mm_select_ps(
+        _mm_cmplt_ps(h, zeros),
+        _mm_add_ps(h, _mm_set1_ps(360f32)),
+        h,
+    );
+    let l = _mm_mul_ps(_mm_add_ps(c_max, c_min), _mm_set1_ps(0.5f32));
+    let s = _mm_div_ps(
+        delta,
+        _mm_sub_ps(
+            _mm_set1_ps(1f32),
+            _mm_abs_ps(_mm_prefer_fma_ps(_mm_set1_ps(-1f32), _mm_set1_ps(2f32), l)),
+        ),
+    );
+    (h, _mm_mul_ps(s, scale), _mm_mul_ps(l, scale))
+}
diff --git a/src/sse/sse_hsv_to_image.rs b/src/sse/sse_hsv_to_image.rs
new file mode 100644
index 0000000..53a207b
--- /dev/null
+++ b/src/sse/sse_hsv_to_image.rs
@@ -0,0 +1,232 @@
+use crate::image::ImageConfiguration;
+use crate::image_to_hsv_support::HsvTarget;
+use crate::sse::sse_color::{sse_hsl_to_rgb, sse_hsv_to_rgb};
+use crate::sse::{
+    sse_deinterleave_rgb_epi16, sse_deinterleave_rgba_epi16, sse_interleave_rgb,
+    sse_interleave_rgba,
+};
+use std::arch::x86_64::*;
+
+#[inline]
+pub unsafe fn sse_hsv_u16_to_image<
+    const CHANNELS_CONFIGURATION: u8,
+    const USE_ALPHA: bool,
+    const TARGET: u8,
+>(
+    start_cx: usize,
+    src: *const u16,
+    src_offset: usize,
+    width: u32,
+    dst: *mut u8,
+    dst_offset: usize,
+    scale: f32,
+) -> usize {
+    let target: HsvTarget = TARGET.into();
+    let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
+    let mut cx = start_cx;
+    if USE_ALPHA {
+        if !image_configuration.has_alpha() {
+            panic!("Use alpha flag used on image without alpha");
+        }
+    }
+
+    let channels = image_configuration.get_channels_count();
+
+    let v_scale = _mm_set1_ps(scale);
+
+    let dst_ptr = dst.add(dst_offset);
+    let src_load_ptr = (src as *const u8).add(src_offset) as *const u16;
+
+    while cx + 16 < width as usize {
+        let (h_chan, s_chan, v_chan, a_chan_lo);
+        let src_ptr = src_load_ptr.add(cx * channels);
+
+        let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
+        let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
+        let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
+
+        match image_configuration {
+            ImageConfiguration::Rgb | ImageConfiguration::Bgr => {
+                let (h_c, s_c, v_c) = sse_deinterleave_rgb_epi16(row0, row1, row2);
+                h_chan = h_c;
+                s_chan = s_c;
+                v_chan = v_c;
+                a_chan_lo = _mm_set1_epi16(255);
+            }
+            ImageConfiguration::Rgba | ImageConfiguration::Bgra => {
+                let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
+                let (h_c, s_c, v_c, a_c) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
+                h_chan = h_c;
+                s_chan = s_c;
+                v_chan = v_c;
+                a_chan_lo = a_c;
+            }
+        }
+
+        let h_low = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(h_chan));
+        let s_low = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(s_chan));
+        let v_low = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(v_chan));
+
+        let (r_low, g_low, b_low) = match target {
+            HsvTarget::HSV => sse_hsv_to_rgb(h_low, s_low, v_low, v_scale),
+            HsvTarget::HSL => sse_hsl_to_rgb(h_low, s_low, v_low, v_scale),
+        };
+
+        let zeros = _mm_setzero_si128();
+
+        let h_high = _mm_cvtepi32_ps(_mm_unpackhi_epi16(h_chan, zeros));
+        let s_high = _mm_cvtepi32_ps(_mm_unpackhi_epi16(s_chan, zeros));
+        let v_high = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_chan, zeros));
+
+        let (r_high, g_high, b_high) = match target {
+            HsvTarget::HSV => sse_hsv_to_rgb(h_high, s_high, v_high, v_scale),
+            HsvTarget::HSL => sse_hsl_to_rgb(h_high, s_high, v_high, v_scale),
+        };
+
+        let r_chan_16_lo = _mm_packus_epi32(r_low, r_high);
+        let g_chan_16_lo = _mm_packus_epi32(g_low, g_high);
+        let b_chan_16_lo = _mm_packus_epi32(b_low, b_high);
+
+        let (h_chan, s_chan, v_chan, a_chan_hi);
+        let src_ptr = src_load_ptr.add(cx * channels);
+
+        let src_ptr = src_ptr.add(8 * channels);
+        let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
+        let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
+        let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
+
+        match image_configuration {
+            ImageConfiguration::Rgb | ImageConfiguration::Bgr => {
+                let (h_c, s_c, v_c) = sse_deinterleave_rgb_epi16(row0, row1, row2);
+                h_chan = h_c;
+                s_chan = s_c;
+                v_chan = v_c;
+                a_chan_hi = _mm_set1_epi16(255);
+            }
+            ImageConfiguration::Rgba | ImageConfiguration::Bgra => {
+                let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
+                let (h_c, s_c, v_c, a_c) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
+                h_chan = h_c;
+                s_chan = s_c;
+                v_chan = v_c;
+                a_chan_hi = a_c;
+            }
+        }
+
+        let h_low = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(h_chan));
+        let s_low = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(s_chan));
+        let v_low = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(v_chan));
+
+        let (r_low, g_low, b_low) = match target {
+            HsvTarget::HSV => sse_hsv_to_rgb(h_low, s_low, v_low, v_scale),
+            HsvTarget::HSL => sse_hsl_to_rgb(h_low, s_low, v_low, v_scale),
+        };
+
+        let zeros = _mm_setzero_si128();
+
+        let h_high = _mm_cvtepi32_ps(_mm_unpackhi_epi16(h_chan, zeros));
+        let s_high = _mm_cvtepi32_ps(_mm_unpackhi_epi16(s_chan, zeros));
+        let v_high = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_chan, zeros));
+
+        let (r_high, g_high, b_high) = match target {
+            HsvTarget::HSV => sse_hsv_to_rgb(h_high, s_high, v_high, v_scale),
+            HsvTarget::HSL => sse_hsl_to_rgb(h_high, s_high, v_high, v_scale),
+        };
+
+        let r_chan_16_hi = _mm_packus_epi32(r_low, r_high);
+        let g_chan_16_hi = _mm_packus_epi32(g_low, g_high);
+        let b_chan_16_hi = _mm_packus_epi32(b_low, b_high);
+
+        let r_chan = _mm_packus_epi16(r_chan_16_lo, r_chan_16_hi);
+        let g_chan = _mm_packus_epi16(g_chan_16_lo, g_chan_16_hi);
+        let b_chan = _mm_packus_epi16(b_chan_16_lo, b_chan_16_hi);
+
+        let ptr = dst_ptr.add(cx * channels);
+        if USE_ALPHA {
+            let a_chan = _mm_packus_epi16(a_chan_lo, a_chan_hi);
+            let (rgba0, rgba1, rgba2, rgba3) = sse_interleave_rgba(r_chan, g_chan, b_chan, a_chan);
+            _mm_storeu_si128(ptr as *mut __m128i, rgba0);
+            _mm_storeu_si128(ptr.add(16) as *mut __m128i, rgba1);
+            _mm_storeu_si128(ptr.add(32) as *mut __m128i, rgba2);
+            _mm_storeu_si128(ptr.add(48) as *mut __m128i, rgba3);
+        } else {
+            let (rgba0, rgba1, rgba2) = sse_interleave_rgb(r_chan, g_chan, b_chan);
+            _mm_storeu_si128(ptr as *mut __m128i, rgba0);
+            _mm_storeu_si128(ptr.add(16) as *mut __m128i, rgba1);
+            _mm_storeu_si128(ptr.add(32) as *mut __m128i, rgba2);
+        }
+
+        cx += 16;
+    }
+
+    while cx + 8 < width as usize {
+        let (h_chan, s_chan, v_chan, a_chan_lo);
+        let src_ptr = src_load_ptr.add(cx * channels);
+
+        let row0 = _mm_loadu_si128(src_ptr as *const __m128i);
+        let row1 = _mm_loadu_si128(src_ptr.add(8) as *const __m128i);
+        let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
+
+        match image_configuration {
+            ImageConfiguration::Rgb | ImageConfiguration::Bgr => {
+                let (h_c, s_c, v_c) = sse_deinterleave_rgb_epi16(row0, row1, row2);
+                h_chan = h_c;
+                s_chan = s_c;
+                v_chan = v_c;
+                a_chan_lo = _mm_set1_epi16(255);
+            }
+            ImageConfiguration::Rgba | ImageConfiguration::Bgra => {
+                let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
+                let (h_c, s_c, v_c, a_c) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
+                h_chan = h_c;
+                s_chan = s_c;
+                v_chan = v_c;
+                a_chan_lo = a_c;
+            }
+        }
+
+        let h_low = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(h_chan));
+        let s_low = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(s_chan));
+        let v_low = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(v_chan));
+
+        let (r_low, g_low, b_low) = match target {
+            HsvTarget::HSV => sse_hsv_to_rgb(h_low, s_low, v_low, v_scale),
+            HsvTarget::HSL => sse_hsl_to_rgb(h_low, s_low, v_low, v_scale),
+        };
+
+        let zeros = _mm_setzero_si128();
+
+        let h_high = _mm_cvtepi32_ps(_mm_unpackhi_epi16(h_chan, zeros));
+        let s_high = _mm_cvtepi32_ps(_mm_unpackhi_epi16(s_chan, zeros));
+        let v_high = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_chan, zeros));
+
+        let (r_high, g_high, b_high) = match target {
+            HsvTarget::HSV => sse_hsv_to_rgb(h_high, s_high, v_high, v_scale),
+            HsvTarget::HSL => sse_hsl_to_rgb(h_high, s_high, v_high, v_scale),
+        };
+
+        let r_chan_16_lo = _mm_packus_epi32(r_low, r_high);
+        let g_chan_16_lo = _mm_packus_epi32(g_low, g_high);
+        let b_chan_16_lo = _mm_packus_epi32(b_low, b_high);
+
+        let r_chan = _mm_packus_epi16(r_chan_16_lo, zeros);
+        let g_chan = _mm_packus_epi16(g_chan_16_lo, zeros);
+        let b_chan = _mm_packus_epi16(b_chan_16_lo, zeros);
+
+        let ptr = dst_ptr.add(cx * channels);
+        if USE_ALPHA {
+            let a_chan = _mm_packus_epi16(a_chan_lo, _mm_setzero_si128());
+            let (rgba0, rgba1, _, _) = sse_interleave_rgba(r_chan, g_chan, b_chan, a_chan);
+            _mm_storeu_si128(ptr as *mut __m128i, rgba0);
+            _mm_storeu_si128(ptr.add(16) as *mut __m128i, rgba1);
+        } else {
+            let (rgba0, rgba1, _) = sse_interleave_rgb(r_chan, g_chan, b_chan);
+            _mm_storeu_si128(ptr as *mut __m128i, rgba0);
+            std::ptr::copy_nonoverlapping(&rgba1 as *const _ as *const u8, ptr.add(16), 8);
+        }
+
+        cx += 8;
+    }
+
+    cx
+}
diff --git a/src/sse/sse_image_to_hsv.rs b/src/sse/sse_image_to_hsv.rs
new file mode 100644
index 0000000..84b9d78
--- /dev/null
+++ b/src/sse/sse_image_to_hsv.rs
@@ -0,0 +1,192 @@
+use crate::image::ImageConfiguration;
+use crate::image_to_hsv_support::HsvTarget;
+use crate::sse::sse_color::{sse_rgb_to_hsl, sse_rgb_to_hsv};
+use crate::sse::{
+    sse_deinterleave_rgb, sse_deinterleave_rgba, sse_interleave_rgb_epi16,
+    sse_interleave_rgba_epi16,
+};
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+#[inline]
+pub unsafe fn sse_channels_to_hsv_u16<
+    const CHANNELS_CONFIGURATION: u8,
+    const USE_ALPHA: bool,
+    const TARGET: u8,
+>(
+    start_cx: usize,
+    src: *const u8,
+    src_offset: usize,
+    width: u32,
+    dst: *mut u16,
+    dst_offset: usize,
+    scale: f32,
+) -> usize {
+    let target: HsvTarget = TARGET.into();
+    let image_configuration: ImageConfiguration = CHANNELS_CONFIGURATION.into();
+    let mut cx = start_cx;
+    if USE_ALPHA {
+        if !image_configuration.has_alpha() {
+            panic!("Use alpha flag used on image without alpha");
+        }
+    }
+
+    let channels = image_configuration.get_channels_count();
+
+    let v_scale = _mm_set1_ps(scale);
+
+    let dst_ptr = (dst as *mut u8).add(dst_offset) as *mut u16;
+
+    while cx + 16 < width as usize {
+        let (r_chan, g_chan, b_chan, a_chan);
+        let src_ptr = src.add(src_offset + cx * channels);
+        let row1 = _mm_loadu_si128(src_ptr as *const __m128i);
+        let row2 = _mm_loadu_si128(src_ptr.add(16) as *const __m128i);
+        let row3 = _mm_loadu_si128(src_ptr.add(32) as *const __m128i);
+        match image_configuration {
+            ImageConfiguration::Rgb | ImageConfiguration::Bgr => {
+                let (rgb0_, rgb1_, rgb2_) = sse_deinterleave_rgb(row1, row2, row3);
+                if image_configuration == ImageConfiguration::Rgb {
+                    r_chan = rgb0_;
+                    g_chan = rgb1_;
+                    b_chan = rgb2_;
+                } else {
+                    r_chan = rgb2_;
+                    g_chan = rgb1_;
+                    b_chan = rgb0_;
+                }
+                a_chan = _mm_setzero_si128();
+            }
+            ImageConfiguration::Rgba => {
+                let row4 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
+                let (rgb0_, rgb1_, rgb2_, rgb3_) = sse_deinterleave_rgba(row1, row2, row3, row4);
+                r_chan = rgb0_;
+                g_chan = rgb1_;
+                b_chan = rgb2_;
+                a_chan = rgb3_;
+            }
+            ImageConfiguration::Bgra => {
+                let row4 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
+                let (rgb0_, rgb1_, rgb2_, rgb3_) = sse_deinterleave_rgba(row1, row2, row3, row4);
+                r_chan = rgb2_;
+                g_chan = rgb1_;
+                b_chan = rgb0_;
+                a_chan = rgb3_;
+            }
+        }
+
+        let zeros = _mm_setzero_si128();
+
+        let r_low = _mm_unpacklo_epi8(r_chan, zeros);
+        let g_low = _mm_unpacklo_epi8(g_chan, zeros);
+        let b_low = _mm_unpacklo_epi8(b_chan, zeros);
+
+        let r_low_low = _mm_unpacklo_epi16(r_low, zeros);
+        let g_low_low = _mm_unpacklo_epi16(g_low, zeros);
+        let b_low_low = _mm_unpacklo_epi16(b_low, zeros);
+
+        let (x_low_low, y_low_low, z_low_low) = match target {
+            HsvTarget::HSV => sse_rgb_to_hsv(r_low_low, g_low_low, b_low_low, v_scale),
+            HsvTarget::HSL => sse_rgb_to_hsl(r_low_low, g_low_low, b_low_low, v_scale),
+        };
+
+        let a_low = _mm_unpacklo_epi8(a_chan, zeros);
+
+        let r_low_high = _mm_unpackhi_epi16(r_low, zeros);
+        let g_low_high = _mm_unpackhi_epi16(g_low, zeros);
+        let b_low_high = _mm_unpackhi_epi16(b_low, zeros);
+
+        let (x_low_high, y_low_high, z_low_high) = match target {
+            HsvTarget::HSV => sse_rgb_to_hsv(r_low_high, g_low_high, b_low_high, v_scale),
+            HsvTarget::HSL => sse_rgb_to_hsl(r_low_high, g_low_high, b_low_high, v_scale),
+        };
+
+        const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+        let x_low = _mm_packus_epi32(
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(x_low_low)),
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(x_low_high)),
+        );
+        let y_low = _mm_packus_epi32(
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(y_low_low)),
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(y_low_high)),
+        );
+        let z_low = _mm_packus_epi32(
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(z_low_low)),
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(z_low_high)),
+        );
+
+        if USE_ALPHA {
+            let (row1, row2, row3, row4) = sse_interleave_rgba_epi16(x_low, y_low, z_low, a_low);
+            let ptr = dst_ptr.add(cx * channels);
+            _mm_storeu_si128(ptr as *mut __m128i, row1);
+            _mm_storeu_si128(ptr.add(8) as *mut __m128i, row2);
+            _mm_storeu_si128(ptr.add(16) as *mut __m128i, row3);
+            _mm_storeu_si128(ptr.add(24) as *mut __m128i, row4);
+        } else {
+            let (row1, row2, row3) = sse_interleave_rgb_epi16(x_low, y_low, z_low);
+            let ptr = dst_ptr.add(cx * channels);
+            _mm_storeu_si128(ptr as *mut __m128i, row1);
+            _mm_storeu_si128(ptr.add(8) as *mut __m128i, row2);
+            _mm_storeu_si128(ptr.add(16) as *mut __m128i, row3);
+        }
+
+        let r_high = _mm_unpackhi_epi8(r_chan, zeros);
+        let g_high = _mm_unpackhi_epi8(g_chan, zeros);
+        let b_high = _mm_unpackhi_epi8(b_chan, zeros);
+
+        let r_high_low = _mm_unpacklo_epi16(r_high, zeros);
+        let g_high_low = _mm_unpacklo_epi16(g_high, zeros);
+        let b_high_low = _mm_unpacklo_epi16(b_high, zeros);
+
+        let (x_high_low, y_high_low, z_high_low) = match target {
+            HsvTarget::HSV => sse_rgb_to_hsv(r_high_low, g_high_low, b_high_low, v_scale),
+            HsvTarget::HSL => sse_rgb_to_hsl(r_high_low, g_high_low, b_high_low, v_scale),
+        };
+
+        let a_high = _mm_unpackhi_epi8(a_chan, zeros);
+
+        let r_high_high = _mm_unpackhi_epi16(r_high, zeros);
+        let g_high_high = _mm_unpackhi_epi16(g_high, zeros);
+        let b_high_high = _mm_unpackhi_epi16(b_high, zeros);
+
+        let (x_high_high, y_high_high, z_high_high) = match target {
+            HsvTarget::HSV => sse_rgb_to_hsv(r_high_high, g_high_high, b_high_high, v_scale),
+            HsvTarget::HSL => sse_rgb_to_hsl(r_high_high, g_high_high, b_high_high, v_scale),
+        };
+
+        let x_high = _mm_packus_epi32(
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(x_high_low)),
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(x_high_high)),
+        );
+        let y_high = _mm_packus_epi32(
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(y_high_low)),
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(y_high_high)),
+        );
+        let z_high = _mm_packus_epi32(
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(z_high_low)),
+            _mm_cvtps_epi32(_mm_round_ps::<ROUNDING_FLAGS>(z_high_high)),
+        );
+
+        if USE_ALPHA {
+            let (row1, row2, row3, row4) =
+                sse_interleave_rgba_epi16(x_high, y_high, z_high, a_high);
+            let ptr = dst_ptr.add(cx * channels + 8 * channels);
+            _mm_storeu_si128(ptr as *mut __m128i, row1);
+            _mm_storeu_si128(ptr.add(8) as *mut __m128i, row2);
+            _mm_storeu_si128(ptr.add(16) as *mut __m128i, row3);
+            _mm_storeu_si128(ptr.add(24) as *mut __m128i, row4);
+        } else {
+            let (row1, row2, row3) = sse_interleave_rgb_epi16(x_high, y_high, z_high);
+            let ptr = dst_ptr.add(cx * channels + 8 * channels);
+            _mm_storeu_si128(ptr as *mut __m128i, row1);
+            _mm_storeu_si128(ptr.add(8) as *mut __m128i, row2);
+            _mm_storeu_si128(ptr.add(16) as *mut __m128i, row3);
+        }
+
+        cx += 16;
+    }
+
+    cx
+}
diff --git a/src/sse/sse_image_to_linear_u8.rs b/src/sse/sse_image_to_linear_u8.rs
index 30d8efe..57e399b 100644
--- a/src/sse/sse_image_to_linear_u8.rs
+++ b/src/sse/sse_image_to_linear_u8.rs
@@ -6,8 +6,6 @@ pub mod sse_image_to_linear_unsigned {
     use crate::image::ImageConfiguration;
     #[allow(unused_imports)]
     use crate::image_to_xyz_lab::XyzTarget;
-    #[allow(unused_imports)]
-    use crate::neon_gamma_curves::*;
     use crate::sse::*;
     #[cfg(target_arch = "x86")]
     use std::arch::x86::*;
diff --git a/src/sse/sse_math.rs b/src/sse/sse_math.rs
index ed61e1e..18db437 100644
--- a/src/sse/sse_math.rs
+++ b/src/sse/sse_math.rs
@@ -1,7 +1,7 @@
-#[cfg(target_arch = "x86_64")]
-use std::arch::x86_64::*;
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
 
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
@@ -219,7 +219,11 @@ pub unsafe fn _mm_vilogbk_ps(d: __m128) -> __m128i {
     );
     let q = _mm_sub_epi32(
         q,
-        _mm_select_si128(_mm_castps_si128(o), _mm_set1_epi32(64 + 0x7f), _mm_set1_epi32(0x7f)),
+        _mm_select_si128(
+            _mm_castps_si128(o),
+            _mm_set1_epi32(64 + 0x7f),
+            _mm_set1_epi32(0x7f),
+        ),
     );
     return q;
 }
@@ -247,6 +251,14 @@ pub(crate) unsafe fn _mm_neg_epi32(x: __m128i) -> __m128i {
     return _mm_sub_epi32(high, x);
 }
 
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[inline(always)]
+#[allow(dead_code)]
+pub(crate) unsafe fn _mm_neg_ps(x: __m128) -> __m128 {
+    let high = _mm_set1_ps(0f32);
+    return _mm_sub_ps(high, x);
+}
+
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
 #[allow(dead_code)]
@@ -268,7 +280,10 @@ pub unsafe fn _mm_cbrt_ps_ulp35(d: __m128) -> __m128 {
 
     let t = _mm_add_ps(_mm_cvtepi32_ps(e), _mm_set1_ps(6144f32));
     let qu = _mm_cvttps_epi32(_mm_mul_ps(t, _mm_set1_ps(1.0f32 / 3.0f32)));
-    let re = _mm_cvttps_epi32(_mm_sub_ps(t, _mm_mul_ps(_mm_cvtepi32_ps(qu), _mm_set1_ps(3f32))));
+    let re = _mm_cvttps_epi32(_mm_sub_ps(
+        t,
+        _mm_mul_ps(_mm_cvtepi32_ps(qu), _mm_set1_ps(3f32)),
+    ));
 
     q = _mm_selecti_ps(
         _mm_cmpeq_epi32(re, _mm_set1_epi32(1)),
@@ -327,3 +342,16 @@ pub unsafe fn _mm_color_matrix_ps(
     let new_b = _mm_prefer_fma_ps(_mm_prefer_fma_ps(_mm_mul_ps(g, c8), b, c9), r, c7);
     (new_r, new_g, new_b)
 }
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[inline(always)]
+#[allow(dead_code)]
+pub(crate) unsafe fn _mm_fmod_ps(a: __m128, b: __m128) -> __m128 {
+    let dividend_vec = a;
+    let divisor_vec = b;
+    let division = _mm_mul_ps(dividend_vec, _mm_rcp_ps(divisor_vec));  // Perform division
+    let int_part = _mm_floor_ps(division);                 // Get the integer part using floor
+    let product = _mm_mul_ps(int_part, divisor_vec);       // Multiply the integer part by the divisor
+    let remainder = _mm_sub_ps(dividend_vec, product);     // Subtract the product from the dividend
+    remainder
+}
\ No newline at end of file
diff --git a/src/sse/sse_support.rs b/src/sse/sse_support.rs
index 3379922..411184f 100644
--- a/src/sse/sse_support.rs
+++ b/src/sse/sse_support.rs
@@ -1,8 +1,8 @@
+use crate::avx::shuffle;
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
-use crate::avx::shuffle;
 
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
@@ -22,7 +22,6 @@ pub unsafe fn sse_interleave_even(x: __m128i) -> __m128i {
     return new_lane;
 }
 
-
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
 #[allow(dead_code)]
@@ -66,7 +65,6 @@ pub unsafe fn sse_transpose_x4(
     (row1, row2, row3, row4)
 }
 
-
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
 #[allow(dead_code)]
@@ -254,6 +252,95 @@ pub unsafe fn sse_interleave_rgb(
     (v0, v1, v2)
 }
 
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[inline(always)]
+#[allow(dead_code)]
+pub unsafe fn sse_interleave_rgb_epi16(
+    a: __m128i,
+    b: __m128i,
+    c: __m128i,
+) -> (__m128i, __m128i, __m128i) {
+    let sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    let sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
+    let sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    let a0 = _mm_shuffle_epi8(a, sh_a);
+    let b0 = _mm_shuffle_epi8(b, sh_b);
+    let c0 = _mm_shuffle_epi8(c, sh_c);
+
+    let v0 = _mm_blend_epi16::<0x24>(_mm_blend_epi16::<0x92>(a0, b0), c0);
+    let v1 = _mm_blend_epi16::<0x24>(_mm_blend_epi16::<0x92>(c0, a0), b0);
+    let v2 = _mm_blend_epi16::<0x24>(_mm_blend_epi16::<0x92>(b0, c0), a0);
+    (v0, v1, v2)
+}
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[inline(always)]
+#[allow(dead_code)]
+pub unsafe fn sse_interleave_rgba_epi16(
+    a: __m128i,
+    b: __m128i,
+    c: __m128i,
+    d: __m128i,
+) -> (__m128i, __m128i, __m128i, __m128i) {
+    let u0 = _mm_unpacklo_epi16(a, c); // a0 c0 a1 c1 ...
+    let u1 = _mm_unpackhi_epi16(a, c); // a4 c4 a5 c5 ...
+    let u2 = _mm_unpacklo_epi16(b, d); // b0 d0 b1 d1 ...
+    let u3 = _mm_unpackhi_epi16(b, d); // b4 d4 b5 d5 ...
+
+    let v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
+    let v1 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
+    let v2 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
+    let v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
+    (v0, v1, v2, v3)
+}
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[inline(always)]
+#[allow(dead_code)]
+pub unsafe fn sse_deinterleave_rgba_epi16(
+    u0: __m128i,
+    u1: __m128i,
+    u2: __m128i,
+    u3: __m128i,
+) -> (__m128i, __m128i, __m128i, __m128i) {
+    let v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
+    let v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
+    let v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
+    let v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
+
+    let u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
+    let u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
+    let u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
+    let u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
+
+    let a = _mm_unpacklo_epi16(u0, u1);
+    let b = _mm_unpackhi_epi16(u0, u1);
+    let c = _mm_unpacklo_epi16(u2, u3);
+    let d = _mm_unpackhi_epi16(u2, u3);
+    (a, b, c ,d)
+}
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[inline(always)]
+#[allow(dead_code)]
+pub unsafe fn sse_deinterleave_rgb_epi16(
+    v0: __m128i,
+    v1: __m128i,
+    v2: __m128i,
+) -> (__m128i, __m128i, __m128i) {
+    let a0 = _mm_blend_epi16::<0x24>(_mm_blend_epi16::<0x92>(v0, v1), v2);
+    let b0 = _mm_blend_epi16::<0x24>(_mm_blend_epi16::<0x92>(v2, v0), v1);
+    let c0 = _mm_blend_epi16::<0x24>(_mm_blend_epi16::<0x92>(v1, v2), v0);
+
+    let sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    let sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+    let sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    let a0 = _mm_shuffle_epi8(a0, sh_a);
+    let b0 = _mm_shuffle_epi8(b0, sh_b);
+    let c0 = _mm_shuffle_epi8(c0, sh_c);
+    (a0, b0, c0)
+}
+
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
 #[allow(dead_code)]
@@ -264,7 +351,6 @@ pub unsafe fn sse_store_rgb_u8(ptr: *mut u8, r: __m128i, g: __m128i, b: __m128i)
     _mm_storeu_si128(ptr.add(32) as *mut __m128i, v2);
 }
 
-
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
 #[allow(dead_code)]
@@ -294,4 +380,4 @@ pub unsafe fn sse_deinterleave_rgba_ps(
     let v2 = _mm_unpacklo_ps(t02hi, t13hi);
     let v3 = _mm_unpackhi_ps(t02hi, t13hi);
     (v0, v1, v2, v3)
-}
\ No newline at end of file
+}
diff --git a/src/sse/sse_to_linear.rs b/src/sse/sse_to_linear.rs
index b809789..c48276a 100644
--- a/src/sse/sse_to_linear.rs
+++ b/src/sse/sse_to_linear.rs
@@ -4,8 +4,6 @@ use crate::gamma_curves::TransferFunction;
 use crate::image::ImageConfiguration;
 #[allow(unused_imports)]
 use crate::image_to_xyz_lab::XyzTarget;
-#[allow(unused_imports)]
-use crate::neon_gamma_curves::*;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[allow(unused_imports)]
 use crate::sse::*;
diff --git a/src/sse/sse_to_xyz_lab.rs b/src/sse/sse_to_xyz_lab.rs
index 7fc35cd..1da973a 100644
--- a/src/sse/sse_to_xyz_lab.rs
+++ b/src/sse/sse_to_xyz_lab.rs
@@ -2,8 +2,6 @@ use crate::gamma_curves::TransferFunction;
 use crate::image::ImageConfiguration;
 use crate::luv::{LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y};
 #[allow(unused_imports)]
-use crate::neon_gamma_curves::*;
-#[allow(unused_imports)]
 use crate::sse::*;
 #[allow(unused_imports)]
 use crate::image_to_xyz_lab::XyzTarget;
diff --git a/src/sse/sse_to_xyza_laba.rs b/src/sse/sse_to_xyza_laba.rs
index 55ee60b..bbd259d 100644
--- a/src/sse/sse_to_xyza_laba.rs
+++ b/src/sse/sse_to_xyza_laba.rs
@@ -4,8 +4,6 @@ use crate::gamma_curves::TransferFunction;
 use crate::image::ImageConfiguration;
 #[allow(unused_imports)]
 use crate::image_to_xyz_lab::XyzTarget;
-#[allow(unused_imports)]
-use crate::neon_gamma_curves::*;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[allow(unused_imports)]
 use crate::sse::*;