Skip to content

Commit

Permalink
Lalpha beta bugfix, codegen
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Oct 15, 2024
1 parent 8bf71c8 commit 8bb2ce9
Show file tree
Hide file tree
Showing 37 changed files with 826 additions and 1,708 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ workspace = { members = ["src/app"] }

[package]
name = "colorutils-rs"
version = "0.7.0"
version = "0.7.1"
edition = "2021"
description = "High performance utilities for color format handling and conversion."
readme = "README.md"
Expand Down
6 changes: 2 additions & 4 deletions src/app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,13 @@ fn main() {
lab_store.resize(width as usize * components * height as usize, 0.);
let src_stride = width * components as u32;
let start_time = Instant::now();
rgba_to_jzazbz(
rgba_to_lalphabeta(
src_bytes,
src_stride,
&mut lab_store,
store_stride as u32,
width,
height,
200.,
TransferFunction::Srgb,
);
let elapsed_time = start_time.elapsed();
Expand Down Expand Up @@ -104,14 +103,13 @@ fn main() {
// }

let start_time = Instant::now();
jzazbz_to_rgba(
lalphabeta_to_rgba(
&lab_store,
store_stride as u32,
&mut dst_slice,
src_stride,
width,
height,
200.,
TransferFunction::Srgb,
);

Expand Down
5 changes: 1 addition & 4 deletions src/avx/image_to_oklab.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,7 @@ use crate::avx::routines::avx_vld_f32_and_deinterleave;
use crate::avx::{_mm256_color_matrix_ps, avx2_interleave_rgb_ps, avx2_interleave_rgba_ps};
use crate::image::ImageConfiguration;
use crate::image_to_oklab::OklabTarget;
use crate::{
avx_store_and_interleave_v3_direct_f32, avx_store_and_interleave_v4_direct_f32
,
};
use crate::{avx_store_and_interleave_v3_direct_f32, avx_store_and_interleave_v4_direct_f32};
use erydanos::{_mm256_atan2_ps, _mm256_cbrt_fast_ps, _mm256_hypot_fast_ps};
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
Expand Down
2 changes: 1 addition & 1 deletion src/avx/support.rs
Original file line number Diff line number Diff line change
Expand Up @@ -452,4 +452,4 @@ pub unsafe fn avx2_div_by255(v: __m256i) -> __m256i {
let multiplier = _mm256_set1_epi16(-32640);
let r = _mm256_mulhi_epu16(x, multiplier);
_mm256_srli_epi16::<7>(r)
}
}
9 changes: 6 additions & 3 deletions src/avx/to_xyz_lab.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,13 @@ use crate::avx::cie::{
use crate::avx::routines::avx_vld_f32_and_deinterleave;
use crate::avx::*;
use crate::image::ImageConfiguration;
use crate::sse::{sse_interleave_ps_rgb, sse_triple_to_lab, sse_triple_to_lch, sse_triple_to_luv, sse_triple_to_xyz};
use crate::sse::{sse_deinterleave_rgb_ps, sse_deinterleave_rgba_ps};
use crate::sse::{
sse_interleave_ps_rgb, sse_triple_to_lab, sse_triple_to_lch, sse_triple_to_luv,
sse_triple_to_xyz,
};
use crate::xyz_target::XyzTarget;
use crate::{avx_store_and_interleave_v3_direct_f32, load_f32_and_deinterleave};
use crate::sse::{sse_deinterleave_rgba_ps, sse_deinterleave_rgb_ps};

#[target_feature(enable = "avx2")]
pub unsafe fn avx2_image_to_xyz_lab<
Expand Down Expand Up @@ -101,7 +104,7 @@ pub unsafe fn avx2_image_to_xyz_lab<
}

while cx + 4 < width as usize {
let src_ptr = ((src as * const u8).add(src_offset) as *const f32).add(cx * channels);
let src_ptr = ((src as *const u8).add(src_offset) as *const f32).add(cx * channels);
let (r_chan, g_chan, b_chan, a_chan) =
load_f32_and_deinterleave!(src_ptr, image_configuration);

Expand Down
212 changes: 62 additions & 150 deletions src/image_to_jzazbz.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,177 +77,89 @@ fn channels_to_jzaz<const CHANNELS_CONFIGURATION: u8, const TARGET: u8>(
lut_table[i] = transfer_function.linearize(i as f32 * (1. / 255.0));
}

let iter;
#[cfg(feature = "rayon")]
{
dst_slice_safe_align
iter = dst_slice_safe_align
.par_chunks_exact_mut(dst_stride as usize)
.zip(src.par_chunks_exact(src_stride as usize))
.for_each(|(dst, src)| unsafe {
let mut _cx = 0usize;

let dst_ptr = dst.as_mut_ptr() as *mut f32;

let mut linearized_row = vec![0f32; width as usize * channels];
for (linear_chunk, src_chunk) in linearized_row
.chunks_exact_mut(channels)
.zip(src.chunks_exact(channels))
{
linear_chunk[image_configuration.get_r_channel_offset()] = *lut_table
.get_unchecked(
src_chunk[image_configuration.get_r_channel_offset()] as usize,
);
linear_chunk[image_configuration.get_g_channel_offset()] = *lut_table
.get_unchecked(
src_chunk[image_configuration.get_g_channel_offset()] as usize,
);
linear_chunk[image_configuration.get_b_channel_offset()] = *lut_table
.get_unchecked(
src_chunk[image_configuration.get_b_channel_offset()] as usize,
);
if image_configuration.has_alpha() {
linear_chunk[image_configuration.get_a_channel_offset()] =
src_chunk[image_configuration.get_a_channel_offset()] as f32
* (1. / 255.0);
}
}

if let Some(dispatcher) = _wide_row_handle {
_cx = dispatcher(
_cx,
linearized_row.as_ptr(),
0,
width,
dst.as_mut_ptr() as *mut f32,
0,
display_luminance,
);
}

for x in _cx..width as usize {
let px = x * channels;

let src = linearized_row.get_unchecked(px..);
let r = *src.get_unchecked(image_configuration.get_r_channel_offset());
let g = *src.get_unchecked(image_configuration.get_g_channel_offset());
let b = *src.get_unchecked(image_configuration.get_b_channel_offset());

let xyz = Xyz::from_linear_rgb(Rgb::<f32>::new(r, g, b), &SRGB_TO_XYZ_D65);

let dst_store = dst_ptr.add(px);

match target {
JzazbzTarget::Jzazbz => {
let jzazbz =
Jzazbz::from_xyz_with_display_luminance(xyz, display_luminance);

dst_store.write_unaligned(jzazbz.jz);
dst_store.add(1).write_unaligned(jzazbz.az);
dst_store.add(2).write_unaligned(jzazbz.bz);
}
JzazbzTarget::Jzczhz => {
let jzczhz =
Jzczhz::from_xyz_with_display_luminance(xyz, display_luminance);

dst_store.write_unaligned(jzczhz.jz);
dst_store.add(1).write_unaligned(jzczhz.cz);
dst_store.add(2).write_unaligned(jzczhz.hz);
}
}

if image_configuration.has_alpha() {
let a = *src.get_unchecked(image_configuration.get_a_channel_offset());
dst_store.add(3).write_unaligned(a);
}
}
});
.zip(src.par_chunks_exact(src_stride as usize));
}

#[cfg(not(feature = "rayon"))]
{
for (dst, src) in dst_slice_safe_align
iter = dst_slice_safe_align
.chunks_exact_mut(dst_stride as usize)
.zip(src.chunks_exact(src_stride as usize))
{
unsafe {
let mut _cx = 0usize;
.zip(src.chunks_exact(src_stride as usize));
}

let dst_ptr = dst.as_mut_ptr() as *mut f32;
iter.for_each(|(dst, src)| unsafe {
let mut _cx = 0usize;

let mut linearized_row = vec![0f32; width as usize * channels];
for (linear_chunk, src_chunk) in linearized_row
.chunks_exact_mut(channels)
.zip(src.chunks_exact(channels))
{
linear_chunk[image_configuration.get_r_channel_offset()] = *lut_table
.get_unchecked(
src_chunk[image_configuration.get_r_channel_offset()] as usize,
);
linear_chunk[image_configuration.get_g_channel_offset()] = *lut_table
.get_unchecked(
src_chunk[image_configuration.get_g_channel_offset()] as usize,
);
linear_chunk[image_configuration.get_b_channel_offset()] = *lut_table
.get_unchecked(
src_chunk[image_configuration.get_b_channel_offset()] as usize,
);
if image_configuration.has_alpha() {
linear_chunk[image_configuration.get_a_channel_offset()] =
src_chunk[image_configuration.get_a_channel_offset()] as f32
* (1. / 255.0);
}
}
let dst_ptr = dst.as_mut_ptr() as *mut f32;

if let Some(dispatcher) = _wide_row_handle {
_cx = dispatcher(
_cx,
linearized_row.as_ptr(),
0,
width,
dst.as_mut_ptr() as *mut f32,
0,
display_luminance,
);
}
let mut linearized_row = vec![0f32; width as usize * channels];
for (linear_chunk, src_chunk) in linearized_row
.chunks_exact_mut(channels)
.zip(src.chunks_exact(channels))
{
linear_chunk[image_configuration.get_r_channel_offset()] = *lut_table
.get_unchecked(src_chunk[image_configuration.get_r_channel_offset()] as usize);
linear_chunk[image_configuration.get_g_channel_offset()] = *lut_table
.get_unchecked(src_chunk[image_configuration.get_g_channel_offset()] as usize);
linear_chunk[image_configuration.get_b_channel_offset()] = *lut_table
.get_unchecked(src_chunk[image_configuration.get_b_channel_offset()] as usize);
if image_configuration.has_alpha() {
linear_chunk[image_configuration.get_a_channel_offset()] =
src_chunk[image_configuration.get_a_channel_offset()] as f32 * (1. / 255.0);
}
}

for x in _cx..width as usize {
let px = x * channels;
if let Some(dispatcher) = _wide_row_handle {
_cx = dispatcher(
_cx,
linearized_row.as_ptr(),
0,
width,
dst.as_mut_ptr() as *mut f32,
0,
display_luminance,
);
}

let src = linearized_row.get_unchecked(px..);
let r = *src.get_unchecked(image_configuration.get_r_channel_offset());
let g = *src.get_unchecked(image_configuration.get_g_channel_offset());
let b = *src.get_unchecked(image_configuration.get_b_channel_offset());
for x in _cx..width as usize {
let px = x * channels;

let xyz = Xyz::from_linear_rgb(Rgb::<f32>::new(r, g, b), &SRGB_TO_XYZ_D65);
let src = linearized_row.get_unchecked(px..);
let r = *src.get_unchecked(image_configuration.get_r_channel_offset());
let g = *src.get_unchecked(image_configuration.get_g_channel_offset());
let b = *src.get_unchecked(image_configuration.get_b_channel_offset());

let dst_store = dst_ptr.add(px);
let xyz = Xyz::from_linear_rgb(Rgb::<f32>::new(r, g, b), &SRGB_TO_XYZ_D65);

match target {
JzazbzTarget::Jzazbz => {
let jzazbz =
Jzazbz::from_xyz_with_display_luminance(xyz, display_luminance);
let dst_store = dst_ptr.add(px);

dst_store.write_unaligned(jzazbz.jz);
dst_store.add(1).write_unaligned(jzazbz.az);
dst_store.add(2).write_unaligned(jzazbz.bz);
}
JzazbzTarget::Jzczhz => {
let jzczhz =
Jzczhz::from_xyz_with_display_luminance(xyz, display_luminance);
match target {
JzazbzTarget::Jzazbz => {
let jzazbz = Jzazbz::from_xyz_with_display_luminance(xyz, display_luminance);

dst_store.write_unaligned(jzczhz.jz);
dst_store.add(1).write_unaligned(jzczhz.cz);
dst_store.add(2).write_unaligned(jzczhz.hz);
}
}
dst_store.write_unaligned(jzazbz.jz);
dst_store.add(1).write_unaligned(jzazbz.az);
dst_store.add(2).write_unaligned(jzazbz.bz);
}
JzazbzTarget::Jzczhz => {
let jzczhz = Jzczhz::from_xyz_with_display_luminance(xyz, display_luminance);

if image_configuration.has_alpha() {
let a = *src.get_unchecked(image_configuration.get_a_channel_offset());
dst_store.add(3).write_unaligned(a);
}
dst_store.write_unaligned(jzczhz.jz);
dst_store.add(1).write_unaligned(jzczhz.cz);
dst_store.add(2).write_unaligned(jzczhz.hz);
}
}

if image_configuration.has_alpha() {
let a = *src.get_unchecked(image_configuration.get_a_channel_offset());
dst_store.add(3).write_unaligned(a);
}
}
}
});
}

/// This function converts RGB to Jzazbz against D65 white point. This is much more effective than naive direct transformation
Expand Down
Loading

0 comments on commit 8bb2ce9

Please sign in to comment.