From 9d834309babbdb66fb0cbf64c250ffcc35012d85 Mon Sep 17 00:00:00 2001 From: Tuomas Tonteri Date: Sat, 8 Jun 2024 15:13:47 +0300 Subject: [PATCH] Add support for b4_SSE2 batched mode (20) Signed-off-by: Tuomas Tonteri --- .github/workflows/ci.yml | 11 + src/cmake/compiler.cmake | 2 +- src/include/OSL/batched_texture.h | 9 +- src/include/OSL/llvm_util.h | 3 + src/include/OSL/rendererservices.h | 1 + src/liboslexec/CMakeLists.txt | 4 + src/liboslexec/batched_analysis.cpp | 8 +- src/liboslexec/batched_backendllvm.cpp | 1 + src/liboslexec/batched_llvm_instance.cpp | 44 +++ src/liboslexec/batched_rendservices.cpp | 1 + src/liboslexec/context.cpp | 1 + src/liboslexec/llvm_passes.h | 2 + src/liboslexec/llvm_util.cpp | 290 ++++++++++++++++-- src/liboslexec/rendservices.cpp | 7 + src/liboslexec/shadingsys.cpp | 31 +- src/testshade/batched_simplerend.cpp | 1 + src/testshade/simplerend.cpp | 4 +- src/testshade/simplerend.h | 5 + src/testshade/testshade.cpp | 19 +- .../oslbatcheddeformer.cpp | 5 + 20 files changed, 418 insertions(+), 31 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3fb26aefd..1874ede0a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -74,6 +74,17 @@ jobs: pybind11_ver: v2.5.0 simd: sse4.2 setenvs: export CONAN_LLVM_VERSION=10.0.1 + - desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 sse2 batch-b4sse2 + nametag: linux-vfx2021 + runner: ubuntu-latest + container: aswftesting/ci-osl:2021-clang11 + vfxyear: 2021 + cxx_std: 17 + openimageio_ver: v2.4.13.0 + python_ver: 3.7 + pybind11_ver: v2.7.0 + simd: sse2 + batched: b4_SSE2 - desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 avx2 batch-b8avx2 nametag: linux-vfx2021 runner: ubuntu-latest diff --git a/src/cmake/compiler.cmake b/src/cmake/compiler.cmake index c97316681..172bf1a81 100644 --- a/src/cmake/compiler.cmake +++ b/src/cmake/compiler.cmake @@ -329,7 +329,7 @@ endif () # # The USE_BATCHED option may be set to indicate that support for batched # SIMD shader execution be compiled along with targe specific libraries -set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)") +set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b4_SSE2, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)") option (VEC_REPORT "Enable compiler's reporting system for vectorization" OFF) set (BATCHED_SUPPORT_DEFINES "") set (BATCHED_TARGET_LIBS "") diff --git a/src/include/OSL/batched_texture.h b/src/include/OSL/batched_texture.h index c720e9bed..787664472 100644 --- a/src/include/OSL/batched_texture.h +++ b/src/include/OSL/batched_texture.h @@ -49,6 +49,9 @@ static_assert(std::alignment_of>::value static_assert(std::alignment_of>::value == VecReg<8>::alignment, "Expect alignment of data member to set alignment of struct"); +static_assert(std::alignment_of>::value + == VecReg<4>::alignment, + "Expect alignment of data member to set alignment of struct"); template struct BatchedTextureOptions { VaryingTextureOptions varying; @@ -90,11 +93,15 @@ static_assert(std::alignment_of>::value static_assert(std::alignment_of>::value == VecReg<8>::alignment, "Expect alignment of data member to set alignment of struct"); +static_assert(std::alignment_of>::value + == VecReg<4>::alignment, + "Expect alignment of data member to set alignment of struct"); #ifdef OIIO_TEXTURE_SIMD_BATCH_WIDTH // Code here is to validate our OSL BatchedTextureOptions is binary compatible // and safe to reinterpret_cast -static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8), +static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8) + || (OIIO::Tex::BatchWidth == 4), "This validation requires OIIO_TEXTURE_SIMD_BATCH_WIDTH=16"); namespace validate_offsets { diff --git a/src/include/OSL/llvm_util.h b/src/include/OSL/llvm_util.h index 7f112ccf5..49df62891 100644 --- a/src/include/OSL/llvm_util.h +++ b/src/include/OSL/llvm_util.h @@ -693,6 +693,8 @@ class OSLEXECPUBLIC LLVM_Util { llvm::Constant* constant(uint32_t i); /// Return an llvm::Constant holding the given integer constant. + llvm::Constant* constant4(int8_t i); + llvm::Constant* constant4(uint8_t i); llvm::Constant* constant8(int8_t i); llvm::Constant* constant8(uint8_t i); llvm::Constant* constant16(int16_t i); @@ -1229,6 +1231,7 @@ class OSLEXECPUBLIC LLVM_Util { llvm::Value* op_linearize_16x_indices(llvm::Value* wide_index); llvm::Value* op_linearize_8x_indices(llvm::Value* wide_index); + llvm::Value* op_linearize_4x_indices(llvm::Value* wide_index); std::array op_split_16x(llvm::Value* vector_val); std::array op_split_8x(llvm::Value* vector_val); std::array op_quarter_16x(llvm::Value* vector_val); diff --git a/src/include/OSL/rendererservices.h b/src/include/OSL/rendererservices.h index 04e5269ae..62a6b6179 100644 --- a/src/include/OSL/rendererservices.h +++ b/src/include/OSL/rendererservices.h @@ -601,6 +601,7 @@ class OSLEXECPUBLIC RendererServices { /// Unless overridden, a nullptr is returned. virtual BatchedRendererServices<16>* batched(WidthOf<16>); virtual BatchedRendererServices<8>* batched(WidthOf<8>); + virtual BatchedRendererServices<4>* batched(WidthOf<4>); protected: TextureSystem* m_texturesys; // A place to hold a TextureSystem diff --git a/src/liboslexec/CMakeLists.txt b/src/liboslexec/CMakeLists.txt index 328565af6..5f2bd048a 100644 --- a/src/liboslexec/CMakeLists.txt +++ b/src/liboslexec/CMakeLists.txt @@ -380,6 +380,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST}) list (APPEND TARGET_CXX_OPTS "-march=core-avx2") elseif (${TARGET_OPT_ISA} STREQUAL "AVX") list (APPEND TARGET_CXX_OPTS "-march=corei7-avx") + elseif (${TARGET_OPT_ISA} STREQUAL "SSE2") + list (APPEND TARGET_CXX_OPTS "-march=core2") else () message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}") endif () @@ -455,6 +457,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST}) list (APPEND TARGET_CXX_OPTS "-march=haswell") elseif (${TARGET_OPT_ISA} STREQUAL "AVX") list (APPEND TARGET_CXX_OPTS "-march=sandybridge") + elseif (${TARGET_OPT_ISA} STREQUAL "SSE2") + list (APPEND TARGET_CXX_OPTS "-march=core2") else () message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}") endif () diff --git a/src/liboslexec/batched_analysis.cpp b/src/liboslexec/batched_analysis.cpp index 888f19874..9f76c1acf 100644 --- a/src/liboslexec/batched_analysis.cpp +++ b/src/liboslexec/batched_analysis.cpp @@ -1813,10 +1813,16 @@ struct Analyzer { // specific BatchedRendererServices. // Right here we don't know which width will be used, // so we will just require all widths provide the same answer + auto rs4 = m_ba.renderer()->batched(WidthOf<4>()); auto rs8 = m_ba.renderer()->batched(WidthOf<8>()); auto rs16 = m_ba.renderer()->batched(WidthOf<16>()); - if (rs8 || rs16) { + if (rs4 || rs8 || rs16) { get_attr_is_uniform = true; + if (rs4) { + get_attr_is_uniform + &= rs4->is_attribute_uniform(obj_name, + attr_name); + } if (rs8) { get_attr_is_uniform &= rs8->is_attribute_uniform(obj_name, diff --git a/src/liboslexec/batched_backendllvm.cpp b/src/liboslexec/batched_backendllvm.cpp index e94122ef4..79f87ca90 100644 --- a/src/liboslexec/batched_backendllvm.cpp +++ b/src/liboslexec/batched_backendllvm.cpp @@ -141,6 +141,7 @@ BatchedBackendLLVM::BatchedBackendLLVM(ShadingSystemImpl& shadingsys, switch (vector_width()) { case 16: m_true_mask_value = Mask<16>(true).value(); break; case 8: m_true_mask_value = Mask<8>(true).value(); break; + case 4: m_true_mask_value = Mask<4>(true).value(); break; default: OSL_ASSERT(0 && "unsupported vector width"); } ll.dumpasm(shadingsys.m_llvm_dumpasm); diff --git a/src/liboslexec/batched_llvm_instance.cpp b/src/liboslexec/batched_llvm_instance.cpp index 8e6ff0a76..218063786 100644 --- a/src/liboslexec/batched_llvm_instance.cpp +++ b/src/liboslexec/batched_llvm_instance.cpp @@ -537,6 +537,33 @@ const char* = "b8_AVX_"; #endif +#ifdef __OSL_SUPPORTS_b4_SSE2 +template<> +const NameAndSignature + ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_functions[] + = { +# define DECL_INDIRECT(name, signature) \ + NameAndSignature { #name, signature }, +# define DECL(name, signature) DECL_INDIRECT(name, signature) +# define __OSL_WIDTH 4 +# define __OSL_TARGET_ISA SSE2 +// Don't allow order of xmacro includes be rearranged +// clang-format off +# include "wide/define_opname_macros.h" +# include "builtindecl_wide_xmacro.h" +# include "wide/undef_opname_macros.h" +// clang-format on +# undef __OSL_TARGET_ISA +# undef __OSL_WIDTH +# undef DECL +# undef DECL_INDIRECT + }; +template<> +const char* + ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_selector_string + = "b4_SSE2_"; +#endif + std::unique_ptr @@ -592,6 +619,17 @@ BatchedBackendLLVM::TargetLibraryHelper::build(ShadingContext* context, default: break; } break; + case 4: + switch (target_isa) { +#ifdef __OSL_SUPPORTS_b4_SSE2 + case TargetISA::x64: + return RetType( + new ConcreteTargetLibraryHelper<4, TargetISA::x64>()); +#endif + default: break; + } + break; + default: OSL_ASSERT(0 && "unsupported vector width"); } std::cerr << "Build is not configured to support TargetISA of " @@ -735,6 +773,9 @@ BatchedBackendLLVM::llvm_type_batched_texture_options() { std::vector offset_by_index; switch (m_width) { + case 4: + build_offsets_of_BatchedTextureOptions<4>(offset_by_index); + break; case 8: build_offsets_of_BatchedTextureOptions<8>(offset_by_index); break; @@ -2698,6 +2739,9 @@ BatchedBackendLLVM::run() { std::vector offset_by_index; switch (m_width) { + case 4: + build_offsets_of_BatchedShaderGlobals<4>(offset_by_index); + break; case 8: build_offsets_of_BatchedShaderGlobals<8>(offset_by_index); break; diff --git a/src/liboslexec/batched_rendservices.cpp b/src/liboslexec/batched_rendservices.cpp index fbff377b2..1c5fcaa4a 100644 --- a/src/liboslexec/batched_rendservices.cpp +++ b/src/liboslexec/batched_rendservices.cpp @@ -328,5 +328,6 @@ BatchedRendererServices::getmessage(BatchedShaderGlobals* bsg, // Explicitly instantiate BatchedRendererServices template template class OSLEXECPUBLIC BatchedRendererServices<16>; template class OSLEXECPUBLIC BatchedRendererServices<8>; +template class OSLEXECPUBLIC BatchedRendererServices<4>; OSL_NAMESPACE_EXIT diff --git a/src/liboslexec/context.cpp b/src/liboslexec/context.cpp index a97b427e1..b001315a8 100644 --- a/src/liboslexec/context.cpp +++ b/src/liboslexec/context.cpp @@ -674,6 +674,7 @@ osl_incr_layers_executed(ShaderGlobals* sg) // Explicit template instantiation for supported batch sizes template class ShadingContext::Batched<16>; template class ShadingContext::Batched<8>; +template class ShadingContext::Batched<4>; #endif diff --git a/src/liboslexec/llvm_passes.h b/src/liboslexec/llvm_passes.h index 852ec82f9..43c7a7289 100644 --- a/src/liboslexec/llvm_passes.h +++ b/src/liboslexec/llvm_passes.h @@ -435,6 +435,8 @@ class LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks final // including this file will need its own static members defined. LLVM will // assign IDs when they get registered, so this initialization value is not // important. +template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>::ID = 0; + template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>::ID = 0; template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<16>::ID = 0; diff --git a/src/liboslexec/llvm_util.cpp b/src/liboslexec/llvm_util.cpp index 3dd888cab..807f8ab3b 100644 --- a/src/liboslexec/llvm_util.cpp +++ b/src/liboslexec/llvm_util.cpp @@ -619,6 +619,12 @@ LLVM_Util::SetupLLVM() #ifndef OSL_LLVM_NEW_PASS_MANAGER // LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks + static llvm::RegisterPass< + LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>> + sRegCustomPass2( + "PreventBitMasksFromBeingLiveinsToBasicBlocks<4>", + "Prevent Bit Masks <4xi1> From Being Liveins To Basic Blocks Pass", + false /* Only looks at CFG */, false /* Analysis Pass */); static llvm::RegisterPass< LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>> sRegCustomPass0( @@ -2305,7 +2311,11 @@ LLVM_Util::setup_new_optimization_passes(int optlevel, bool target_host) break; } case 4: - // We don't use masking or SIMD shading for 4-wide + // MUST BE THE FINAL PASS! + m_new_pass_manager->module_pass_manager.addPass( + createModuleToFunctionPassAdaptor( + NewPreventBitMasksFromBeingLiveinsToBasicBlocks<4>( + context()))); break; default: std::cout << "m_vector_width = " << m_vector_width << "\n"; @@ -2618,7 +2628,9 @@ LLVM_Util::setup_legacy_optimization_passes(int optlevel, bool target_host) new LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>()); break; case 4: - // We don't use masking or SIMD shading for 4-wide + // MUST BE THE FINAL PASS! + mpm.add( + new LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>()); break; default: std::cout << "m_vector_width = " << m_vector_width << "\n"; @@ -3393,6 +3405,19 @@ LLVM_Util::constant(uint32_t i) return llvm::ConstantInt::get(context(), llvm::APInt(32, i)); } +llvm::Constant* +LLVM_Util::constant4(int8_t i) +{ + return llvm::ConstantInt::get(context(), + llvm::APInt(4, i, true /*signed*/)); +} + +llvm::Constant* +LLVM_Util::constant4(uint8_t i) +{ + return llvm::ConstantInt::get(context(), llvm::APInt(4, i)); +} + llvm::Constant* LLVM_Util::constant8(int8_t i) { @@ -3592,6 +3617,11 @@ LLVM_Util::mask_as_int(llvm::Value* mask) // and all types are happy intMaskType = type_int8(); break; + case 4: + // We can just reinterpret cast a 4 bit mask to a 8 bit integer + // and all types are happy + intMaskType = type_int8(); + break; default: OSL_ASSERT(0 && "unsupported native bit mask width"); }; @@ -3659,6 +3689,33 @@ LLVM_Util::mask_as_int(llvm::Value* mask) int8_mask = builder().CreateCall(func, toArrayRef(args)); return int8_mask; } + case 4: { + // We need to do more than a simple cast to an int. Since we + // know vectorized comparison for SSE2 ends up setting 4 + // 32 bit integers to 0xFFFFFFFF or 0x00000000, We need to + // do more than a simple cast to an int. + + // Convert <4 x i1> -> <4 x i32> + llvm::Value* wide_int_mask = builder().CreateSExt(mask, + type_wide_int()); + + // Convert <4 x i32> -> <4 x f32> + // Now we will use the horizontal sign extraction intrinsic + // to build a 32 bit mask value. However the only 256bit + // version works on floats, so we will cast from int32 to + // float beforehand + llvm::Type* w4_float_type = llvm_vector_type(m_llvm_type_float, 4); + llvm::Value* w4_float_mask = builder().CreateBitCast(wide_int_mask, + w4_float_type); + + llvm::Function* func = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_sse_movmsk_ps); + + llvm::Value* args[1] = { w4_float_mask }; + llvm::Value* int8_mask; + int8_mask = builder().CreateCall(func, toArrayRef(args)); + return int8_mask; + } default: { OSL_ASSERT(0 && "unsupported native bit mask width"); return mask; @@ -3828,14 +3885,19 @@ LLVM_Util::int_as_mask(llvm::Value* value) // and all types are happy intMaskType = type_int8(); break; + case 4: + // We can just reinterpret cast a 8 bit integer to a 4 bit mask + // and all types are happy + intMaskType = type_int8(); + break; default: OSL_ASSERT(0 && "unsupported native bit mask width"); }; llvm::Value* intMask = builder().CreateTrunc(value, intMaskType); result = builder().CreateBitCast(intMask, type_wide_bool()); } else { - // Since we know vectorized comparisons for AVX&AVX2 end up setting - // 8 32 bit integers to 0xFFFFFFFF or 0x00000000, We need to do more + // Since we know vectorized comparisons for SSE2&AVX&AVX2 end up setting + // 32 bit integers to 0xFFFFFFFF or 0x00000000, We need to do more // than a simple cast to an int. // Broadcast out the int32 mask to all data lanes @@ -3950,23 +4012,20 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask) // and all types are happy intMaskType = type_int8(); break; -#if 0 // WIP - case 4: - { - // We can just reinterpret cast a 8 bit mask to a 8 bit integer - // and all types are happy - intMaskType = type_int8(); + case 4: { + // We can just reinterpret cast a 4 bit mask to a 8 bit integer + // and all types are happy + intMaskType = type_int8(); -// extended_int_vector_type = (llvm::Type *) llvm::VectorType::get(llvm::Type::getInt32Ty (*m_llvm_context), m_vector_width); -// llvm::Value * wide_int_mask = builder().CreateSExt(mask, extended_int_vector_type); -// -// int_reinterpret_cast_vector_type = (llvm::Type *) llvm::Type::getInt128Ty (*m_llvm_context); -// zeroConstant = constant128(0); -// -// llvm::Value * mask_as_int = builder().CreateBitCast (wide_int_mask, int_reinterpret_cast_vector_type); - break; - } -#endif + // extended_int_vector_type = (llvm::Type *) llvm::VectorType::get(llvm::Type::getInt32Ty (*m_llvm_context), m_vector_width); + // llvm::Value * wide_int_mask = builder().CreateSExt(mask, extended_int_vector_type); + // + // int_reinterpret_cast_vector_type = (llvm::Type *) llvm::Type::getInt128Ty (*m_llvm_context); + // zeroConstant = constant128(0); + // + // llvm::Value * mask_as_int = builder().CreateBitCast (wide_int_mask, int_reinterpret_cast_vector_type); + break; + } default: OSL_ASSERT(0 && "unsupported native bit mask width"); }; @@ -4455,6 +4514,19 @@ LLVM_Util::op_linearize_8x_indices(llvm::Value* wide_index) } +llvm::Value* +LLVM_Util::op_linearize_4x_indices(llvm::Value* wide_index) +{ + llvm::Value* strided_indices = op_mul(wide_index, wide_constant(4, 4)); + llvm::Constant* offsets_to_lane[4] = { constant(0), constant(1), + constant(2), constant(3) }; + llvm::Value* const_vec_offsets = llvm::ConstantVector::get( + llvm::ArrayRef(&offsets_to_lane[0], 4)); + + return op_add(strided_indices, const_vec_offsets); +} + + std::array LLVM_Util::op_split_16x(llvm::Value* vector_val) { @@ -4613,6 +4685,7 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, module(), llvm::Intrinsic::x86_avx512_gather_dpi_512); break; case 8: + case 4: int_mask = mask_as_int8(current_mask()); func_avx512_gather_pi = llvm::Intrinsic::getDeclaration( module(), llvm::Intrinsic::x86_avx512_gather3siv8_si); @@ -4663,6 +4736,16 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, toArrayRef(args)); return gather_result; } + case 4: { + llvm::Value* args[] = { avx2_unmasked_value, void_ptr(src_ptr), + wide_index, wide_int_mask, + constant4((uint8_t)4) }; + llvm::Value* gather_result + = builder().CreateCall(func_avx2_gather_pi, + toArrayRef(args)); + return gather_result; + } + default: OSL_ASSERT(0 && "unsupported width"); }; } else { @@ -4680,6 +4763,7 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, module(), llvm::Intrinsic::x86_avx512_gather_dps_512); break; case 8: + case 4: int_mask = mask_as_int8(current_mask()); func_avx512_gather_ps = llvm::Intrinsic::getDeclaration( module(), llvm::Intrinsic::x86_avx512_gather3siv8_sf); @@ -4739,6 +4823,17 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, toArrayRef(args)); return gather; } + case 4: { + llvm::Value* args[] = { + avx2_unmasked_value, void_ptr(src_ptr), wide_index, + builder().CreateBitCast(wide_int_mask, + llvm_vector_type(type_float(), 4)), + constant4((uint8_t)4) + }; + llvm::Value* gather = builder().CreateCall(func_avx2_gather_ps, + toArrayRef(args)); + return gather; + } } } else { return clamped_gather_from_uniform(type_wide_float()); @@ -4805,6 +4900,29 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, gather2), type_wide_ustring()); } + case 4: { + // Gather 64bit integer, as that is binary compatible with 64bit pointers of ustring + llvm::Function* func_avx512_gather_dpq + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_gather3siv4_di); + OSL_ASSERT(func_avx512_gather_dpq); + + auto w4_bit_masks = current_mask(); + auto w4_int_indices = wide_index; + + llvm::Value* unmasked_value + = builder().CreateVectorSplat(4, constant64((uint64_t)0)); + llvm::Value* args[] + = { unmasked_value, void_ptr(src_ptr), w4_int_indices, + mask4_as_int8(w4_bit_masks), constant(4) }; + llvm::Value* gather1 + = builder().CreateCall(func_avx512_gather_dpq, + toArrayRef(args)); + args[2] = w4_int_indices; + args[3] = mask4_as_int8(w4_bit_masks); + + return builder().CreateIntToPtr(gather1, type_wide_ustring()); + } default: OSL_ASSERT(0 && "unsupported native bit mask width"); } } else { @@ -4841,6 +4959,20 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, return builder().CreateCall(func_avx512_gather_ps, toArrayRef(args)); } + case 4: { + llvm::Function* func_avx512_gather_ps + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_gather3siv8_sf); + OSL_ASSERT(func_avx512_gather_ps); + + llvm::Value* unmasked_value = wide_constant(0.0f); + llvm::Value* args[] = { unmasked_value, void_ptr(src_ptr), + op_linearize_4x_indices(wide_index), + mask_as_int8(current_mask()), + constant(4) }; + return builder().CreateCall(func_avx512_gather_ps, + toArrayRef(args)); + } default: OSL_ASSERT(0 && "unsupported native bit mask width"); }; @@ -4889,6 +5021,19 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, toArrayRef(args)); return gather_result; } + case 4: { + auto int_indices = op_linearize_4x_indices(wide_index); + llvm::Value* args[] = { + avx2_unmasked_value, void_ptr(src_ptr), int_indices, + builder().CreateBitCast(wide_int_mask, + llvm_vector_type(type_float(), 4)), + constant8((uint8_t)4) + }; + llvm::Value* gather_result + = builder().CreateCall(func_avx2_gather_ps, + toArrayRef(args)); + return gather_result; + } default: OSL_ASSERT(0 && "unsupported vector width for avx2 gather"); } @@ -4926,6 +5071,20 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, return builder().CreateCall(func_avx512_gather_pi, toArrayRef(args)); } + case 4: { + llvm::Function* func_avx512_gather_pi + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_gather3siv8_si); + OSL_ASSERT(func_avx512_gather_pi); + + llvm::Value* unmasked_value = wide_constant(0); + llvm::Value* args[] = { unmasked_value, void_ptr(src_ptr), + op_linearize_4x_indices(wide_index), + mask_as_int8(current_mask()), + constant(4) }; + return builder().CreateCall(func_avx512_gather_pi, + toArrayRef(args)); + } default: OSL_ASSERT(0 && "unsupported native bit mask width"); } } else if (m_supports_avx2) { @@ -4975,6 +5134,26 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, toArrayRef(args)); return gather_result; } + case 4: { + llvm::Function* func_avx2_gather_pi + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx2_gather_d_d_256); + OSL_ASSERT(func_avx2_gather_pi); + + llvm::Constant* avx2_unmasked_value = wide_constant(8, 0); + + // Convert <16 x i1> -> <16 x i32> -> to <2 x< 8 x i32>> + llvm::Value* wide_int_mask + = builder().CreateSExt(current_mask(), type_wide_int()); + auto int_indices = op_linearize_4x_indices(wide_index); + llvm::Value* args[] = { avx2_unmasked_value, void_ptr(src_ptr), + int_indices, wide_int_mask, + constant8((uint8_t)4) }; + llvm::Value* gather_result + = builder().CreateCall(func_avx2_gather_pi, + toArrayRef(args)); + return gather_result; + } default: OSL_ASSERT(0 && "unsupported vector width for avx2 gather"); } @@ -5017,7 +5196,8 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, gather2), type_wide_ustring()); } - case 8: { + case 8: + case 4: { // Gather 64bit integer, as that is binary compatible with 64bit pointers of ustring llvm::Function* func_avx512_gather_dpq = llvm::Intrinsic::getDeclaration( @@ -5093,6 +5273,7 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, linear_indices = op_linearize_16x_indices(wide_index); break; case 8: linear_indices = op_linearize_8x_indices(wide_index); break; + case 4: linear_indices = op_linearize_4x_indices(wide_index); break; default: OSL_ASSERT(0 && "unsupported vector width for scatter"); }; } else { @@ -5150,6 +5331,7 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, module(), llvm::Intrinsic::x86_avx512_scatter_dps_512); break; case 8: + case 4: int_mask = mask_as_int8(current_mask()); func_avx512_scatter_ps = llvm::Intrinsic::getDeclaration( module(), llvm::Intrinsic::x86_avx512_scattersiv8_sf); @@ -5182,6 +5364,7 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, module(), llvm::Intrinsic::x86_avx512_scatter_dpi_512); break; case 8: + case 4: int_mask = mask_as_int8(current_mask()); func_avx512_scatter_pi = llvm::Intrinsic::getDeclaration( module(), llvm::Intrinsic::x86_avx512_scattersiv8_si); @@ -5256,6 +5439,25 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args)); return; } + case 4: { + llvm::Value* linear_indices = wide_index; + + llvm::Function* func_avx512_scatter_dpq + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_scatter_dpq_512); + OSL_ASSERT(func_avx512_scatter_dpq); + + llvm::Type* wide_address_int_type + = llvm_vector_type(type_addrint(), 4); + llvm::Value* address_int_val + = builder().CreatePtrToInt(wide_val, wide_address_int_type); + + llvm::Value* args[] + = { void_ptr(src_ptr), mask_as_int8(current_mask()), + linear_indices, address_int_val, constant(4) }; + builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args)); + return; + } default: OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter"); } @@ -5295,6 +5497,19 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, builder().CreateCall(func_avx512_scatter_ps, toArrayRef(args)); return; } + case 4: { + llvm::Function* func_avx512_scatter_ps + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_scattersiv8_sf); + OSL_ASSERT(func_avx512_scatter_ps); + + llvm::Value* args[] = { void_ptr(src_ptr), + mask_as_int8(current_mask()), + op_linearize_4x_indices(wide_index), + wide_val, constant(4) }; + builder().CreateCall(func_avx512_scatter_ps, toArrayRef(args)); + return; + } default: OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter"); } @@ -5338,6 +5553,19 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, builder().CreateCall(func_avx512_scatter_pi, toArrayRef(args)); return; } + case 4: { + llvm::Function* func_avx512_scatter_pi + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_scattersiv8_si); + OSL_ASSERT(func_avx512_scatter_pi); + + llvm::Value* args[] = { void_ptr(src_ptr), + mask_as_int8(current_mask()), + op_linearize_4x_indices(wide_index), + wide_val, constant(4) }; + builder().CreateCall(func_avx512_scatter_pi, toArrayRef(args)); + return; + } default: OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter"); } @@ -5407,6 +5635,26 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args)); return; } + case 4: { + llvm::Value* linear_indices = op_linearize_4x_indices( + wide_index); + + llvm::Function* func_avx512_scatter_dpq + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_scatter_dpq_512); + OSL_ASSERT(func_avx512_scatter_dpq); + + llvm::Type* wide_address_int_type + = llvm_vector_type(type_addrint(), 4); + llvm::Value* address_int_val + = builder().CreatePtrToInt(wide_val, wide_address_int_type); + + llvm::Value* args[] + = { void_ptr(src_ptr), mask_as_int8(current_mask()), + linear_indices, address_int_val, constant(4) }; + builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args)); + return; + } default: OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter"); } diff --git a/src/liboslexec/rendservices.cpp b/src/liboslexec/rendservices.cpp index c0c84b03d..b3bd5c898 100644 --- a/src/liboslexec/rendservices.cpp +++ b/src/liboslexec/rendservices.cpp @@ -524,4 +524,11 @@ RendererServices::batched(WidthOf<8>) return nullptr; } +BatchedRendererServices<4>* +RendererServices::batched(WidthOf<4>) +{ + // No default implementation for batched services + return nullptr; +} + OSL_NAMESPACE_EXIT diff --git a/src/liboslexec/shadingsys.cpp b/src/liboslexec/shadingsys.cpp index 307d57355..620c09cee 100644 --- a/src/liboslexec/shadingsys.cpp +++ b/src/liboslexec/shadingsys.cpp @@ -618,6 +618,29 @@ ShadingSystem::configure_batch_execution_at(int width) m_impl->attribute("llvm_jit_fma", 0); return true; } +# endif + if (target_requested) { + break; + } + // fallthrough + default: return false; + }; + return false; + case 4: + switch (requestedISA) { + case TargetISA::UNKNOWN: + // fallthrough + case TargetISA::x64: +# ifdef __OSL_SUPPORTS_b4_SSE2 + if (LLVM_Util::supports_isa(TargetISA::x64)) { + if (!target_requested) + m_impl->attribute("llvm_jit_target", + LLVM_Util::target_isa_name( + TargetISA::x64)); + // SSE2 doesn't support FMA + m_impl->attribute("llvm_jit_fma", 0); + return true; + } # endif if (target_requested) { break; @@ -885,6 +908,7 @@ ShadingSystem::BatchedExecutor::jit_all_groups(int nthreads) // Explicitly instantiate template class ShadingSystem::BatchedExecutor<16>; template class ShadingSystem::BatchedExecutor<8>; +template class ShadingSystem::BatchedExecutor<4>; #endif @@ -1079,7 +1103,8 @@ ShadingSystemImpl::ShadingSystemImpl(RendererServices* renderer, , m_opt_groupdata(true) #if OSL_USE_BATCHED , m_opt_batched_analysis((renderer->batched(WidthOf<16>()) != nullptr) - || (renderer->batched(WidthOf<8>()) != nullptr)) + || (renderer->batched(WidthOf<8>()) != nullptr) + || (renderer->batched(WidthOf<4>()) != nullptr)) #else , m_opt_batched_analysis(false) #endif @@ -3794,7 +3819,8 @@ ShadingSystemImpl::optimize_group(ShaderGroup& group, ShadingContext* ctx, // the batch jit has already happened, // as it requires the ops so we can't delete them yet! if (((renderer()->batched(WidthOf<16>()) == nullptr) - && (renderer()->batched(WidthOf<8>()) == nullptr)) + && (renderer()->batched(WidthOf<8>()) == nullptr) + && (renderer()->batched(WidthOf<4>()) == nullptr)) || group.batch_jitted()) { group_post_jit_cleanup(group); } @@ -4015,6 +4041,7 @@ ShadingSystemImpl::Batched::jit_all_groups(int nthreads, int mythread, // machine as well, start with just the batch size template class pvt::ShadingSystemImpl::Batched<16>; template class pvt::ShadingSystemImpl::Batched<8>; +template class pvt::ShadingSystemImpl::Batched<4>; #endif int diff --git a/src/testshade/batched_simplerend.cpp b/src/testshade/batched_simplerend.cpp index 937655af4..ea2acbdf9 100644 --- a/src/testshade/batched_simplerend.cpp +++ b/src/testshade/batched_simplerend.cpp @@ -1001,6 +1001,7 @@ BatchedSimpleRenderer::get_camera_screen_window(ustringhash /*object*/, // Explicitly instantiate BatchedSimpleRenderer template template class BatchedSimpleRenderer<16>; template class BatchedSimpleRenderer<8>; +template class BatchedSimpleRenderer<4>; OSL_NAMESPACE_EXIT diff --git a/src/testshade/simplerend.cpp b/src/testshade/simplerend.cpp index 65862c2db..3582c9cc4 100644 --- a/src/testshade/simplerend.cpp +++ b/src/testshade/simplerend.cpp @@ -218,7 +218,9 @@ register_closures(OSL::ShadingSystem* shadingsys) SimpleRenderer::SimpleRenderer() #if OSL_USE_BATCHED - : m_batch_16_simple_renderer(*this), m_batch_8_simple_renderer(*this) + : m_batch_16_simple_renderer(*this) + , m_batch_8_simple_renderer(*this) + , m_batch_4_simple_renderer(*this) #endif { Matrix44 M; diff --git a/src/testshade/simplerend.h b/src/testshade/simplerend.h index 87d0b96dd..8ebe1c1fc 100644 --- a/src/testshade/simplerend.h +++ b/src/testshade/simplerend.h @@ -177,12 +177,17 @@ class SimpleRenderer : public RendererServices { { return &m_batch_8_simple_renderer; } + BatchedRendererServices<4>* batched(WidthOf<4>) override + { + return &m_batch_4_simple_renderer; + } #endif protected: #if OSL_USE_BATCHED BatchedSimpleRenderer<16> m_batch_16_simple_renderer; BatchedSimpleRenderer<8> m_batch_8_simple_renderer; + BatchedSimpleRenderer<4> m_batch_4_simple_renderer; #endif // Camera parameters diff --git a/src/testshade/testshade.cpp b/src/testshade/testshade.cpp index 39834f638..bd80a0f41 100644 --- a/src/testshade/testshade.cpp +++ b/src/testshade/testshade.cpp @@ -306,6 +306,9 @@ set_shadingsys_options() } else if ((!batch_size_requested || batch_size == 8) && shadingsys->configure_batch_execution_at(8)) { batch_size = 8; + } else if ((!batch_size_requested || batch_size == 4) + && shadingsys->configure_batch_execution_at(4)) { + batch_size = 4; } else { OSL::print( "WARNING: Hardware or library requirements to utilize batched execution"); @@ -1194,9 +1197,11 @@ setup_output_images(SimpleRenderer* rend, ShadingSystem* shadingsys, // jit_group will optimize the group if necesssary if (batch_size == 16) { shadingsys->batched<16>().jit_group(shadergroup.get(), ctx); - } else { - ASSERT((batch_size == 8) && "Unsupported batch size"); + } else if (batch_size == 8) { shadingsys->batched<8>().jit_group(shadergroup.get(), ctx); + } else { + ASSERT((batch_size == 4) && "Unsupported batch size"); + shadingsys->batched<4>().jit_group(shadergroup.get(), ctx); } } else #endif @@ -2195,13 +2200,19 @@ test_shade(int argc, const char* argv[]) batched_shade_region<16>(rend, shadergroup.get(), sub_roi, save); }); - } else { - ASSERT((batch_size == 8) && "Unsupported batch size"); + } else if (batch_size == 8) { OIIO::ImageBufAlgo::parallel_image( roi, num_threads, [&](OIIO::ROI sub_roi) -> void { batched_shade_region<8>(rend, shadergroup.get(), sub_roi, save); }); + } else { + ASSERT((batch_size == 4) && "Unsupported batch size"); + OIIO::ImageBufAlgo::parallel_image( + roi, num_threads, [&](OIIO::ROI sub_roi) -> void { + batched_shade_region<4>(rend, shadergroup.get(), + sub_roi, save); + }); } } else # endif diff --git a/testsuite/example-batched-deformer/oslbatcheddeformer.cpp b/testsuite/example-batched-deformer/oslbatcheddeformer.cpp index 0b7af16e4..fe620fefa 100644 --- a/testsuite/example-batched-deformer/oslbatcheddeformer.cpp +++ b/testsuite/example-batched-deformer/oslbatcheddeformer.cpp @@ -182,10 +182,15 @@ class MyRendererServices final : public OSL::RendererServices { { return &m_batch_8_rs; } + OSL::BatchedRendererServices<4>* batched(OSL::WidthOf<4>) override + { + return &m_batch_4_rs; + } private: MyBatchedRendererServices<16> m_batch_16_rs; MyBatchedRendererServices<8> m_batch_8_rs; + MyBatchedRendererServices<4> m_batch_4_rs; };