diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3fb26aefdf..1874ede0a4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -74,6 +74,17 @@ jobs: pybind11_ver: v2.5.0 simd: sse4.2 setenvs: export CONAN_LLVM_VERSION=10.0.1 + - desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 sse2 batch-b4sse2 + nametag: linux-vfx2021 + runner: ubuntu-latest + container: aswftesting/ci-osl:2021-clang11 + vfxyear: 2021 + cxx_std: 17 + openimageio_ver: v2.4.13.0 + python_ver: 3.7 + pybind11_ver: v2.7.0 + simd: sse2 + batched: b4_SSE2 - desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 avx2 batch-b8avx2 nametag: linux-vfx2021 runner: ubuntu-latest diff --git a/CMakeLists.txt b/CMakeLists.txt index 9c612a27e2..ee864a5c6d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,7 +95,7 @@ else () endif () set (OSL_LIBNAME_SUFFIX "" CACHE STRING "Optional name appended to ${PROJECT_NAME} libraries that are built") -option (OSL_BUILD_TESTS "Build the unit tests, testshade, testrender" ON) +option (OSL_BUILD_TESTS "Build the unit tests, testminimal, testshade, testrender" ON) if (WIN32) option (USE_LLVM_BITCODE "Generate embedded LLVM bitcode" OFF) else () @@ -220,6 +220,7 @@ add_subdirectory (src/oslc) add_subdirectory (src/oslinfo) if (OSL_BUILD_TESTS AND BUILD_TESTING) + add_subdirectory (src/testminimal) add_subdirectory (src/testshade) add_subdirectory (src/testrender) endif () diff --git a/src/cmake/compiler.cmake b/src/cmake/compiler.cmake index c97316681c..172bf1a818 100644 --- a/src/cmake/compiler.cmake +++ b/src/cmake/compiler.cmake @@ -329,7 +329,7 @@ endif () # # The USE_BATCHED option may be set to indicate that support for batched # SIMD shader execution be compiled along with targe specific libraries -set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)") +set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b4_SSE2, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)") option (VEC_REPORT "Enable compiler's reporting system for vectorization" OFF) set (BATCHED_SUPPORT_DEFINES "") set (BATCHED_TARGET_LIBS "") diff --git a/src/cmake/testing.cmake b/src/cmake/testing.cmake index fc5e956b05..c3c0bee88e 100644 --- a/src/cmake/testing.cmake +++ b/src/cmake/testing.cmake @@ -270,7 +270,7 @@ macro (osl_add_all_tests) bug-array-heapoffsets bug-locallifetime bug-outputinit bug-param-duplicate bug-peep bug-return calculatenormal-reg - cellnoise closure closure-array closure-layered closure-parameters closure-zero closure-conditional + cellnoise closure closure-array closure-layered closure-parameters closure-string closure-zero closure-conditional color color-reg colorspace comparison complement-reg compile-buffer compassign-bool compassign-reg component-range diff --git a/src/include/OSL/batched_texture.h b/src/include/OSL/batched_texture.h index c720e9bedc..7876644720 100644 --- a/src/include/OSL/batched_texture.h +++ b/src/include/OSL/batched_texture.h @@ -49,6 +49,9 @@ static_assert(std::alignment_of>::value static_assert(std::alignment_of>::value == VecReg<8>::alignment, "Expect alignment of data member to set alignment of struct"); +static_assert(std::alignment_of>::value + == VecReg<4>::alignment, + "Expect alignment of data member to set alignment of struct"); template struct BatchedTextureOptions { VaryingTextureOptions varying; @@ -90,11 +93,15 @@ static_assert(std::alignment_of>::value static_assert(std::alignment_of>::value == VecReg<8>::alignment, "Expect alignment of data member to set alignment of struct"); +static_assert(std::alignment_of>::value + == VecReg<4>::alignment, + "Expect alignment of data member to set alignment of struct"); #ifdef OIIO_TEXTURE_SIMD_BATCH_WIDTH // Code here is to validate our OSL BatchedTextureOptions is binary compatible // and safe to reinterpret_cast -static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8), +static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8) + || (OIIO::Tex::BatchWidth == 4), "This validation requires OIIO_TEXTURE_SIMD_BATCH_WIDTH=16"); namespace validate_offsets { diff --git a/src/include/OSL/llvm_util.h b/src/include/OSL/llvm_util.h index 7f112ccf52..49df628917 100644 --- a/src/include/OSL/llvm_util.h +++ b/src/include/OSL/llvm_util.h @@ -693,6 +693,8 @@ class OSLEXECPUBLIC LLVM_Util { llvm::Constant* constant(uint32_t i); /// Return an llvm::Constant holding the given integer constant. + llvm::Constant* constant4(int8_t i); + llvm::Constant* constant4(uint8_t i); llvm::Constant* constant8(int8_t i); llvm::Constant* constant8(uint8_t i); llvm::Constant* constant16(int16_t i); @@ -1229,6 +1231,7 @@ class OSLEXECPUBLIC LLVM_Util { llvm::Value* op_linearize_16x_indices(llvm::Value* wide_index); llvm::Value* op_linearize_8x_indices(llvm::Value* wide_index); + llvm::Value* op_linearize_4x_indices(llvm::Value* wide_index); std::array op_split_16x(llvm::Value* vector_val); std::array op_split_8x(llvm::Value* vector_val); std::array op_quarter_16x(llvm::Value* vector_val); diff --git a/src/include/OSL/rendererservices.h b/src/include/OSL/rendererservices.h index 04e5269ae0..62a6b61793 100644 --- a/src/include/OSL/rendererservices.h +++ b/src/include/OSL/rendererservices.h @@ -601,6 +601,7 @@ class OSLEXECPUBLIC RendererServices { /// Unless overridden, a nullptr is returned. virtual BatchedRendererServices<16>* batched(WidthOf<16>); virtual BatchedRendererServices<8>* batched(WidthOf<8>); + virtual BatchedRendererServices<4>* batched(WidthOf<4>); protected: TextureSystem* m_texturesys; // A place to hold a TextureSystem diff --git a/src/liboslexec/CMakeLists.txt b/src/liboslexec/CMakeLists.txt index 328565af68..5f2bd048a4 100644 --- a/src/liboslexec/CMakeLists.txt +++ b/src/liboslexec/CMakeLists.txt @@ -380,6 +380,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST}) list (APPEND TARGET_CXX_OPTS "-march=core-avx2") elseif (${TARGET_OPT_ISA} STREQUAL "AVX") list (APPEND TARGET_CXX_OPTS "-march=corei7-avx") + elseif (${TARGET_OPT_ISA} STREQUAL "SSE2") + list (APPEND TARGET_CXX_OPTS "-march=core2") else () message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}") endif () @@ -455,6 +457,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST}) list (APPEND TARGET_CXX_OPTS "-march=haswell") elseif (${TARGET_OPT_ISA} STREQUAL "AVX") list (APPEND TARGET_CXX_OPTS "-march=sandybridge") + elseif (${TARGET_OPT_ISA} STREQUAL "SSE2") + list (APPEND TARGET_CXX_OPTS "-march=core2") else () message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}") endif () diff --git a/src/liboslexec/batched_analysis.cpp b/src/liboslexec/batched_analysis.cpp index 888f198741..9f76c1acf6 100644 --- a/src/liboslexec/batched_analysis.cpp +++ b/src/liboslexec/batched_analysis.cpp @@ -1813,10 +1813,16 @@ struct Analyzer { // specific BatchedRendererServices. // Right here we don't know which width will be used, // so we will just require all widths provide the same answer + auto rs4 = m_ba.renderer()->batched(WidthOf<4>()); auto rs8 = m_ba.renderer()->batched(WidthOf<8>()); auto rs16 = m_ba.renderer()->batched(WidthOf<16>()); - if (rs8 || rs16) { + if (rs4 || rs8 || rs16) { get_attr_is_uniform = true; + if (rs4) { + get_attr_is_uniform + &= rs4->is_attribute_uniform(obj_name, + attr_name); + } if (rs8) { get_attr_is_uniform &= rs8->is_attribute_uniform(obj_name, diff --git a/src/liboslexec/batched_backendllvm.cpp b/src/liboslexec/batched_backendllvm.cpp index e94122ef43..79f87ca900 100644 --- a/src/liboslexec/batched_backendllvm.cpp +++ b/src/liboslexec/batched_backendllvm.cpp @@ -141,6 +141,7 @@ BatchedBackendLLVM::BatchedBackendLLVM(ShadingSystemImpl& shadingsys, switch (vector_width()) { case 16: m_true_mask_value = Mask<16>(true).value(); break; case 8: m_true_mask_value = Mask<8>(true).value(); break; + case 4: m_true_mask_value = Mask<4>(true).value(); break; default: OSL_ASSERT(0 && "unsupported vector width"); } ll.dumpasm(shadingsys.m_llvm_dumpasm); diff --git a/src/liboslexec/batched_llvm_instance.cpp b/src/liboslexec/batched_llvm_instance.cpp index 8e6ff0a76d..2180637861 100644 --- a/src/liboslexec/batched_llvm_instance.cpp +++ b/src/liboslexec/batched_llvm_instance.cpp @@ -537,6 +537,33 @@ const char* = "b8_AVX_"; #endif +#ifdef __OSL_SUPPORTS_b4_SSE2 +template<> +const NameAndSignature + ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_functions[] + = { +# define DECL_INDIRECT(name, signature) \ + NameAndSignature { #name, signature }, +# define DECL(name, signature) DECL_INDIRECT(name, signature) +# define __OSL_WIDTH 4 +# define __OSL_TARGET_ISA SSE2 +// Don't allow order of xmacro includes be rearranged +// clang-format off +# include "wide/define_opname_macros.h" +# include "builtindecl_wide_xmacro.h" +# include "wide/undef_opname_macros.h" +// clang-format on +# undef __OSL_TARGET_ISA +# undef __OSL_WIDTH +# undef DECL +# undef DECL_INDIRECT + }; +template<> +const char* + ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_selector_string + = "b4_SSE2_"; +#endif + std::unique_ptr @@ -592,6 +619,17 @@ BatchedBackendLLVM::TargetLibraryHelper::build(ShadingContext* context, default: break; } break; + case 4: + switch (target_isa) { +#ifdef __OSL_SUPPORTS_b4_SSE2 + case TargetISA::x64: + return RetType( + new ConcreteTargetLibraryHelper<4, TargetISA::x64>()); +#endif + default: break; + } + break; + default: OSL_ASSERT(0 && "unsupported vector width"); } std::cerr << "Build is not configured to support TargetISA of " @@ -735,6 +773,9 @@ BatchedBackendLLVM::llvm_type_batched_texture_options() { std::vector offset_by_index; switch (m_width) { + case 4: + build_offsets_of_BatchedTextureOptions<4>(offset_by_index); + break; case 8: build_offsets_of_BatchedTextureOptions<8>(offset_by_index); break; @@ -2698,6 +2739,9 @@ BatchedBackendLLVM::run() { std::vector offset_by_index; switch (m_width) { + case 4: + build_offsets_of_BatchedShaderGlobals<4>(offset_by_index); + break; case 8: build_offsets_of_BatchedShaderGlobals<8>(offset_by_index); break; diff --git a/src/liboslexec/batched_rendservices.cpp b/src/liboslexec/batched_rendservices.cpp index fbff377b25..1c5fcaa4a6 100644 --- a/src/liboslexec/batched_rendservices.cpp +++ b/src/liboslexec/batched_rendservices.cpp @@ -328,5 +328,6 @@ BatchedRendererServices::getmessage(BatchedShaderGlobals* bsg, // Explicitly instantiate BatchedRendererServices template template class OSLEXECPUBLIC BatchedRendererServices<16>; template class OSLEXECPUBLIC BatchedRendererServices<8>; +template class OSLEXECPUBLIC BatchedRendererServices<4>; OSL_NAMESPACE_EXIT diff --git a/src/liboslexec/context.cpp b/src/liboslexec/context.cpp index a97b427e1b..b001315a8e 100644 --- a/src/liboslexec/context.cpp +++ b/src/liboslexec/context.cpp @@ -674,6 +674,7 @@ osl_incr_layers_executed(ShaderGlobals* sg) // Explicit template instantiation for supported batch sizes template class ShadingContext::Batched<16>; template class ShadingContext::Batched<8>; +template class ShadingContext::Batched<4>; #endif diff --git a/src/liboslexec/llvm_passes.h b/src/liboslexec/llvm_passes.h index 852ec82f94..43c7a72894 100644 --- a/src/liboslexec/llvm_passes.h +++ b/src/liboslexec/llvm_passes.h @@ -435,6 +435,8 @@ class LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks final // including this file will need its own static members defined. LLVM will // assign IDs when they get registered, so this initialization value is not // important. +template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>::ID = 0; + template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>::ID = 0; template<> char LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<16>::ID = 0; diff --git a/src/liboslexec/llvm_util.cpp b/src/liboslexec/llvm_util.cpp index 3dd888cab0..2e758ec309 100644 --- a/src/liboslexec/llvm_util.cpp +++ b/src/liboslexec/llvm_util.cpp @@ -619,6 +619,12 @@ LLVM_Util::SetupLLVM() #ifndef OSL_LLVM_NEW_PASS_MANAGER // LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks + static llvm::RegisterPass< + LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>> + sRegCustomPass2( + "PreventBitMasksFromBeingLiveinsToBasicBlocks<4>", + "Prevent Bit Masks <4xi1> From Being Liveins To Basic Blocks Pass", + false /* Only looks at CFG */, false /* Analysis Pass */); static llvm::RegisterPass< LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>> sRegCustomPass0( @@ -2305,7 +2311,11 @@ LLVM_Util::setup_new_optimization_passes(int optlevel, bool target_host) break; } case 4: - // We don't use masking or SIMD shading for 4-wide + // MUST BE THE FINAL PASS! + m_new_pass_manager->module_pass_manager.addPass( + createModuleToFunctionPassAdaptor( + NewPreventBitMasksFromBeingLiveinsToBasicBlocks<4>( + context()))); break; default: std::cout << "m_vector_width = " << m_vector_width << "\n"; @@ -2618,7 +2628,9 @@ LLVM_Util::setup_legacy_optimization_passes(int optlevel, bool target_host) new LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<8>()); break; case 4: - // We don't use masking or SIMD shading for 4-wide + // MUST BE THE FINAL PASS! + mpm.add( + new LegacyPreventBitMasksFromBeingLiveinsToBasicBlocks<4>()); break; default: std::cout << "m_vector_width = " << m_vector_width << "\n"; @@ -3393,6 +3405,19 @@ LLVM_Util::constant(uint32_t i) return llvm::ConstantInt::get(context(), llvm::APInt(32, i)); } +llvm::Constant* +LLVM_Util::constant4(int8_t i) +{ + return llvm::ConstantInt::get(context(), + llvm::APInt(4, i, true /*signed*/)); +} + +llvm::Constant* +LLVM_Util::constant4(uint8_t i) +{ + return llvm::ConstantInt::get(context(), llvm::APInt(4, i)); +} + llvm::Constant* LLVM_Util::constant8(int8_t i) { @@ -3592,6 +3617,11 @@ LLVM_Util::mask_as_int(llvm::Value* mask) // and all types are happy intMaskType = type_int8(); break; + case 4: + // We can just reinterpret cast a 4 bit mask to a 8 bit integer + // and all types are happy + intMaskType = type_int8(); + break; default: OSL_ASSERT(0 && "unsupported native bit mask width"); }; @@ -3659,6 +3689,33 @@ LLVM_Util::mask_as_int(llvm::Value* mask) int8_mask = builder().CreateCall(func, toArrayRef(args)); return int8_mask; } + case 4: { + // We need to do more than a simple cast to an int. Since we + // know vectorized comparison for SSE2 ends up setting 4 + // 32 bit integers to 0xFFFFFFFF or 0x00000000, We need to + // do more than a simple cast to an int. + + // Convert <4 x i1> -> <4 x i32> + llvm::Value* wide_int_mask = builder().CreateSExt(mask, + type_wide_int()); + + // Convert <4 x i32> -> <4 x f32> + // Now we will use the horizontal sign extraction intrinsic + // to build a 32 bit mask value. However the only 256bit + // version works on floats, so we will cast from int32 to + // float beforehand + llvm::Type* w4_float_type = llvm_vector_type(m_llvm_type_float, 4); + llvm::Value* w4_float_mask = builder().CreateBitCast(wide_int_mask, + w4_float_type); + + llvm::Function* func = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_sse_movmsk_ps); + + llvm::Value* args[1] = { w4_float_mask }; + llvm::Value* int8_mask; + int8_mask = builder().CreateCall(func, toArrayRef(args)); + return int8_mask; + } default: { OSL_ASSERT(0 && "unsupported native bit mask width"); return mask; @@ -3828,14 +3885,19 @@ LLVM_Util::int_as_mask(llvm::Value* value) // and all types are happy intMaskType = type_int8(); break; + case 4: + // We can just reinterpret cast a 8 bit integer to a 4 bit mask + // and all types are happy + intMaskType = type_int8(); + break; default: OSL_ASSERT(0 && "unsupported native bit mask width"); }; llvm::Value* intMask = builder().CreateTrunc(value, intMaskType); result = builder().CreateBitCast(intMask, type_wide_bool()); } else { - // Since we know vectorized comparisons for AVX&AVX2 end up setting - // 8 32 bit integers to 0xFFFFFFFF or 0x00000000, We need to do more + // Since we know vectorized comparisons for SSE2&AVX&AVX2 end up setting + // 32 bit integers to 0xFFFFFFFF or 0x00000000, We need to do more // than a simple cast to an int. // Broadcast out the int32 mask to all data lanes @@ -3950,23 +4012,20 @@ LLVM_Util::op_1st_active_lane_of(llvm::Value* mask) // and all types are happy intMaskType = type_int8(); break; -#if 0 // WIP - case 4: - { - // We can just reinterpret cast a 8 bit mask to a 8 bit integer - // and all types are happy - intMaskType = type_int8(); + case 4: { + // We can just reinterpret cast a 4 bit mask to a 8 bit integer + // and all types are happy + intMaskType = type_int8(); -// extended_int_vector_type = (llvm::Type *) llvm::VectorType::get(llvm::Type::getInt32Ty (*m_llvm_context), m_vector_width); -// llvm::Value * wide_int_mask = builder().CreateSExt(mask, extended_int_vector_type); -// -// int_reinterpret_cast_vector_type = (llvm::Type *) llvm::Type::getInt128Ty (*m_llvm_context); -// zeroConstant = constant128(0); -// -// llvm::Value * mask_as_int = builder().CreateBitCast (wide_int_mask, int_reinterpret_cast_vector_type); - break; - } -#endif + // extended_int_vector_type = (llvm::Type *) llvm::VectorType::get(llvm::Type::getInt32Ty (*m_llvm_context), m_vector_width); + // llvm::Value * wide_int_mask = builder().CreateSExt(mask, extended_int_vector_type); + // + // int_reinterpret_cast_vector_type = (llvm::Type *) llvm::Type::getInt128Ty (*m_llvm_context); + // zeroConstant = constant128(0); + // + // llvm::Value * mask_as_int = builder().CreateBitCast (wide_int_mask, int_reinterpret_cast_vector_type); + break; + } default: OSL_ASSERT(0 && "unsupported native bit mask width"); }; @@ -4455,6 +4514,19 @@ LLVM_Util::op_linearize_8x_indices(llvm::Value* wide_index) } +llvm::Value* +LLVM_Util::op_linearize_4x_indices(llvm::Value* wide_index) +{ + llvm::Value* strided_indices = op_mul(wide_index, wide_constant(4, 4)); + llvm::Constant* offsets_to_lane[4] = { constant(0), constant(1), + constant(2), constant(3) }; + llvm::Value* const_vec_offsets = llvm::ConstantVector::get( + llvm::ArrayRef(&offsets_to_lane[0], 4)); + + return op_add(strided_indices, const_vec_offsets); +} + + std::array LLVM_Util::op_split_16x(llvm::Value* vector_val) { @@ -4613,6 +4685,7 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, module(), llvm::Intrinsic::x86_avx512_gather_dpi_512); break; case 8: + case 4: int_mask = mask_as_int8(current_mask()); func_avx512_gather_pi = llvm::Intrinsic::getDeclaration( module(), llvm::Intrinsic::x86_avx512_gather3siv8_si); @@ -4663,6 +4736,16 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, toArrayRef(args)); return gather_result; } + case 4: { + llvm::Value* args[] = { avx2_unmasked_value, void_ptr(src_ptr), + wide_index, wide_int_mask, + constant4((uint8_t)4) }; + llvm::Value* gather_result + = builder().CreateCall(func_avx2_gather_pi, + toArrayRef(args)); + return gather_result; + } + default: OSL_ASSERT(0 && "unsupported width"); }; } else { @@ -4680,6 +4763,7 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, module(), llvm::Intrinsic::x86_avx512_gather_dps_512); break; case 8: + case 4: int_mask = mask_as_int8(current_mask()); func_avx512_gather_ps = llvm::Intrinsic::getDeclaration( module(), llvm::Intrinsic::x86_avx512_gather3siv8_sf); @@ -4739,6 +4823,17 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, toArrayRef(args)); return gather; } + case 4: { + llvm::Value* args[] = { + avx2_unmasked_value, void_ptr(src_ptr), wide_index, + builder().CreateBitCast(wide_int_mask, + llvm_vector_type(type_float(), 4)), + constant4((uint8_t)4) + }; + llvm::Value* gather = builder().CreateCall(func_avx2_gather_ps, + toArrayRef(args)); + return gather; + } } } else { return clamped_gather_from_uniform(type_wide_float()); @@ -4805,6 +4900,29 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, gather2), type_wide_ustring()); } + case 4: { + // Gather 64bit integer, as that is binary compatible with 64bit pointers of ustring + llvm::Function* func_avx512_gather_dpq + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_gather3siv4_di); + OSL_ASSERT(func_avx512_gather_dpq); + + auto w4_bit_masks = current_mask(); + auto w4_int_indices = wide_index; + + llvm::Value* unmasked_value + = builder().CreateVectorSplat(4, constant64((uint64_t)0)); + llvm::Value* args[] + = { unmasked_value, void_ptr(src_ptr), w4_int_indices, + mask4_as_int8(w4_bit_masks), constant(4) }; + llvm::Value* gather1 + = builder().CreateCall(func_avx512_gather_dpq, + toArrayRef(args)); + args[2] = w4_int_indices; + args[3] = mask4_as_int8(w4_bit_masks); + + return builder().CreateIntToPtr(gather1, type_wide_ustring()); + } default: OSL_ASSERT(0 && "unsupported native bit mask width"); } } else { @@ -4841,6 +4959,20 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, return builder().CreateCall(func_avx512_gather_ps, toArrayRef(args)); } + case 4: { + llvm::Function* func_avx512_gather_ps + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_gather3siv8_sf); + OSL_ASSERT(func_avx512_gather_ps); + + llvm::Value* unmasked_value = wide_constant(0.0f); + llvm::Value* args[] = { unmasked_value, void_ptr(src_ptr), + op_linearize_4x_indices(wide_index), + mask_as_int8(current_mask()), + constant(4) }; + return builder().CreateCall(func_avx512_gather_ps, + toArrayRef(args)); + } default: OSL_ASSERT(0 && "unsupported native bit mask width"); }; @@ -4889,6 +5021,19 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, toArrayRef(args)); return gather_result; } + case 4: { + auto int_indices = op_linearize_4x_indices(wide_index); + llvm::Value* args[] = { + avx2_unmasked_value, void_ptr(src_ptr), int_indices, + builder().CreateBitCast(wide_int_mask, + llvm_vector_type(type_float(), 4)), + constant8((uint8_t)4) + }; + llvm::Value* gather_result + = builder().CreateCall(func_avx2_gather_ps, + toArrayRef(args)); + return gather_result; + } default: OSL_ASSERT(0 && "unsupported vector width for avx2 gather"); } @@ -4926,6 +5071,20 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, return builder().CreateCall(func_avx512_gather_pi, toArrayRef(args)); } + case 4: { + llvm::Function* func_avx512_gather_pi + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_gather3siv8_si); + OSL_ASSERT(func_avx512_gather_pi); + + llvm::Value* unmasked_value = wide_constant(0); + llvm::Value* args[] = { unmasked_value, void_ptr(src_ptr), + op_linearize_4x_indices(wide_index), + mask_as_int8(current_mask()), + constant(4) }; + return builder().CreateCall(func_avx512_gather_pi, + toArrayRef(args)); + } default: OSL_ASSERT(0 && "unsupported native bit mask width"); } } else if (m_supports_avx2) { @@ -4975,6 +5134,26 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, toArrayRef(args)); return gather_result; } + case 4: { + llvm::Function* func_avx2_gather_pi + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx2_gather_d_d_256); + OSL_ASSERT(func_avx2_gather_pi); + + llvm::Constant* avx2_unmasked_value = wide_constant(8, 0); + + // Convert <16 x i1> -> <16 x i32> -> to <2 x< 8 x i32>> + llvm::Value* wide_int_mask + = builder().CreateSExt(current_mask(), type_wide_int()); + auto int_indices = op_linearize_4x_indices(wide_index); + llvm::Value* args[] = { avx2_unmasked_value, void_ptr(src_ptr), + int_indices, wide_int_mask, + constant8((uint8_t)4) }; + llvm::Value* gather_result + = builder().CreateCall(func_avx2_gather_pi, + toArrayRef(args)); + return gather_result; + } default: OSL_ASSERT(0 && "unsupported vector width for avx2 gather"); } @@ -5017,7 +5196,8 @@ LLVM_Util::op_gather(llvm::Type* src_type, llvm::Value* src_ptr, gather2), type_wide_ustring()); } - case 8: { + case 8: + case 4: { // Gather 64bit integer, as that is binary compatible with 64bit pointers of ustring llvm::Function* func_avx512_gather_dpq = llvm::Intrinsic::getDeclaration( @@ -5093,6 +5273,7 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, linear_indices = op_linearize_16x_indices(wide_index); break; case 8: linear_indices = op_linearize_8x_indices(wide_index); break; + case 4: linear_indices = op_linearize_4x_indices(wide_index); break; default: OSL_ASSERT(0 && "unsupported vector width for scatter"); }; } else { @@ -5150,6 +5331,7 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, module(), llvm::Intrinsic::x86_avx512_scatter_dps_512); break; case 8: + case 4: int_mask = mask_as_int8(current_mask()); func_avx512_scatter_ps = llvm::Intrinsic::getDeclaration( module(), llvm::Intrinsic::x86_avx512_scattersiv8_sf); @@ -5182,6 +5364,7 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, module(), llvm::Intrinsic::x86_avx512_scatter_dpi_512); break; case 8: + case 4: int_mask = mask_as_int8(current_mask()); func_avx512_scatter_pi = llvm::Intrinsic::getDeclaration( module(), llvm::Intrinsic::x86_avx512_scattersiv8_si); @@ -5256,6 +5439,25 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args)); return; } + case 4: { + llvm::Value* linear_indices = wide_index; + + llvm::Function* func_avx512_scatter_dpq + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_scatter_dpq_512); + OSL_ASSERT(func_avx512_scatter_dpq); + + llvm::Type* wide_address_int_type + = llvm_vector_type(type_addrint(), 4); + llvm::Value* address_int_val + = builder().CreatePtrToInt(wide_val, wide_address_int_type); + + llvm::Value* args[] + = { void_ptr(src_ptr), mask_as_int8(current_mask()), + linear_indices, address_int_val, constant(4) }; + builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args)); + return; + } default: OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter"); } @@ -5295,6 +5497,19 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, builder().CreateCall(func_avx512_scatter_ps, toArrayRef(args)); return; } + case 4: { + llvm::Function* func_avx512_scatter_ps + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_scattersiv8_sf); + OSL_ASSERT(func_avx512_scatter_ps); + + llvm::Value* args[] = { void_ptr(src_ptr), + mask_as_int8(current_mask()), + op_linearize_4x_indices(wide_index), + wide_val, constant(4) }; + builder().CreateCall(func_avx512_scatter_ps, toArrayRef(args)); + return; + } default: OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter"); } @@ -5338,6 +5553,19 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, builder().CreateCall(func_avx512_scatter_pi, toArrayRef(args)); return; } + case 4: { + llvm::Function* func_avx512_scatter_pi + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_scattersiv8_si); + OSL_ASSERT(func_avx512_scatter_pi); + + llvm::Value* args[] = { void_ptr(src_ptr), + mask_as_int8(current_mask()), + op_linearize_4x_indices(wide_index), + wide_val, constant(4) }; + builder().CreateCall(func_avx512_scatter_pi, toArrayRef(args)); + return; + } default: OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter"); } @@ -5407,6 +5635,26 @@ LLVM_Util::op_scatter(llvm::Value* wide_val, llvm::Type* src_type, builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args)); return; } + case 4: { + llvm::Value* linear_indices = op_linearize_8x_indices( + wide_index); + + llvm::Function* func_avx512_scatter_dpq + = llvm::Intrinsic::getDeclaration( + module(), llvm::Intrinsic::x86_avx512_scatter_dpq_512); + OSL_ASSERT(func_avx512_scatter_dpq); + + llvm::Type* wide_address_int_type + = llvm_vector_type(type_addrint(), 4); + llvm::Value* address_int_val + = builder().CreatePtrToInt(wide_val, wide_address_int_type); + + llvm::Value* args[] + = { void_ptr(src_ptr), mask_as_int8(current_mask()), + linear_indices, address_int_val, constant(4) }; + builder().CreateCall(func_avx512_scatter_dpq, toArrayRef(args)); + return; + } default: OSL_ASSERT(0 && "incomplete vector width for AVX512 scatter"); } diff --git a/src/liboslexec/rendservices.cpp b/src/liboslexec/rendservices.cpp index c0c84b03d6..b3bd5c8989 100644 --- a/src/liboslexec/rendservices.cpp +++ b/src/liboslexec/rendservices.cpp @@ -524,4 +524,11 @@ RendererServices::batched(WidthOf<8>) return nullptr; } +BatchedRendererServices<4>* +RendererServices::batched(WidthOf<4>) +{ + // No default implementation for batched services + return nullptr; +} + OSL_NAMESPACE_EXIT diff --git a/src/liboslexec/shadingsys.cpp b/src/liboslexec/shadingsys.cpp index 307d57355e..620c09cee8 100644 --- a/src/liboslexec/shadingsys.cpp +++ b/src/liboslexec/shadingsys.cpp @@ -618,6 +618,29 @@ ShadingSystem::configure_batch_execution_at(int width) m_impl->attribute("llvm_jit_fma", 0); return true; } +# endif + if (target_requested) { + break; + } + // fallthrough + default: return false; + }; + return false; + case 4: + switch (requestedISA) { + case TargetISA::UNKNOWN: + // fallthrough + case TargetISA::x64: +# ifdef __OSL_SUPPORTS_b4_SSE2 + if (LLVM_Util::supports_isa(TargetISA::x64)) { + if (!target_requested) + m_impl->attribute("llvm_jit_target", + LLVM_Util::target_isa_name( + TargetISA::x64)); + // SSE2 doesn't support FMA + m_impl->attribute("llvm_jit_fma", 0); + return true; + } # endif if (target_requested) { break; @@ -885,6 +908,7 @@ ShadingSystem::BatchedExecutor::jit_all_groups(int nthreads) // Explicitly instantiate template class ShadingSystem::BatchedExecutor<16>; template class ShadingSystem::BatchedExecutor<8>; +template class ShadingSystem::BatchedExecutor<4>; #endif @@ -1079,7 +1103,8 @@ ShadingSystemImpl::ShadingSystemImpl(RendererServices* renderer, , m_opt_groupdata(true) #if OSL_USE_BATCHED , m_opt_batched_analysis((renderer->batched(WidthOf<16>()) != nullptr) - || (renderer->batched(WidthOf<8>()) != nullptr)) + || (renderer->batched(WidthOf<8>()) != nullptr) + || (renderer->batched(WidthOf<4>()) != nullptr)) #else , m_opt_batched_analysis(false) #endif @@ -3794,7 +3819,8 @@ ShadingSystemImpl::optimize_group(ShaderGroup& group, ShadingContext* ctx, // the batch jit has already happened, // as it requires the ops so we can't delete them yet! if (((renderer()->batched(WidthOf<16>()) == nullptr) - && (renderer()->batched(WidthOf<8>()) == nullptr)) + && (renderer()->batched(WidthOf<8>()) == nullptr) + && (renderer()->batched(WidthOf<4>()) == nullptr)) || group.batch_jitted()) { group_post_jit_cleanup(group); } @@ -4015,6 +4041,7 @@ ShadingSystemImpl::Batched::jit_all_groups(int nthreads, int mythread, // machine as well, start with just the batch size template class pvt::ShadingSystemImpl::Batched<16>; template class pvt::ShadingSystemImpl::Batched<8>; +template class pvt::ShadingSystemImpl::Batched<4>; #endif int diff --git a/src/testminimal/CMakeLists.txt b/src/testminimal/CMakeLists.txt new file mode 100644 index 0000000000..42e6e9d115 --- /dev/null +++ b/src/testminimal/CMakeLists.txt @@ -0,0 +1,30 @@ +# Copyright Contributors to the Open Shading Language project. +# SPDX-License-Identifier: BSD-3-Clause +# https://github.com/AcademySoftwareFoundation/OpenShadingLanguage + +# The 'testminimal' executable +set ( testminimal_srcs + testminimal.cpp + oslmaterial.cpp ) + +set(include_dirs ${CMAKE_CURRENT_SOURCE_DIR}) +list(APPEND include_dirs ${CMAKE_SOURCE_DIR}/src/include) +list(APPEND include_dirs ${CMAKE_BINARY_DIR}/include) +list(APPEND include_dirs ${IMATH_INCLUDES}) +list(APPEND include_dirs ${OPENEXR_INCLUDES}) +list(APPEND include_dirs ${OpenImageIO_INCLUDES}) + +set ( rs_srcs + oslmaterial.cpp ) + +EMBED_LLVM_BITCODE_IN_CPP ( "${rs_srcs}" "_host" "testminimal_llvm_compiled_rs" testminimal_srcs "-DOSL_HOST_RS_BITCODE=1" "${include_dirs}") + +add_executable ( testminimal ${testminimal_srcs} ) + +target_link_libraries (testminimal + PRIVATE + oslexec oslquery) + +install (TARGETS testminimal RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ) + +osl_optix_target(testminimal) diff --git a/src/testminimal/oslmaterial.cpp b/src/testminimal/oslmaterial.cpp new file mode 100644 index 0000000000..dd509aa6e7 --- /dev/null +++ b/src/testminimal/oslmaterial.cpp @@ -0,0 +1,205 @@ +// Copyright Contributors to the Open Shading Language project. +// SPDX-License-Identifier: BSD-3-Clause +// https://github.com/AcademySoftwareFoundation/OpenShadingLanguage + + +#include "oslmaterial.h" +#include + +using std::cout; +using std::endl; + +#if OSL_USE_BATCHED +template +CustomBatchedRendererServices::CustomBatchedRendererServices( + BatchedOSLMaterial& m) + : OSL::BatchedRendererServices(m.texturesys()), m_sr(m) +{ +} +#endif + +OSLMaterial::OSLMaterial() {} + +#if OSL_USE_BATCHED +template +BatchedOSLMaterial::BatchedOSLMaterial() : m_batch(*this) +{ +} + +template BatchedOSLMaterial<4>::BatchedOSLMaterial(); +template BatchedOSLMaterial<8>::BatchedOSLMaterial(); +template BatchedOSLMaterial<16>::BatchedOSLMaterial(); +#endif + +// Supported closures and parameters +struct EmptyParams {}; + +enum ClosureIDs { + EMISSION_ID, + BACKGROUND_ID, + MICROFACET_ID, +}; + +struct MicrofacetParams { + OSL::ustringhash dist; + OSL::Vec3 N, U; + float xalpha, yalpha, eta; + int refract; +}; + +void +register_closures(OSL::ShadingSystem* ss) +{ + // "Describe the memory layout of each closure type to the OSL runtime" + constexpr int MaxParams = 32; + struct BuiltinClosures { + const char* name; + int id; + OSL::ClosureParam params[MaxParams]; // "upper bound" + }; + + using namespace OSL; + + // Closures with support built into OSL, connected by the 1st string + BuiltinClosures supported[] = { + { "emission", EMISSION_ID, { CLOSURE_FINISH_PARAM(EmptyParams) } }, + { "background", BACKGROUND_ID, { CLOSURE_FINISH_PARAM(EmptyParams) } }, + { "microfacet", + MICROFACET_ID, + { CLOSURE_STRING_PARAM(MicrofacetParams, dist), + CLOSURE_VECTOR_PARAM(MicrofacetParams, N), + CLOSURE_VECTOR_PARAM(MicrofacetParams, U), + CLOSURE_FLOAT_PARAM(MicrofacetParams, xalpha), + CLOSURE_FLOAT_PARAM(MicrofacetParams, yalpha), + CLOSURE_FLOAT_PARAM(MicrofacetParams, eta), + CLOSURE_INT_PARAM(MicrofacetParams, refract), + CLOSURE_FINISH_PARAM(MicrofacetParams) } }, + }; + // Closure registration here enables that type of closure, when executing or compiling a shader + for (const BuiltinClosures& c : supported) + ss->register_closure(c.name, c.id, c.params, nullptr, nullptr); +} + +void +process_bsdf_closure(const OSL::ClosureColor* closure) +{ + static const ::OSL::ustringhash uh_ggx(OIIO::Strutil::strhash("ggx")); + //static const ::OSL::ustringhash uh_beckmann(OIIO::Strutil::strhash("beckmann")); + if (!closure) + return; + switch (closure->id) { + case OSL::ClosureColor::MUL: { + process_bsdf_closure(closure->as_mul()->closure); + break; + } + case OSL::ClosureColor::ADD: { + process_bsdf_closure(closure->as_add()->closureA); + process_bsdf_closure(closure->as_add()->closureB); + break; + } + default: { + const OSL::ClosureComponent* comp = closure->as_comp(); + switch (comp->id) { + case EMISSION_ID: cout << "parsing emission closure" << endl; break; + case MICROFACET_ID: { + cout << "parsing microfacet closure" << endl; + const MicrofacetParams* mp = comp->as(); + if (mp->dist.hash() == uh_ggx.hash()) { + cout << "uh_ggx" << endl; + } else { + cout << "uh_beckmann or default" << endl; + } + } break; + default: + OSL_ASSERT(false && "Invalid closure invoked in surface shader"); + break; + } + } break; + } +} + +void +OSLMaterial::run_test(OSL::ShadingSystem* ss, OSL::PerThreadInfo* thread_info, + OSL::ShadingContext* context, char* shader_name) +{ + register_closures(ss); + OSL::ShaderGlobals globals; + globals_from_hit(globals); + + std::vector options; + + // Create a new shader group + m_shaders.emplace_back(); + m_shaders[0] = ss->ShaderGroupBegin(std::to_string(0)); + OSL::ShaderGroupRef group = m_shaders[0]; + + //{ + // OSL::OSLCompiler compiler; + // std::string name = std::string(shader_name) + ".osl"; + // compiler.compile(name.c_str(), options); + //} + + ss->Shader(*group, "surface", shader_name, "Test"); + ss->ShaderGroupEnd(*group); + + ss->execute(context, *group, globals); + const OSL::ClosureColor* closure = globals.Ci; + process_bsdf_closure(closure); +} + +#if OSL_USE_BATCHED +template +void +BatchedOSLMaterial::run_test(OSL::ShadingSystem* ss, + OSL::PerThreadInfo* thread_info, + OSL::ShadingContext* context, + char* shader_name) +{ + register_closures(ss); + OSL::BatchedShaderGlobals batched_globals; + + m_batch.globals_from_hit(batched_globals); + + std::vector options; + + // Create a new shader group + m_shaders.emplace_back(); + m_shaders[0] = ss->ShaderGroupBegin(std::to_string(0)); + OSL::ShaderGroupRef group = m_shaders[0]; + + //{ + // OSL::OSLCompiler compiler; + // std::string name = std::string(shader_name) + ".osl"; + // compiler.compile(name.c_str(), options); + //} + + ss->Shader(*group, "surface", shader_name, "Test"); + ss->ShaderGroupEnd(*group); + + // Run the shader that was just created + OSL::Block wide_shadeindex_block; + char* userdata_base_ptr = NULL; + char* output_base_ptr = NULL; + ss->batched().execute(*context, *group, batch_width, + wide_shadeindex_block, batched_globals, + userdata_base_ptr, output_base_ptr); + const OSL::ClosureColor* closure = batched_globals.varying.Ci[0]; + process_bsdf_closure(closure); +} + +template void +BatchedOSLMaterial<4>::run_test(OSL::ShadingSystem* ss, + OSL::PerThreadInfo* thread_info, + OSL::ShadingContext* context, + char* shader_name); +template void +BatchedOSLMaterial<8>::run_test(OSL::ShadingSystem* ss, + OSL::PerThreadInfo* thread_info, + OSL::ShadingContext* context, + char* shader_name); +template void +BatchedOSLMaterial<16>::run_test(OSL::ShadingSystem* ss, + OSL::PerThreadInfo* thread_info, + OSL::ShadingContext* context, + char* shader_name); +#endif diff --git a/src/testminimal/oslmaterial.h b/src/testminimal/oslmaterial.h new file mode 100644 index 0000000000..b3499ae231 --- /dev/null +++ b/src/testminimal/oslmaterial.h @@ -0,0 +1,236 @@ +// Copyright Contributors to the Open Shading Language project. +// SPDX-License-Identifier: BSD-3-Clause +// https://github.com/AcademySoftwareFoundation/OpenShadingLanguage + + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#if OSL_USE_BATCHED +# include +# include +#endif + +class OSLMaterial; + +#if OSL_USE_BATCHED +template class BatchedOSLMaterial; + +using OSL::Vec3; + +/// Custom BatchedRendererServices +template +class CustomBatchedRendererServices + : public OSL::BatchedRendererServices { +public: + explicit CustomBatchedRendererServices(BatchedOSLMaterial& m); + + //OIIO::ErrorHandler& errhandler() const { return *m_errhandler; } + /// Turn information at hitpoint into ShaderGlobals for OSL + void globals_from_hit(OSL::BatchedShaderGlobals& bsg) + { + // Uniform + auto& usg = bsg.uniform; + // Zero it all + std::memset(&usg, 0, sizeof(OSL::UniformShaderGlobals)); + usg.raytype = 1; // 1 stands for camera ray? + // Varying + auto& vsg = bsg.varying; + + //assign_all(vsg.shader2common, TransformationPtr(&Mshad)); + //assign_all(vsg.object2common, TransformationPtr(&Mobj)); + + for (int i = 0; i < batch_width; i++) + vsg.P[i] = { 0.0f, 0.0f, 0.0f }; + + for (int i = 0; i < batch_width; i++) + vsg.I[i] = { 0.0f, 0.0f, -1.0f }; // incident ray + for (int i = 0; i < batch_width; i++) + vsg.N[i] = { 0.0f, 0.0f, 1.0f }; // shading normal + for (int i = 0; i < batch_width; i++) + vsg.Ng[i] = { 0.0f, 0.0f, 1.0f }; // true geometric normal + + assign_all(vsg.u, + 0.5f); // 2D surface parameter u, and its differentials. + assign_all(vsg.v, + 0.5f); // 2D surface parameter u, and its differentials. + + + //if (false == vary_udxdy) { + assign_all(vsg.dudx, 0.0f); //uscale / xres); + assign_all(vsg.dudy, 0.0f); + //} + //if (false == vary_vdxdy) { + assign_all(vsg.dvdx, 0.0f); + assign_all(vsg.dvdy, 0.0f); //vscale / yres); + //} + + + //if (false == vary_Pdxdy) { + // assign_all(vsg.dPdx, Vec3(vsg.dudx[0], vsg.dudy[0], 0.0f)); + // assign_all(vsg.dPdy, Vec3(vsg.dvdx[0], vsg.dvdy[0], 0.0f)); + //} + + assign_all(vsg.dPdz, + Vec3(0.0f, 0.0f, 0.0f)); // just use 0 for volume tangent + + // Tangents of P with respect to surface u,v + assign_all(vsg.dPdu, Vec3(1.0f, 0.0f, 0.0f)); + assign_all(vsg.dPdv, Vec3(0.0f, 1.0f, 0.0f)); + + assign_all(vsg.I, Vec3(0, 0, 0)); + assign_all(vsg.dIdx, Vec3(0, 0, 0)); + assign_all(vsg.dIdy, Vec3(0, 0, 0)); + + // That also implies that our normal points to (0,0,1) + assign_all(vsg.N, Vec3(0, 0, 1)); + assign_all(vsg.Ng, Vec3(0, 0, 1)); + + assign_all(vsg.time, 0.0f); + assign_all(vsg.dtime, 0.0f); + assign_all(vsg.dPdtime, Vec3(0, 0, 0)); + + assign_all(vsg.Ps, Vec3(0, 0, 0)); + assign_all(vsg.dPsdx, Vec3(0, 0, 0)); + assign_all(vsg.dPsdy, Vec3(0, 0, 0)); + + assign_all(vsg.surfacearea, 1.0f); + assign_all(vsg.flipHandedness, 0); + assign_all(vsg.backfacing, 0); + + for (int i = 0; i < 4; i++) + vsg.Ci[i] = NULL; + } + + bool is_overridden_get_inverse_matrix_WmWxWf() const override + { + return false; + }; + bool is_overridden_get_matrix_WmWsWf() const override { return false; }; + bool is_overridden_get_inverse_matrix_WmsWf() const override + { + return false; + }; + bool is_overridden_get_inverse_matrix_WmWsWf() const override + { + return false; + }; + bool is_overridden_texture() const override { return false; }; + bool is_overridden_texture3d() const override { return false; }; + bool is_overridden_environment() const override { return false; }; + bool is_overridden_pointcloud_search() const override { return false; }; + bool is_overridden_pointcloud_get() const override { return false; }; + bool is_overridden_pointcloud_write() const override { return false; }; + + BatchedOSLMaterial& m_sr; + +private: +}; +#endif + +/// Custom RendererServices for non-batched case +class OSLMaterial : public OSL::RendererServices { +public: + OSLMaterial(); + + void run_test(OSL::ShadingSystem* ss, OSL::PerThreadInfo* thread_info, + OSL::ShadingContext* context, char* shader_name); + + OIIO::ErrorHandler& errhandler() const { return *m_errhandler; } + + /// Turn information at hitpoint into ShaderGlobals for OSL + void globals_from_hit(OSL::ShaderGlobals& sg) + { + sg.P = { 0.0f, 0.0f, 0.0f }; // surface pos + sg.dPdx = { 0.0f, 0.0f, 0.0f }; + sg.dPdy = { 0.0f, 0.0f, 0.0f }; + sg.dPdz = { 0.0f, 0.0f, 0.0f }; // for volume shading only + + sg.I = { 0.0f, 0.0f, -1.0f }; // incident ray + sg.dIdx = { 0.0f, 0.0f, 0.0f }; + sg.dIdy = { 0.0f, 0.0f, 0.0f }; + + sg.N = { 0.0f, 0.0f, 1.0f }; // shading normal + sg.Ng = { 0.0f, 0.0f, 1.0f }; // true geometric normal + + sg.u = 0.5f; // 2D surface parameter u, and its differentials. + sg.dudx = 0.0f; + sg.dudy = 0.0f; + sg.v = 0.5f; // 2D surface parameter v, and its differentials. + sg.dvdx = 0.0f; + sg.dvdy = 0.0f; + + // Surface tangents: derivative of P with respect to surface u and v. + sg.dPdu = { 1.0f, 0.0f, 0.0f }; + sg.dPdv = { 0.0f, 1.0f, 0.0f }; + + sg.time = 0.0f; + sg.dtime = 0.001f; + + // Velocity vector: derivative of position P with respect to time. + sg.dPdtime = { 0.0f, 0.0f, 0.0f }; + + // For lights or light attenuation shaders: the point being illuminated (???) + sg.Ps = { 0.0f, 0.0f, 0.0f }; + sg.dPsdx = { 0.0f, 0.0f, 0.0f }; + sg.dPsdy = { 0.0f, 0.0f, 0.0f }; + + // Renderer user pointers + sg.renderstate = NULL; + sg.tracedata = NULL; + sg.objdata = NULL; + + sg.renderer = this; + + sg.raytype = 1; // 1 stands for camera ray? + sg.flipHandedness = 0; + sg.backfacing = 0; + + // output closure, needs to be null initialized + sg.Ci = NULL; + } + + // ShaderGroupRef storage + std::vector& shaders() { return m_shaders; } + std::vector m_shaders; + +private: + std::unique_ptr m_errhandler; +}; + +#if OSL_USE_BATCHED + +/// Custom RendererServices for batched case +template +class BatchedOSLMaterial : public OSL::RendererServices { +public: + BatchedOSLMaterial(); + + void run_test(OSL::ShadingSystem* ss, OSL::PerThreadInfo* thread_info, + OSL::ShadingContext* context, char* shader_name); + + OIIO::ErrorHandler& errhandler() const { return *m_errhandler; } + + // ShaderGroupRef storage + std::vector& shaders() { return m_shaders; } + std::vector m_shaders; + + OSL::BatchedRendererServices* + batched(OSL::WidthOf) override + { + return &m_batch; + } + + CustomBatchedRendererServices m_batch; + +private: + std::unique_ptr m_errhandler; +}; + +#endif diff --git a/src/testminimal/testminimal.cpp b/src/testminimal/testminimal.cpp new file mode 100644 index 0000000000..6f9a5fb0c6 --- /dev/null +++ b/src/testminimal/testminimal.cpp @@ -0,0 +1,139 @@ +// Copyright Contributors to the Open Shading Language project. +// SPDX-License-Identifier: BSD-3-Clause +// https://github.com/AcademySoftwareFoundation/OpenShadingLanguage + + +#include +#include +#include +#include "oslmaterial.h" + +using namespace OSL; + +int +main(int argc, char** argv) +{ + int batch_width; + char* shader_name; + if (argc < 2) { + std::cout + << "usage: shader_name(without .osl) [+optional] batch_width (0/4/8/16)" + << std::endl; + return 0; + } else if (argc >= 3) { + shader_name = argv[1]; + batch_width = atoi(argv[2]); + batch_width = std::max(batch_width, 1); + if (batch_width != 1 && batch_width != 4 && batch_width != 8 + && batch_width != 16) + batch_width = 1; + } else { + shader_name = argv[1]; + batch_width = -1; + } + + OSLMaterial* oslmat = NULL; +#if OSL_USE_BATCHED + BatchedOSLMaterial<4>* boslmat4 = NULL; + BatchedOSLMaterial<8>* boslmat8 = NULL; + BatchedOSLMaterial<16>* boslmat16 = NULL; +#endif + + TextureSystem* texturesys = TextureSystem::create(); + ShadingSystem* ss = NULL; + + if (batch_width == -1) { +#if OSL_USE_BATCHED + oslmat = new OSLMaterial(); + ss = new ShadingSystem(oslmat, NULL, &oslmat->errhandler()); + if (ss->configure_batch_execution_at(16)) + batch_width = 16; + else if (ss->configure_batch_execution_at(8)) + batch_width = 8; + else if (ss->configure_batch_execution_at(4)) + batch_width = 4; + else + batch_width = 1; + delete oslmat; + oslmat = NULL; + delete ss; + ss = NULL; +#else + batch_width = 1; +#endif + } + + switch (batch_width) { + case 1: + oslmat = new OSLMaterial(); + ss = new ShadingSystem(oslmat, texturesys, &oslmat->errhandler()); + break; +#if OSL_USE_BATCHED + case 4: + boslmat4 = new BatchedOSLMaterial<4>(); + ss = new ShadingSystem(boslmat4, texturesys, &boslmat4->errhandler()); + break; + case 8: + boslmat8 = new BatchedOSLMaterial<8>(); + ss = new ShadingSystem(boslmat8, texturesys, &boslmat8->errhandler()); + break; + case 16: + boslmat16 = new BatchedOSLMaterial<16>(); + ss = new ShadingSystem(boslmat16, texturesys, &boslmat16->errhandler()); + break; +#endif + } + +#if OSL_USE_BATCHED + if (batch_width > 1) { + //ss->attribute("llvm_jit_fma", true); + ss->configure_batch_execution_at(batch_width); + + // build searchpath for ISA specific OSL shared libraries based on expected + // location of library directories relative to the executables path. + static const char* relative_lib_dirs[] = +# if (defined(_WIN32) || defined(_WIN64)) + { "\\..\\lib64", "\\..\\lib" }; +# else + { "/../lib64", "/../lib" }; +# endif + auto executable_directory = OIIO::Filesystem::parent_path( + OIIO::Sysutil::this_program_path()); + int dirNum = 0; + std::string librarypath; + for (const char* relative_lib_dir : relative_lib_dirs) { + if (dirNum++ > 0) + librarypath += ":"; + librarypath += executable_directory + relative_lib_dir; + } + ss->attribute("searchpath:library", librarypath); + } +#endif + + PerThreadInfo* thread_info; + ShadingContext* context; + thread_info = ss->create_thread_info(); + context = ss->get_context(thread_info); + + switch (batch_width) { + case 1: oslmat->run_test(ss, thread_info, context, shader_name); break; +#if OSL_USE_BATCHED + case 4: boslmat4->run_test(ss, thread_info, context, shader_name); break; + case 8: boslmat8->run_test(ss, thread_info, context, shader_name); break; + case 16: boslmat16->run_test(ss, thread_info, context, shader_name); break; +#endif + } + + ss->release_context(context); + ss->destroy_thread_info(thread_info); + + delete oslmat; +#if OSL_USE_BATCHED + delete boslmat4; + delete boslmat8; + delete boslmat16; +#endif + delete ss; + + return 0; +} diff --git a/src/testshade/batched_simplerend.cpp b/src/testshade/batched_simplerend.cpp index 937655af4d..ea2acbdf97 100644 --- a/src/testshade/batched_simplerend.cpp +++ b/src/testshade/batched_simplerend.cpp @@ -1001,6 +1001,7 @@ BatchedSimpleRenderer::get_camera_screen_window(ustringhash /*object*/, // Explicitly instantiate BatchedSimpleRenderer template template class BatchedSimpleRenderer<16>; template class BatchedSimpleRenderer<8>; +template class BatchedSimpleRenderer<4>; OSL_NAMESPACE_EXIT diff --git a/src/testshade/simplerend.cpp b/src/testshade/simplerend.cpp index 65862c2dba..3582c9cc48 100644 --- a/src/testshade/simplerend.cpp +++ b/src/testshade/simplerend.cpp @@ -218,7 +218,9 @@ register_closures(OSL::ShadingSystem* shadingsys) SimpleRenderer::SimpleRenderer() #if OSL_USE_BATCHED - : m_batch_16_simple_renderer(*this), m_batch_8_simple_renderer(*this) + : m_batch_16_simple_renderer(*this) + , m_batch_8_simple_renderer(*this) + , m_batch_4_simple_renderer(*this) #endif { Matrix44 M; diff --git a/src/testshade/simplerend.h b/src/testshade/simplerend.h index 87d0b96dda..8ebe1c1fc4 100644 --- a/src/testshade/simplerend.h +++ b/src/testshade/simplerend.h @@ -177,12 +177,17 @@ class SimpleRenderer : public RendererServices { { return &m_batch_8_simple_renderer; } + BatchedRendererServices<4>* batched(WidthOf<4>) override + { + return &m_batch_4_simple_renderer; + } #endif protected: #if OSL_USE_BATCHED BatchedSimpleRenderer<16> m_batch_16_simple_renderer; BatchedSimpleRenderer<8> m_batch_8_simple_renderer; + BatchedSimpleRenderer<4> m_batch_4_simple_renderer; #endif // Camera parameters diff --git a/src/testshade/testshade.cpp b/src/testshade/testshade.cpp index 39834f6380..bd80a0f415 100644 --- a/src/testshade/testshade.cpp +++ b/src/testshade/testshade.cpp @@ -306,6 +306,9 @@ set_shadingsys_options() } else if ((!batch_size_requested || batch_size == 8) && shadingsys->configure_batch_execution_at(8)) { batch_size = 8; + } else if ((!batch_size_requested || batch_size == 4) + && shadingsys->configure_batch_execution_at(4)) { + batch_size = 4; } else { OSL::print( "WARNING: Hardware or library requirements to utilize batched execution"); @@ -1194,9 +1197,11 @@ setup_output_images(SimpleRenderer* rend, ShadingSystem* shadingsys, // jit_group will optimize the group if necesssary if (batch_size == 16) { shadingsys->batched<16>().jit_group(shadergroup.get(), ctx); - } else { - ASSERT((batch_size == 8) && "Unsupported batch size"); + } else if (batch_size == 8) { shadingsys->batched<8>().jit_group(shadergroup.get(), ctx); + } else { + ASSERT((batch_size == 4) && "Unsupported batch size"); + shadingsys->batched<4>().jit_group(shadergroup.get(), ctx); } } else #endif @@ -2195,13 +2200,19 @@ test_shade(int argc, const char* argv[]) batched_shade_region<16>(rend, shadergroup.get(), sub_roi, save); }); - } else { - ASSERT((batch_size == 8) && "Unsupported batch size"); + } else if (batch_size == 8) { OIIO::ImageBufAlgo::parallel_image( roi, num_threads, [&](OIIO::ROI sub_roi) -> void { batched_shade_region<8>(rend, shadergroup.get(), sub_roi, save); }); + } else { + ASSERT((batch_size == 4) && "Unsupported batch size"); + OIIO::ImageBufAlgo::parallel_image( + roi, num_threads, [&](OIIO::ROI sub_roi) -> void { + batched_shade_region<4>(rend, shadergroup.get(), + sub_roi, save); + }); } } else # endif diff --git a/testsuite/closure-string/BATCHED b/testsuite/closure-string/BATCHED new file mode 100644 index 0000000000..e69de29bb2 diff --git a/testsuite/closure-string/ref/out.txt b/testsuite/closure-string/ref/out.txt new file mode 100644 index 0000000000..d497c44d99 --- /dev/null +++ b/testsuite/closure-string/ref/out.txt @@ -0,0 +1,3 @@ +Compiled test.osl -> test.oso +parsing microfacet closure +uh_ggx diff --git a/testsuite/closure-string/run.py b/testsuite/closure-string/run.py new file mode 100755 index 0000000000..5e688f5cb9 --- /dev/null +++ b/testsuite/closure-string/run.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python + +# Copyright Contributors to the Open Shading Language project. +# SPDX-License-Identifier: BSD-3-Clause +# https://github.com/AcademySoftwareFoundation/OpenShadingLanguage + +command = testminimal("test") diff --git a/testsuite/closure-string/test.osl b/testsuite/closure-string/test.osl new file mode 100644 index 0000000000..4071f5d9e4 --- /dev/null +++ b/testsuite/closure-string/test.osl @@ -0,0 +1,4 @@ +shader test(string distribution = "ggx") +{ + Ci = microfacet(distribution, N, N, 0.1, 0.1, 0.0, 0); +} diff --git a/testsuite/example-batched-deformer/oslbatcheddeformer.cpp b/testsuite/example-batched-deformer/oslbatcheddeformer.cpp index 0b7af16e40..fe620fefab 100644 --- a/testsuite/example-batched-deformer/oslbatcheddeformer.cpp +++ b/testsuite/example-batched-deformer/oslbatcheddeformer.cpp @@ -182,10 +182,15 @@ class MyRendererServices final : public OSL::RendererServices { { return &m_batch_8_rs; } + OSL::BatchedRendererServices<4>* batched(OSL::WidthOf<4>) override + { + return &m_batch_4_rs; + } private: MyBatchedRendererServices<16> m_batch_16_rs; MyBatchedRendererServices<8> m_batch_8_rs; + MyBatchedRendererServices<4> m_batch_4_rs; }; diff --git a/testsuite/runtest.py b/testsuite/runtest.py index befe278a46..eceec81be2 100755 --- a/testsuite/runtest.py +++ b/testsuite/runtest.py @@ -249,6 +249,14 @@ def oiiodiff (fileA, fileB, extraargs="", silent=True, concat=True) : command += " ;\n" return command +# Construct a command that run testminimal with the specified arguments, +# appending output to the file "out.txt". +def testminimal (args) : + if os.environ.__contains__('OSL_TESTMINIMAL_NAME') : + testminimalname = os.environ['OSL_TESTMINIMAL_NAME'] + " " + else : + testminimalname = osl_app("testminimal") + return (testminimalname + args + redirect + " ;\n") # Construct a command that run testshade with the specified arguments, # appending output to the file "out.txt".