-
Notifications
You must be signed in to change notification settings - Fork 59
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Global scan #665
base: master
Are you sure you want to change the base?
Global scan #665
Conversation
…but not working yet
…but not working yet
[CI]: Can one of the admins verify this patch? |
#ifndef NBL_BUILTIN_MAX_SCAN_LEVELS | ||
#define NBL_BUILTIN_MAX_SCAN_LEVELS 7 | ||
#endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do as NBL_CONSTEXPR
instead of define
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also this better be formulated slightly differently, in terms of reduction passes, not reduction + downsweep
uint32_t topLevel; | ||
uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use uint16_t for this
uint32_t topLevel; | ||
uint32_t temporaryStorageOffset[NBL_BUILTIN_MAX_SCAN_LEVELS/2]; | ||
}; | ||
|
||
Parameters_t getParameters(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no need for such a forward decl anymore
struct DefaultSchedulerParameters_t | ||
{ | ||
uint32_t finishedFlagOffset[NBL_BUILTIN_MAX_SCAN_LEVELS-1]; | ||
uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_SCAN_LEVELS]; | ||
|
||
}; | ||
|
||
DefaultSchedulerParameters_t getSchedulerParameters(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
split out schedulers into separate headers
template<typename Storage_t> | ||
void getData( | ||
inout Storage_t data, | ||
in uint levelInvocationIndex, | ||
in uint localWorkgroupIndex, | ||
in uint treeLevel, | ||
in uint pseudoLevel | ||
NBL_REF_ARG(Storage_t) data, | ||
NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, | ||
NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, | ||
NBL_CONST_REF_ARG(uint32_t) treeLevel, | ||
NBL_CONST_REF_ARG(uint32_t) pseudoLevel | ||
); | ||
} | ||
} | ||
} | ||
#define _NBL_HLSL_SCAN_GET_PADDED_DATA_DECLARED_ | ||
#endif | ||
|
||
#ifndef _NBL_HLSL_SCAN_SET_DATA_DECLARED_ | ||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace scan | ||
{ | ||
template<typename Storage_t> | ||
void setData( | ||
in Storage_t data, | ||
in uint levelInvocationIndex, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no more forward declarations, just let it be an accessor.
Also the level
const-ref-args can be rolled up into a single struct SDataIndex
// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. | ||
// This file is part of the "Nabla Engine". | ||
// For conditions of distribution and use, see copyright notice in nabla.h | ||
|
||
#ifndef _NBL_HLSL_SCAN_DESCRIPTORS_INCLUDED_ | ||
#define _NBL_HLSL_SCAN_DESCRIPTORS_INCLUDED_ | ||
|
||
// choerent -> globallycoherent No newline at end of file | ||
#include "nbl/builtin/hlsl/scan/declarations.hlsl" | ||
#include "nbl/builtin/hlsl/workgroup/basic.hlsl" | ||
|
||
// coherent -> globallycoherent | ||
|
||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace scan | ||
{ | ||
|
||
template<uint32_t scratchElementCount=scratchSz> // (REVIEW): This should be externally defined. Maybe change the scratch buffer to RWByteAddressBuffer? Annoying to manage though... | ||
struct Scratch | ||
{ | ||
uint32_t workgroupsStarted; | ||
uint32_t data[scratchElementCount]; | ||
}; | ||
|
||
[[vk::binding(0 ,0)]] RWStructuredBuffer<uint32_t /*Storage_t*/> scanBuffer; // (REVIEW): Make the type externalizable. Decide how (#define?) | ||
[[vk::binding(1 ,0)]] RWStructuredBuffer<Scratch> globallycoherent scanScratchBuf; // (REVIEW): Check if globallycoherent can be used with Vulkan Mem Model | ||
|
||
template<typename Storage_t, bool isExclusive=false> | ||
void getData( | ||
NBL_REF_ARG(Storage_t) data, | ||
NBL_CONST_REF_ARG(uint32_t) levelInvocationIndex, | ||
NBL_CONST_REF_ARG(uint32_t) localWorkgroupIndex, | ||
NBL_CONST_REF_ARG(uint32_t) treeLevel, | ||
NBL_CONST_REF_ARG(uint32_t) pseudoLevel | ||
) | ||
{ | ||
const Parameters_t params = getParameters(); // defined differently for direct and indirect shaders | ||
|
||
uint32_t offset = levelInvocationIndex; | ||
const bool notFirstOrLastLevel = bool(pseudoLevel); | ||
if (notFirstOrLastLevel) | ||
offset += params.temporaryStorageOffset[pseudoLevel-1u]; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove this whole file, it should be userspace
// TODO: Can we make it a static variable? | ||
groupshared uint32_t wgScratch[SharedScratchSz]; | ||
|
||
#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" | ||
|
||
template<uint16_t offset> | ||
struct WGScratchProxy | ||
{ | ||
uint32_t get(const uint32_t ix) | ||
{ | ||
return wgScratch[ix+offset]; | ||
} | ||
void set(const uint32_t ix, const uint32_t value) | ||
{ | ||
wgScratch[ix+offset] = value; | ||
} | ||
|
||
uint32_t atomicAdd(uint32_t ix, uint32_t val) | ||
{ | ||
return glsl::atomicAdd(wgScratch[ix + offset], val); | ||
} | ||
|
||
void workgroupExecutionAndMemoryBarrier() | ||
{ | ||
nbl::hlsl::glsl::barrier(); | ||
//nbl::hlsl::glsl::memoryBarrierShared(); implied by the above | ||
} | ||
}; | ||
static WGScratchProxy<0> accessor; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
scratches are userspace
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use accessors
/** | ||
* Required since we rely on SubgroupContiguousIndex instead of | ||
* gl_LocalInvocationIndex which means to match the global index | ||
* we can't use the gl_GlobalInvocationID but an index based on | ||
* SubgroupContiguousIndex. | ||
*/ | ||
uint32_t globalIndex() | ||
{ | ||
return nbl::hlsl::glsl::gl_WorkGroupID().x*WORKGROUP_SIZE+nbl::hlsl::workgroup::SubgroupContiguousIndex(); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we have this in a header already
struct ScanPushConstants | ||
{ | ||
nbl::hlsl::scan::Parameters_t scanParams; | ||
nbl::hlsl::scan::DefaultSchedulerParameters_t schedulerParams; | ||
}; | ||
|
||
[[vk::push_constant]] | ||
ScanPushConstants spc; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
everything affecting the pipeline layout should be userspace
#ifndef _NBL_HLSL_MAIN_DEFINED_ | ||
[numthreads(_NBL_HLSL_WORKGROUP_SIZE_, 1, 1)] | ||
void CSMain() | ||
[numthreads(WORKGROUP_SIZE,1,1)] | ||
void main() | ||
{ | ||
if (bool(nbl::hlsl::scan::getIndirectElementCount())) | ||
nbl::hlsl::scan::main(); | ||
if(bool(nbl::hlsl::scan::getIndirectElementCount())) { | ||
// TODO call main from virtual_workgroup.hlsl | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
all this should be userspace, also there will be very little difference between direct and indirect
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace scan | ||
{ | ||
template<class Binop, class Storage_t> | ||
void virtualWorkgroup(in uint treeLevel, in uint localWorkgroupIndex) | ||
template<class Binop, typename Storage_t, bool isExclusive, uint16_t ItemCount, class Accessor, class device_capabilities=void> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why not template on the workgroup scan instead?
Within you can alias and extract:
- binop
- storage_t
- exclusive or not
- item count
- smem accessor
- device traits necessary
const uint32_t levelInvocationIndex = localWorkgroupIndex * glsl::gl_WorkGroupSize().x + SubgroupContiguousIndex(); | ||
const bool lastInvocationInGroup = SubgroupContiguousIndex() == (gl_WorkGroupSize().x - 1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
shouldn't ItemCount
be used instead of glsl::gl_WorkGroupSize().x
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
most definitely
src/nbl/video/utilities/CScanner.cpp
Outdated
{ | ||
core::smart_refctd_ptr<asset::ICPUShader> base = createBaseShader("nbl/builtin/hlsl/scan/direct.hlsl", dataType, op, scratchSz); | ||
return asset::CHLSLCompiler::createOverridenCopy(base.get(), "#define IS_EXCLUSIVE %s\n#define IS_SCAN %s\n", scanType == EST_EXCLUSIVE ? "true" : "false", "true"); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same comment as for CReduce
src/nbl/video/utilities/CScanner.cpp
Outdated
asset::ICPUShader* CScanner::getDefaultShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) | ||
{ | ||
if (!m_shaders[scanType][dataType][op]) | ||
m_shaders[scanType][dataType][op] = createShader(false,scanType,dataType,op,scratchSz); | ||
return m_shaders[scanType][dataType][op].get(); | ||
} | ||
|
||
#if 0 // TODO: port | ||
core::smart_refctd_ptr<asset::ICPUShader> CScanner::createShader(const bool indirect, const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op) const | ||
IGPUShader* CScanner::getDefaultSpecializedShader(const E_SCAN_TYPE scanType, const E_DATA_TYPE dataType, const E_OPERATOR op, const uint32_t scratchSz) | ||
{ | ||
auto system = m_device->getPhysicalDevice()->getSystem(); | ||
core::smart_refctd_ptr<const system::IFile> glsl; | ||
{ | ||
auto loadBuiltinData = [&](const std::string _path) -> core::smart_refctd_ptr<const nbl::system::IFile> | ||
{ | ||
nbl::system::ISystem::future_t<core::smart_refctd_ptr<nbl::system::IFile>> future; | ||
system->createFile(future, system::path(_path), core::bitflag(nbl::system::IFileBase::ECF_READ) | nbl::system::IFileBase::ECF_MAPPABLE); | ||
if (future.wait()) | ||
return future.copy(); | ||
return nullptr; | ||
}; | ||
if (!m_specialized_shaders[scanType][dataType][op]) | ||
{ | ||
auto cpuShader = core::smart_refctd_ptr<asset::ICPUShader>(getDefaultShader(scanType,dataType,op,scratchSz)); | ||
cpuShader->setFilePathHint("nbl/builtin/hlsl/scan/direct.hlsl"); | ||
cpuShader->setShaderStage(asset::IShader::ESS_COMPUTE); | ||
|
||
auto gpushader = m_device->createShader(cpuShader.get()); | ||
|
||
m_specialized_shaders[scanType][dataType][op] = gpushader; | ||
} | ||
return m_specialized_shaders[scanType][dataType][op].get(); | ||
} | ||
|
||
if(indirect) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
these 3 utilities are practically identical for CScan and CReduce, why not hoist them to CArithmeticOps?
void computeParameters(NBL_CONST_REF_ARG(uint32_t) elementCount, out Parameters_t _scanParams, out DefaultSchedulerParameters_t _schedulerParams) | ||
{ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The parameters are "married" to to the scheduling logic.
So make these methods of DefaultSchedulerParameters_t
, this one is really just a constructor so turn it into a static DefaultSchedulerParameters_t create(...)
method
Hint: whenever a function takes inout
or out
struct T
as first parameter, it likely means that I wrote it in GLSL to be a pseudo-method (would have a C fn ptr signature matching a method).
Bonus Hint: Whenever the first param is a in T
it would have been a const method in C++.
* The CScanner.h parameter computation calculates the number of virtual workgroups that will have to be launched for the Scan operation | ||
* (always based on the elementCount) as well as different offsets for the results of each step of the Scan operation, flag positions | ||
* that are used for synchronization etc. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The logic needs to change to "last one out closes the door"
The comments too.
we will now only launch RoundUp(N/ItemsPerWorkgroup)
virtual workgroups, not the geometric series
* Keep in mind that each virtual workgroup executes a single step of the whole algorithm, which is why we have the cumulativeWorkgroupCount. | ||
* The first virtual workgroups will work on the upsweep phase, the next on the downsweep phase. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since we do "last workgroup takes over" we will now only launch RoundUp(N/ItemsPerWorkgroup)
virtual workgroups, not the geometric series
So no need for the cumulativeWorkgroupCount
.
* |> finishedFlagOffset - an index in the scratch buffer where each virtual workgroup indicates that ALL its invocations have finished their work. This helps | ||
* synchronizing between workgroups with while-loop spinning. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
amend comments as needed
uint32_t cumulativeWorkgroupCount[NBL_BUILTIN_MAX_LEVELS]; | ||
uint32_t workgroupFinishFlagsOffset[NBL_BUILTIN_MAX_LEVELS]; | ||
uint32_t lastWorkgroupSetCountForLevel[NBL_BUILTIN_MAX_LEVELS]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
not sure you need either cumulativeWorkgroupCount
or lastWorkgroupSetCountForLevel
for a "last one out closes the door" upsweep
default: | ||
break; | ||
#if NBL_BUILTIN_MAX_SCAN_LEVELS>7 | ||
const uint32_t workgroupSizeLog2 = firstbithigh(glsl::gl_WorkGroupSize().x); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
modify it to not use Vulkan/SPIR-V environment builtins, that way we can use it from C++ too!
(So take workgroup size - or more accurately item per workgroup count - from the outside)
#endif | ||
// REVIEW: Putting topLevel second allows better alignment for packing of constant variables, assuming lastElement has length 4. (https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules) | ||
struct Parameters_t { | ||
uint32_t lastElement[NBL_BUILTIN_MAX_LEVELS/2+1]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add documentation about what lastElement
is used for
_schedulerParams.finishedFlagOffset[1] = 1u; | ||
|
||
_scanParams.temporaryStorageOffset[0] = 2u; | ||
break; | ||
case 2u: | ||
_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1); | ||
_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+1u; | ||
_schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+WorkgroupCount(1); | ||
_schedulerParams.cumulativeWorkgroupCount[4] = _schedulerParams.cumulativeWorkgroupCount[3]+WorkgroupCount(0); | ||
// climb up | ||
_schedulerParams.finishedFlagOffset[1] = WorkgroupCount(1); | ||
_schedulerParams.finishedFlagOffset[2] = _schedulerParams.finishedFlagOffset[1]+1u; | ||
// climb down | ||
_schedulerParams.finishedFlagOffset[3] = _schedulerParams.finishedFlagOffset[1]+2u; | ||
|
||
_scanParams.temporaryStorageOffset[0] = _schedulerParams.finishedFlagOffset[3]+WorkgroupCount(1); | ||
_scanParams.temporaryStorageOffset[1] = _scanParams.temporaryStorageOffset[0]+WorkgroupCount(0); | ||
break; | ||
case 3u: | ||
_schedulerParams.cumulativeWorkgroupCount[1] = _schedulerParams.cumulativeWorkgroupCount[0]+WorkgroupCount(1); | ||
_schedulerParams.cumulativeWorkgroupCount[2] = _schedulerParams.cumulativeWorkgroupCount[1]+WorkgroupCount(2); | ||
_schedulerParams.cumulativeWorkgroupCount[3] = _schedulerParams.cumulativeWorkgroupCount[2]+1u; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
because due to forward progress guarantees you can only do upsweep or downsweep in a single dispatch, you don't need the "climb down"
you probably don't even need the cumulative workgroup counts, as the way you'd find out if your current workgroup truly is the last one, is by checking the return value of the finished flag atomic counter.
e.g. markFinished
uint32_t howManyDone = flagsAccessor.atomicAdd(finishedOffset[level]+virtualWorkgroupIndexInThisLevel);
//uint32_t currentLastWorkgroup = currentLastElement>>ItemsPerWorkgroupLog2;
uint32_t nextLevelLastWorkgroup = currentLastWorkgroup >>ItemsPerWorkgroupLog2;
uint32_t virtualWorkgroupIndexInNextLevel = virtualWorkgroupIndexInThisLevel>>ItemsPerWorkgroupLog2;
// NOT: either last workgroup, or in the last bundle of workgroups and last workgroup dynamically there
if (howManyDone!=(ItemsPerWorkgroup-1) && (virtualWorkgroupIndexInNextLevel!=nextLevelLastWorkgroup || (virtualWorkgroupIndexInThisLevel&(ItemsPerWorkgroupLog2-1))!=howManyDone) )
return;
// last one takes over
currentLastWorkgroup = nextLevelLastWorkgroup ;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
btw I'd try to rewrite this as a loop instead of a weird switch that was only written like this to avoid recursion in GLSL
/** | ||
* treeLevel - the current level in the Blelloch Scan | ||
* localWorkgroupIndex - the workgroup index the current invocation is a part of in the specific virtual dispatch. | ||
* For example, if we have dispatched 10 workgroups and we the virtu al workgroup number is 35, then the localWorkgroupIndex should be 5. | ||
*/ | ||
template<class Accessor> | ||
bool getWork(NBL_CONST_REF_ARG(DefaultSchedulerParameters_t) params, NBL_CONST_REF_ARG(uint32_t) topLevel, NBL_REF_ARG(uint32_t) treeLevel, NBL_REF_ARG(uint32_t) localWorkgroupIndex, NBL_REF_ARG(Accessor) sharedScratch) | ||
{ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you can't do any flagging via sharedmemory because you want inter-workgroup communication, must be a BDA address!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
even better a bda::__ptr<uint32_t>
cause then you'll have atomicAdd
methods
sharedScratch.workgroupExecutionAndMemoryBarrier(); | ||
const uint32_t globalWorkgroupIndex = sharedScratch.get(0u); | ||
|
||
treeLevel = sharedScratch.get(1u); | ||
if(treeLevel>lastLevel) | ||
return true; | ||
|
||
localWorkgroupIndex = globalWorkgroupIndex; | ||
const bool dependentLevel = treeLevel != 0u; | ||
if(dependentLevel) | ||
{ | ||
const uint32_t prevLevel = treeLevel - 1u; | ||
localWorkgroupIndex -= params.cumulativeWorkgroupCount[prevLevel]; | ||
if(workgroup::SubgroupContiguousIndex() == 0u) | ||
{ | ||
uint32_t dependentsCount = 1u; | ||
if(treeLevel <= topLevel) | ||
{ | ||
dependentsCount = glsl::gl_WorkGroupSize().x; | ||
const bool lastWorkgroup = (globalWorkgroupIndex+1u)==params.cumulativeWorkgroupCount[treeLevel]; | ||
if (lastWorkgroup) | ||
{ | ||
const Parameters_t scanParams = getParameters(); | ||
dependentsCount = scanParams.lastElement[treeLevel]+1u; | ||
if (treeLevel<topLevel) | ||
{ | ||
dependentsCount -= scanParams.lastElement[treeLevel+1u] * glsl::gl_WorkGroupSize().x; | ||
} | ||
} | ||
} | ||
uint32_t dependentsFinishedFlagOffset = localWorkgroupIndex; | ||
if (treeLevel>topLevel) // !(prevLevel<topLevel) TODO: merge with `else` above? | ||
dependentsFinishedFlagOffset /= glsl::gl_WorkGroupSize().x; | ||
dependentsFinishedFlagOffset += params.finishedFlagOffset[prevLevel]; | ||
while (scanScratchBuf[0].data[dependentsFinishedFlagOffset]!=dependentsCount) | ||
glsl::memoryBarrierBuffer(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I expect a rewrite of this into "last one out closes the door"
if (treeLevel<topLevel) | ||
{ | ||
finishedFlagOffset += localWorkgroupIndex/glsl::gl_WorkGroupSize().x; | ||
glsl::atomicAdd(scanScratchBuf[0].data[finishedFlagOffset], 1u); | ||
} | ||
else if (treeLevel!=(topLevel<<1u)) | ||
{ | ||
finishedFlagOffset += localWorkgroupIndex; | ||
scanScratchBuf[0].data[finishedFlagOffset] = 1u; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
comments about what and why is happenning in the if statements
template<class Accessor> | ||
bool getWork(NBL_REF_ARG(Accessor) accessor) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
whats the accessor for? needs a better name
Also why don't you store the accessor past the create
as a member?
It should now work with all sizes of inputs.
// could do scanScratchBuf[0u].workgroupsStarted[SubgroupContiguousIndex()] = 0u but don't know how many invocations are live during this call | ||
if(workgroup::SubgroupContiguousIndex() == 0u) | ||
{ | ||
for(uint32_t i = 0; i < params.topLevel; i++) | ||
{ | ||
scanScratchBuf[0u].workgroupsStarted[i] = 0u; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
don't you know the workgroup size?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My bad, this is actually not used anymore. I think it was being used inside an inRage
at some point? Not sure. However we can't reset the workgroupStarted
buffer within the shader at the end of the Reduce step (which was the initial goal) because it's possible that there are still workgroups that are "alive" where they won't be doing any work but will still increase the workgroupStarted
buffer and some times we end up doing the reset before those WGs exit, ending up with some 1 values instead of all 0s.
Description
Initial implementation of the Global Scan in HLSL
Testing
Not yet finished
TODO list:
Still need to test the HLSL code as well as the porting of the example code to the new API of vulkan_1_3 branch