Skip to content

Commit 3e5428a

Browse files
committed
Support not writing top mip for HiZ.
It's very unlikely that it'll matter in practice and it saves a lot of GPU time to not have to write out full-res.
1 parent 2a95dcb commit 3e5428a

File tree

4 files changed

+63
-37
lines changed

4 files changed

+63
-37
lines changed

assets/shaders/inc/meshlet_render.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ layout(set = MESHLET_RENDER_DESCRIPTOR_SET, binding = MESHLET_RENDER_FRUSTUM_BIN
5757
mat4 view;
5858
vec4 viewport_scale_bias;
5959
ivec2 hiz_resolution;
60+
int hiz_min_lod;
6061
int hiz_max_lod;
6162
} frustum;
6263

@@ -114,13 +115,16 @@ bool hiz_cull(vec2 view_range_x, vec2 view_range_y, float closest_z)
114115
// We need to sample from a LOD where where there is at most one texel delta
115116
// between lo/hi values.
116117
int max_delta = max(ix.y - ix.x, iy.y - iy.x);
117-
int lod = min(findMSB(max_delta - 1) + 1, frustum.hiz_max_lod);
118+
int lod = clamp(findMSB(max_delta - 1) + 1, frustum.hiz_min_lod, frustum.hiz_max_lod);
118119
ivec2 lod_max_coord = max(frustum.hiz_resolution >> lod, ivec2(1)) - 1;
119120
ix = min(ix >> lod, lod_max_coord.xx);
120121
iy = min(iy >> lod, lod_max_coord.yy);
121122

122123
ivec2 hiz_coord = ivec2(ix.x, iy.x);
123124

125+
// We didn't write the top LOD.
126+
lod -= frustum.hiz_min_lod;
127+
124128
float d = texelFetch(uHiZDepth, hiz_coord, lod).x;
125129
bool nx = ix.y != ix.x;
126130
bool ny = iy.y != iy.x;

assets/shaders/post/hiz.comp

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
#version 450
22
#extension GL_KHR_shader_subgroup_basic : require
3-
#extension GL_KHR_shader_subgroup_shuffle : require
43
#extension GL_KHR_shader_subgroup_quad : require
54

65
// A rewrite of SPD to support HiZ correctly and moar wave ops for good measure.
76

87
layout(local_size_x = 256) in;
98

10-
layout(set = 0, binding = 0, r32f) uniform writeonly image2D uImageTop;
9+
#if defined(WRITE_TOP_LEVEL) && WRITE_TOP_LEVEL
10+
layout(set = 0, binding = 0, r32f) coherent writeonly uniform image2D uImageTop;
11+
#endif
1112
layout(set = 0, binding = 1, r32f) coherent uniform image2D uImages[12];
1213
layout(set = 1, binding = 0) uniform sampler2D uTexture;
1314
layout(set = 1, binding = 1) buffer Counter
@@ -66,13 +67,12 @@ void write_image(ivec2 coord, int mip, float v)
6667
imageStore(uImages[mip - 1], coord, vec4(v));
6768
}
6869

69-
void write_image4_top(ivec2 coord, int mip, vec4 v)
70+
#if defined(WRITE_TOP_LEVEL) && WRITE_TOP_LEVEL
71+
void write_image_top(ivec2 coord, float v)
7072
{
71-
imageStore(uImageTop, coord + ivec2(0, 0), v.xxxx);
72-
imageStore(uImageTop, coord + ivec2(1, 0), v.yyyy);
73-
imageStore(uImageTop, coord + ivec2(0, 1), v.zzzz);
74-
imageStore(uImageTop, coord + ivec2(1, 1), v.wwww);
73+
imageStore(uImageTop, coord, vec4(v, 0, 0, 0));
7574
}
75+
#endif
7676

7777
const int SHARED_WIDTH = 32;
7878
const int SHARED_HEIGHT = 32;
@@ -120,11 +120,35 @@ float fetch_image_mip6(ivec2 coord)
120120
return imageLoad(uImages[5], coord).x;
121121
}
122122

123-
vec4 write_mip0_transformed(vec4 v, ivec2 base_coord)
123+
vec4 write_mip0_transformed(vec4 v, ivec2 base_coord, ivec2 local_coord)
124124
{
125125
v = transform_z(v);
126+
127+
#if defined(WRITE_TOP_LEVEL) && WRITE_TOP_LEVEL
128+
// Ensure that top-level image is written with full cache lines per warp.
129+
// Writing in the strided 2x2 pattern is noticably bad for L2 performance.
130+
// Taking extra time on the shader cores to reshuffle data is actually beneficial since we're fully bandwidth bound
131+
// in these shaders, so we should give the memory system all the help it can get.
132+
store_shared(2 * local_coord + ivec2(0, 0), v.x);
133+
store_shared(2 * local_coord + ivec2(1, 0), v.y);
134+
store_shared(2 * local_coord + ivec2(0, 1), v.z);
135+
store_shared(2 * local_coord + ivec2(1, 1), v.w);
136+
137+
barrier();
138+
126139
// Write out transformed LOD 0
127-
write_image4_top(base_coord, 0, v);
140+
for (int y = 0; y < 2; y++)
141+
{
142+
for (int x = 0; x < 2; x++)
143+
{
144+
ivec2 tile_offset = ivec2(x, y) * 16;
145+
write_image_top(base_coord + tile_offset + local_coord, load_shared(local_coord + tile_offset));
146+
}
147+
}
148+
149+
barrier();
150+
#endif
151+
128152
return v;
129153
}
130154

@@ -241,10 +265,14 @@ void main()
241265
// It seems like we need to be super careful about memory access patterns to get optimal bandwidth.
242266

243267
// LOD 0 feedback with transform.
244-
vec4 tile00 = write_mip0_transformed(fetch_2x2_texture(base_coord_00), base_coord_00);
245-
vec4 tile10 = write_mip0_transformed(fetch_2x2_texture(base_coord_10), base_coord_10);
246-
vec4 tile01 = write_mip0_transformed(fetch_2x2_texture(base_coord_01), base_coord_01);
247-
vec4 tile11 = write_mip0_transformed(fetch_2x2_texture(base_coord_11), base_coord_11);
268+
vec4 tile00 = write_mip0_transformed(
269+
fetch_2x2_texture(base_coord_00), ivec2(gl_WorkGroupID.xy * 64u) + ivec2(0, 0), ivec2(local_coord));
270+
vec4 tile10 = write_mip0_transformed(
271+
fetch_2x2_texture(base_coord_10), ivec2(gl_WorkGroupID.xy * 64u) + ivec2(32, 0), ivec2(local_coord));
272+
vec4 tile01 = write_mip0_transformed(
273+
fetch_2x2_texture(base_coord_01), ivec2(gl_WorkGroupID.xy * 64u) + ivec2(0, 32), ivec2(local_coord));
274+
vec4 tile11 = write_mip0_transformed(
275+
fetch_2x2_texture(base_coord_11), ivec2(gl_WorkGroupID.xy * 64u) + ivec2(32, 32), ivec2(local_coord));
248276
if (registers.mips <= 1)
249277
return;
250278

@@ -275,6 +303,7 @@ void main()
275303
store_shared(local_coord_shared + ivec2(0, 8), reduced01);
276304
store_shared(local_coord_shared + ivec2(8, 8), reduced11);
277305
}
306+
278307
barrier();
279308

280309
// Write LOD 3

tests/hiz.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,14 +97,14 @@ int main()
9797
dev.begin_renderdoc_capture();
9898

9999
auto cmd = dev.request_command_buffer();
100-
cmd->set_program("builtin://shaders/post/hiz.comp");
100+
cmd->set_program("builtin://shaders/post/hiz.comp", {{ "WRITE_TOP_LEVEL", 1 }});
101101
for (unsigned i = 0; i < 13; i++)
102102
cmd->set_storage_texture(0, i, *views[i < push.mips ? i : (push.mips - 1)]);
103103
cmd->set_texture(1, 0, img->get_view(), StockSampler::NearestClamp);
104104
cmd->set_storage_buffer(1, 1, *counter_buffer);
105105
cmd->push_constants(&push, 0, sizeof(push));
106106
cmd->enable_subgroup_size_control(true);
107-
cmd->set_subgroup_size_log2(true, 4, 7);
107+
cmd->set_subgroup_size_log2(true, 2, 7);
108108
cmd->dispatch(wg_x, wg_y, 1);
109109
dev.submit(cmd);
110110

tests/meshlet_viewer.cpp

Lines changed: 14 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,7 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler //
438438
mat4 view;
439439
vec4 viewport_scale_bias;
440440
uvec2 hiz_resolution;
441+
uint hiz_min_lod;
441442
uint hiz_max_lod;
442443
};
443444

@@ -463,9 +464,10 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler //
463464

464465
ubo->view = render_context.get_render_parameters().view;
465466
ubo->viewport_scale_bias = viewport_scale_bias;
466-
ubo->hiz_resolution.x = hiz->get_view_width();
467-
ubo->hiz_resolution.y = hiz->get_view_height();
468-
ubo->hiz_max_lod = hiz->get_create_info().levels - 1;
467+
ubo->hiz_resolution.x = hiz->get_view_width() * 2;
468+
ubo->hiz_resolution.y = hiz->get_view_height() * 2;
469+
ubo->hiz_min_lod = 1;
470+
ubo->hiz_max_lod = hiz->get_create_info().levels;
469471
}
470472
};
471473

@@ -805,24 +807,15 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler //
805807
(depth_view.get_view_width() + 63u) & ~63u,
806808
(depth_view.get_view_height() + 63u) & ~63u,
807809
VK_FORMAT_R32_SFLOAT);
810+
info.width /= 2;
811+
info.height /= 2;
808812
info.usage = VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT;
809813
info.initial_layout = VK_IMAGE_LAYOUT_UNDEFINED;
810-
info.levels = Util::floor_log2(max(depth_view.get_view_width(), depth_view.get_view_height()));
814+
info.levels = Util::floor_log2(max(depth_view.get_view_width(), depth_view.get_view_height())) - 1;
815+
info.misc |= IMAGE_MISC_CREATE_PER_MIP_LEVEL_VIEWS_BIT;
811816

812817
auto hiz = device.create_image(info);
813818

814-
ImageViewHandle views[13];
815-
for (unsigned i = 0; i < info.levels; i++)
816-
{
817-
ImageViewCreateInfo view = {};
818-
view.base_level = i;
819-
view.levels = 1;
820-
view.image = hiz.get();
821-
view.view_type = VK_IMAGE_VIEW_TYPE_2D;
822-
view.aspect = VK_IMAGE_ASPECT_COLOR_BIT;
823-
views[i] = device.create_image_view(view);
824-
}
825-
826819
struct Push
827820
{
828821
mat2 z_transform;
@@ -846,22 +839,22 @@ struct MeshletViewerApplication : Granite::Application, Granite::EventHandler //
846839

847840
Push push = {};
848841
push.z_transform = inv_z;
849-
push.resolution = uvec2(info.width, info.height);
842+
push.resolution = uvec2(info.width * 2, info.height * 2);
850843
push.inv_resolution = vec2(1.0f / float(depth_view.get_view_width()), 1.0f / float(depth_view.get_view_height()));
851-
push.mips = info.levels;
844+
push.mips = info.levels + 1;
852845

853846
uint32_t wg_x = (push.resolution.x + 63) / 64;
854847
uint32_t wg_y = (push.resolution.y + 63) / 64;
855848
push.target_counter = wg_x * wg_y;
856849

857850
cmd->set_program("builtin://shaders/post/hiz.comp");
858-
for (unsigned i = 0; i < 13; i++)
859-
cmd->set_storage_texture(0, i, *views[i < push.mips ? i : (push.mips - 1)]);
851+
for (unsigned i = 0; i < 12; i++)
852+
cmd->set_storage_texture_level(0, i + 1, hiz->get_view(), i < info.levels ? i : (info.levels - 1));
860853
cmd->set_texture(1, 0, depth_view, StockSampler::NearestClamp);
861854
cmd->set_storage_buffer(1, 1, *counter);
862855
cmd->push_constants(&push, 0, sizeof(push));
863856
cmd->enable_subgroup_size_control(true);
864-
cmd->set_subgroup_size_log2(true, 4, 7);
857+
cmd->set_subgroup_size_log2(true, 2, 7);
865858

866859
auto start_ts = cmd->write_timestamp(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
867860

0 commit comments

Comments
 (0)