|
1 | 1 | #version 450
|
2 | 2 | #extension GL_KHR_shader_subgroup_basic : require
|
3 |
| -#extension GL_KHR_shader_subgroup_shuffle : require |
4 | 3 | #extension GL_KHR_shader_subgroup_quad : require
|
5 | 4 |
|
6 | 5 | // A rewrite of SPD to support HiZ correctly and moar wave ops for good measure.
|
7 | 6 |
|
8 | 7 | layout(local_size_x = 256) in;
|
9 | 8 |
|
10 |
| -layout(set = 0, binding = 0, r32f) uniform writeonly image2D uImageTop; |
| 9 | +#if defined(WRITE_TOP_LEVEL) && WRITE_TOP_LEVEL |
| 10 | +layout(set = 0, binding = 0, r32f) coherent writeonly uniform image2D uImageTop; |
| 11 | +#endif |
11 | 12 | layout(set = 0, binding = 1, r32f) coherent uniform image2D uImages[12];
|
12 | 13 | layout(set = 1, binding = 0) uniform sampler2D uTexture;
|
13 | 14 | layout(set = 1, binding = 1) buffer Counter
|
@@ -66,13 +67,12 @@ void write_image(ivec2 coord, int mip, float v)
|
66 | 67 | imageStore(uImages[mip - 1], coord, vec4(v));
|
67 | 68 | }
|
68 | 69 |
|
69 |
| -void write_image4_top(ivec2 coord, int mip, vec4 v) |
| 70 | +#if defined(WRITE_TOP_LEVEL) && WRITE_TOP_LEVEL |
| 71 | +void write_image_top(ivec2 coord, float v) |
70 | 72 | {
|
71 |
| - imageStore(uImageTop, coord + ivec2(0, 0), v.xxxx); |
72 |
| - imageStore(uImageTop, coord + ivec2(1, 0), v.yyyy); |
73 |
| - imageStore(uImageTop, coord + ivec2(0, 1), v.zzzz); |
74 |
| - imageStore(uImageTop, coord + ivec2(1, 1), v.wwww); |
| 73 | + imageStore(uImageTop, coord, vec4(v, 0, 0, 0)); |
75 | 74 | }
|
| 75 | +#endif |
76 | 76 |
|
77 | 77 | const int SHARED_WIDTH = 32;
|
78 | 78 | const int SHARED_HEIGHT = 32;
|
@@ -120,11 +120,35 @@ float fetch_image_mip6(ivec2 coord)
|
120 | 120 | return imageLoad(uImages[5], coord).x;
|
121 | 121 | }
|
122 | 122 |
|
123 |
| -vec4 write_mip0_transformed(vec4 v, ivec2 base_coord) |
| 123 | +vec4 write_mip0_transformed(vec4 v, ivec2 base_coord, ivec2 local_coord) |
124 | 124 | {
|
125 | 125 | v = transform_z(v);
|
| 126 | + |
| 127 | +#if defined(WRITE_TOP_LEVEL) && WRITE_TOP_LEVEL |
| 128 | + // Ensure that top-level image is written with full cache lines per warp. |
| 129 | + // Writing in the strided 2x2 pattern is noticably bad for L2 performance. |
| 130 | + // Taking extra time on the shader cores to reshuffle data is actually beneficial since we're fully bandwidth bound |
| 131 | + // in these shaders, so we should give the memory system all the help it can get. |
| 132 | + store_shared(2 * local_coord + ivec2(0, 0), v.x); |
| 133 | + store_shared(2 * local_coord + ivec2(1, 0), v.y); |
| 134 | + store_shared(2 * local_coord + ivec2(0, 1), v.z); |
| 135 | + store_shared(2 * local_coord + ivec2(1, 1), v.w); |
| 136 | + |
| 137 | + barrier(); |
| 138 | + |
126 | 139 | // Write out transformed LOD 0
|
127 |
| - write_image4_top(base_coord, 0, v); |
| 140 | + for (int y = 0; y < 2; y++) |
| 141 | + { |
| 142 | + for (int x = 0; x < 2; x++) |
| 143 | + { |
| 144 | + ivec2 tile_offset = ivec2(x, y) * 16; |
| 145 | + write_image_top(base_coord + tile_offset + local_coord, load_shared(local_coord + tile_offset)); |
| 146 | + } |
| 147 | + } |
| 148 | + |
| 149 | + barrier(); |
| 150 | +#endif |
| 151 | + |
128 | 152 | return v;
|
129 | 153 | }
|
130 | 154 |
|
@@ -241,10 +265,14 @@ void main()
|
241 | 265 | // It seems like we need to be super careful about memory access patterns to get optimal bandwidth.
|
242 | 266 |
|
243 | 267 | // LOD 0 feedback with transform.
|
244 |
| - vec4 tile00 = write_mip0_transformed(fetch_2x2_texture(base_coord_00), base_coord_00); |
245 |
| - vec4 tile10 = write_mip0_transformed(fetch_2x2_texture(base_coord_10), base_coord_10); |
246 |
| - vec4 tile01 = write_mip0_transformed(fetch_2x2_texture(base_coord_01), base_coord_01); |
247 |
| - vec4 tile11 = write_mip0_transformed(fetch_2x2_texture(base_coord_11), base_coord_11); |
| 268 | + vec4 tile00 = write_mip0_transformed( |
| 269 | + fetch_2x2_texture(base_coord_00), ivec2(gl_WorkGroupID.xy * 64u) + ivec2(0, 0), ivec2(local_coord)); |
| 270 | + vec4 tile10 = write_mip0_transformed( |
| 271 | + fetch_2x2_texture(base_coord_10), ivec2(gl_WorkGroupID.xy * 64u) + ivec2(32, 0), ivec2(local_coord)); |
| 272 | + vec4 tile01 = write_mip0_transformed( |
| 273 | + fetch_2x2_texture(base_coord_01), ivec2(gl_WorkGroupID.xy * 64u) + ivec2(0, 32), ivec2(local_coord)); |
| 274 | + vec4 tile11 = write_mip0_transformed( |
| 275 | + fetch_2x2_texture(base_coord_11), ivec2(gl_WorkGroupID.xy * 64u) + ivec2(32, 32), ivec2(local_coord)); |
248 | 276 | if (registers.mips <= 1)
|
249 | 277 | return;
|
250 | 278 |
|
@@ -275,6 +303,7 @@ void main()
|
275 | 303 | store_shared(local_coord_shared + ivec2(0, 8), reduced01);
|
276 | 304 | store_shared(local_coord_shared + ivec2(8, 8), reduced11);
|
277 | 305 | }
|
| 306 | + |
278 | 307 | barrier();
|
279 | 308 |
|
280 | 309 | // Write LOD 3
|
|
0 commit comments