-
Notifications
You must be signed in to change notification settings - Fork 145
feat[gpu]: dyn dispatch patches infrastructure #7431
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
0ax1
wants to merge
7
commits into
develop
Choose a base branch
from
ad/cuda-patches-clean
base: develop
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
bbcf7ff
refactor(cuda): per-stage source patches in fused dynamic dispatch
0ax1 514e1bb
perf(cuda): narrow dispatch plan struct fields and reorder for alignment
0ax1 64f6237
test(cuda): add fused dispatch tests for BitPacked with patches
0ax1 eb4be24
style(cuda): clang-format dynamic dispatch files
0ax1 5f6fcae
fix(cuda): restore patches gate in is_dyn_dispatch_compatible
0ax1 771d471
doc
0ax1 3204a32
can't be 0
0ax1 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -107,7 +107,7 @@ union SourceParams { | |
| /// Unpack FastLanes bit-packed data. | ||
| struct BitunpackParams { | ||
| uint8_t bit_width; | ||
| uint32_t element_offset; // Sub-byte offset | ||
| uint16_t element_offset; // Sub-block offset (0..1023) | ||
| } bitunpack; | ||
|
|
||
| /// Copy from global to shared memory. | ||
|
|
@@ -120,10 +120,10 @@ union SourceParams { | |
| /// The smem offsets are byte offsets so that ends and values can have | ||
| /// different element widths. | ||
| struct RunEndParams { | ||
| uint32_t ends_smem_byte_offset; // byte offset to decoded ends in smem | ||
| uint32_t values_smem_byte_offset; // byte offset to decoded values in smem | ||
| uint64_t num_runs; | ||
| uint64_t offset; // slice offset into the run-end encoded array | ||
| uint16_t ends_smem_byte_offset; // byte offset to decoded ends in smem | ||
| uint16_t values_smem_byte_offset; // byte offset to decoded values in smem | ||
| uint32_t num_runs; | ||
| uint32_t offset; // slice offset into the run-end encoded array | ||
| } runend; | ||
|
|
||
| /// Generate a linear sequence: `value[i] = base + i * multiplier`. | ||
|
|
@@ -134,8 +134,8 @@ union SourceParams { | |
| }; | ||
|
|
||
| struct SourceOp { | ||
| enum SourceOpCode { BITUNPACK, LOAD, RUNEND, SEQUENCE } op_code; | ||
| union SourceParams params; | ||
| enum SourceOpCode : uint8_t { BITUNPACK, LOAD, RUNEND, SEQUENCE } op_code; | ||
| }; | ||
|
|
||
| /// Scalar ops: element-wise transforms in registers. | ||
|
|
@@ -166,17 +166,17 @@ union ScalarParams { | |
| /// `output_ptype` (on the enclosing ScalarOp) to determine the values' | ||
| /// element type. | ||
| struct DictParams { | ||
| uint32_t values_smem_byte_offset; // byte offset to decoded dict values in smem | ||
| uint16_t values_smem_byte_offset; // byte offset to decoded dict values in smem | ||
| } dict; | ||
| }; | ||
|
|
||
| struct ScalarOp { | ||
| enum ScalarOpCode { FOR, ZIGZAG, ALP, DICT } op_code; | ||
| union ScalarParams params; | ||
| enum ScalarOpCode : uint8_t { FOR, ZIGZAG, ALP, DICT } op_code; | ||
| /// The PType this op produces. For type-preserving ops (FOR, ZIGZAG) | ||
| /// this equals the input PType. For type-changing ops (ALP, DICT) this | ||
| /// is the new output PType. | ||
| enum PTypeTag output_ptype; | ||
| union ScalarParams params; | ||
| }; | ||
|
|
||
| /// Packed stage header, followed by `num_scalar_ops` inline ScalarOps. | ||
|
|
@@ -188,11 +188,11 @@ struct ScalarOp { | |
| /// `smem_byte_offset` is a byte offset into the dynamic shared memory | ||
| /// pool so that stages with different element widths can coexist. | ||
| struct PackedStage { | ||
| uint64_t input_ptr; // global memory pointer to this stage's encoded input | ||
| uint32_t smem_byte_offset; // byte offset within dynamic shared memory for output | ||
| uint32_t len; // number of elements this stage produces | ||
|
|
||
| uint64_t input_ptr; // global memory pointer to this stage's encoded input | ||
| uint64_t patches_ptr; // device ptr to packed source patches (0 = none) | ||
| struct SourceOp source; | ||
| uint32_t len; // number of elements this stage produces | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Happy to expand this again to u64, but for now let's assume and guard we stay within u32. If we go past u32, we should also try micro/marco-benchmarking in that range. |
||
| uint16_t smem_byte_offset; // byte offset within dynamic shared memory for output | ||
| uint8_t num_scalar_ops; | ||
| enum PTypeTag source_ptype; // PType produced by the source op | ||
| }; | ||
|
|
@@ -221,12 +221,13 @@ struct __attribute__((aligned(8))) PlanHeader { | |
| /// `output_ptype` (or `source_ptype` if there are no scalar ops). | ||
| struct Stage { | ||
| uint64_t input_ptr; // encoded input in global memory | ||
| uint32_t smem_byte_offset; // byte offset within dynamic shared memory | ||
| const struct ScalarOp *scalar_ops; // pointer into packed plan buffer | ||
| uint64_t patches_ptr; // device ptr to packed source patches (0 = none) | ||
| uint32_t len; // elements produced | ||
| uint16_t smem_byte_offset; // byte offset within dynamic shared memory | ||
| enum PTypeTag source_ptype; // PType produced by the source op | ||
| struct SourceOp source; // source decode op | ||
| uint8_t num_scalar_ops; // number of scalar ops | ||
| const struct ScalarOp *scalar_ops; // scalar decode ops | ||
| struct SourceOp source; // source decode op | ||
| }; | ||
|
|
||
| /// Parse a single stage from the packed plan byte buffer and advance the cursor. | ||
|
|
@@ -243,12 +244,13 @@ __device__ inline Stage parse_stage(const uint8_t *&cursor) { | |
|
|
||
| return Stage { | ||
| .input_ptr = packed_stage->input_ptr, | ||
| .smem_byte_offset = packed_stage->smem_byte_offset, | ||
| .scalar_ops = ops, | ||
| .patches_ptr = packed_stage->patches_ptr, | ||
| .len = packed_stage->len, | ||
| .smem_byte_offset = packed_stage->smem_byte_offset, | ||
| .source_ptype = packed_stage->source_ptype, | ||
| .source = packed_stage->source, | ||
| .num_scalar_ops = packed_stage->num_scalar_ops, | ||
| .scalar_ops = ops, | ||
| .source = packed_stage->source, | ||
| }; | ||
| } | ||
|
|
||
|
|
||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Current max shared memory with guard is 48KB.