diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu index 9cd2bfb5..3e9ead7d 100644 --- a/csrc/kernels/internode_ll.cu +++ b/csrc/kernels/internode_ll.cu @@ -72,7 +72,7 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales, const size_t hidden_bytes = kHidden * (kUseFP8 ? sizeof(__nv_fp8_storage_t) : sizeof(nv_bfloat16)); const size_t hidden_int4 = hidden_bytes / sizeof(int4); - // Message package: hidden data, FP8 scales, index at source + // Message package: index at source (int), 3 reserved int fields, hidden data, FP8 scales // NOTES: currently we have 3 reserved int fields for future use using vec_t = std::conditional_t; const size_t num_bytes_per_msg = sizeof(int4) + (kUseFP8 ? (kHidden + num_scales * sizeof(float)) : (kHidden * sizeof(nv_bfloat16)));