You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
use_qwix_quantization: False # Whether to use qwix for quantization. If set to True, the model will be quantized using qwix.
130
130
# Quantization calibration method used for weights and activations. Supported methods can be found in https://github.com/google/qwix/blob/dc2a0770351c740e5ab3cce7c0efe9f7beacce9e/qwix/qconfig.py#L70-L80
131
-
quantization_calibration_method: "absmax"
131
+
weight_quantization_calibration_method: "absmax"
132
+
act_quantization_calibration_method: "absmax"
133
+
bwd_quantization_calibration_method: "absmax"
132
134
# Shard the range finding operation for quantization. By default this is set to number of slices.
133
135
quantization_local_shard_count: -1
134
136
@@ -177,10 +179,26 @@ load_balance_loss_weight: 0.01 # weight for the load balance loss
177
179
use_random_routing: False # whether to use random routing for debug/test purpose
178
180
use_custom_sort_vjp: True # whether to use a custom sort vjp for sparse matmul ops
179
181
use_ring_of_experts: False # whether to use ring of experts for sparse matmul expert parallelism
180
-
# Tunable tiling dimensions used for Megablox
181
-
tile_batch_seq: 512
182
-
tile_embed_dim: 1024
183
-
tile_mlp_dim: 1024
182
+
# Tunable tiling dimensions used for MLP GMM, includes Tokamax ragged_dot and Megablox
183
+
wi_tile_fwd_batch_seq: 512
184
+
wi_tile_fwd_embed_dim: 1024
185
+
wi_tile_fwd_mlp_dim: 1024
186
+
wi_tile_dlhs_batch_seq: 512
187
+
wi_tile_dlhs_embed_dim: 1024
188
+
wi_tile_dlhs_mlp_dim: 1024
189
+
wi_tile_drhs_batch_seq: 512
190
+
wi_tile_drhs_embed_dim: 1024
191
+
wi_tile_drhs_mlp_dim: 1024
192
+
193
+
wo_tile_fwd_batch_seq: 512
194
+
wo_tile_fwd_embed_dim: 1024
195
+
wo_tile_fwd_mlp_dim: 1024
196
+
wo_tile_dlhs_batch_seq: 512
197
+
wo_tile_dlhs_embed_dim: 1024
198
+
wo_tile_dlhs_mlp_dim: 1024
199
+
wo_tile_drhs_batch_seq: 512
200
+
wo_tile_drhs_embed_dim: 1024
201
+
wo_tile_drhs_mlp_dim: 1024
184
202
norm_topk_prob: False # Boolean to enable the top-k probability normalization. Qwen3-specific normalization of router weights.
185
203
186
204
# How the expert axis is used to shard attention weights and activations
0 commit comments