Skip to content

Commit

Permalink
ADLR/megatron-lm!1913 - bugfix for multiple context managers
Browse files Browse the repository at this point in the history
Co-authored-by: Xin Yao <[email protected]>
  • Loading branch information
2 people authored and ko3n1g committed Nov 25, 2024
1 parent 3a32fbc commit a1fbf86
Show file tree
Hide file tree
Showing 14 changed files with 1,810 additions and 1,057 deletions.
2 changes: 1 addition & 1 deletion megatron/core/transformer/transformer_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ def forward(
else:
fp8_context = nullcontext()

with rng_context and fp8_context:
with rng_context, fp8_context:
# Forward pass.
if self.config.recompute_granularity == 'full' and self.training:
hidden_states = self._checkpointed_forward(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,33 @@
"end_step": 50,
"step_interval": 5,
"values": [
10.81962,
10.8674,
10.8579,
10.80754,
10.71119,
10.63665,
10.16221,
10.27928,
10.18799,
9.89003
10.82445,
10.86393,
10.85733,
10.80809,
10.70951,
10.63738,
10.16425,
10.28201,
10.19003,
9.88697
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
12597.0,
15988.0,
16507.0,
15995.0,
14088.0,
14994.0,
12887.0,
15815.0,
17017.0,
17439.0
12678.0,
16220.0,
16626.0,
16055.0,
13829.0,
14904.0,
12931.0,
15765.0,
16771.0,
17621.0
]
},
"iteration-time": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,33 @@
"end_step": 50,
"step_interval": 5,
"values": [
10.81962,
10.8674,
10.8579,
10.80754,
10.71119,
10.63665,
10.16221,
10.27928,
10.18787,
9.88951
10.82445,
10.86393,
10.85733,
10.80809,
10.70951,
10.63738,
10.16425,
10.28201,
10.19003,
9.88697
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
12597.0,
15988.0,
16507.0,
15995.0,
14088.0,
14994.0,
12887.0,
15815.0,
17049.0,
17592.0
12678.0,
16220.0,
16626.0,
16055.0,
13829.0,
14904.0,
12931.0,
15765.0,
16771.0,
17621.0
]
},
"iteration-time": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,33 @@
"end_step": 50,
"step_interval": 5,
"values": [
10.79806,
10.86449,
10.87223,
10.80743,
10.71153,
10.63864,
10.19312,
10.30941,
10.22013,
9.91591
10.79987,
10.85947,
10.86478,
10.80039,
10.70971,
10.63893,
10.19526,
10.31102,
10.22247,
9.91425
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
31034.0,
36990.0,
37990.0,
36195.0,
33575.0,
34963.0,
31002.0,
34952.0,
36574.0,
37403.0
30798.0,
37696.0,
37844.0,
36275.0,
33140.0,
35137.0,
30638.0,
35309.0,
36677.0,
37604.0
]
},
"iteration-time": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1,37 @@
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.79806, 10.86508, 10.87232, 10.80773, 10.71115, 10.63886, 10.19259, 10.30975, 10.22077, 9.9157]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31010.0, 37093.0, 37540.0, 35923.0, 33445.0, 34824.0, 30686.0, 35286.0, 36691.0, 37420.0]}, "iteration_timing_avg": 0.3566726470588235}
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.79987,
10.85983,
10.865,
10.799,
10.70987,
10.63782,
10.1965,
10.3099,
10.22262,
9.91423
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
30784.0,
37528.0,
37616.0,
36105.0,
33464.0,
34923.0,
30806.0,
35663.0,
36661.0,
37641.0
]
},
"iteration_timing_avg": 0.3566726470588235
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,33 @@
"end_step": 50,
"step_interval": 5,
"values": [
10.80392,
10.86451,
10.86407,
10.80254,
10.71523,
10.64479,
10.21223,
10.32267,
10.22495,
9.93003
10.8029,
10.86149,
10.86819,
10.80829,
10.72062,
10.64588,
10.21132,
10.32324,
10.2265,
9.92918
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
31227.0,
37874.0,
37773.0,
35936.0,
33255.0,
34279.0,
30117.0,
35460.0,
36069.0,
36785.0
31473.0,
37753.0,
38332.0,
36348.0,
33270.0,
34310.0,
30284.0,
35432.0,
36356.0,
37109.0
]
},
"iteration-time": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1,37 @@
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80392, 10.86451, 10.86407, 10.80254, 10.71523, 10.64479, 10.21223, 10.32267, 10.22495, 9.93003]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [31227.0, 37874.0, 37773.0, 35936.0, 33255.0, 34279.0, 30117.0, 35460.0, 36069.0, 36785.0]}, "iteration_timing_avg": 0.21900323529411767}
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.8029,
10.86149,
10.86819,
10.80829,
10.72062,
10.64588,
10.21132,
10.32324,
10.2265,
9.92918
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
31473.0,
37753.0,
38332.0,
36348.0,
33270.0,
34310.0,
30284.0,
35432.0,
36356.0,
37109.0
]
},
"iteration_timing_avg": 0.21900323529411767
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,33 @@
"end_step": 50,
"step_interval": 5,
"values": [
10.83503,
10.88475,
10.87872,
10.81608,
10.69357,
10.60024,
10.08934,
10.21378,
10.10871,
9.78568
10.83445,
10.87978,
10.87924,
10.81567,
10.69374,
10.60333,
10.08824,
10.21471,
10.10778,
9.78309
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
26744.0,
33099.0,
33750.0,
31697.0,
28979.0,
30817.0,
28713.0,
33425.0,
33927.0,
35074.0
26648.0,
32884.0,
33611.0,
31683.0,
28744.0,
30671.0,
28602.0,
33538.0,
34560.0,
35099.0
]
},
"iteration-time": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1,37 @@
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.83503, 10.88475, 10.87872, 10.81608, 10.69357, 10.60024, 10.08934, 10.21378, 10.10871, 9.78568]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [26744.0, 33099.0, 33750.0, 31697.0, 28979.0, 30817.0, 28713.0, 33425.0, 33927.0, 35074.0]}, "iteration_timing_avg": 0.28211852941176474}
{
"lm loss": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
10.83445,
10.87978,
10.87924,
10.81567,
10.69374,
10.60333,
10.08824,
10.21471,
10.10778,
9.78309
]
},
"num-zeros": {
"start_step": 0,
"end_step": 50,
"step_interval": 5,
"values": [
26648.0,
32884.0,
33611.0,
31683.0,
28744.0,
30671.0,
28602.0,
33538.0,
34560.0,
35099.0
]
},
"iteration_timing_avg": 0.28211852941176474
}
Loading

0 comments on commit a1fbf86

Please sign in to comment.