Fix whitespaces

amaurya · amaurya · commit 6160140e44a9 · 2025-04-15T22:20:22.000Z
Signed-off-by: amaurya &lt;amaurya@anl.gov&gt;
diff --git a/deepspeed/datastates/README.md b/deepspeed/datastates/README.md
@@ -0,0 +1,3 @@
+# DataStates-LLM checkpointing engine.
+
+This feature is not enabled by default. To enable, set the following options in ds_config.json and download [DataStates-LLM checkpointing library](https://github.com/DataStates/datastates-llm/). A detailed tutorial is available [here](../../docs/_tutorials/datastates-async-checkpointing.md).
diff --git a/deepspeed/runtime/checkpoint_engine/README.md b/deepspeed/runtime/checkpoint_engine/README.md
@@ -39,7 +39,7 @@ class CheckpointEngine(object):
 
 ### Asynchronous Lazy Checkpointing using DataStates-LLM
 
-DataStates-LLM is an asynchronous checkpointing approach optimized for LLM pre-training and can be obtained at https://github.com/DataStates/datastates-llm. To enable datastates-llm checkpointing, specify the `host_cache_size` (in gigabytes) which reserves pinned host memory for asynchronous checkpoint flushing, and `parser_threads` to parse multiple checkpoint file requests in parallel using the following lines in config.json supplied during the launch:
+DataStates-LLM is an asynchronous checkpointing approach optimized for LLM pre-training and can be obtained at https://github.com/DataStates/datastates-llm. A detailed tutorial is available [here](../../../docs/_tutorials/datastates-async-checkpointing.md). To enable datastates-llm checkpointing, specify the `host_cache_size` (in gigabytes) which reserves pinned host memory for asynchronous checkpoint flushing, and `parser_threads` to parse multiple checkpoint file requests in parallel using the following lines in config.json supplied during the launch:
 ```
 {
     ... other deepspeed config options,
diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
@@ -57,6 +57,7 @@
 from ..profiling.config import DeepSpeedFlopsProfilerConfig
 from ..autotuning.config import DeepSpeedAutotuningConfig
 from ..nebula.config import DeepSpeedNebulaConfig
+from ..datastates.config import DeepSpeedDataStatesConfig
 
 from ..compression.config import get_compression_config, get_quantize_enabled
 from ..compression.constants import *
@@ -908,6 +909,7 @@ def _initialize_params(self, param_dict):
         self.dataloader_drop_last = get_dataloader_drop_last(param_dict)
 
         self.nebula_config = DeepSpeedNebulaConfig(param_dict)
+        self.datastates_config = DeepSpeedDataStatesConfig(param_dict)
 
         self.weight_quantization_config = WeightQuantConfig(
             **param_dict['weight_quantization']) if 'weight_quantization' in param_dict else None
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
@@ -2264,6 +2264,12 @@ def _take_model_step(self, lr_kwargs, block_eigenvalue={}):
                 # https://nvidia.github.io/apex/advanced.html#gradient-clipping
                 master_params = amp.master_params(self.optimizer)
                 clip_grad_norm_(parameters=master_params, max_norm=self.gradient_clipping(), mpu=self.mpu)
+
+        try:
+            self.checkpoint_engine.wait()
+        except Exception as exc:
+            logger.error(f"Error during optimizer wait step: {exc}")
+
         self.optimizer.step()
 
         if hasattr(self.optimizer, '_global_grad_norm'):

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# DataStates-LLM checkpointing engine.`
	`2`	`+`
	`3`	`+This feature is not enabled by default. To enable, set the following options in ds_config.json and download [DataStates-LLM checkpointing library](https://github.com/DataStates/datastates-llm/). A detailed tutorial is available [here](../../docs/_tutorials/datastates-async-checkpointing.md).`
Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ class CheckpointEngine(object):`
`39`	`39`
`40`	`40`	`### Asynchronous Lazy Checkpointing using DataStates-LLM`
`41`	`41`
`42`		-DataStates-LLM is an asynchronous checkpointing approach optimized for LLM pre-training and can be obtained at https://github.com/DataStates/datastates-llm. To enable datastates-llm checkpointing, specify the `host_cache_size` (in gigabytes) which reserves pinned host memory for asynchronous checkpoint flushing, and `parser_threads` to parse multiple checkpoint file requests in parallel using the following lines in config.json supplied during the launch:
	`42`	+DataStates-LLM is an asynchronous checkpointing approach optimized for LLM pre-training and can be obtained at https://github.com/DataStates/datastates-llm. A detailed tutorial is available [here](../../../docs/_tutorials/datastates-async-checkpointing.md). To enable datastates-llm checkpointing, specify the `host_cache_size` (in gigabytes) which reserves pinned host memory for asynchronous checkpoint flushing, and `parser_threads` to parse multiple checkpoint file requests in parallel using the following lines in config.json supplied during the launch:
`43`	`43`	```
`44`	`44`	`{`
`45`	`45`	`... other deepspeed config options,`