improve docs

epwalsh · epwalsh · commit 1ea4d0b6f693 · 2024-04-03T14:41:30.000-07:00
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -62,6 +62,8 @@
     "python": ("https://docs.python.org/3", None),
     "docker": ("https://docker-py.readthedocs.io/en/stable/", None),
     "requests": ("https://requests.readthedocs.io/en/stable/", None),
+    "torch": ("https://pytorch.org/docs/stable/", None),
+    "safetensors": ("https://huggingface.co/docs/safetensors/main/en/", None),
 }
 
 # By default, sort documented members by type within classes and modules.
diff --git a/docs/source/distributed/checkpoint.rst b/docs/source/distributed/checkpoint.rst
@@ -1,6 +1,6 @@
-distributed.checkpoint
-======================
+``distributed.checkpoint``
+==========================
 
 .. automodule:: olmo_core.distributed.checkpoint
-   :members: save_model_and_optim_state, load_model_and_optim_state, Checkpointer, StorageMetadata, TensorStorageMetadata
+   :members: save_model_and_optim_state, load_model_and_optim_state, unshard_model_state, unshard_optim_state, Checkpointer, StorageMetadata, TensorStorageMetadata
    :member-order: bysource
diff --git a/docs/source/exceptions.rst b/docs/source/exceptions.rst
@@ -0,0 +1,6 @@
+``exceptions``
+==============
+
+.. automodule:: olmo_core.exceptions
+   :members:
+   :member-order: bysource
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -13,6 +13,9 @@
    :maxdepth: 2
    :caption: API Reference
 
+   exceptions.rst
+   utils.rst
+   io.rst
    distributed/checkpoint.rst
 
 .. toctree::
diff --git a/docs/source/io.rst b/docs/source/io.rst
@@ -0,0 +1,6 @@
+``io``
+======
+
+.. automodule:: olmo_core.io
+   :members:
+   :member-order: bysource
diff --git a/docs/source/utils.rst b/docs/source/utils.rst
@@ -0,0 +1,6 @@
+``utils``
+=========
+
+.. automodule:: olmo_core.utils
+   :members:
+   :member-order: bysource
diff --git a/src/olmo_core/distributed/checkpoint.py b/src/olmo_core/distributed/checkpoint.py
@@ -1,3 +1,34 @@
+"""
+A low-overhead, fast, distributed checkpointing module with a unified API for saving and
+loading both local and remote checkpoints. Built on top of `safetensors <https://huggingface.co/docs/safetensors/>`_
+and inspired by :mod:`torch.distributed.checkpoint`, but better suited for handling distributed models and
+optimizer state without unnecessary distributed communication and GPU allocations.
+
+Features
+--------
+
+- Sharded distributed models, such as PyTorch's :class:`~torch.distributed.fsdp.FullyShardedDataParallel`
+  are supported out-of-the-box.
+- Utilizes `safetensors <https://huggingface.co/docs/safetensors/>`_ under the hood for fast, efficient, and
+  safe serialization/deserialization.
+- Save with one distributed topology, seamlessly load with a different one. For example,
+  with FSDP you can save/load checkpoints with different world sizes or wrapping strategies.
+- Save/load directly to/from a remote object store like S3 or GCS. When loading from a remote object store each
+  rank only downloads the fraction of the data it needs for its local (potentially sharded) tensors.
+- Checkpoints are always loaded in-place and one tensor at a time to avoid unnecessary allocations.
+  This results in virtually no additional memory overhead.
+
+Overview
+--------
+
+Use :func:`save_model_and_optim_state()` to write a checkpoint with your model and optimizer's state, then
+use :func:`load_model_and_optim_state()` to load the checkpoint in-place. You can also generate unsharded, full
+state dictionaries from a checkpoint with :func:`unshard_model_state()` and :func:`unshard_optim_state()`.
+
+API Reference
+-------------
+"""
+
 from __future__ import annotations
 
 import json
@@ -51,6 +82,26 @@ def save_model_and_optim_state(
     a different distributed topology through :func:`load_model_and_optim_state()`.
 
     Returns all of the files created by the current rank.
+
+    .. seealso::
+        - :func:`load_model_and_optim_state()`
+        - :func:`unshard_model_state()`
+        - :func:`unshard_optim_state()`
+
+    .. tip::
+        With :class:`~torch.distributed.fsdp.FullyShardedDataParallel` models it's not necessary
+        to set the state dict type before calling this (or :func:`load_model_and_optim_state()`) via
+        :meth:`~torch.distributed.fsdp.FullyShardedDataParallel.state_dict_type()` or other methods.
+        In fact those settings will always be ignored.
+
+    .. attention::
+        At the moment :class:`~torch.distributed.fsdp.FullyShardedDataParallel` models must have
+        ``use_orig_params=True``.
+
+    :param dir: Path/URL to save to.
+    :param model: The model to save state from.
+    :param optim: The optimizer to save state from.
+    :param save_overwrite: Overwrite existing files.
     """
     dir = str(dir).rstrip("/")
 
@@ -77,10 +128,22 @@ def load_model_and_optim_state(
     """
     Load model and optimizer state in-place from a checkpoint saved via :func:`save_model_and_optim_state()`.
     This method is agnostic to the distributed topology in that it can load checkpoints saved with a different
-    distributed topology.
+    distributed topology (e.g. FSDP vs DDP, or FSDP with a different world size).
+
+    .. seealso::
+        - :func:`save_model_and_optim_state()`
+        - :func:`unshard_model_state()`
+        - :func:`unshard_optim_state()`
+
+    .. tip::
+        Internally this function handles calling :meth:`torch.nn.Module.load_state_dict()` and
+        :meth:`torch.optim.Optimizer.load_state_dict()` for you, hence the return type is ``None``.
+
+    :param dir: Path/URL to the checkpoint saved via :func:`save_model_and_optim_state()`.
+    :param model: The model to load the state into.
+    :param optim: The optimizer to load the state into.
     """
     dir = str(dir).rstrip("/")
-
     checkpointer = Checkpointer()
 
     # Get model state in-place.
@@ -113,6 +176,51 @@ def load_model_and_optim_state(
         del optim_state_to_load
 
 
+@torch.no_grad()
+def unshard_model_state(
+    dir: PathOrStr, device: Optional[torch.device] = None, rank0_only: bool = False, no_dist: bool = False
+) -> Dict[str, torch.Tensor]:
+    """
+    Unshard model state saved via :func:`save_model_and_optim_state()`.
+
+    .. seealso::
+        - :func:`unshard_optim_state()`
+
+    :param dir: Local or remote checkpoint directory.
+    :param device: Device to load the checkpoint onto. Defaults to CPU.
+    :param rank0_only: Set to true if you only want to load the unsharded state to rank 0 in a distributed
+        context. Other ranks will receive an empty dictionary.
+    :param no_dist: Set to true to avoid any distributed communication whatsoever.
+    """
+    dir = str(dir).rstrip("/")
+    checkpointer = Checkpointer()
+    return checkpointer.unshard(f"{dir}/model", device=device, rank0_only=rank0_only, no_dist=no_dist)
+
+
+@torch.no_grad()
+def unshard_optim_state(
+    dir: PathOrStr, device: Optional[torch.device] = None, rank0_only: bool = False, no_dist: bool = False
+) -> OptimStateDict:
+    """
+    Unshard optimizer state saved via :func:`save_model_and_optim_state()`.
+
+    .. seealso::
+        - :func:`unshard_model_state()`
+
+    :param dir: Local or remote checkpoint directory.
+    :param device: Device to load the checkpoint onto. Defaults to CPU.
+    :param rank0_only: Set to true if you only want to load the unsharded state to rank 0 in a distributed
+        context. Other ranks will receive an empty dictionary.
+    :param no_dist: Set to true to avoid any distributed communication whatsoever.
+    """
+    dir = str(dir).rstrip("/")
+    checkpointer = Checkpointer()
+    flat_optim_state = checkpointer.unshard(f"{dir}/optim", device=device, rank0_only=rank0_only, no_dist=no_dist)
+    optim_state = _unflatten_optimizer_state(flat_optim_state)
+    del flat_optim_state
+    return optim_state
+
+
 class Checkpointer:
     """
     A distributed checkpointer for saving and loading *non-nested* state dictionaries,
@@ -363,6 +471,12 @@ def unshard(
 
         Alternatively, setting ``no_dist=True`` will return a full state dict from whatever process
         calls this.
+
+        :param dir: Local or remote checkpoint directory.
+        :param device: Device to load the checkpoint onto. Defaults to CPU.
+        :param rank0_only: Set to true if you only want to load the unsharded state to rank 0 in a distributed
+            context. Other ranks will receive an empty dictionary.
+        :param no_dist: Set to true to avoid any distributed communication whatsoever.
         """
         dir = self._normalize_dir(dir)
 
diff --git a/src/olmo_core/io.py b/src/olmo_core/io.py
@@ -27,12 +27,19 @@
 
 
 def is_url(path: PathOrStr) -> bool:
+    """
+    Check if a path is a URL.
+
+    :param path: Path-like object to check.
+    """
     return re.match(r"[a-z0-9]+://.*", str(path)) is not None
 
 
 def file_size(path: PathOrStr) -> int:
     """
     Get the size of a local or remote file in bytes.
+
+    :param path: Path/URL to the file.
     """
     if is_url(path):
         from urllib.parse import urlparse
@@ -52,31 +59,44 @@ def file_size(path: PathOrStr) -> int:
         return os.stat(path).st_size
 
 
-def get_bytes_range(source: PathOrStr, bytes_start: int, num_bytes: int) -> bytes:
-    if is_url(source):
+def get_bytes_range(path: PathOrStr, bytes_start: int, num_bytes: int) -> bytes:
+    """
+    Get a range of bytes from a file.
+
+    :param source: Path/URL to the file.
+    :param bytes_start: Byte offset to start at.
+    :param num_bytes: Number of bytes to get.
+    """
+    if is_url(path):
         from urllib.parse import urlparse
 
-        parsed = urlparse(str(source))
+        parsed = urlparse(str(path))
         if parsed.scheme == "gs":
             return _gcs_get_bytes_range(parsed.netloc, parsed.path.strip("/"), bytes_start, num_bytes)
         elif parsed.scheme in ("s3", "r2"):
             return _s3_get_bytes_range(
                 parsed.scheme, parsed.netloc, parsed.path.strip("/"), bytes_start, num_bytes
             )
         elif parsed.scheme in ("http", "https"):
-            return _http_get_bytes_range(str(source), bytes_start, num_bytes)
+            return _http_get_bytes_range(str(path), bytes_start, num_bytes)
         elif parsed.scheme == "file":
-            return get_bytes_range(str(source).replace("file://", "", 1), bytes_start, num_bytes)
+            return get_bytes_range(str(path).replace("file://", "", 1), bytes_start, num_bytes)
         else:
             raise NotImplementedError(f"file size not implemented for '{parsed.scheme}' files")
     else:
-        with open(source, "rb") as f:
+        with open(path, "rb") as f:
             f.seek(bytes_start)
             return f.read(num_bytes)
 
 
 def upload(source: PathOrStr, target: str, save_overwrite: bool = False):
-    """Upload source file to a target location on GCS or S3."""
+    """
+    Upload source file to a target location on GCS or S3.
+
+    :param source: Path to the file to upload.
+    :param target: Target URL to upload to.
+    :param save_overwrite: Overwrite any existing file.
+    """
     from urllib.parse import urlparse
 
     source = Path(source)
@@ -93,6 +113,11 @@ def upload(source: PathOrStr, target: str, save_overwrite: bool = False):
 
 
 def dir_is_empty(dir: PathOrStr) -> bool:
+    """
+    Check if a local directory is empty. This also returns true if the directory does not exist.
+
+    :param dir: Path to the local directory.
+    """
     dir = Path(dir)
     if not dir.is_dir():
         return True
@@ -104,6 +129,11 @@ def dir_is_empty(dir: PathOrStr) -> bool:
 
 
 def file_exists(path: PathOrStr) -> bool:
+    """
+    Check if a file exists.
+
+    :param path: Path/URL to a file.
+    """
     if is_url(path):
         from urllib.parse import urlparse
 
@@ -133,6 +163,11 @@ def file_exists(path: PathOrStr) -> bool:
 
 
 def clear_directory(dir: PathOrStr):
+    """
+    Clear out the contents of a local or remote directory. GCS (``gs://``) and S3 (``s3://``) URLs are supported.
+
+    :param dir: Path/URL to the directory.
+    """
     if is_url(dir):
         from urllib.parse import urlparse
 
@@ -153,11 +188,21 @@ def clear_directory(dir: PathOrStr):
 
 
 def serialize_to_tensor(x: Any) -> torch.Tensor:
+    """
+    Serialize an object to a byte tensor using pickle.
+
+    :param x: The pickeable object to serialize.
+    """
     serialized_bytes = pickle.dumps(x)
     return torch.frombuffer(bytearray(serialized_bytes), dtype=torch.uint8)
 
 
 def deserialize_from_tensor(data: torch.Tensor) -> Any:
+    """
+    Deserialize an object from a byte tensor using pickle.
+
+    :param data: The byte tensor to deserialize.
+    """
     assert data.dtype == torch.uint8
     return pickle.loads(bytearray([int(x.item()) for x in data.flatten()]))
 
diff --git a/src/olmo_core/utils.py b/src/olmo_core/utils.py
@@ -141,7 +141,7 @@ def seed_all(seed: int):
 
 def get_grad_norm(params: Iterable[nn.Parameter], norm_type: float) -> torch.Tensor:
     """
-    Return the gradient norm of parameters ``param`` s, where the gradients are viewed as a single vector.
+    Return the gradient norm of parameters, where the gradients are viewed as a single vector.
 
     The returned norm is in FP32 even if parameters/gradients are in a low precision. This is because the downstream
     use of this return value is a reduction across ranks.
diff --git a/src/test/distributed/checkpoint_test.py b/src/test/distributed/checkpoint_test.py
@@ -16,6 +16,8 @@
     init_optimizer_state,
     load_model_and_optim_state,
     save_model_and_optim_state,
+    unshard_model_state,
+    unshard_optim_state,
 )
 from olmo_core.distributed.fsdp import FSDP
 from olmo_core.distributed.sharded_flat_parameter import ShardedFlatParameter
@@ -446,6 +448,26 @@ def run_save_and_load_fsdp_model(dir, model_factory, model_data_factory, pre_ini
     for p1, p2 in zip(fsdp_model.parameters(), fsdp_model2.parameters()):
         torch.testing.assert_close(optim.state[p1], optim2.state[p2])
 
+    # Check unsharding model state.
+    full_model_state = unshard_model_state(dir)
+    assert full_model_state
+    for name, param in fsdp_model.named_parameters():
+        assert isinstance(param, ShardedFlatParameter)
+        assert name in full_model_state
+        assert full_model_state[name].shape == param.unsharded_shape
+
+    # Check unsharding optim state.
+    full_optim_state = unshard_optim_state(dir)
+    assert full_optim_state
+    assert len(full_optim_state["param_groups"]) == len(optim.param_groups)
+    for i, param in enumerate(fsdp_model.parameters()):
+        assert isinstance(param, ShardedFlatParameter)
+        assert i in full_optim_state["state"]
+        state = full_optim_state["state"][i]
+        assert state["step"].numel() == 1
+        assert state["exp_avg"].shape == param.unsharded_shape
+        assert state["exp_avg_sq"].shape == param.unsharded_shape
+
 
 @pytest.mark.parametrize("backend", BACKENDS)
 @pytest.mark.parametrize(

Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,8 @@`
`62`	`62`	`"python": ("https://docs.python.org/3", None),`
`63`	`63`	`"docker": ("https://docker-py.readthedocs.io/en/stable/", None),`
`64`	`64`	`"requests": ("https://requests.readthedocs.io/en/stable/", None),`
	`65`	`+ "torch": ("https://pytorch.org/docs/stable/", None),`
	`66`	`+ "safetensors": ("https://huggingface.co/docs/safetensors/main/en/", None),`
`65`	`67`	`}`
`66`	`68`
`67`	`69`	`# By default, sort documented members by type within classes and modules.`