kubeflow · andreyvelich · Jan 13, 2025 · Jan 14, 2025 · Jan 14, 2025 · Jan 14, 2025
diff --git a/content/en/_redirects b/content/en/_redirects
@@ -337,4 +337,22 @@ docs/started/requirements/                    /docs/started/getting-started/
 /docs/components/pipelines/v2/reference/api/kubeflow-pipeline-api-spec/     /docs/components/pipelines/reference/api/kubeflow-pipeline-api-spec/
 /docs/components/pipelines/v2/reference/sdk/                                /docs/components/pipelines/reference/sdk/
 /docs/components/pipelines/v2/run-a-pipeline/                               /docs/components/pipelines/user-guides/core-functions/run-a-pipeline/
-/docs/components/pipelines/v2/version-compatibility/                        /docs/components/pipelines/reference/version-compatibility/
+/docs/components/pipelines/v2/version-compatibility/                        /docs/components/pipelines/reference/version-compatibility/
+
+# Kubeflow Training V2 (https://github.com/kubeflow/training-operator/issues/2214)
+/docs/components/training/installation/                       /docs/components/training/legacy-v1/installation/
+/docs/components/training/explanation/                        /docs/components/training/legacy-v1/explanation/
+/docs/components/training/explanation/fine-tuning/            /docs/components/training/legacy-v1/explanation/fine-tuning/
+/docs/components/training/reference/                          /docs/components/training/legacy-v1/reference/
+/docs/components/training/reference/architecture/             /docs/components/training/legacy-v1/reference/architecture/
+/docs/components/training/reference/distributed-training/     /docs/components/training/legacy-v1/reference/distributed-training/
+/docs/components/training/reference/fine-tuning/              /docs/components/training/legacy-v1/reference/fine-tuning/
+/docs/components/training/user-guides/                        /docs/components/training/legacy-v1/user-guides/
+/docs/components/training/user-guides/fine-tuning/            /docs/components/training/legacy-v1/user-guides/fine-tuning/
+/docs/components/training/user-guides/jax/                    /docs/components/training/legacy-v1/user-guides/jax/
+/docs/components/training/user-guides/job-scheduling/         /docs/components/training/legacy-v1/user-guides/job-scheduling/
+/docs/components/training/user-guides/mpi/                    /docs/components/training/legacy-v1/user-guides/mpi/
+/docs/components/training/user-guides/paddle/                 /docs/components/training/legacy-v1/user-guides/paddle/
+/docs/components/training/user-guides/prometheus/             /docs/components/training/legacy-v1/user-guides/prometheus/
+/docs/components/training/user-guides/tensorflow/             /docs/components/training/legacy-v1/user-guides/tensorflow/
+/docs/components/training/user-guides/xgboost/                /docs/components/training/legacy-v1/user-guides/xgboost/
diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/configure-experiment.md b/content/en/docs/components/katib/user-guides/hp-tuning/configure-experiment.md
@@ -122,7 +122,7 @@ trialSpec:
 ```
 
 If you use `PyTorchJob` or other Training Operator jobs in your Trial template check
-[here](/docs/components/training/user-guides/tensorflow/#what-is-tfjob) how to set the annotation.
+[here](/docs/components/training/legacy-v1/user-guides/tensorflow/#what-is-tfjob) how to set the annotation.
 
 ## Running the Experiment
 

diff --git a/content/en/docs/components/katib/user-guides/trial-template.md b/content/en/docs/components/katib/user-guides/trial-template.md
@@ -16,13 +16,13 @@ In Katib examples, you can find the following examples for Trial's Workers:
 
 - [Kubernetes `Job`](https://kubernetes.io/docs/concepts/workloads/controllers/job/)
 
-- [Kubeflow `TFJob`](/docs/components/training/user-guides/tensorflow)
+- [Kubeflow `TFJob`](/docs/components/training/legacy-v1/user-guides/tensorflow)
 
-- [Kubeflow `PyTorchJob`](/docs/components/training/user-guides/pytorch/)
+- [Kubeflow `PyTorchJob`](/docs/components/training/legacy-v1/user-guides/pytorch/)
 
-- [Kubeflow `XGBoostJob`](/docs/components/training/user-guides/xgboost)
+- [Kubeflow `XGBoostJob`](/docs/components/training/legacy-v1/user-guides/xgboost)
 
-- [Kubeflow `MPIJob`](/docs/components/training/user-guides/mpi)
+- [Kubeflow `MPIJob`](/docs/components/training/legacy-v1/user-guides/mpi)
 
 - [Tekton `Pipelines`](https://github.com/kubeflow/katib/tree/master/examples/v1beta1/tekton)
 

diff --git a/content/en/docs/components/training/_index.md b/content/en/docs/components/training/_index.md
@@ -1,5 +1,5 @@
 +++
-title = "Training Operator"
-description = "Documentation for Kubeflow Training Operator"
-weight = 70
+title = "Kubeflow Training"
+description = "Documentation for Kubeflow Training"
+weight = 20
 +++
diff --git a/content/en/docs/components/training/contributor-guides/_index.md b/content/en/docs/components/training/contributor-guides/_index.md
@@ -0,0 +1,5 @@
++++
+title = "Contributor Guides"
+description = "Documentation for Kubeflow Training contributors"
+weight = 60
++++
diff --git a/content/en/docs/components/training/contributor-guides/contributing.md b/content/en/docs/components/training/contributor-guides/contributing.md
@@ -0,0 +1,7 @@
++++
+title = "Contributing Guide"
+description = "How to contribute to Kubeflow Training project"
+weight = 10
++++
+
+This document describes how to contribute to Kubeflow Training project.
diff --git a/content/en/docs/components/training/getting-started.md b/content/en/docs/components/training/getting-started.md
@@ -1,158 +1,34 @@
 +++
 title = "Getting Started"
-description = "Get started with the Training Operator"
+description = "Get Started with Kubeflow Training"
 weight = 30
 +++
 
-This guide describes how to get started with the Training Operator and run a few simple examples.
+This guide describes how to get started with Kubeflow Training and run distributed training
+with PyTorch.
 
 ## Prerequisites
 
-You need to install the following components to run examples:
+Ensure that you have access to a Kubernetes cluster with Kubeflow Training
+control plane installed. If it is not set up yet, follow
+[the installation guide](/docs/components/training/operator-guides/installation) to quickly deploy
+Kubeflow Training on a local Kind cluster.
 
-- The Training Operator control plane [installed](/docs/components/training/installation/#installing-the-control-plane).
-- The Training Python SDK [installed](/docs/components/training/installation/#installing-the-python-sdk).
+### Installing the Kubeflow Python SDK
 
-## Getting Started with PyTorchJob
+Install the Kubeflow Python SDK to interact with Kubeflow Training APIs:
 
-You can create your first Training Operator distributed PyTorchJob using the Python SDK. Define the
-training function that implements end-to-end model training. Each Worker will execute this
-function on the appropriate Kubernetes Pod. Usually, this function contains logic to
-download dataset, create model, and train the model.
-
-The Training Operator will automatically set `WORLD_SIZE` and `RANK` for the appropriate PyTorchJob
-worker to perform [PyTorch Distributed Data Parallel (DDP)](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html).
-
-If you install the Training Operator as part of the Kubeflow Platform, you can open a new
-[Kubeflow Notebook](/docs/components/notebooks/quickstart-guide/) to run this script. If you
-install the Training Operator standalone, make sure that you
-[configure local `kubeconfig`](https://kubernetes.io/docs/tasks/access-application-cluster/access-cluster/#programmatic-access-to-the-api)
-to access your Kubernetes cluster where you installed the Training Operator.
-
-```python
-def train_func():
-    import torch
-    import torch.nn.functional as F
-    from torch.utils.data import DistributedSampler
-    from torchvision import datasets, transforms
-    import torch.distributed as dist
-
-    # [1] Setup PyTorch DDP. Distributed environment will be set automatically by Training Operator.
-    dist.init_process_group(backend="nccl")
-    Distributor = torch.nn.parallel.DistributedDataParallel
-    local_rank = int(os.getenv("LOCAL_RANK", 0))
-    print(
-        "Distributed Training for WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}".format(
-            dist.get_world_size(),
-            dist.get_rank(),
-            local_rank,
-        )
-    )
-
-    # [2] Create PyTorch CNN Model.
-    class Net(torch.nn.Module):
-        def __init__(self):
-            super(Net, self).__init__()
-            self.conv1 = torch.nn.Conv2d(1, 20, 5, 1)
-            self.conv2 = torch.nn.Conv2d(20, 50, 5, 1)
-            self.fc1 = torch.nn.Linear(4 * 4 * 50, 500)
-            self.fc2 = torch.nn.Linear(500, 10)
-
-        def forward(self, x):
-            x = F.relu(self.conv1(x))
-            x = F.max_pool2d(x, 2, 2)
-            x = F.relu(self.conv2(x))
-            x = F.max_pool2d(x, 2, 2)
-            x = x.view(-1, 4 * 4 * 50)
-            x = F.relu(self.fc1(x))
-            x = self.fc2(x)
-            return F.log_softmax(x, dim=1)
-
-    # [3] Attach model to the correct GPU device and distributor.
-    device = torch.device(f"cuda:{local_rank}")
-    model = Net().to(device)
-    model = Distributor(model)
-    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
-
-    # [4] Setup FashionMNIST dataloader and distribute data across PyTorchJob workers.
-    dataset = datasets.FashionMNIST(
-        "./data",
-        download=True,
-        train=True,
-        transform=transforms.Compose([transforms.ToTensor()]),
-    )
-    train_loader = torch.utils.data.DataLoader(
-        dataset=dataset,
-        batch_size=128,
-        sampler=DistributedSampler(dataset),
-    )
-
-    # [5] Start model Training.
-    for epoch in range(3):
-        for batch_idx, (data, target) in enumerate(train_loader):
-            # Attach Tensors to the device.
-            data = data.to(device)
-            target = target.to(device)
-
-            optimizer.zero_grad()
-            output = model(data)
-            loss = F.nll_loss(output, target)
-            loss.backward()
-            optimizer.step()
-            if batch_idx % 10 == 0 and dist.get_rank() == 0:
-                print(
-                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format(
-                        epoch,
-                        batch_idx * len(data),
-                        len(train_loader.dataset),
-                        100.0 * batch_idx / len(train_loader),
-                        loss.item(),
-                    )
-                )
-
-
-from kubeflow.training import TrainingClient
-
-# Start PyTorchJob with 3 Workers and 1 GPU per Worker (e.g. multi-node, multi-worker job).
-TrainingClient().create_job(
-    name="pytorch-ddp",
-    train_func=train_func,
-    num_procs_per_worker="auto",
-    num_workers=3,
-    resources_per_worker={"gpu": "1"},
-)
+```bash
+pip install kubeflow
 ```
 
-## Getting Started with TFJob
-
-Similar to the PyTorchJob example, you can use the Python SDK to create your first distributed
-TensorFlow job. Run the following script to create TFJob with pre-created Docker image:
-`docker.io/kubeflow/tf-mnist-with-summaries:latest` that contains
-[distributed TensorFlow code](https://github.com/kubeflow/training-operator/tree/e6b4300f9dfebb5c2a3269641c828add367688ee/examples/tensorflow/mnist_with_summaries):
-
-```python
-from kubeflow.training import TrainingClient
+Alternatively, you can install the latest Kubeflow Python SDK version directly
+from the source repository:
 
-TrainingClient().create_job(
-    name="tensorflow-dist",
-    job_kind="TFJob",
-    base_image="docker.io/kubeflow/tf-mnist-with-summaries:latest",
-    num_workers=3,
-)
+```bash
+pip install git+https://github.com/kubeflow/training-operator.git@master#subdirectory=sdk_v2
 ```
 
-Run the following API to get logs from your TFJob:
-
-```python
-TrainingClient().get_job_logs(
-    name="tensorflow-dist",
-    job_kind="TFJob",
-    follow=True,
-)
-```
-
-## Next steps
-
-- Run the [FashionMNIST example](https://github.com/kubeflow/training-operator/blob/7345e33b333ba5084127efe027774dd7bed8f6e6/examples/pytorch/image-classification/Train-CNN-with-FashionMNIST.ipynb) with using Training Operator Python SDK.
+## Getting Started with PyTorch
 
-- Learn more about [the PyTorchJob APIs](/docs/components/training/user-guides/pytorch/).
+TODO (andreyvelich): Add example from the Notebook
diff --git a/content/en/docs/components/training/images/ml-lifecycle-training.drawio.svg b/content/en/docs/components/training/images/ml-lifecycle-training.drawio.svg
diff --git a/content/en/docs/components/training/images/user-personas.drawio.svg b/content/en/docs/components/training/images/user-personas.drawio.svg
diff --git a/content/en/docs/components/training/legacy-v1/_index.md b/content/en/docs/components/training/legacy-v1/_index.md
@@ -0,0 +1,11 @@
++++
+title = "Legacy (v1)"
+description = "Kubeflow Training V1 Documentation"
+weight = 999
++++
+
+{{% alert title="Old Version" color="warning" %}}
+This page is about **Kubeflow Training V1**, please see the [V2 documentation](/docs/components/training) for the latest information.
+
+Please follow [this guide for migrating to Kubeflow Training V2](/docs/components/training/operator-guides/migration)
+{{% /alert %}}
diff --git a/...components/training/explanation/_index.md → .../training/legacy-v1/explanation/_index.md b/...components/training/explanation/_index.md → .../training/legacy-v1/explanation/_index.md
diff --git a/...nents/training/explanation/fine-tuning.md → ...ning/legacy-v1/explanation/fine-tuning.md b/...nents/training/explanation/fine-tuning.md → ...ning/legacy-v1/explanation/fine-tuning.md
@@ -10,7 +10,7 @@ share your experience using the [#kubeflow-training Slack channel](/docs/about/c
 or [Kubeflow Training Operator GitHib](https://github.com/kubeflow/training-operator/issues/new).
 {{% /alert %}}
 
-This page explains how the [Training Operator fine-tuning API](/docs/components/training/user-guides/fine-tuning)
+This page explains how the [Training Operator fine-tuning API](/docs/components/training/legacy-v1/user-guides/fine-tuning)
 fits into the Kubeflow ecosystem.
 
 In the rapidly evolving landscape of machine learning (ML) and artificial intelligence (AI),
@@ -60,4 +60,4 @@ Different user personas can benefit from this feature:
 
 ## Next Steps
 
-- Understand [the architecture behind `train` API](/docs/components/training/reference/fine-tuning).
+- Understand [the architecture behind `train` API](/docs/components/training/legacy-v1/reference/fine-tuning).