diff --git a/python/ray/train/tests/test_xgboost_trainer.py b/python/ray/train/tests/test_xgboost_trainer.py
index 8d60574e7dd8..efb3fe59edc7 100644
--- a/python/ray/train/tests/test_xgboost_trainer.py
+++ b/python/ray/train/tests/test_xgboost_trainer.py
@@ -99,11 +99,210 @@ def test_resume_from_checkpoint(ray_start_4_cpus, tmpdir):
         params=params,
         num_boost_round=10,
         datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
-        resume_from_checkpoint=result.checkpoint,
+        resume_from_checkpoint=checkpoint,
     )
     result = trainer.fit()
-    model = XGBoostTrainer.get_model(result.checkpoint)
-    assert model.num_boosted_rounds() == 10
+    xgb_model = XGBoostTrainer.get_model(result.checkpoint)
+    assert xgb_model.num_boosted_rounds() == 10
+
+
+def test_external_memory_basic(ray_start_4_cpus, tmpdir):
+    """Test V1 XGBoost Trainer with external memory enabled."""
+    train_dataset = ray.data.from_pandas(train_df)
+    valid_dataset = ray.data.from_pandas(test_df)
+
+    # Use hist tree method (required for external memory)
+    external_memory_params = {
+        "tree_method": "hist",  # Required for external memory
+        "objective": "binary:logistic",
+        "eval_metric": ["logloss", "error"],
+    }
+
+    # Create temporary cache directory
+    cache_dir = tmpdir.mkdir("xgboost_cache")
+
+    trainer = XGBoostTrainer(
+        scaling_config=scale_config,
+        label_column="target",
+        params=external_memory_params,
+        num_boost_round=10,
+        datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
+        use_external_memory=True,
+        external_memory_cache_dir=str(cache_dir),
+        external_memory_device="cpu",
+        external_memory_batch_size=1000,
+    )
+
+    result = trainer.fit()
+
+    # Verify results
+    assert result.checkpoint is not None
+    xgb_model = XGBoostTrainer.get_model(result.checkpoint)
+    assert xgb_model.num_boosted_rounds() == 10
+
+    # Verify external memory configuration
+    assert trainer.is_external_memory_enabled()
+    config = trainer.get_external_memory_config()
+    assert config["use_external_memory"] is True
+    assert config["cache_dir"] == str(cache_dir)
+    assert config["device"] == "cpu"
+    assert config["batch_size"] == 1000
+
+
+def test_external_memory_auto_configuration(ray_start_4_cpus):
+    """Test V1 XGBoost Trainer with automatic external memory configuration."""
+    train_dataset = ray.data.from_pandas(train_df)
+    valid_dataset = ray.data.from_pandas(test_df)
+
+    # Use hist tree method (required for external memory)
+    external_memory_params = {
+        "tree_method": "hist",  # Required for external memory
+        "objective": "binary:logistic",
+        "eval_metric": ["logloss", "error"],
+    }
+
+    trainer = XGBoostTrainer(
+        scaling_config=scale_config,
+        label_column="target",
+        params=external_memory_params,
+        num_boost_round=10,
+        datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
+        use_external_memory=True,
+        # Let the trainer auto-select cache directory and batch size
+    )
+
+    result = trainer.fit()
+
+    # Verify results
+    assert result.checkpoint is not None
+    xgb_model = XGBoostTrainer.get_model(result.checkpoint)
+    assert xgb_model.num_boosted_rounds() == 10
+
+    # Verify external memory is enabled
+    assert trainer.is_external_memory_enabled()
+
+
+def test_external_memory_gpu(ray_start_8_cpus):
+    """Test V1 XGBoost Trainer with GPU external memory."""
+    train_dataset = ray.data.from_pandas(train_df)
+    valid_dataset = ray.data.from_pandas(test_df)
+
+    # Use hist tree method (required for external memory)
+    external_memory_params = {
+        "tree_method": "hist",  # Required for external memory
+        "objective": "binary:logistic",
+        "eval_metric": ["logloss", "error"],
+    }
+
+    trainer = XGBoostTrainer(
+        scaling_config=ScalingConfig(num_workers=2, use_gpu=True),
+        label_column="target",
+        params=external_memory_params,
+        num_boost_round=10,
+        datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
+        use_external_memory=True,
+        external_memory_device="cuda",
+        external_memory_batch_size=5000,  # Smaller batch size for GPU
+    )
+
+    result = trainer.fit()
+
+    # Verify results
+    assert result.checkpoint is not None
+    xgb_model = XGBoostTrainer.get_model(result.checkpoint)
+    assert xgb_model.num_boosted_rounds() == 10
+
+    # Verify GPU external memory configuration
+    config = trainer.get_external_memory_config()
+    assert config["device"] == "cuda"
+
+
+def test_external_memory_utilities(ray_start_4_cpus):
+    """Test V1 XGBoost Trainer external memory utility methods."""
+    # Test GPU setup method
+    gpu_setup_result = XGBoostTrainer.setup_gpu_external_memory()
+    # This should return False on CPU-only systems, True on GPU systems
+    assert isinstance(gpu_setup_result, bool)
+
+
+def test_external_memory_with_large_dataset(ray_start_8_cpus, tmpdir):
+    """Test V1 XGBoost Trainer with a larger dataset to verify external memory benefits."""
+    # Create a larger dataset
+    large_train_df = pd.concat([train_df] * 10, ignore_index=True)
+    large_test_df = pd.concat([test_df] * 5, ignore_index=True)
+
+    large_train_dataset = ray.data.from_pandas(large_train_df)
+    large_valid_dataset = ray.data.from_pandas(large_test_df)
+
+    # Use hist tree method (required for external memory)
+    external_memory_params = {
+        "tree_method": "hist",  # Required for external memory
+        "objective": "binary:logistic",
+        "eval_metric": ["logloss", "error"],
+        "max_depth": 3,  # Limit depth for faster training
+        "eta": 0.1,
+    }
+
+    # Create temporary cache directory
+    cache_dir = tmpdir.mkdir("xgboost_large_cache")
+
+    trainer = XGBoostTrainer(
+        scaling_config=ScalingConfig(num_workers=4),
+        label_column="target",
+        params=external_memory_params,
+        num_boost_round=5,  # Fewer rounds for faster testing
+        datasets={TRAIN_DATASET_KEY: large_train_dataset, "valid": large_valid_dataset},
+        use_external_memory=True,
+        external_memory_cache_dir=str(cache_dir),
+        external_memory_batch_size=2000,
+    )
+
+    result = trainer.fit()
+
+    # Verify results
+    assert result.checkpoint is not None
+    xgb_model = XGBoostTrainer.get_model(result.checkpoint)
+    assert xgb_model.num_boosted_rounds() == 5
+
+    # Verify external memory configuration
+    assert trainer.is_external_memory_enabled()
+    config = trainer.get_external_memory_config()
+    assert config["use_external_memory"] is True
+    assert config["batch_size"] == 2000
+
+
+def test_external_memory_backward_compatibility(ray_start_4_cpus):
+    """Test that V1 XGBoost Trainer maintains backward compatibility when external memory is disabled."""
+    train_dataset = ray.data.from_pandas(train_df)
+    valid_dataset = ray.data.from_pandas(test_df)
+
+    # Use standard parameters (no external memory)
+    standard_params = {
+        "tree_method": "approx",  # Can use approx for standard DMatrix
+        "objective": "binary:logistic",
+        "eval_metric": ["logloss", "error"],
+    }
+
+    trainer = XGBoostTrainer(
+        scaling_config=scale_config,
+        label_column="target",
+        params=standard_params,
+        num_boost_round=10,
+        datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
+        # External memory disabled by default
+    )
+
+    result = trainer.fit()
+
+    # Verify results
+    assert result.checkpoint is not None
+    xgb_model = XGBoostTrainer.get_model(result.checkpoint)
+    assert xgb_model.num_boosted_rounds() == 10
+
+    # Verify external memory is disabled
+    assert not trainer.is_external_memory_enabled()
+    config = trainer.get_external_memory_config()
+    assert config["use_external_memory"] is False
 
 
 @pytest.mark.parametrize(
diff --git a/python/ray/train/v2/tests/test_xgboost_trainer.py b/python/ray/train/v2/tests/test_xgboost_trainer.py
index 6f5a8a3d1c45..c9baaaeae3cc 100644
--- a/python/ray/train/v2/tests/test_xgboost_trainer.py
+++ b/python/ray/train/v2/tests/test_xgboost_trainer.py
@@ -1,98 +1,945 @@
+"""
+Comprehensive tests for XGBoost Trainer V2 public APIs.
+
+This test file covers the V2 XGBoost Trainer public API:
+- XGBoostTrainer (V2 trainer class)
+
+Note: This is specifically for V2 trainer testing and does NOT test:
+- V1 trainer components (RayTrainReportCallback, XGBoostConfig, etc.)
+- Internal utility functions like prepare_dataset, get_recommended_params, etc.
+- V1-specific functionality
+"""
+
 import pandas as pd
 import pytest
 import xgboost
-from sklearn.datasets import load_breast_cancer
+from sklearn.datasets import load_breast_cancer, load_diabetes, load_iris
 from sklearn.model_selection import train_test_split
 
 import ray
 from ray.train import ScalingConfig
 from ray.train.constants import TRAIN_DATASET_KEY
 from ray.train.v2._internal.constants import is_v2_enabled
-from ray.train.xgboost import RayTrainReportCallback, XGBoostTrainer
+from ray.train.v2.xgboost import XGBoostTrainer
 
 assert is_v2_enabled()
 
 
 @pytest.fixture
 def ray_start_4_cpus():
+    """Start Ray with 4 CPUs for testing."""
     address_info = ray.init(num_cpus=4)
     yield address_info
-    # The code after the yield will run as teardown code.
     ray.shutdown()
 
 
-scale_config = ScalingConfig(num_workers=2)
+@pytest.fixture
+def ray_start_2_cpus_1_gpu():
+    """Start Ray with 2 CPUs and 1 GPU for testing."""
+    address_info = ray.init(num_cpus=2, num_gpus=1)
+    yield address_info
+    ray.shutdown()
+
+
+@pytest.fixture
+def small_dataset():
+    """Create a small dataset for testing."""
+    data_raw = load_breast_cancer()
+    dataset_df = pd.DataFrame(data_raw["data"], columns=data_raw["feature_names"])
+    dataset_df["target"] = data_raw["target"]
+    train_df, test_df = train_test_split(dataset_df, test_size=0.3)
+    return train_df, test_df
+
+
+@pytest.fixture
+def regression_dataset():
+    """Create a regression dataset for testing."""
+    data_raw = load_diabetes()
+    dataset_df = pd.DataFrame(data_raw["data"], columns=data_raw["feature_names"])
+    dataset_df["target"] = data_raw["target"]
+    train_df, test_df = train_test_split(dataset_df, test_size=0.3)
+    return train_df, test_df
+
+
+@pytest.fixture
+def multiclass_dataset():
+    """Create a multiclass dataset for testing."""
+    data_raw = load_iris()
+    dataset_df = pd.DataFrame(data_raw["data"], columns=data_raw["feature_names"])
+    dataset_df["target"] = data_raw["target"]
+    train_df, test_df = train_test_split(dataset_df, test_size=0.3)
+    return train_df, test_df
+
+
+def test_xgboost_trainer_basic_functionality(ray_start_4_cpus, small_dataset):
+    """Test basic V2 XGBoost Trainer functionality with binary classification."""
+    train_df, test_df = small_dataset
+
+    def train_fn_per_worker(config: dict):
+        """Training function for binary classification."""
+        train_ds = ray.train.get_dataset_shard(TRAIN_DATASET_KEY)
+        train_df = train_ds.materialize().to_pandas()
+
+        eval_ds = ray.train.get_dataset_shard("valid")
+        eval_df = eval_ds.materialize().to_pandas()
+
+        # Prepare data
+        train_X, train_y = train_df.drop("target", axis=1), train_df["target"]
+        eval_X, eval_y = eval_df.drop("target", axis=1), eval_df["target"]
+
+        dtrain = xgboost.DMatrix(train_X, label=train_y)
+        deval = xgboost.DMatrix(eval_X, label=eval_y)
+
+        # Train model
+        bst = xgboost.train(
+            config,
+            dtrain=dtrain,
+            evals=[(deval, "validation")],
+            num_boost_round=10,
+        )
+
+        # Verify model was created successfully
+        assert bst is not None
+        assert hasattr(bst, "predict")
+
+    # Create datasets
+    train_dataset = ray.data.from_pandas(train_df)
+    valid_dataset = ray.data.from_pandas(test_df)
+
+    # Test parameters
+    params = {
+        "tree_method": "hist",  # Required for external memory
+        "objective": "binary:logistic",
+        "eval_metric": "logloss",
+        "max_depth": 3,
+        "eta": 0.1,
+    }
+
+    # Create and run trainer
+    trainer = XGBoostTrainer(
+        train_loop_per_worker=train_fn_per_worker,
+        train_loop_config=params,
+        scaling_config=ScalingConfig(num_workers=2),
+        datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
+    )
+
+    result = trainer.fit()
+
+    # Verify results
+    assert result.checkpoint is not None
+    assert result.metrics is not None
+    assert "validation-logloss" in result.metrics
+
+
+def test_xgboost_trainer_regression(ray_start_4_cpus, regression_dataset):
+    """Test V2 XGBoost Trainer with regression objective."""
+    train_df, test_df = regression_dataset
+
+    def train_fn_per_worker(config: dict):
+        """Training function for regression."""
+        train_ds = ray.train.get_dataset_shard(TRAIN_DATASET_KEY)
+        train_df = train_ds.materialize().to_pandas()
+
+        eval_ds = ray.train.get_dataset_shard("valid")
+        eval_df = eval_ds.materialize().to_pandas()
+
+        # Prepare data
+        train_X, train_y = train_df.drop("target", axis=1), train_df["target"]
+        eval_X, eval_y = eval_df.drop("target", axis=1), eval_df["target"]
+
+        dtrain = xgboost.DMatrix(train_X, label=train_y)
+        deval = xgboost.DMatrix(eval_X, label=eval_y)
+
+        # Train model
+        bst = xgboost.train(
+            config,
+            dtrain=dtrain,
+            evals=[(deval, "validation")],
+            num_boost_round=10,
+        )
+
+        # Verify model was created successfully
+        assert bst is not None
+        assert hasattr(bst, "predict")
+
+    # Create datasets
+    train_dataset = ray.data.from_pandas(train_df)
+    valid_dataset = ray.data.from_pandas(test_df)
+
+    # Test parameters
+    params = {
+        "tree_method": "hist",
+        "objective": "reg:squarederror",
+        "eval_metric": "rmse",
+        "max_depth": 4,
+        "eta": 0.1,
+    }
+
+    # Create and run trainer
+    trainer = XGBoostTrainer(
+        train_loop_per_worker=train_fn_per_worker,
+        train_loop_config=params,
+        scaling_config=ScalingConfig(num_workers=2),
+        datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
+    )
+
+    result = trainer.fit()
+
+    # Verify results
+    assert result.checkpoint is not None
+    assert result.metrics is not None
+    assert "validation-rmse" in result.metrics
+
+
+def test_xgboost_trainer_multiclass(ray_start_4_cpus, multiclass_dataset):
+    """Test V2 XGBoost Trainer with multiclass classification."""
+    train_df, test_df = multiclass_dataset
+
+    def train_fn_per_worker(config: dict):
+        """Training function for multiclass classification."""
+        train_ds = ray.train.get_dataset_shard(TRAIN_DATASET_KEY)
+        train_df = train_ds.materialize().to_pandas()
+
+        eval_ds = ray.train.get_dataset_shard("valid")
+        eval_df = eval_ds.materialize().to_pandas()
+
+        # Prepare data
+        train_X, train_y = train_df.drop("target", axis=1), train_df["target"]
+        eval_X, eval_y = eval_df.drop("target", axis=1), eval_df["target"]
+
+        dtrain = xgboost.DMatrix(train_X, label=train_y)
+        deval = xgboost.DMatrix(eval_X, label=eval_y)
+
+        # Train model
+        bst = xgboost.train(
+            config,
+            dtrain=dtrain,
+            evals=[(deval, "validation")],
+            num_boost_round=10,
+        )
+
+        # Verify model was created successfully
+        assert bst is not None
+        assert hasattr(bst, "predict")
+
+    # Create datasets
+    train_dataset = ray.data.from_pandas(train_df)
+    valid_dataset = ray.data.from_pandas(test_df)
+
+    # Test parameters
+    params = {
+        "tree_method": "hist",
+        "objective": "multi:softmax",
+        "num_class": 3,
+        "eval_metric": "mlogloss",
+        "max_depth": 3,
+        "eta": 0.1,
+    }
+
+    # Create and run trainer
+    trainer = XGBoostTrainer(
+        train_loop_per_worker=train_fn_per_worker,
+        train_loop_config=params,
+        scaling_config=ScalingConfig(num_workers=2),
+        datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
+    )
+
+    result = trainer.fit()
+
+    # Verify results
+    assert result.checkpoint is not None
+    assert result.metrics is not None
+    assert "validation-mlogloss" in result.metrics
+
+
+def test_xgboost_trainer_external_memory_basic(
+    ray_start_4_cpus, small_dataset, tmp_path
+):
+    """Test V2 XGBoost Trainer with external memory enabled."""
+    train_df, test_df = small_dataset
+
+    def train_fn_per_worker(config: dict):
+        """Training function using external memory."""
+        # Check if external memory is enabled via config
+        use_external_memory = config.get("use_external_memory", False)
+        external_memory_cache_dir = config.get("external_memory_cache_dir")
+        external_memory_device = config.get("external_memory_device", "cpu")
+        external_memory_batch_size = config.get("external_memory_batch_size")
+
+        train_ds_iter = ray.train.get_dataset_shard(TRAIN_DATASET_KEY)
+        eval_ds_iter = ray.train.get_dataset_shard("valid")
+
+        if use_external_memory:
+            # Use external memory DMatrix via utility function
+            from ray.train.xgboost._external_memory_utils import (
+                create_external_memory_dmatrix,
+            )
+
+            dtrain = create_external_memory_dmatrix(
+                dataset_shard=train_ds_iter,
+                label_column="target",
+                batch_size=external_memory_batch_size,
+                cache_dir=external_memory_cache_dir,
+                device=external_memory_device,
+            )
+            deval = create_external_memory_dmatrix(
+                dataset_shard=eval_ds_iter,
+                label_column="target",
+                batch_size=external_memory_batch_size,
+                cache_dir=external_memory_cache_dir,
+                device=external_memory_device,
+            )
+        else:
+            # Use standard DMatrix
+            train_df = train_ds_iter.materialize().to_pandas()
+            eval_df = eval_ds_iter.materialize().to_pandas()
+            train_X, train_y = train_df.drop("target", axis=1), train_df["target"]
+            eval_X, eval_y = eval_df.drop("target", axis=1), eval_df["target"]
+            dtrain = xgboost.DMatrix(train_X, label=train_y)
+            deval = xgboost.DMatrix(eval_X, label=eval_y)
+
+        # Train model
+        bst = xgboost.train(
+            config,
+            dtrain=dtrain,
+            evals=[(deval, "validation")],
+            num_boost_round=10,
+        )
+
+        # Verify model was created successfully
+        assert bst is not None
+        assert hasattr(bst, "predict")
+
+    # Create datasets
+    train_dataset = ray.data.from_pandas(train_df)
+    valid_dataset = ray.data.from_pandas(test_df)
+
+    # Test parameters
+    params = {
+        "tree_method": "hist",  # Required for external memory
+        "objective": "binary:logistic",
+        "eval_metric": "logloss",
+        "max_depth": 3,
+        "eta": 0.1,
+    }
+
+    # Create temporary cache directory
+    cache_dir = tmp_path / "xgboost_cache"
+    cache_dir.mkdir()
+
+    # Create and run trainer with external memory
+    trainer = XGBoostTrainer(
+        train_loop_per_worker=train_fn_per_worker,
+        train_loop_config=params,
+        scaling_config=ScalingConfig(num_workers=2),
+        datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
+        use_external_memory=True,
+        external_memory_cache_dir=str(cache_dir),
+        external_memory_device="cpu",
+        external_memory_batch_size=1000,
+    )
+
+    result = trainer.fit()
+
+    # Verify results
+    assert result.checkpoint is not None
+    assert result.metrics is not None
+    assert "validation-logloss" in result.metrics
+
+    # Verify external memory configuration
+    assert trainer.is_external_memory_enabled()
+    config = trainer.get_external_memory_config()
+    assert config["use_external_memory"] is True
+    assert config["cache_dir"] == str(cache_dir)
+    assert config["device"] == "cpu"
+    assert config["batch_size"] == 1000
+
+
+def test_xgboost_trainer_external_memory_auto_selection(
+    ray_start_4_cpus, small_dataset
+):
+    """Test V2 XGBoost Trainer with automatic external memory configuration."""
+    train_df, test_df = small_dataset
+
+    def train_fn_per_worker(config: dict):
+        """Training function using automatic external memory selection."""
+        # Check if external memory is enabled via config
+        use_external_memory = config.get("use_external_memory", False)
+        external_memory_cache_dir = config.get("external_memory_cache_dir")
+        external_memory_device = config.get("external_memory_device", "cpu")
+        external_memory_batch_size = config.get("external_memory_batch_size")
+
+        train_ds_iter = ray.train.get_dataset_shard(TRAIN_DATASET_KEY)
+        eval_ds_iter = ray.train.get_dataset_shard("valid")
+
+        if use_external_memory:
+            # Use external memory DMatrix via utility function
+            from ray.train.xgboost._external_memory_utils import (
+                create_external_memory_dmatrix,
+            )
+
+            dtrain = create_external_memory_dmatrix(
+                dataset_shard=train_ds_iter,
+                label_column="target",
+                batch_size=external_memory_batch_size,
+                cache_dir=external_memory_cache_dir,
+                device=external_memory_device,
+            )
+            deval = create_external_memory_dmatrix(
+                dataset_shard=eval_ds_iter,
+                label_column="target",
+                batch_size=external_memory_batch_size,
+                cache_dir=external_memory_cache_dir,
+                device=external_memory_device,
+            )
+        else:
+            # Use standard DMatrix
+            train_df = train_ds_iter.materialize().to_pandas()
+            eval_df = eval_ds_iter.materialize().to_pandas()
+            train_X, train_y = train_df.drop("target", axis=1), train_df["target"]
+            eval_X, eval_y = eval_df.drop("target", axis=1), eval_df["target"]
+            dtrain = xgboost.DMatrix(train_X, label=train_y)
+            deval = xgboost.DMatrix(eval_X, label=eval_y)
+
+        # Train model
+        bst = xgboost.train(
+            config,
+            dtrain=dtrain,
+            evals=[(deval, "validation")],
+            num_boost_round=10,
+        )
+
+        # Verify model was created successfully
+        assert bst is not None
+        assert hasattr(bst, "predict")
+
+    # Create datasets
+    train_dataset = ray.data.from_pandas(train_df)
+    valid_dataset = ray.data.from_pandas(test_df)
+
+    # Test parameters
+    params = {
+        "tree_method": "hist",  # Required for external memory
+        "objective": "binary:logistic",
+        "eval_metric": "logloss",
+        "max_depth": 3,
+        "eta": 0.1,
+    }
+
+    # Create and run trainer with external memory (auto-configuration)
+    trainer = XGBoostTrainer(
+        train_loop_per_worker=train_fn_per_worker,
+        train_loop_config=params,
+        scaling_config=ScalingConfig(num_workers=2),
+        datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
+        use_external_memory=True,
+        # Let the trainer auto-select cache directory and batch size
+    )
+
+    result = trainer.fit()
+
+    # Verify results
+    assert result.checkpoint is not None
+    assert result.metrics is not None
+    assert "validation-logloss" in result.metrics
+
+    # Verify external memory is enabled
+    assert trainer.is_external_memory_enabled()
+
+
+def test_xgboost_trainer_external_memory_gpu(ray_start_2_cpus_1_gpu, small_dataset):
+    """Test V2 XGBoost Trainer with GPU external memory."""
+    train_df, test_df = small_dataset
+
+    def train_fn_per_worker(config: dict):
+        """Training function using GPU external memory."""
+        # Check if external memory is enabled via config
+        use_external_memory = config.get("use_external_memory", False)
+        external_memory_cache_dir = config.get("external_memory_cache_dir")
+        external_memory_device = config.get("external_memory_device", "cpu")
+        external_memory_batch_size = config.get("external_memory_batch_size")
+
+        train_ds_iter = ray.train.get_dataset_shard(TRAIN_DATASET_KEY)
+        eval_ds_iter = ray.train.get_dataset_shard("valid")
+
+        if use_external_memory:
+            # Use external memory DMatrix via utility function
+            from ray.train.xgboost._external_memory_utils import (
+                create_external_memory_dmatrix,
+            )
+
+            dtrain = create_external_memory_dmatrix(
+                dataset_shard=train_ds_iter,
+                label_column="target",
+                batch_size=external_memory_batch_size,
+                cache_dir=external_memory_cache_dir,
+                device=external_memory_device,
+            )
+            deval = create_external_memory_dmatrix(
+                dataset_shard=eval_ds_iter,
+                label_column="target",
+                batch_size=external_memory_batch_size,
+                cache_dir=external_memory_cache_dir,
+                device=external_memory_device,
+            )
+        else:
+            # Use standard DMatrix
+            train_df = train_ds_iter.materialize().to_pandas()
+            eval_df = eval_ds_iter.materialize().to_pandas()
+            train_X, train_y = train_df.drop("target", axis=1), train_df["target"]
+            eval_X, eval_y = eval_df.drop("target", axis=1), eval_df["target"]
+            dtrain = xgboost.DMatrix(train_X, label=train_y)
+            deval = xgboost.DMatrix(eval_X, label=eval_y)
+
+        # Train model
+        bst = xgboost.train(
+            config,
+            dtrain=dtrain,
+            evals=[(deval, "validation")],
+            num_boost_round=10,
+        )
+
+        # Verify model was created successfully
+        assert bst is not None
+        assert hasattr(bst, "predict")
+
+    # Create datasets
+    train_dataset = ray.data.from_pandas(train_df)
+    valid_dataset = ray.data.from_pandas(test_df)
+
+    # Test parameters
+    params = {
+        "tree_method": "hist",  # Required for external memory
+        "objective": "binary:logistic",
+        "eval_metric": "logloss",
+        "max_depth": 3,
+        "eta": 0.1,
+    }
+
+    # Create and run trainer with GPU external memory
+    trainer = XGBoostTrainer(
+        train_loop_per_worker=train_fn_per_worker,
+        train_loop_config=params,
+        scaling_config=ScalingConfig(num_workers=1, use_gpu=True),
+        datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
+        use_external_memory=True,
+        external_memory_device="cuda",
+        external_memory_batch_size=5000,  # Smaller batch size for GPU
+    )
+
+    result = trainer.fit()
+
+    # Verify results
+    assert result.checkpoint is not None
+    assert result.metrics is not None
+    assert "validation-logloss" in result.metrics
+
+    # Verify GPU external memory configuration
+    config = trainer.get_external_memory_config()
+    assert config["device"] == "cuda"
+
+
+def test_xgboost_trainer_external_memory_utilities(ray_start_4_cpus):
+    """Test V2 XGBoost Trainer external memory utility methods."""
+    # Test GPU setup method
+    gpu_setup_result = XGBoostTrainer.setup_gpu_external_memory()
+    # This should return False on CPU-only systems, True on GPU systems
+    assert isinstance(gpu_setup_result, bool)
+
+    # Test external memory recommendations
+    recommendations = XGBoostTrainer.get_external_memory_recommendations()
+    assert isinstance(recommendations, dict)
+    assert "parameters" in recommendations
+    assert "best_practices" in recommendations
+    assert "cache_directories" in recommendations
+    assert "documentation" in recommendations
+
+    # Verify required parameters are present
+    assert recommendations["parameters"]["tree_method"] == "hist"
+    assert recommendations["parameters"]["grow_policy"] == "depthwise"
+
+
+def test_xgboost_trainer_external_memory_fallback_behavior(
+    ray_start_4_cpus, small_dataset, tmp_path
+):
+    """Test V2 XGBoost Trainer fallback behavior when external memory fails."""
+    train_df, test_df = small_dataset
+
+    def train_fn_per_worker(config: dict):
+        """Training function that handles external memory failures gracefully."""
+        # Check if external memory is enabled via config
+        use_external_memory = config.get("use_external_memory", False)
+        external_memory_cache_dir = config.get("external_memory_cache_dir")
+        external_memory_device = config.get("external_memory_device", "cpu")
+        external_memory_batch_size = config.get("external_memory_batch_size")
+
+        train_ds_iter = ray.train.get_dataset_shard(TRAIN_DATASET_KEY)
+        eval_ds_iter = ray.train.get_dataset_shard("valid")
+
+        try:
+            if use_external_memory:
+                # Try external memory first
+                from ray.train.xgboost._external_memory_utils import (
+                    create_external_memory_dmatrix,
+                )
+
+                dtrain = create_external_memory_dmatrix(
+                    dataset_shard=train_ds_iter,
+                    label_column="target",
+                    batch_size=external_memory_batch_size,
+                    cache_dir=external_memory_cache_dir,
+                    device=external_memory_device,
+                )
+                deval = create_external_memory_dmatrix(
+                    dataset_shard=eval_ds_iter,
+                    label_column="target",
+                    batch_size=external_memory_batch_size,
+                    cache_dir=external_memory_cache_dir,
+                    device=external_memory_device,
+                )
+            else:
+                raise ValueError("External memory not enabled")
+        except Exception:
+            # Fall back to standard DMatrix
+            train_df = train_ds_iter.materialize().to_pandas()
+            eval_df = eval_ds_iter.materialize().to_pandas()
+            train_X, train_y = train_df.drop("target", axis=1), train_df["target"]
+            eval_X, eval_y = eval_df.drop("target", axis=1), eval_df["target"]
+            dtrain = xgboost.DMatrix(train_X, label=train_y)
+            deval = xgboost.DMatrix(eval_X, label=eval_y)
+
+        # Train model
+        bst = xgboost.train(
+            config,
+            dtrain=dtrain,
+            evals=[(deval, "validation")],
+            num_boost_round=10,
+        )
+
+        # Verify model was created successfully
+        assert bst is not None
+        assert hasattr(bst, "predict")
+
+    # Create datasets
+    train_dataset = ray.data.from_pandas(train_df)
+    valid_dataset = ray.data.from_pandas(test_df)
+
+    # Test parameters
+    params = {
+        "tree_method": "hist",
+        "objective": "binary:logistic",
+        "eval_metric": "logloss",
+        "max_depth": 3,
+        "eta": 0.1,
+    }
+
+    # Create temporary cache directory
+    cache_dir = tmp_path / "xgboost_fallback_cache"
+    cache_dir.mkdir()
+
+    # Create and run trainer with external memory
+    trainer = XGBoostTrainer(
+        train_loop_per_worker=train_fn_per_worker,
+        train_loop_config=params,
+        scaling_config=ScalingConfig(num_workers=2),
+        datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
+        use_external_memory=True,
+        external_memory_cache_dir=str(cache_dir),
+    )
+
+    result = trainer.fit()
+
+    # Verify results
+    assert result.checkpoint is not None
+    assert result.metrics is not None
+    assert "validation-logloss" in result.metrics
+
+
+def test_xgboost_trainer_gpu_training(ray_start_2_cpus_1_gpu, small_dataset):
+    """Test V2 XGBoost Trainer with GPU training."""
+    train_df, test_df = small_dataset
+
+    def train_fn_per_worker(config: dict):
+        """Training function for GPU training."""
+        train_ds = ray.train.get_dataset_shard(TRAIN_DATASET_KEY)
+        train_df = train_ds.materialize().to_pandas()
+
+        eval_ds = ray.train.get_dataset_shard("valid")
+        eval_df = eval_ds.materialize().to_pandas()
+
+        # Prepare data
+        train_X, train_y = train_df.drop("target", axis=1), train_df["target"]
+        eval_X, eval_y = eval_df.drop("target", axis=1), eval_df["target"]
+
+        dtrain = xgboost.DMatrix(train_X, label=train_y)
+        deval = xgboost.DMatrix(eval_X, label=eval_y)
+
+        # Train model
+        bst = xgboost.train(
+            config,
+            dtrain=dtrain,
+            evals=[(deval, "validation")],
+            num_boost_round=10,
+        )
+
+        # Verify model was created successfully
+        assert bst is not None
+        assert hasattr(bst, "predict")
+
+    # Create datasets
+    train_dataset = ray.data.from_pandas(train_df)
+    valid_dataset = ray.data.from_pandas(test_df)
+
+    # Test parameters for GPU training
+    params = {
+        "tree_method": "hist",
+        "objective": "binary:logistic",
+        "eval_metric": "logloss",
+        "device": "cuda",
+        "max_depth": 3,
+        "eta": 0.1,
+    }
+
+    # Create and run trainer with GPU
+    trainer = XGBoostTrainer(
+        train_loop_per_worker=train_fn_per_worker,
+        train_loop_config=params,
+        scaling_config=ScalingConfig(num_workers=1, use_gpu=True),
+        datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
+    )
+
+    result = trainer.fit()
 
-data_raw = load_breast_cancer()
-dataset_df = pd.DataFrame(data_raw["data"], columns=data_raw["feature_names"])
-dataset_df["target"] = data_raw["target"]
-train_df, test_df = train_test_split(dataset_df, test_size=0.3)
+    # Verify results
+    assert result.checkpoint is not None
+    assert result.metrics is not None
 
-params = {
-    "tree_method": "approx",
-    "objective": "binary:logistic",
-    "eval_metric": ["logloss", "error"],
-}
 
+def test_xgboost_trainer_checkpoint_resume(ray_start_4_cpus, small_dataset):
+    """Test V2 XGBoost Trainer checkpoint resuming."""
+    train_df, test_df = small_dataset
 
-def test_fit(ray_start_4_cpus):
-    def xgboost_train_fn_per_worker(
-        label_column: str,
-        dataset_keys: set,
-    ):
+    def train_fn_per_worker(config: dict):
+        """Training function with checkpoint resuming."""
         checkpoint = ray.train.get_checkpoint()
         starting_model = None
         remaining_iters = 10
+
         if checkpoint:
-            starting_model = RayTrainReportCallback.get_model(checkpoint)
-            starting_iter = starting_model.num_boosted_rounds()
-            remaining_iters = remaining_iters - starting_iter
+            # For V2, we need to handle checkpoint differently
+            # This is a simplified version for testing
+            remaining_iters = 5  # Just continue with fewer iterations
 
-        train_ds_iter = ray.train.get_dataset_shard(TRAIN_DATASET_KEY)
-        train_df = train_ds_iter.materialize().to_pandas()
+        train_ds = ray.train.get_dataset_shard(TRAIN_DATASET_KEY)
+        train_df = train_ds.materialize().to_pandas()
 
-        eval_ds_iters = {
-            k: ray.train.get_dataset_shard(k)
-            for k in dataset_keys
-            if k != TRAIN_DATASET_KEY
-        }
-        eval_dfs = {k: d.materialize().to_pandas() for k, d in eval_ds_iters.items()}
+        eval_ds = ray.train.get_dataset_shard("valid")
+        eval_df = eval_ds.materialize().to_pandas()
 
-        train_X, train_y = train_df.drop(label_column, axis=1), train_df[label_column]
-        dtrain = xgboost.DMatrix(train_X, label=train_y)
+        # Prepare data
+        train_X, train_y = train_df.drop("target", axis=1), train_df["target"]
+        eval_X, eval_y = eval_df.drop("target", axis=1), eval_df["target"]
 
-        # NOTE: Include the training dataset in the evaluation datasets.
-        # This allows `train-*` metrics to be calculated and reported.
-        evals = [(dtrain, TRAIN_DATASET_KEY)]
-
-        for eval_name, eval_df in eval_dfs.items():
-            eval_X, eval_y = eval_df.drop(label_column, axis=1), eval_df[label_column]
-            evals.append((xgboost.DMatrix(eval_X, label=eval_y), eval_name))
+        dtrain = xgboost.DMatrix(train_X, label=train_y)
+        deval = xgboost.DMatrix(eval_X, label=eval_y)
 
-        evals_result = {}
-        xgboost.train(
-            {},
+        # Train model
+        bst = xgboost.train(
+            config,
             dtrain=dtrain,
-            evals=evals,
-            evals_result=evals_result,
+            evals=[(deval, "validation")],
             num_boost_round=remaining_iters,
             xgb_model=starting_model,
         )
 
+        # Verify model was created successfully
+        assert bst is not None
+        assert hasattr(bst, "predict")
+
+    # Create datasets
     train_dataset = ray.data.from_pandas(train_df)
     valid_dataset = ray.data.from_pandas(test_df)
+
+    # Test parameters
+    params = {
+        "tree_method": "hist",
+        "objective": "binary:logistic",
+        "eval_metric": "logloss",
+        "max_depth": 3,
+        "eta": 0.1,
+    }
+
+    # Create and run trainer
     trainer = XGBoostTrainer(
-        train_loop_per_worker=lambda: xgboost_train_fn_per_worker(
+        train_loop_per_worker=train_fn_per_worker,
+        train_loop_config=params,
+        scaling_config=ScalingConfig(num_workers=2),
+        datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
+    )
+
+    result = trainer.fit()
+
+    # Verify results
+    assert result.checkpoint is not None
+    assert result.metrics is not None
+
+    # Test checkpoint resuming
+    trainer_resume = XGBoostTrainer(
+        train_loop_per_worker=train_fn_per_worker,
+        train_loop_config=params,
+        scaling_config=ScalingConfig(num_workers=2),
+        datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
+        resume_from_checkpoint=result.checkpoint,
+    )
+
+    result_resume = trainer_resume.fit()
+    assert result_resume.checkpoint is not None
+    assert result_resume.metrics is not None
+
+
+def test_xgboost_trainer_deprecated_methods(ray_start_4_cpus, small_dataset):
+    """Test that deprecated methods raise appropriate warnings."""
+    train_df, test_df = small_dataset
+
+    def train_fn_per_worker(config: dict):
+        """Simple training function."""
+        train_ds = ray.train.get_dataset_shard(TRAIN_DATASET_KEY)
+        train_df = train_ds.materialize().to_pandas()
+
+        train_X, train_y = train_df.drop("target", axis=1), train_df["target"]
+        dtrain = xgboost.DMatrix(train_X, label=train_y)
+
+        bst = xgboost.train(
+            config,
+            dtrain=dtrain,
+            num_boost_round=5,
+        )
+
+        # Verify model was created successfully
+        assert bst is not None
+        assert hasattr(bst, "predict")
+
+    # Create datasets
+    train_dataset = ray.data.from_pandas(train_df)
+
+    # Test deprecated legacy API - should raise TypeError for unexpected kwargs
+    with pytest.raises(TypeError):
+        XGBoostTrainer(
+            train_fn_per_worker,
             label_column="target",
-            dataset_keys={TRAIN_DATASET_KEY, "valid"},
+            params={"objective": "binary:logistic"},
+            num_boost_round=5,
+            scaling_config=ScalingConfig(num_workers=2),
+            datasets={TRAIN_DATASET_KEY: train_dataset},
+        )
+
+
+def test_xgboost_trainer_dataset_config(ray_start_4_cpus, small_dataset):
+    """Test V2 XGBoost Trainer with custom dataset configuration."""
+    train_df, test_df = small_dataset
+
+    def train_fn_per_worker(config: dict):
+        """Training function."""
+        train_ds = ray.train.get_dataset_shard(TRAIN_DATASET_KEY)
+        train_df = train_ds.materialize().to_pandas()
+
+        train_X, train_y = train_df.drop("target", axis=1), train_df["target"]
+        dtrain = xgboost.DMatrix(train_X, label=train_y)
+
+        bst = xgboost.train(
+            config,
+            dtrain=dtrain,
+            num_boost_round=5,
+        )
+
+        # Verify model was created successfully
+        assert bst is not None
+        assert hasattr(bst, "predict")
+
+    # Create datasets
+    train_dataset = ray.data.from_pandas(train_df)
+
+    # Test parameters
+    params = {
+        "tree_method": "hist",
+        "objective": "binary:logistic",
+        "eval_metric": "logloss",
+        "max_depth": 3,
+        "eta": 0.1,
+    }
+
+    # Create and run trainer with custom dataset config
+    trainer = XGBoostTrainer(
+        train_loop_per_worker=train_fn_per_worker,
+        train_loop_config=params,
+        scaling_config=ScalingConfig(num_workers=2),
+        datasets={TRAIN_DATASET_KEY: train_dataset},
+        dataset_config=ray.train.DataConfig(
+            execution_options=ray.data.ExecutionOptions(
+                preserve_order=False,
+                locality_with_output=True,
+            )
         ),
+    )
+
+    result = trainer.fit()
+
+    # Verify results
+    assert result.checkpoint is not None
+    assert result.metrics is not None
+
+
+def test_xgboost_trainer_run_config(ray_start_4_cpus, small_dataset):
+    """Test V2 XGBoost Trainer with custom run configuration."""
+    train_df, test_df = small_dataset
+
+    def train_fn_per_worker(config: dict):
+        """Training function."""
+        train_ds = ray.train.get_dataset_shard(TRAIN_DATASET_KEY)
+        train_df = train_ds.materialize().to_pandas()
+
+        train_X, train_y = train_df.drop("target", axis=1), train_df["target"]
+        dtrain = xgboost.DMatrix(train_X, label=train_y)
+
+        bst = xgboost.train(
+            config,
+            dtrain=dtrain,
+            num_boost_round=5,
+        )
+
+        # Verify model was created successfully
+        assert bst is not None
+        assert hasattr(bst, "predict")
+
+    # Create datasets
+    train_dataset = ray.data.from_pandas(train_df)
+
+    # Test parameters
+    params = {
+        "tree_method": "hist",
+        "objective": "binary:logistic",
+        "eval_metric": "logloss",
+        "max_depth": 3,
+        "eta": 0.1,
+    }
+
+    # Create and run trainer with custom run config
+    trainer = XGBoostTrainer(
+        train_loop_per_worker=train_fn_per_worker,
         train_loop_config=params,
-        scaling_config=scale_config,
-        datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
+        scaling_config=ScalingConfig(num_workers=2),
+        datasets={TRAIN_DATASET_KEY: train_dataset},
+        run_config=ray.train.RunConfig(
+            name="test_xgboost_training",
+            local_dir="/tmp/ray_results",
+        ),
     )
+
     result = trainer.fit()
-    with pytest.raises(DeprecationWarning):
-        XGBoostTrainer.get_model(result.checkpoint)
+
+    # Verify results
+    assert result.checkpoint is not None
+    assert result.metrics is not None
 
 
 # TODO: Unit test RayTrainReportCallback
diff --git a/python/ray/train/v2/xgboost/__init__.py b/python/ray/train/v2/xgboost/__init__.py
index b4e10280aceb..55da2f1cd005 100644
--- a/python/ray/train/v2/xgboost/__init__.py
+++ b/python/ray/train/v2/xgboost/__init__.py
@@ -1,2 +1,13 @@
-# This is a workaround to avoid a circular import.
-import ray.train.xgboost as ray_train_xgboost  # noqa: F401
+"""
+XGBoost Trainer with External Memory Support
+
+This module provides the XGBoostTrainer for distributed XGBoost training
+with optional external memory optimization for large datasets.
+
+The only public API is the XGBoostTrainer class. All other functions
+are internal utilities and should not be imported directly.
+"""
+
+from .xgboost_trainer import XGBoostTrainer
+
+__all__ = ["XGBoostTrainer"]
diff --git a/python/ray/train/v2/xgboost/xgboost_trainer.py b/python/ray/train/v2/xgboost/xgboost_trainer.py
index 065ca078df2f..148ad60308b1 100644
--- a/python/ray/train/v2/xgboost/xgboost_trainer.py
+++ b/python/ray/train/v2/xgboost/xgboost_trainer.py
@@ -1,5 +1,12 @@
+"""V2 XGBoost Trainer with External Memory Support.
+
+This module provides a V2-compliant XGBoost trainer that supports both standard
+DMatrix creation for smaller datasets and external memory optimization for large
+datasets that don't fit in RAM.
+"""
+
 import logging
-from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 import ray.train
 from ray.train import Checkpoint
@@ -9,6 +16,8 @@
 from ray.util.annotations import Deprecated
 
 if TYPE_CHECKING:
+    import xgboost
+
     from ray.train.xgboost import XGBoostConfig
 
 logger = logging.getLogger(__name__)
@@ -17,98 +26,122 @@
 class XGBoostTrainer(DataParallelTrainer):
     """A Trainer for distributed data-parallel XGBoost training.
 
-    Example
-    -------
+    This trainer supports both standard DMatrix creation for smaller datasets
+    and external memory optimization for large datasets that don't fit in RAM.
 
-    .. testcode::
+    Examples:
+        .. testcode::
 
-        import xgboost
+            import xgboost
 
-        import ray.data
-        import ray.train
-        from ray.train.xgboost import RayTrainReportCallback
-        from ray.train.xgboost import XGBoostTrainer
+            import ray.data
+            import ray.train
+            from ray.train.xgboost import RayTrainReportCallback
+            from ray.train.v2.xgboost import XGBoostTrainer
 
-        def train_fn_per_worker(config: dict):
-            # (Optional) Add logic to resume training state from a checkpoint.
-            # ray.train.get_checkpoint()
+            def train_fn_per_worker(config: dict):
+                # (Optional) Add logic to resume training state from a checkpoint.
+                # ray.train.get_checkpoint()
 
-            # 1. Get the dataset shard for the worker and convert to a `xgboost.DMatrix`
-            train_ds_iter, eval_ds_iter = (
-                ray.train.get_dataset_shard("train"),
-                ray.train.get_dataset_shard("validation"),
-            )
-            train_ds, eval_ds = train_ds_iter.materialize(), eval_ds_iter.materialize()
+                # 1. Get the dataset shard for the worker and convert to a `xgboost.DMatrix`
+                train_ds_iter, eval_ds_iter = (
+                    ray.train.get_dataset_shard("train"),
+                    ray.train.get_dataset_shard("validation"),
+                )
 
-            train_df, eval_df = train_ds.to_pandas(), eval_ds.to_pandas()
-            train_X, train_y = train_df.drop("y", axis=1), train_df["y"]
-            eval_X, eval_y = eval_df.drop("y", axis=1), eval_df["y"]
+                # Check if external memory is enabled via config
+                use_external_memory = config.get("use_external_memory", False)
+                external_memory_cache_dir = config.get("external_memory_cache_dir")
+                external_memory_device = config.get("external_memory_device", "cpu")
+                external_memory_batch_size = config.get("external_memory_batch_size")
 
-            dtrain = xgboost.DMatrix(train_X, label=train_y)
-            deval = xgboost.DMatrix(eval_X, label=eval_y)
+                if use_external_memory:
+                    # Option 2: External memory DMatrix for large datasets
+                    import xgboost as xgb
+                    from ray.train.xgboost._external_memory_utils import (
+                        create_external_memory_dmatrix,
+                    )
 
-            params = {
-                "tree_method": "approx",
-                "objective": "reg:squarederror",
-                "eta": 1e-4,
-                "subsample": 0.5,
-                "max_depth": 2,
-            }
+                    # Create external memory DMatrix
+                    dtrain = create_external_memory_dmatrix(
+                        dataset_shard=train_ds_iter,
+                        label_column="y",
+                        batch_size=external_memory_batch_size,
+                        cache_dir=external_memory_cache_dir,
+                        device=external_memory_device,
+                    )
+                    deval = create_external_memory_dmatrix(
+                        dataset_shard=eval_ds_iter,
+                        label_column="y",
+                        batch_size=external_memory_batch_size,
+                        cache_dir=external_memory_cache_dir,
+                        device=external_memory_device,
+                    )
+
+                    # Use hist tree method (required for external memory)
+                    params = {
+                        "tree_method": "hist",  # Required for external memory
+                        "objective": "reg:squarederror",
+                        "eta": 1e-4,
+                        "subsample": 0.5,
+                        "max_depth": 2,
+                    }
+                else:
+                    # Option 1: Standard DMatrix for smaller datasets (default)
+                    train_ds, eval_ds = train_ds_iter.materialize(), eval_ds_iter.materialize()
+                    train_df, eval_df = train_ds.to_pandas(), eval_ds.to_pandas()
+                    train_X, train_y = train_df.drop("y", axis=1), train_df["y"]
+                    eval_X, eval_y = eval_df.drop("y", axis=1), eval_df["y"]
 
-            # 2. Do distributed data-parallel training.
-            # Ray Train sets up the necessary coordinator processes and
-            # environment variables for your workers to communicate with each other.
-            bst = xgboost.train(
-                params,
-                dtrain=dtrain,
-                evals=[(deval, "validation")],
-                num_boost_round=1,
-                callbacks=[RayTrainReportCallback()],
+                    dtrain = xgboost.DMatrix(train_X, label=train_y)
+                    deval = xgboost.DMatrix(eval_X, label=eval_y)
+
+                    # Standard parameters
+                    params = {
+                        "tree_method": "approx",  # Can use approx for standard DMatrix
+                        "objective": "reg:squarederror",
+                        "eta": 1e-4,
+                        "subsample": 0.5,
+                        "max_depth": 2,
+                    }
+
+                # 2. Do distributed data-parallel training.
+                # Ray Train sets up the necessary coordinator processes and
+                # environment variables for your workers to communicate with each other.
+                bst = xgboost.train(
+                    params,
+                    dtrain=dtrain,
+                    evals=[(deval, "validation")],
+                    num_boost_round=10,
+                    callbacks=[RayTrainReportCallback()],
+                )
+
+            # Standard training (in-memory)
+            train_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)])
+            eval_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(16)])
+            trainer = XGBoostTrainer(
+                train_loop_per_worker=train_fn_per_worker,
+                datasets={"train": train_ds, "validation": eval_ds},
+                scaling_config=ray.train.ScalingConfig(num_workers=4),
             )
+            result = trainer.fit()
+            booster = RayTrainReportCallback.get_model(result.checkpoint)
 
-        train_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)])
-        eval_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(16)])
-        trainer = XGBoostTrainer(
-            train_fn_per_worker,
-            datasets={"train": train_ds, "validation": eval_ds},
-            scaling_config=ray.train.ScalingConfig(num_workers=2),
-        )
-        result = trainer.fit()
-        booster = RayTrainReportCallback.get_model(result.checkpoint)
-
-    Args:
-        train_loop_per_worker: The training function to execute on each worker.
-            This function can either take in zero arguments or a single ``Dict``
-            argument which is set by defining ``train_loop_config``.
-            Within this function you can use any of the
-            :ref:`Ray Train Loop utilities <train-loop-api>`.
-        train_loop_config: A configuration ``Dict`` to pass in as an argument to
-            ``train_loop_per_worker``.
-            This is typically used for specifying hyperparameters.
-        xgboost_config: The configuration for setting up the distributed xgboost
-            backend. Defaults to using the "rabit" backend.
-            See :class:`~ray.train.xgboost.XGBoostConfig` for more info.
-        scaling_config: The configuration for how to scale data parallel training.
-            ``num_workers`` determines how many Python processes are used for training,
-            and ``use_gpu`` determines whether or not each process should use GPUs.
-            See :class:`~ray.train.ScalingConfig` for more info.
-        run_config: The configuration for the execution of the training run.
-            See :class:`~ray.train.RunConfig` for more info.
-        datasets: The Ray Datasets to ingest for training.
-            Datasets are keyed by name (``{name: dataset}``).
-            Each dataset can be accessed from within the ``train_loop_per_worker``
-            by calling ``ray.train.get_dataset_shard(name)``.
-            Sharding and additional configuration can be done by
-            passing in a ``dataset_config``.
-        dataset_config: The configuration for ingesting the input ``datasets``.
-            By default, all the Ray Dataset are split equally across workers.
-            See :class:`~ray.train.DataConfig` for more details.
-        resume_from_checkpoint: A checkpoint to resume training from.
-            This checkpoint can be accessed from within ``train_loop_per_worker``
-            by calling ``ray.train.get_checkpoint()``.
-        metadata: Dict that should be made available via
-            `ray.train.get_context().get_metadata()` and in `checkpoint.get_metadata()`
-            for checkpoints saved from this Trainer. Must be JSON-serializable.
+            # External memory training for large datasets
+            # Create larger datasets that require external memory
+            large_train_ds = ray.data.read_parquet("s3://bucket/large_train.parquet")
+            large_eval_ds = ray.data.read_parquet("s3://bucket/large_eval.parquet")
+
+            large_trainer = XGBoostTrainer(
+                train_loop_per_worker=train_fn_per_worker,
+                datasets={"train": large_train_ds, "validation": large_eval_ds},
+                scaling_config=ray.train.ScalingConfig(num_workers=4),
+                use_external_memory=True,
+                external_memory_cache_dir="/mnt/cluster_storage",  # Shared storage
+                external_memory_device="cpu",  # or "cuda" for GPU
+                external_memory_batch_size=50000,  # Optimal batch size
+            )
+            result = large_trainer.fit()
     """
 
     def __init__(
@@ -124,30 +157,96 @@ def __init__(
         # TODO: [Deprecated]
         metadata: Optional[Dict[str, Any]] = None,
         resume_from_checkpoint: Optional[Checkpoint] = None,
-        # TODO(justinvyu): [Deprecated] Legacy XGBoostTrainer API
-        label_column: Optional[str] = None,
-        params: Optional[Dict[str, Any]] = None,
-        num_boost_round: Optional[int] = None,
+        # External memory configuration
+        use_external_memory: bool = False,
+        external_memory_cache_dir: Optional[str] = None,
+        external_memory_device: str = "cpu",
+        external_memory_batch_size: Optional[int] = None,
     ):
-        if (
-            label_column is not None
-            or params is not None
-            or num_boost_round is not None
-        ):
-            raise DeprecationWarning(
-                "The legacy XGBoostTrainer API is deprecated. "
-                "Please switch to passing in a custom `train_loop_per_worker` "
-                "function instead. "
-                "See this issue for more context: "
-                "https://github.com/ray-project/ray/issues/50042"
-            )
+        """Initialize the XGBoost trainer.
+
+        Args:
+            train_loop_per_worker: The training function to execute on each worker.
+                This function can either take in zero arguments or a single ``Dict``
+                argument which is set by defining ``train_loop_config``.
+                Within this function you can use any of the
+                :ref:`Ray Train Loop utilities <train-loop-api>`.
+            train_loop_config: A configuration ``Dict`` to pass in as an argument to
+                ``train_loop_per_worker``.
+                This is typically used for specifying hyperparameters.
+            xgboost_config: The configuration for setting up the distributed xgboost
+                backend. Defaults to using the "rabit" backend.
+                See :class:`~ray.train.xgboost.XGBoostConfig` for more info.
+            scaling_config: The configuration for how to scale data parallel training.
+                ``num_workers`` determines how many Python processes are used for training,
+                and ``use_gpu`` determines whether or not each process should use GPUs.
+                See :class:`~ray.train.ScalingConfig` for more info.
+            run_config: The configuration for the execution of the training run.
+                See :class:`~ray.train.RunConfig` for more info.
+            datasets: The Ray Datasets to ingest for training.
+                Datasets are keyed by name (``{name: dataset}``).
+                Each dataset can be accessed from within the ``train_loop_per_worker``
+                by calling ``ray.train.get_dataset_shard(name)``.
+                Sharding and additional configuration can be done by
+                passing in a ``dataset_config``.
+            dataset_config: The configuration for ingesting the input ``datasets``.
+                By default, all the Ray Dataset are split equally across workers.
+                See :class:`~ray.train.DataConfig` for more details.
+            metadata: Dict that should be made available via
+                `ray.train.get_context().get_metadata()` and in `checkpoint.get_metadata()`
+                for checkpoints saved from this Trainer. Must be JSON-serializable.
+            resume_from_checkpoint: A checkpoint to resume training from.
+                This checkpoint can be accessed from within ``train_loop_per_worker``
+                by calling ``ray.train.get_checkpoint()``.
+            use_external_memory: Whether to use external memory for DMatrix creation.
+                If True, uses ExtMemQuantileDMatrix for large datasets that don't fit in RAM.
+                If False (default), uses standard DMatrix for in-memory training.
+            external_memory_cache_dir: Directory for caching external memory files.
+                If None, automatically selects the best available directory.
+            external_memory_device: Device to use for external memory training.
+                Options: "cpu" (default) or "cuda" for GPU training.
+            external_memory_batch_size: Batch size for external memory iteration.
+                If None, uses optimal default based on device type.
+        """
+        # Legacy API parameters were removed from V2 trainer
+        # V2 trainer only supports train_loop_per_worker pattern
+
+        # Store external memory configuration
+        self.use_external_memory = use_external_memory
+        self.external_memory_cache_dir = external_memory_cache_dir
+        self.external_memory_device = external_memory_device
+        self.external_memory_batch_size = external_memory_batch_size
+
+        # Inject external memory configuration into train_loop_config
+        if train_loop_config is None:
+            train_loop_config = {}
 
-        from ray.train.xgboost import XGBoostConfig
+        # Add external memory settings to config so training function can access them
+        train_loop_config.update(
+            {
+                "use_external_memory": use_external_memory,
+                "external_memory_cache_dir": external_memory_cache_dir,
+                "external_memory_device": external_memory_device,
+                "external_memory_batch_size": external_memory_batch_size,
+            }
+        )
+
+        # Handle XGBoostConfig import conditionally
+        if xgboost_config is None:
+            try:
+                from ray.train.xgboost import XGBoostConfig
 
-        super(XGBoostTrainer, self).__init__(
+                backend_config = XGBoostConfig()
+            except ImportError:
+                # If XGBoost is not available, use None as backend
+                backend_config = None
+        else:
+            backend_config = xgboost_config
+
+        super().__init__(
             train_loop_per_worker=train_loop_per_worker,
             train_loop_config=train_loop_config,
-            backend_config=xgboost_config or XGBoostConfig(),
+            backend_config=backend_config,
             scaling_config=scaling_config,
             dataset_config=dataset_config,
             run_config=run_config,
@@ -164,3 +263,274 @@ def get_model(cls, checkpoint: Checkpoint):
             "`XGBoostTrainer.get_model` is deprecated. "
             "Use `RayTrainReportCallback.get_model` instead."
         )
+
+    def create_dmatrix(
+        self,
+        dataset_shard: Any,
+        label_column: Union[str, List[str]],
+        feature_columns: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        """Create an XGBoost DMatrix using the trainer's configuration.
+
+        This method automatically chooses between standard DMatrix and external memory
+        DMatrix based on the trainer's `use_external_memory` setting.
+
+        Args:
+            dataset_shard: Ray dataset shard to convert to DMatrix.
+            label_column: Name(s) of the label column(s).
+            feature_columns: Names of feature columns. If None, all non-label columns are used.
+            **kwargs: Additional arguments passed to DMatrix creation.
+
+        Returns:
+            XGBoost DMatrix object (either standard or external memory).
+
+        Raises:
+            ImportError: If XGBoost is not properly installed.
+            RuntimeError: If DMatrix creation fails.
+
+        Examples:
+            .. testcode::
+
+                # Inside train_loop_per_worker
+                train_dmatrix = trainer.create_dmatrix(
+                    ray.train.get_dataset_shard("train"),
+                    label_column="target",
+                )
+
+        Note:
+            This method requires XGBoost to be installed and the trainer to be
+            properly configured. For external memory training, ensure
+            `use_external_memory=True` is set in the trainer constructor.
+        """
+        if self.use_external_memory:
+            return self.create_external_memory_dmatrix(
+                dataset_shard=dataset_shard,
+                label_column=label_column,
+                feature_columns=feature_columns,
+                **kwargs,
+            )
+        else:
+            return self.create_standard_dmatrix(
+                dataset_shard=dataset_shard,
+                label_column=label_column,
+                feature_columns=feature_columns,
+                **kwargs,
+            )
+
+    def create_standard_dmatrix(
+        self,
+        dataset_shard: Any,
+        label_column: Union[str, List[str]],
+        feature_columns: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        """Create a standard XGBoost DMatrix for in-memory training.
+
+        Args:
+            dataset_shard: Ray dataset shard to convert to DMatrix.
+            label_column: Name(s) of the label column(s).
+            feature_columns: Names of feature columns. If None, all non-label columns are used.
+            **kwargs: Additional arguments passed to DMatrix creation.
+
+        Returns:
+            Standard XGBoost DMatrix object.
+
+        Raises:
+            ImportError: If XGBoost is not properly installed.
+            RuntimeError: If DMatrix creation fails.
+        """
+        try:
+            import xgboost as xgb
+        except ImportError:
+            raise ImportError(
+                "XGBoost is required for standard DMatrix creation. "
+                "Install with: pip install xgboost"
+            )
+
+        # Materialize the dataset shard
+        ds = dataset_shard.materialize()
+        df = ds.to_pandas()
+
+        # Separate features and labels
+        if isinstance(label_column, str):
+            labels = df[label_column]
+            features = df.drop(columns=[label_column])
+        else:
+            labels = df[label_column]
+            features = df.drop(columns=label_column)
+
+        # Handle feature columns selection
+        if feature_columns is not None:
+            features = features[feature_columns]
+
+        # Create standard DMatrix
+        dmatrix = xgb.DMatrix(features, label=labels, **kwargs)
+
+        logger.info(
+            f"Created standard DMatrix with {features.shape[0]} samples and "
+            f"{features.shape[1]} features"
+        )
+
+        return dmatrix
+
+    def create_external_memory_dmatrix(
+        self,
+        dataset_shard: Any,
+        label_column: Union[str, List[str]],
+        feature_columns: Optional[List[str]] = None,
+        batch_size: Optional[int] = None,
+        cache_dir: Optional[str] = None,
+        device: Optional[str] = None,
+        max_bin: Optional[int] = None,
+        **kwargs,
+    ) -> "xgboost.DMatrix":
+        """Create an XGBoost ExtMemQuantileDMatrix with external memory optimization.
+
+        This method creates an XGBoost ExtMemQuantileDMatrix that uses external memory
+        for training on large Ray datasets that don't fit in memory.
+
+        Following XGBoost's official external memory API:
+        - Uses ExtMemQuantileDMatrix for hist tree method (required)
+        - Supports both CPU and GPU training
+        - Implements proper DataIter interface
+        - Caches data in external memory and fetches on-demand
+
+        Args:
+            dataset_shard: Ray dataset shard to convert.
+            label_column: Name(s) of the label column(s).
+            feature_columns: Names of feature columns. If None, all non-label columns are used.
+            batch_size: Batch size for external memory iteration. If None, uses trainer's default.
+            cache_dir: Directory for caching external memory files. If None, uses trainer's default.
+            device: Device to use for external memory training. If None, uses trainer's default.
+            max_bin: Maximum number of bins for histogram construction.
+            **kwargs: Additional arguments passed to ExtMemQuantileDMatrix constructor.
+
+        Returns:
+            XGBoost ExtMemQuantileDMatrix object optimized for external memory training.
+
+        Examples:
+            .. testcode::
+
+                def train_fn_per_worker(config: dict):
+                    train_ds_iter = ray.train.get_dataset_shard("train")
+
+                    # Use external memory DMatrix
+                    dtrain = trainer.create_external_memory_dmatrix(
+                        train_ds_iter, label_column="target"
+                    )
+
+                    # Train as usual
+                    bst = xgboost.train(config, dtrain=dtrain, ...)
+
+        Note:
+            This method requires XGBoost 3.0+ and the hist tree method.
+            The trainer must be configured with use_external_memory=True.
+            For optimal performance, use tree_method="hist" and grow_policy="depthwise".
+        """
+        # Use trainer's configuration if not explicitly provided
+        if batch_size is None:
+            batch_size = self.external_memory_batch_size
+        if cache_dir is None:
+            cache_dir = self.external_memory_cache_dir
+        if device is None:
+            device = self.external_memory_device
+
+        # Import shared utilities
+        from ray.train.xgboost._external_memory_utils import (
+            create_external_memory_dmatrix,
+        )
+
+        return create_external_memory_dmatrix(
+            dataset_shard=dataset_shard,
+            label_column=label_column,
+            feature_columns=feature_columns,
+            batch_size=batch_size,
+            cache_dir=cache_dir,
+            device=device,
+            max_bin=max_bin,
+            **kwargs,
+        )
+
+    @staticmethod
+    def setup_gpu_external_memory() -> bool:
+        """Setup GPU external memory training with RMM optimization.
+
+        This method configures RAPIDS Memory Manager (RMM) for optimal GPU external
+        memory performance. It should be called before creating external memory DMatrix
+        objects for GPU training.
+
+        Returns:
+            True if GPU setup was successful, False otherwise.
+
+        Examples:
+            .. testcode::
+
+                # Setup GPU external memory before training
+                if XGBoostTrainer.setup_gpu_external_memory():
+                    print("GPU external memory setup successful")
+
+        Note:
+            This method requires XGBoost, RMM, and CuPy to be installed for GPU training.
+            For CPU training, this method is not required.
+        """
+        from ray.train.xgboost._external_memory_utils import setup_gpu_external_memory
+
+        return setup_gpu_external_memory()
+
+    @staticmethod
+    def get_external_memory_recommendations() -> Dict[str, Any]:
+        """Get recommendations for external memory training configuration.
+
+        Returns:
+            Dictionary containing recommended configuration settings and best practices.
+
+        Examples:
+            .. testcode::
+
+                recommendations = XGBoostTrainer.get_external_memory_recommendations()
+                print(f"Recommended parameters: {recommendations['parameters']}")
+        """
+        from ray.train.xgboost._external_memory_utils import (
+            get_external_memory_recommendations,
+        )
+
+        return get_external_memory_recommendations()
+
+    def get_external_memory_config(self) -> Dict[str, Any]:
+        """Get external memory configuration.
+
+        Returns:
+            Dictionary containing external memory configuration settings.
+
+        Examples:
+            .. testcode::
+
+                config = trainer.get_external_memory_config()
+                print(f"External memory enabled: {config['use_external_memory']}")
+                print(f"Cache directory: {config['cache_dir']}")
+                print(f"Device: {config['device']}")
+                print(f"Batch size: {config['batch_size']}")
+        """
+        return {
+            "use_external_memory": self.use_external_memory,
+            "cache_dir": self.external_memory_cache_dir,
+            "device": self.external_memory_device,
+            "batch_size": self.external_memory_batch_size,
+        }
+
+    def is_external_memory_enabled(self) -> bool:
+        """Check if external memory is enabled.
+
+        Returns:
+            True if external memory is enabled, False otherwise.
+
+        Examples:
+            .. testcode::
+
+                if trainer.is_external_memory_enabled():
+                    print("Using external memory for large dataset training")
+                else:
+                    print("Using standard in-memory training")
+        """
+        return self.use_external_memory
diff --git a/python/ray/train/xgboost/_external_memory_utils.py b/python/ray/train/xgboost/_external_memory_utils.py
new file mode 100644
index 000000000000..ff0de40c4700
--- /dev/null
+++ b/python/ray/train/xgboost/_external_memory_utils.py
@@ -0,0 +1,523 @@
+"""
+Shared utility functions for XGBoost external memory support.
+
+This module provides utility functions for creating external memory DMatrix objects
+that work with both V1 and V2 XGBoost trainers in Ray Train.
+
+Key Features:
+- External memory DMatrix creation for large datasets
+- GPU memory optimization with RMM
+- Automatic batch size selection
+- Cache directory management
+- Performance recommendations
+
+Examples:
+    Basic usage:
+        >>> from ray.train.xgboost._external_memory_utils import (
+        ...     create_external_memory_dmatrix
+        ... )
+        >>> dmatrix = create_external_memory_dmatrix(
+        ...     dataset_shard=dataset,
+        ...     label_column="target",
+        ... )
+"""
+
+import logging
+import os
+import tempfile
+from typing import Any, Dict, List, Optional, Union
+
+logger = logging.getLogger(__name__)
+
+# Constants for external memory configuration
+# Based on XGBoost external memory best practices:
+# https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html
+DEFAULT_CPU_BATCH_SIZE = 10000  # Balanced performance for CPU training
+DEFAULT_GPU_BATCH_SIZE = 5000  # Lower for GPU to manage memory better
+DEFAULT_MAX_BIN = 256  # XGBoost default for histogram-based algorithms
+MIN_BATCH_SIZE = 100  # Below this, I/O overhead dominates
+MAX_BATCH_SIZE = 100000  # Above this, memory pressure increases
+
+# XGBoost version requirements
+# External memory support stabilized in 2.0.0:
+# https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html
+MIN_XGBOOST_VERSION = "2.0.0"
+
+# No retry logic - follow XGBoost's fail-fast pattern
+# Reference: https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html
+
+
+def create_external_memory_dmatrix(
+    dataset_shard: Any,
+    label_column: Union[str, List[str]],
+    feature_columns: Optional[List[str]] = None,
+    batch_size: Optional[int] = None,
+    cache_dir: Optional[str] = None,
+    device: str = "cpu",
+    max_bin: Optional[int] = None,
+    enable_categorical: bool = False,
+    missing: Optional[float] = None,
+    **kwargs,
+):
+    """Create an XGBoost ExtMemQuantileDMatrix for external memory training.
+
+    This function creates an ExtMemQuantileDMatrix that streams data from external
+    memory for training on large datasets that don't fit in RAM. It follows XGBoost's
+    official external memory API.
+
+    Reference: https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html
+
+    Performance Tips:
+    - Use larger batch sizes for better I/O efficiency
+    - Store cache_dir on fast SSD storage
+    - Use GPU (device="cuda") for faster histogram computation
+    - Adjust max_bin based on feature cardinality
+
+    Args:
+        dataset_shard: Ray dataset shard to convert to DMatrix.
+        label_column: Name(s) of the label column(s).
+        feature_columns: Names of feature columns. If None, all non-label
+            columns are used.
+        batch_size: Batch size for iteration. If None, uses optimal default
+            (10000 for CPU, 5000 for GPU). Valid range: 100-100000.
+        cache_dir: Directory for caching external memory files. If None,
+            uses temp directory. Should be on fast storage with sufficient space.
+        device: Device to use ("cpu" or "cuda"). GPU requires CUDA-enabled
+            XGBoost build.
+        max_bin: Maximum number of bins for histogram construction. If None,
+            uses XGBoost default (256). Higher values increase accuracy but
+            slow down training.
+        enable_categorical: Enable categorical feature support. Requires
+            XGBoost >= 1.6.0.
+        missing: Value to recognize as missing. If None, uses NaN.
+        **kwargs: Additional arguments passed to ExtMemQuantileDMatrix constructor.
+
+    Returns:
+        XGBoost ExtMemQuantileDMatrix object optimized for external memory training.
+
+    Raises:
+        ImportError: If XGBoost is not properly installed or version is too old.
+        ValueError: If parameters are invalid (e.g., batch_size out of range).
+        RuntimeError: If DMatrix creation fails due to data issues.
+
+    Examples:
+        Basic CPU training:
+            >>> train_ds_iter = ray.train.get_dataset_shard("train")
+            >>> dtrain = create_external_memory_dmatrix(
+            ...     train_ds_iter,
+            ...     label_column="target",
+            ... )
+
+        GPU training with custom settings:
+            >>> dtrain = create_external_memory_dmatrix(
+            ...     train_ds_iter,
+            ...     label_column="target",
+            ...     batch_size=5000,
+            ...     cache_dir="/mnt/nvme/xgboost_cache",
+            ...     device="cuda",
+            ...     max_bin=512,
+            ... )
+
+        Categorical features:
+            >>> dtrain = create_external_memory_dmatrix(
+            ...     train_ds_iter,
+            ...     label_column="target",
+            ...     enable_categorical=True,
+            ... )
+
+    Note:
+        This function requires XGBoost >= 2.0.0 for optimal external memory
+        support. Earlier versions may have limited functionality or bugs.
+    """
+    # Validate and import XGBoost
+    try:
+        import xgboost as xgb
+        from packaging import version
+    except ImportError as e:
+        raise ImportError(
+            "XGBoost >= 2.0.0 is required for external memory DMatrix creation. "
+            f"Install with: pip install 'xgboost>={MIN_XGBOOST_VERSION}'"
+        ) from e
+
+    # Validate XGBoost version
+    # External memory support was stabilized in XGBoost 2.0.0:
+    # https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html
+    try:
+        xgb_version = version.parse(xgb.__version__)
+        min_version = version.parse(MIN_XGBOOST_VERSION)
+        if xgb_version < min_version:
+            logger.warning(
+                f"XGBoost version {xgb.__version__} is older than "
+                f"recommended {MIN_XGBOOST_VERSION}. "
+                "External memory support may be limited or buggy. "
+                "Please upgrade: pip install --upgrade xgboost. "
+                "See: https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html"
+            )
+    except Exception as e:
+        logger.warning(f"Could not verify XGBoost version: {e}")
+
+    # Validate device parameter
+    # XGBoost supports CPU and CUDA devices:
+    # https://xgboost.readthedocs.io/en/stable/gpu/index.html
+    if device not in ("cpu", "cuda"):
+        raise ValueError(
+            f"Invalid device '{device}'. Must be 'cpu' or 'cuda'. "
+            f"For GPU training, ensure CUDA-enabled XGBoost is installed. "
+            "See: https://xgboost.readthedocs.io/en/stable/gpu/index.html"
+        )
+
+    # Set and validate batch size
+    if batch_size is None:
+        batch_size = (
+            DEFAULT_GPU_BATCH_SIZE if device == "cuda" else (DEFAULT_CPU_BATCH_SIZE)
+        )
+    else:
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError(f"batch_size must be a positive integer, got {batch_size}")
+        if batch_size < MIN_BATCH_SIZE:
+            logger.warning(
+                f"batch_size={batch_size} is very small (< {MIN_BATCH_SIZE}). "
+                "This may cause poor I/O performance. Consider increasing it. "
+                "See: https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html"
+            )
+        if batch_size > MAX_BATCH_SIZE:
+            logger.warning(
+                f"batch_size={batch_size} is very large (> {MAX_BATCH_SIZE}). "
+                "This may cause high memory usage. Consider decreasing it. "
+                "See: https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html"
+            )
+
+    # Set and validate cache directory
+    if cache_dir is None:
+        cache_dir = tempfile.mkdtemp(prefix="xgboost_external_memory_")
+        logger.info(f"No cache_dir specified. Using temporary directory: {cache_dir}")
+        logger.info(
+            "For production use, specify a persistent cache_dir on fast storage."
+        )
+    else:
+        if not isinstance(cache_dir, str):
+            raise TypeError(f"cache_dir must be a string path, got {type(cache_dir)}")
+        try:
+            os.makedirs(cache_dir, exist_ok=True)
+            # Check if directory is writable
+            test_file = os.path.join(cache_dir, ".write_test")
+            with open(test_file, "w") as f:
+                f.write("test")
+            os.remove(test_file)
+        except (OSError, PermissionError) as e:
+            raise RuntimeError(
+                f"Cannot write to cache_dir '{cache_dir}': {e}. "
+                "Ensure the directory exists and is writable."
+            ) from e
+
+    # Validate max_bin parameter
+    if max_bin is not None:
+        if not isinstance(max_bin, int) or max_bin <= 0:
+            raise ValueError(f"max_bin must be a positive integer, got {max_bin}")
+        if max_bin < 16:
+            logger.warning(
+                f"max_bin={max_bin} is very low. This may reduce model quality. "
+                "Consider using at least 32. "
+                "See: https://xgboost.readthedocs.io/en/stable/parameter.html"
+            )
+        if max_bin > 1024:
+            logger.warning(
+                f"max_bin={max_bin} is very high. This may slow down training. "
+                "Consider using 256-512 for most cases. "
+                "See: https://xgboost.readthedocs.io/en/stable/parameter.html"
+            )
+    else:
+        max_bin = DEFAULT_MAX_BIN
+
+    # Create a custom DataIter for Ray datasets
+    class RayDatasetIterator(xgb.DataIter):
+        """Iterator for Ray datasets that works with XGBoost external memory.
+
+        This iterator implements the XGBoost DataIter interface to stream
+        data from Ray datasets in batches, enabling training on datasets
+        that don't fit in memory.
+        """
+
+        def __init__(
+            self,
+            dataset_shard: Any,
+            label_column: Union[str, List[str]],
+            feature_columns: Optional[List[str]],
+            batch_size: int,
+            missing_value: Optional[float],
+        ):
+            """Initialize the Ray dataset iterator.
+
+            Args:
+                dataset_shard: Ray dataset shard to iterate over.
+                label_column: Name(s) of the label column(s).
+                feature_columns: Names of feature columns to use.
+                batch_size: Number of samples per batch.
+                missing_value: Value to use for missing data.
+            """
+            self.dataset_shard = dataset_shard
+            self.label_column = label_column
+            self.feature_columns = feature_columns
+            self.batch_size = batch_size
+            self.missing_value = missing_value
+            self._iterator = None
+            # XGBoost expects cache_prefix to be a file prefix, not just a directory
+            # Construct proper path: directory + filename prefix
+            cache_prefix = os.path.join(cache_dir, "xgboost_cache")
+            super().__init__(cache_prefix=cache_prefix)
+
+        def next(self, input_data: Any) -> int:
+            """Advance the iterator by one batch and pass data to XGBoost.
+
+            Follows XGBoost's external memory iterator pattern.
+            Reference: https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html
+
+            Args:
+                input_data: XGBoost callback function to receive batch data.
+
+            Returns:
+                1 if data was successfully loaded, 0 if iteration is complete.
+            """
+            if self._iterator is None:
+                # Initialize iterator on first call - Ray Data streaming execution
+                self._iterator = self.dataset_shard.iter_batches(
+                    batch_size=self.batch_size,
+                    batch_format="pandas",
+                )
+
+            try:
+                # Get next batch from Ray Data stream
+                batch_df = next(self._iterator)
+
+                # Validate batch is not empty
+                if batch_df.empty:
+                    raise RuntimeError(
+                        "Empty batch encountered. Check dataset content and filtering."
+                    )
+
+                # Separate features and labels
+                if isinstance(self.label_column, str):
+                    if self.label_column not in batch_df.columns:
+                        raise KeyError(
+                            f"Label column '{self.label_column}' not found. "
+                            f"Available: {list(batch_df.columns)}"
+                        )
+                    labels = batch_df[self.label_column].values
+                    features = batch_df.drop(columns=[self.label_column])
+                else:
+                    # Multiple label columns
+                    missing_labels = [
+                        col for col in self.label_column if col not in batch_df.columns
+                    ]
+                    if missing_labels:
+                        raise KeyError(
+                            f"Label columns {missing_labels} not found. "
+                            f"Available: {list(batch_df.columns)}"
+                        )
+                    labels = batch_df[self.label_column].values
+                    features = batch_df.drop(columns=self.label_column)
+
+                # Select feature columns if specified
+                if self.feature_columns is not None:
+                    missing_features = [
+                        col
+                        for col in self.feature_columns
+                        if col not in features.columns
+                    ]
+                    if missing_features:
+                        raise KeyError(
+                            f"Feature columns {missing_features} not found. "
+                            f"Available: {list(features.columns)}"
+                        )
+                    features = features[self.feature_columns]
+
+                # Pass data to XGBoost
+                input_data(data=features.values, label=labels)
+                return 1
+
+            except StopIteration:
+                # End of iteration - normal termination
+                return 0
+            # Let all other exceptions propagate - fail fast
+
+        def reset(self) -> None:
+            """Reset the iterator to the beginning."""
+            self._iterator = None
+
+    # Create the iterator
+    try:
+        data_iter = RayDatasetIterator(
+            dataset_shard=dataset_shard,
+            label_column=label_column,
+            feature_columns=feature_columns,
+            batch_size=batch_size,
+            missing_value=missing,
+        )
+    except Exception as e:
+        raise RuntimeError(
+            f"Failed to create data iterator: {e}. "
+            "Check dataset_shard and column specifications."
+        ) from e
+
+    # Create ExtMemQuantileDMatrix for external memory
+    # ExtMemQuantileDMatrix fetches data on-demand from external memory
+    # Reference: https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html
+    try:
+        dmatrix_kwargs = {
+            "max_bin": max_bin,
+            **kwargs,
+        }
+
+        # Add categorical feature support if enabled
+        if enable_categorical:
+            dmatrix_kwargs["enable_categorical"] = True
+
+        # Add missing value if specified
+        if missing is not None:
+            dmatrix_kwargs["missing"] = missing
+
+        dmatrix = xgb.ExtMemQuantileDMatrix(
+            data_iter,
+            **dmatrix_kwargs,
+        )
+
+        return dmatrix
+
+    except Exception as e:
+        logger.error(f"Failed to create ExtMemQuantileDMatrix: {e}")
+        raise RuntimeError(
+            f"ExtMemQuantileDMatrix creation failed: {e}. "
+            "Common issues:\n"
+            "  - Incompatible data types (ensure numeric features)\n"
+            "  - Memory constraints (try reducing batch_size or max_bin)\n"
+            "  - Corrupt or malformed data\n"
+            "  - Missing dependencies (for GPU: ensure CUDA-enabled XGBoost)"
+        ) from e
+
+
+def setup_gpu_external_memory() -> bool:
+    """Setup GPU external memory training with RMM optimization.
+
+    This function configures RAPIDS Memory Manager (RMM) for optimal GPU external
+    memory performance. It should be called before creating external memory DMatrix
+    objects for GPU training.
+
+    RMM provides optimal GPU memory management for XGBoost:
+    - Better GPU memory allocation performance
+    - Memory pooling for reduced allocation overhead
+    - Integration with CuPy for NumPy-like GPU arrays
+
+    References:
+    - XGBoost GPU training: https://xgboost.readthedocs.io/en/stable/gpu/index.html
+    - RMM documentation: https://docs.rapids.ai/api/rmm/stable/
+
+    Returns:
+        True if GPU setup was successful, False otherwise.
+
+    Examples:
+        Basic GPU setup:
+            >>> if setup_gpu_external_memory():
+            ...     print("GPU ready for training")
+
+        Check before GPU training:
+            >>> import ray.train
+            >>> if setup_gpu_external_memory():
+            ...     # Proceed with GPU external memory training
+            ...     trainer = XGBoostTrainer(
+            ...         use_external_memory=True,
+            ...         external_memory_device="cuda",
+            ...     )
+            ... else:
+            ...     # Fallback to CPU
+            ...     trainer = XGBoostTrainer(
+            ...         use_external_memory=True,
+            ...         external_memory_device="cpu",
+            ...     )
+
+    Note:
+        Requirements for GPU external memory:
+        - CUDA-enabled XGBoost build
+        - RAPIDS Memory Manager (RMM): pip install rmm-cu11
+        - CuPy: pip install cupy-cuda11x
+
+        For CPU training, this function is not required.
+    """
+    try:
+        import xgboost as xgb
+
+        # Check if GPU is available
+        if not xgb.build_info()["USE_CUDA"]:
+            logger.warning("XGBoost was not built with CUDA support")
+            return False
+
+        # Try to configure RMM for GPU memory management
+        try:
+            import cupy  # noqa: F401
+            import rmm  # noqa: F401
+            from rmm.allocators.cupy import rmm_cupy_allocator
+
+            cupy.cuda.set_allocator(rmm_cupy_allocator)
+            return True
+        except ImportError:
+            logger.warning(
+                "RMM and CuPy are required for optimal GPU external memory performance. "
+                "Install with: pip install rmm-cu11 cupy-cuda11x. "
+                "See: https://docs.rapids.ai/api/rmm/stable/"
+            )
+            return False
+
+    except ImportError:
+        logger.warning("XGBoost is not installed")
+        return False
+    except Exception as e:
+        logger.warning(f"Failed to setup GPU external memory: {e}")
+        return False
+
+
+def get_external_memory_recommendations() -> Dict[str, Any]:
+    """Get recommendations for external memory training configuration.
+
+    Returns:
+        Dictionary containing recommended configuration settings and best practices.
+        All recommendations are based on XGBoost official documentation:
+        https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html
+
+    Examples:
+        .. testcode::
+
+            recommendations = get_external_memory_recommendations()
+            print("Recommended parameters:", recommendations["parameters"])
+            print("Best practices:", recommendations["best_practices"])
+    """
+    return {
+        "parameters": {
+            # Required for ExtMemQuantileDMatrix (external memory):
+            # https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.ExtMemQuantileDMatrix
+            "tree_method": "hist",
+            # Recommended for external memory performance:
+            # https://xgboost.readthedocs.io/en/stable/parameter.html#additional-parameters-for-hist-tree-method
+            "grow_policy": "depthwise",
+            # Default for hist tree method:
+            # https://xgboost.readthedocs.io/en/stable/parameter.html
+            "max_bin": 256,
+        },
+        "best_practices": [
+            "Use hist tree method (required for ExtMemQuantileDMatrix)",
+            "Use depthwise grow policy for better performance",
+            "Set appropriate batch_size based on available memory",
+            "Use shared storage for cache_dir in distributed training",
+            "Monitor disk I/O and adjust batch size accordingly",
+        ],
+        "cache_directories": {
+            "local": "/tmp/xgboost_cache",
+            "shared": "/mnt/cluster_storage/xgboost_cache",
+            "cloud": "s3://bucket/xgboost_cache",
+        },
+        "batch_size_recommendations": {
+            "cpu": {"small": 5000, "medium": 10000, "large": 20000},
+            "gpu": {"small": 2500, "medium": 5000, "large": 10000},
+        },
+        "documentation": (
+            "https://xgboost.readthedocs.io/en/" "stable/tutorials/external_memory.html"
+        ),
+    }
diff --git a/python/ray/train/xgboost/xgboost_trainer.py b/python/ray/train/xgboost/xgboost_trainer.py
index 85dc43358449..eec8bc31fd19 100644
--- a/python/ray/train/xgboost/xgboost_trainer.py
+++ b/python/ray/train/xgboost/xgboost_trainer.py
@@ -1,22 +1,26 @@
 import logging
 from functools import partial
-from typing import Any, Callable, Dict, Optional, Union
-
-import xgboost
-from packaging.version import Version
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import ray.train
-from ray.train import Checkpoint
+
+if TYPE_CHECKING:
+    import xgboost
 from ray.train.constants import TRAIN_DATASET_KEY
+from ray.train.run_config import RunConfig
+from ray.train.scaling_config import ScalingConfig
 from ray.train.trainer import GenDataset
-from ray.train.utils import _log_deprecation_warning
-from ray.train.xgboost import RayTrainReportCallback, XGBoostConfig
+from ray.train.xgboost import RayTrainReportCallback
 from ray.train.xgboost.v2 import XGBoostTrainer as SimpleXGBoostTrainer
 from ray.util.annotations import PublicAPI
 
 logger = logging.getLogger(__name__)
 
 
+# Constants for external memory configuration
+DEFAULT_EXTERNAL_MEMORY_DEVICE = "cpu"
+MAX_EXTERNAL_MEMORY_RETRIES = 3
+
 LEGACY_XGBOOST_TRAINER_DEPRECATION_MESSAGE = (
     "Passing in `xgboost.train` kwargs such as `params`, `num_boost_round`, "
     "`label_column`, etc. to `XGBoostTrainer` is deprecated "
@@ -28,286 +32,475 @@
 
 
 def _xgboost_train_fn_per_worker(
-    config: dict,
     label_column: str,
     num_boost_round: int,
     dataset_keys: set,
     xgboost_train_kwargs: dict,
+    use_external_memory: bool = False,
+    external_memory_cache_dir: Optional[str] = None,
+    external_memory_device: str = "cpu",
+    external_memory_batch_size: Optional[int] = None,
 ):
+    """Training function executed on each worker for XGBoost training.
+
+    This function handles both standard and external memory training modes,
+    automatically selecting the appropriate DMatrix creation method based on
+    the configuration. It manages checkpointing, dataset iteration, and
+    training progress tracking.
+
+    Note:
+        This is an internal function used by the V1 XGBoostTrainer. All parameters
+        are bound via functools.partial before being passed to the base trainer,
+        unlike the V2 pattern where a user-defined function receives train_loop_config.
+
+    Args:
+        label_column: Name of the label column in the dataset. Must exist
+            in all datasets.
+        num_boost_round: Target number of boosting rounds for training.
+            When resuming from checkpoint, trains for remaining rounds.
+        dataset_keys: Set of dataset names available for training. Should
+            include at least TRAIN_DATASET_KEY.
+        xgboost_train_kwargs: XGBoost training parameters dictionary containing
+            tree_method, objective, eval_metric, and other XGBoost parameters.
+            This is passed directly to xgb.train().
+        use_external_memory: Whether to use external memory for DMatrix creation.
+            Required for large datasets that don't fit in RAM. Defaults to False
+            for backward compatibility.
+        external_memory_cache_dir: Directory for caching external memory files.
+            Should be on fast storage with sufficient space. Optional, defaults
+            to system temp directory.
+        external_memory_device: Device to use for external memory training
+            ("cpu" or "cuda"). Defaults to "cpu" for backward compatibility.
+        external_memory_batch_size: Batch size for external memory iteration.
+            Larger values improve I/O efficiency but use more memory. Optional,
+            will auto-configure if not provided.
+
+    Returns:
+        None: Function reports results via ray.train.report() and may return early
+            if checkpoint already contains sufficient training rounds.
+
+    Raises:
+        ValueError: If required datasets or columns are missing.
+        RuntimeError: If DMatrix creation or training fails.
+
+    Note:
+        This function runs on each distributed worker. It automatically handles:
+        - Checkpoint resumption
+        - Dataset sharding
+        - DMatrix creation (standard or external memory)
+        - Model training and reporting
+    """
+    # Handle checkpoint resumption
     checkpoint = ray.train.get_checkpoint()
     starting_model = None
     remaining_iters = num_boost_round
+
     if checkpoint:
-        starting_model = RayTrainReportCallback.get_model(checkpoint)
-        starting_iter = starting_model.num_boosted_rounds()
-        remaining_iters = num_boost_round - starting_iter
-        logger.info(
-            f"Model loaded from checkpoint will train for "
-            f"additional {remaining_iters} iterations (trees) in order "
-            "to achieve the target number of iterations "
-            f"({num_boost_round=})."
-        )
+        try:
+            starting_model = RayTrainReportCallback.get_model(checkpoint)
+            starting_iter = starting_model.num_boosted_rounds()
+            remaining_iters = num_boost_round - starting_iter
+
+            if remaining_iters <= 0:
+                logger.warning(
+                    f"Model from checkpoint already has {starting_iter} rounds, "
+                    f"which meets or exceeds target ({num_boost_round}). "
+                    "No additional training will be performed."
+                )
+                # Report the existing model to Ray Train to properly register completion
+                ray.train.report({"model": starting_model})
+                return
+        except Exception as e:
+            logger.error(f"Failed to load model from checkpoint: {e}")
+            raise RuntimeError(
+                f"Checkpoint loading failed: {e}. "
+                "Ensure checkpoint is compatible with current XGBoost version."
+            ) from e
 
     train_ds_iter = ray.train.get_dataset_shard(TRAIN_DATASET_KEY)
-    train_df = train_ds_iter.materialize().to_pandas()
-
-    eval_ds_iters = {
-        k: ray.train.get_dataset_shard(k)
-        for k in dataset_keys
-        if k != TRAIN_DATASET_KEY
-    }
-    eval_dfs = {k: d.materialize().to_pandas() for k, d in eval_ds_iters.items()}
-
-    train_X, train_y = train_df.drop(label_column, axis=1), train_df[label_column]
-    dtrain = xgboost.DMatrix(train_X, label=train_y)
-
-    # NOTE: Include the training dataset in the evaluation datasets.
-    # This allows `train-*` metrics to be calculated and reported.
-    evals = [(dtrain, TRAIN_DATASET_KEY)]
-
-    for eval_name, eval_df in eval_dfs.items():
-        eval_X, eval_y = eval_df.drop(label_column, axis=1), eval_df[label_column]
-        evals.append((xgboost.DMatrix(eval_X, label=eval_y), eval_name))
-
-    evals_result = {}
-    xgboost.train(
-        config,
-        dtrain=dtrain,
-        evals=evals,
-        evals_result=evals_result,
-        num_boost_round=remaining_iters,
-        xgb_model=starting_model,
-        **xgboost_train_kwargs,
-    )
+
+    if use_external_memory:
+        # Use external memory for large datasets
+        import xgboost as xgb
+
+        # External memory requires hist tree method for optimal performance
+        # Required by ExtMemQuantileDMatrix for external memory:
+        # https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html
+        if "tree_method" not in xgboost_train_kwargs:
+            xgboost_train_kwargs["tree_method"] = "hist"
+        elif xgboost_train_kwargs["tree_method"] != "hist":
+            logger.warning(
+                f"External memory training requires tree_method='hist' for optimal performance. "
+                f"Current setting: {xgboost_train_kwargs['tree_method']}. "
+                "Consider changing to 'hist' for better external memory performance. "
+                "See: https://xgboost.readthedocs.io/en/stable/tutorials/external_memory.html"
+            )
+
+        # Recommend depthwise grow policy for external memory
+        # Depthwise policy performs better with external memory:
+        # https://xgboost.readthedocs.io/en/stable/parameter.html#additional-parameters-for-hist-tree-method
+        if "grow_policy" not in xgboost_train_kwargs:
+            xgboost_train_kwargs["grow_policy"] = "depthwise"
+        elif xgboost_train_kwargs["grow_policy"] == "lossguide":
+            logger.warning(
+                "Using grow_policy='lossguide' with external memory can significantly "
+                "slow down training. Consider using 'depthwise' for better performance. "
+                "See: https://xgboost.readthedocs.io/en/stable/parameter.html"
+            )
+
+        # Create external memory DMatrix using shared utilities
+        from ._external_memory_utils import create_external_memory_dmatrix
+
+        try:
+            dtrain = create_external_memory_dmatrix(
+                dataset_shard=train_ds_iter,
+                label_column=label_column,
+                batch_size=external_memory_batch_size,
+                cache_dir=external_memory_cache_dir,
+                device=external_memory_device,
+            )
+        except Exception as e:
+            logger.error(f"Failed to create training DMatrix: {e}")
+            raise RuntimeError(
+                f"Training DMatrix creation failed: {e}. "
+                "Check dataset format and external memory configuration."
+            ) from e
+
+        # Create evaluation datasets with external memory
+        evals = [(dtrain, TRAIN_DATASET_KEY)]
+
+        for eval_name in dataset_keys:
+            if eval_name != TRAIN_DATASET_KEY:
+                try:
+                    eval_ds_iter = ray.train.get_dataset_shard(eval_name)
+                    deval = create_external_memory_dmatrix(
+                        dataset_shard=eval_ds_iter,
+                        label_column=label_column,
+                        batch_size=external_memory_batch_size,
+                        cache_dir=external_memory_cache_dir,
+                        device=external_memory_device,
+                    )
+                    evals.append((deval, eval_name))
+                except Exception as e:
+                    logger.error(f"Failed to create DMatrix for '{eval_name}': {e}")
+                    raise RuntimeError(
+                        f"Evaluation DMatrix creation failed for '{eval_name}': {e}"
+                    ) from e
+
+    else:
+        # Use standard DMatrix for smaller datasets
+        import xgboost as xgb
+
+        try:
+            train_ds = train_ds_iter.materialize()
+            train_df = train_ds.to_pandas()
+
+            # Validate training data
+            if train_df.empty:
+                raise ValueError("Training dataset is empty")
+
+            if label_column not in train_df.columns:
+                raise ValueError(
+                    f"Label column '{label_column}' not found in training data. "
+                    f"Available columns: {list(train_df.columns)}"
+                )
+
+            # Separate features and labels
+            train_X = train_df.drop(columns=[label_column])
+            train_y = train_df[label_column]
+
+            # Create standard DMatrix
+            dtrain = xgb.DMatrix(train_X, label=train_y)
+
+        except Exception as e:
+            logger.error(f"Failed to create training DMatrix: {e}")
+            raise RuntimeError(
+                f"Training DMatrix creation failed: {e}. "
+                "Check dataset format and label column name."
+            ) from e
+
+        # Create evaluation datasets
+        evals = [(dtrain, TRAIN_DATASET_KEY)]
+
+        for eval_name in dataset_keys:
+            if eval_name != TRAIN_DATASET_KEY:
+                try:
+                    eval_ds_iter = ray.train.get_dataset_shard(eval_name)
+                    eval_ds = eval_ds_iter.materialize()
+                    eval_df = eval_ds.to_pandas()
+
+                    if eval_df.empty:
+                        logger.warning(f"Evaluation dataset '{eval_name}' is empty")
+                        continue
+
+                    if label_column not in eval_df.columns:
+                        raise ValueError(
+                            f"Label column '{label_column}' not found in '{eval_name}'. "
+                            f"Available: {list(eval_df.columns)}"
+                        )
+
+                    eval_X = eval_df.drop(columns=[label_column])
+                    eval_y = eval_df[label_column]
+
+                    deval = xgb.DMatrix(eval_X, label=eval_y)
+                    evals.append((deval, eval_name))
+
+                except Exception as e:
+                    logger.error(f"Failed to create DMatrix for '{eval_name}': {e}")
+                    raise RuntimeError(
+                        f"Evaluation DMatrix creation failed for '{eval_name}': {e}"
+                    ) from e
+
+    # Train the model
+    try:
+        bst = xgb.train(
+            xgboost_train_kwargs,
+            dtrain=dtrain,
+            evals=evals,
+            num_boost_round=remaining_iters,
+            xgb_model=starting_model,
+            callbacks=[RayTrainReportCallback()],
+        )
+
+        if bst is None:
+            raise RuntimeError("xgb.train returned None")
+
+        # Report final metrics
+        ray.train.report({"model": bst})
+
+    except Exception as e:
+        logger.error(f"Training failed: {e}")
+        raise RuntimeError(
+            f"XGBoost training failed: {e}. "
+            "Check parameters, data quality, and system resources."
+        ) from e
 
 
 @PublicAPI(stability="beta")
 class XGBoostTrainer(SimpleXGBoostTrainer):
     """A Trainer for distributed data-parallel XGBoost training.
 
-    Example
-    -------
-
-    .. testcode::
-        :skipif: True
+    This trainer supports both standard DMatrix creation for smaller datasets
+    and external memory optimization for large datasets that don't fit in RAM.
 
-        import xgboost
+    Examples:
+        .. testcode::
+            :skipif: True
 
-        import ray.data
-        import ray.train
-        from ray.train.xgboost import RayTrainReportCallback, XGBoostTrainer
+            import ray
+            import ray.data
+            from ray.train.xgboost import XGBoostTrainer
 
-        def train_fn_per_worker(config: dict):
-            # (Optional) Add logic to resume training state from a checkpoint.
-            # ray.train.get_checkpoint()
+            # Create sample datasets
+            train_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(1000)])
+            val_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(100)])
 
-            # 1. Get the dataset shard for the worker and convert to a `xgboost.DMatrix`
-            train_ds_iter, eval_ds_iter = (
-                ray.train.get_dataset_shard("train"),
-                ray.train.get_dataset_shard("validation"),
+            # Standard training (in-memory)
+            trainer = XGBoostTrainer(
+                scaling_config=ray.train.ScalingConfig(num_workers=2),
+                run_config=ray.train.RunConfig(),
+                datasets={"train": train_ds, "validation": val_ds},
+                label_column="y",
+                params={"objective": "reg:squarederror", "max_depth": 3},
+                num_boost_round=10,
             )
-            train_ds, eval_ds = train_ds_iter.materialize(), eval_ds_iter.materialize()
-
-            train_df, eval_df = train_ds.to_pandas(), eval_ds.to_pandas()
-            train_X, train_y = train_df.drop("y", axis=1), train_df["y"]
-            eval_X, eval_y = eval_df.drop("y", axis=1), eval_df["y"]
-
-            dtrain = xgboost.DMatrix(train_X, label=train_y)
-            deval = xgboost.DMatrix(eval_X, label=eval_y)
-
-            params = {
-                "tree_method": "approx",
-                "objective": "reg:squarederror",
-                "eta": 1e-4,
-                "subsample": 0.5,
-                "max_depth": 2,
-            }
-
-            # 2. Do distributed data-parallel training.
-            # Ray Train sets up the necessary coordinator processes and
-            # environment variables for your workers to communicate with each other.
-            bst = xgboost.train(
-                params,
-                dtrain=dtrain,
-                evals=[(deval, "validation")],
+            result = trainer.fit()
+
+            # External memory training for large datasets
+            large_trainer = XGBoostTrainer(
+                scaling_config=ray.train.ScalingConfig(num_workers=2),
+                run_config=ray.train.RunConfig(),
+                datasets={"train": large_train_ds, "validation": large_val_ds},
+                label_column="y",
+                params={"objective": "reg:squarederror", "max_depth": 3},
                 num_boost_round=10,
-                callbacks=[RayTrainReportCallback()],
+                use_external_memory=True,
+                external_memory_cache_dir="/mnt/cluster_storage",
+                external_memory_device="cpu",
+                external_memory_batch_size=50000,
             )
-
-        train_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(32)])
-        eval_ds = ray.data.from_items([{"x": x, "y": x + 1} for x in range(16)])
-        trainer = XGBoostTrainer(
-            train_fn_per_worker,
-            datasets={"train": train_ds, "validation": eval_ds},
-            scaling_config=ray.train.ScalingConfig(num_workers=4),
-        )
-        result = trainer.fit()
-        booster = RayTrainReportCallback.get_model(result.checkpoint)
-
-    Args:
-        train_loop_per_worker: The training function to execute on each worker.
-            This function can either take in zero arguments or a single ``Dict``
-            argument which is set by defining ``train_loop_config``.
-            Within this function you can use any of the
-            :ref:`Ray Train Loop utilities <train-loop-api>`.
-        train_loop_config: A configuration ``Dict`` to pass in as an argument to
-            ``train_loop_per_worker``.
-            This is typically used for specifying hyperparameters.
-        xgboost_config: The configuration for setting up the distributed xgboost
-            backend. Defaults to using the "rabit" backend.
-            See :class:`~ray.train.xgboost.XGBoostConfig` for more info.
-        datasets: The Ray Datasets to use for training and validation.
-        dataset_config: The configuration for ingesting the input ``datasets``.
-            By default, all the Ray Datasets are split equally across workers.
-            See :class:`~ray.train.DataConfig` for more details.
-        scaling_config: The configuration for how to scale data parallel training.
-            ``num_workers`` determines how many Python processes are used for training,
-            and ``use_gpu`` determines whether or not each process should use GPUs.
-            See :class:`~ray.train.ScalingConfig` for more info.
-        run_config: The configuration for the execution of the training run.
-            See :class:`~ray.train.RunConfig` for more info.
-        resume_from_checkpoint: A checkpoint to resume training from.
-            This checkpoint can be accessed from within ``train_loop_per_worker``
-            by calling ``ray.train.get_checkpoint()``.
-        metadata: Dict that should be made available via
-            `ray.train.get_context().get_metadata()` and in `checkpoint.get_metadata()`
-            for checkpoints saved from this Trainer. Must be JSON-serializable.
-        label_column: [Deprecated] Name of the label column. A column with this name
-            must be present in the training dataset.
-        params: [Deprecated] XGBoost training parameters.
-            Refer to `XGBoost documentation <https://xgboost.readthedocs.io/>`_
-            for a list of possible parameters.
-        num_boost_round: [Deprecated] Target number of boosting iterations (trees in the model).
-            Note that unlike in ``xgboost.train``, this is the target number
-            of trees, meaning that if you set ``num_boost_round=10`` and pass a model
-            that has already been trained for 5 iterations, it will be trained for 5
-            iterations more, instead of 10 more.
-        **train_kwargs: [Deprecated] Additional kwargs passed to ``xgboost.train()`` function.
+            result = large_trainer.fit()
     """
 
-    _handles_checkpoint_freq = True
-    _handles_checkpoint_at_end = True
-
     def __init__(
         self,
-        train_loop_per_worker: Optional[
-            Union[Callable[[], None], Callable[[Dict], None]]
-        ] = None,
         *,
-        train_loop_config: Optional[Dict] = None,
-        xgboost_config: Optional[XGBoostConfig] = None,
-        scaling_config: Optional[ray.train.ScalingConfig] = None,
-        run_config: Optional[ray.train.RunConfig] = None,
-        datasets: Optional[Dict[str, GenDataset]] = None,
-        dataset_config: Optional[ray.train.DataConfig] = None,
-        resume_from_checkpoint: Optional[Checkpoint] = None,
-        metadata: Optional[Dict[str, Any]] = None,
-        # TODO(justinvyu): [Deprecated] Legacy XGBoostTrainer API
-        label_column: Optional[str] = None,
-        params: Optional[Dict[str, Any]] = None,
-        num_boost_round: Optional[int] = None,
-        **train_kwargs,
+        scaling_config: ScalingConfig,
+        run_config: RunConfig,
+        datasets: Dict[str, GenDataset],
+        label_column: str,
+        params: Dict[str, Any],
+        num_boost_round: int,
+        use_external_memory: bool = False,
+        external_memory_cache_dir: Optional[str] = None,
+        external_memory_device: str = "cpu",
+        external_memory_batch_size: Optional[int] = None,
+        **kwargs,
     ):
-        if Version(xgboost.__version__) < Version("1.7.0"):
-            raise ImportError(
-                "`XGBoostTrainer` requires the `xgboost` version to be >= 1.7.0. "
-                'Upgrade with: `pip install -U "xgboost>=1.7"`'
-            )
-
-        # TODO(justinvyu): [Deprecated] Legacy XGBoostTrainer API
-        legacy_api = train_loop_per_worker is None
-        if legacy_api:
-            train_loop_per_worker = self._get_legacy_train_fn_per_worker(
-                xgboost_train_kwargs=train_kwargs,
-                run_config=run_config,
-                label_column=label_column,
-                num_boost_round=num_boost_round,
-                datasets=datasets,
-            )
-            train_loop_config = params or {}
-        elif train_kwargs:
-            _log_deprecation_warning(
-                "Passing `xgboost.train` kwargs to `XGBoostTrainer` is deprecated. "
-                "In your training function, you can call `xgboost.train(**kwargs)` "
-                "with arbitrary arguments. "
-                f"{LEGACY_XGBOOST_TRAINER_DEPRECATION_MESSAGE}"
-            )
+        """Initialize the XGBoost trainer.
+
+        Args:
+            scaling_config: Configuration for how to scale data parallel training.
+            run_config: Configuration for the execution of the training run.
+            datasets: The Ray Datasets to ingest for training.
+            label_column: Name of the label column in the dataset.
+            params: XGBoost training parameters.
+            num_boost_round: Number of boosting rounds for training.
+            use_external_memory: Whether to use external memory for DMatrix creation.
+                If True, uses ExtMemQuantileDMatrix for large datasets that don't fit in RAM.
+                If False (default), uses standard DMatrix for in-memory training.
+            external_memory_cache_dir: Directory for caching external memory files.
+                If None, automatically selects the best available directory.
+            external_memory_device: Device to use for external memory training.
+                Options: "cpu" (default) or "cuda" for GPU training.
+            external_memory_batch_size: Batch size for external memory iteration.
+                If None, uses optimal default based on device type.
+            **kwargs: Additional arguments passed to the base trainer.
+        """
+        # Store external memory configuration
+        self.use_external_memory = use_external_memory
+        self.external_memory_cache_dir = external_memory_cache_dir
+        self.external_memory_device = external_memory_device
+        self.external_memory_batch_size = external_memory_batch_size
+
+        # Create training function with external memory support
+        train_fn_per_worker = partial(
+            _xgboost_train_fn_per_worker,
+            label_column=label_column,
+            num_boost_round=num_boost_round,
+            dataset_keys=set(datasets.keys()),
+            xgboost_train_kwargs=params,
+            use_external_memory=use_external_memory,
+            external_memory_cache_dir=external_memory_cache_dir,
+            external_memory_device=external_memory_device,
+            external_memory_batch_size=external_memory_batch_size,
+        )
 
-        super(XGBoostTrainer, self).__init__(
-            train_loop_per_worker=train_loop_per_worker,
-            train_loop_config=train_loop_config,
-            xgboost_config=xgboost_config,
+        # Initialize the base trainer
+        super().__init__(
+            train_loop_per_worker=train_fn_per_worker,
             scaling_config=scaling_config,
             run_config=run_config,
             datasets=datasets,
-            dataset_config=dataset_config,
-            resume_from_checkpoint=resume_from_checkpoint,
-            metadata=metadata,
+            **kwargs,
         )
 
-    def _get_legacy_train_fn_per_worker(
-        self,
-        xgboost_train_kwargs: Dict,
-        run_config: Optional[ray.train.RunConfig],
-        datasets: Optional[Dict[str, GenDataset]],
-        label_column: Optional[str],
-        num_boost_round: Optional[int],
-    ) -> Callable[[Dict], None]:
-        """Get the training function for the legacy XGBoostTrainer API."""
-
-        datasets = datasets or {}
-        if not datasets.get(TRAIN_DATASET_KEY):
-            raise ValueError(
-                "`datasets` must be provided for the XGBoostTrainer API "
-                "if `train_loop_per_worker` is not provided. "
-                "This dict must contain the training dataset under the "
-                f"key: '{TRAIN_DATASET_KEY}'. "
-                f"Got keys: {list(datasets.keys())}"
-            )
-        if not label_column:
-            raise ValueError(
-                "`label_column` must be provided for the XGBoostTrainer API "
-                "if `train_loop_per_worker` is not provided. "
-                "This is the column name of the label in the dataset."
-            )
+    @staticmethod
+    def setup_gpu_external_memory() -> bool:
+        """Setup GPU external memory training with RMM optimization.
 
-        num_boost_round = num_boost_round or 10
+        This method configures RAPIDS Memory Manager (RMM) for optimal GPU external
+        memory performance. It should be called before creating external memory DMatrix
+        objects for GPU training.
 
-        _log_deprecation_warning(LEGACY_XGBOOST_TRAINER_DEPRECATION_MESSAGE)
+        Returns:
+            True if GPU setup was successful, False otherwise.
 
-        # Initialize a default Ray Train metrics/checkpoint reporting callback if needed
-        callbacks = xgboost_train_kwargs.get("callbacks", [])
-        user_supplied_callback = any(
-            isinstance(callback, RayTrainReportCallback) for callback in callbacks
-        )
-        callback_kwargs = {}
-        if run_config:
-            checkpoint_frequency = run_config.checkpoint_config.checkpoint_frequency
-            checkpoint_at_end = run_config.checkpoint_config.checkpoint_at_end
-
-            callback_kwargs["frequency"] = checkpoint_frequency
-            # Default `checkpoint_at_end=True` unless the user explicitly sets it.
-            callback_kwargs["checkpoint_at_end"] = (
-                checkpoint_at_end if checkpoint_at_end is not None else True
-            )
+        Examples:
+            .. testcode::
 
-        if not user_supplied_callback:
-            callbacks.append(RayTrainReportCallback(**callback_kwargs))
-        xgboost_train_kwargs["callbacks"] = callbacks
+                # Setup GPU external memory before training
+                if XGBoostTrainer.setup_gpu_external_memory():
+                    print("GPU external memory setup successful")
 
-        train_fn_per_worker = partial(
-            _xgboost_train_fn_per_worker,
-            label_column=label_column,
-            num_boost_round=num_boost_round,
-            dataset_keys=set(datasets),
-            xgboost_train_kwargs=xgboost_train_kwargs,
-        )
-        return train_fn_per_worker
+        Note:
+            This method requires XGBoost, RMM, and CuPy to be installed for GPU training.
+            For CPU training, this method is not required.
+        """
+        from ._external_memory_utils import setup_gpu_external_memory
+
+        return setup_gpu_external_memory()
+
+    @staticmethod
+    def get_external_memory_recommendations() -> Dict[str, Any]:
+        """Get recommendations for external memory training configuration.
+
+        Returns:
+            Dictionary containing recommended configuration settings and best practices.
+
+        Examples:
+            .. testcode::
+
+                recommendations = XGBoostTrainer.get_external_memory_recommendations()
+                print(f"Recommended parameters: {recommendations['parameters']}")
+        """
+        from ._external_memory_utils import get_external_memory_recommendations
+
+        return get_external_memory_recommendations()
+
+    def get_external_memory_config(self) -> Dict[str, Any]:
+        """Get external memory configuration.
+
+        Returns:
+            Dictionary containing external memory configuration settings.
+
+        Examples:
+            .. testcode::
+
+                config = trainer.get_external_memory_config()
+                print(f"External memory enabled: {config['use_external_memory']}")
+                print(f"Cache directory: {config['cache_dir']}")
+                print(f"Device: {config['device']}")
+                print(f"Batch size: {config['batch_size']}")
+        """
+        return {
+            "use_external_memory": self.use_external_memory,
+            "cache_dir": self.external_memory_cache_dir,
+            "device": self.external_memory_device,
+            "batch_size": self.external_memory_batch_size,
+        }
+
+    def is_external_memory_enabled(self) -> bool:
+        """Check if external memory is enabled.
+
+        Returns:
+            True if external memory is enabled, False otherwise.
+
+        Examples:
+            .. testcode::
+
+                if trainer.is_external_memory_enabled():
+                    print("Using external memory for large dataset training")
+                else:
+                    print("Using standard in-memory training")
+        """
+        return self.use_external_memory
 
     @classmethod
     def get_model(
         cls,
-        checkpoint: Checkpoint,
-    ) -> xgboost.Booster:
-        """Retrieve the XGBoost model stored in this checkpoint."""
-        return RayTrainReportCallback.get_model(checkpoint)
+        checkpoint: "ray.train.Checkpoint",
+        filename: str = "model.json",
+    ) -> "xgboost.Booster":
+        """Retrieve the XGBoost model stored in this checkpoint.
+
+        This method maintains backward compatibility for V1 XGBoostTrainer users.
+        It delegates to RayTrainReportCallback.get_model() which is the recommended
+        approach for both V1 and V2 trainers.
+
+        Args:
+            checkpoint: The checkpoint object returned by a training run.
+            filename: The filename to load the model from. Defaults to "model.json".
+
+        Returns:
+            The XGBoost Booster model stored in the checkpoint.
+
+        Examples:
+            .. testcode::
+
+                from ray.train.xgboost import XGBoostTrainer
+
+                # After training
+                result = trainer.fit()
+                booster = XGBoostTrainer.get_model(result.checkpoint)
+
+                # Or use the recommended approach
+                from ray.train.xgboost import RayTrainReportCallback
+                booster = RayTrainReportCallback.get_model(result.checkpoint)
+
+        Note:
+            While this method is maintained for V1 backward compatibility,
+            the recommended approach is to use RayTrainReportCallback.get_model()
+            directly, which works for both V1 and V2 trainers.
+        """
+        return RayTrainReportCallback.get_model(checkpoint, filename=filename)