Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
c0e81a4
initial commit
soffer-anyscale Aug 12, 2025
efb8c39
updated based on feedback
soffer-anyscale Aug 13, 2025
ac8d58e
made scalability improvements and expanded testing
soffer-anyscale Aug 13, 2025
5535fa3
simplified the updates to core functionality and added documentation
soffer-anyscale Aug 15, 2025
7745db6
updated to fix lint
soffer-anyscale Aug 15, 2025
83c1283
updated external dataset API
soffer-anyscale Aug 28, 2025
1620d4f
Implement XGBoost external memory training support
soffer-anyscale Oct 9, 2025
a31763f
Address code review feedback for XGBoost external memory
soffer-anyscale Oct 9, 2025
5308ba1
Fix infinite recursion bug in external memory iterator
soffer-anyscale Oct 9, 2025
6edd088
Clean up logging and add documentation URLs
soffer-anyscale Oct 9, 2025
be11044
Fix critical bug and simplify external memory implementation
soffer-anyscale Oct 9, 2025
da2ccdd
Fix linter error: remove unused trainer variable in test
soffer-anyscale Oct 9, 2025
36fbc6b
Maintain V1/V2 API backward compatibility in XGBoost trainer
soffer-anyscale Oct 12, 2025
f121c39
Fix critical bugs in XGBoost external memory and checkpointing
soffer-anyscale Oct 12, 2025
0d9deb3
fixed training bugs and lint issues
soffer-anyscale Oct 13, 2025
d0389f6
Fix pydoclint violations in XGBoost trainer instead of adding to base…
soffer-anyscale Oct 13, 2025
66cd798
Merge origin/master into train_xgboost_scale
soffer-anyscale Oct 13, 2025
56cc79c
Fix docstring example: use direct import instead of trainer.method
soffer-anyscale Oct 13, 2025
9155420
Fix pydoclint violations: add type hints and move Args to __init__
soffer-anyscale Oct 13, 2025
5254a98
Move Args from V2 trainer class docstring to __init__
soffer-anyscale Oct 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 202 additions & 3 deletions python/ray/train/tests/test_xgboost_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,210 @@ def test_resume_from_checkpoint(ray_start_4_cpus, tmpdir):
params=params,
num_boost_round=10,
datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
resume_from_checkpoint=result.checkpoint,
resume_from_checkpoint=checkpoint,
)
result = trainer.fit()
model = XGBoostTrainer.get_model(result.checkpoint)
assert model.num_boosted_rounds() == 10
xgb_model = XGBoostTrainer.get_model(result.checkpoint)
assert xgb_model.num_boosted_rounds() == 10


def test_external_memory_basic(ray_start_4_cpus, tmpdir):
"""Test V1 XGBoost Trainer with external memory enabled."""
train_dataset = ray.data.from_pandas(train_df)
valid_dataset = ray.data.from_pandas(test_df)

# Use hist tree method (required for external memory)
external_memory_params = {
"tree_method": "hist", # Required for external memory
"objective": "binary:logistic",
"eval_metric": ["logloss", "error"],
}

# Create temporary cache directory
cache_dir = tmpdir.mkdir("xgboost_cache")

trainer = XGBoostTrainer(
scaling_config=scale_config,
label_column="target",
params=external_memory_params,
num_boost_round=10,
datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
use_external_memory=True,
external_memory_cache_dir=str(cache_dir),
external_memory_device="cpu",
external_memory_batch_size=1000,
)

result = trainer.fit()

# Verify results
assert result.checkpoint is not None
xgb_model = XGBoostTrainer.get_model(result.checkpoint)
assert xgb_model.num_boosted_rounds() == 10

# Verify external memory configuration
assert trainer.is_external_memory_enabled()
config = trainer.get_external_memory_config()
assert config["use_external_memory"] is True
assert config["cache_dir"] == str(cache_dir)
assert config["device"] == "cpu"
assert config["batch_size"] == 1000


def test_external_memory_auto_configuration(ray_start_4_cpus):
"""Test V1 XGBoost Trainer with automatic external memory configuration."""
train_dataset = ray.data.from_pandas(train_df)
valid_dataset = ray.data.from_pandas(test_df)

# Use hist tree method (required for external memory)
external_memory_params = {
"tree_method": "hist", # Required for external memory
"objective": "binary:logistic",
"eval_metric": ["logloss", "error"],
}

trainer = XGBoostTrainer(
scaling_config=scale_config,
label_column="target",
params=external_memory_params,
num_boost_round=10,
datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
use_external_memory=True,
# Let the trainer auto-select cache directory and batch size
)

result = trainer.fit()

# Verify results
assert result.checkpoint is not None
xgb_model = XGBoostTrainer.get_model(result.checkpoint)
assert xgb_model.num_boosted_rounds() == 10

# Verify external memory is enabled
assert trainer.is_external_memory_enabled()


def test_external_memory_gpu(ray_start_8_cpus):
"""Test V1 XGBoost Trainer with GPU external memory."""
train_dataset = ray.data.from_pandas(train_df)
valid_dataset = ray.data.from_pandas(test_df)

# Use hist tree method (required for external memory)
external_memory_params = {
"tree_method": "hist", # Required for external memory
"objective": "binary:logistic",
"eval_metric": ["logloss", "error"],
}

trainer = XGBoostTrainer(
scaling_config=ScalingConfig(num_workers=2, use_gpu=True),
label_column="target",
params=external_memory_params,
num_boost_round=10,
datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
use_external_memory=True,
external_memory_device="cuda",
external_memory_batch_size=5000, # Smaller batch size for GPU
)

result = trainer.fit()

# Verify results
assert result.checkpoint is not None
xgb_model = XGBoostTrainer.get_model(result.checkpoint)
assert xgb_model.num_boosted_rounds() == 10

# Verify GPU external memory configuration
config = trainer.get_external_memory_config()
assert config["device"] == "cuda"


def test_external_memory_utilities(ray_start_4_cpus):
"""Test V1 XGBoost Trainer external memory utility methods."""
# Test GPU setup method
gpu_setup_result = XGBoostTrainer.setup_gpu_external_memory()
# This should return False on CPU-only systems, True on GPU systems
assert isinstance(gpu_setup_result, bool)


def test_external_memory_with_large_dataset(ray_start_8_cpus, tmpdir):
"""Test V1 XGBoost Trainer with a larger dataset to verify external memory benefits."""
# Create a larger dataset
large_train_df = pd.concat([train_df] * 10, ignore_index=True)
large_test_df = pd.concat([test_df] * 5, ignore_index=True)

large_train_dataset = ray.data.from_pandas(large_train_df)
large_valid_dataset = ray.data.from_pandas(large_test_df)

# Use hist tree method (required for external memory)
external_memory_params = {
"tree_method": "hist", # Required for external memory
"objective": "binary:logistic",
"eval_metric": ["logloss", "error"],
"max_depth": 3, # Limit depth for faster training
"eta": 0.1,
}

# Create temporary cache directory
cache_dir = tmpdir.mkdir("xgboost_large_cache")

trainer = XGBoostTrainer(
scaling_config=ScalingConfig(num_workers=4),
label_column="target",
params=external_memory_params,
num_boost_round=5, # Fewer rounds for faster testing
datasets={TRAIN_DATASET_KEY: large_train_dataset, "valid": large_valid_dataset},
use_external_memory=True,
external_memory_cache_dir=str(cache_dir),
external_memory_batch_size=2000,
)

result = trainer.fit()

# Verify results
assert result.checkpoint is not None
xgb_model = XGBoostTrainer.get_model(result.checkpoint)
assert xgb_model.num_boosted_rounds() == 5

# Verify external memory configuration
assert trainer.is_external_memory_enabled()
config = trainer.get_external_memory_config()
assert config["use_external_memory"] is True
assert config["batch_size"] == 2000


def test_external_memory_backward_compatibility(ray_start_4_cpus):
"""Test that V1 XGBoost Trainer maintains backward compatibility when external memory is disabled."""
train_dataset = ray.data.from_pandas(train_df)
valid_dataset = ray.data.from_pandas(test_df)

# Use standard parameters (no external memory)
standard_params = {
"tree_method": "approx", # Can use approx for standard DMatrix
"objective": "binary:logistic",
"eval_metric": ["logloss", "error"],
}

trainer = XGBoostTrainer(
scaling_config=scale_config,
label_column="target",
params=standard_params,
num_boost_round=10,
datasets={TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset},
# External memory disabled by default
)

result = trainer.fit()

# Verify results
assert result.checkpoint is not None
xgb_model = XGBoostTrainer.get_model(result.checkpoint)
assert xgb_model.num_boosted_rounds() == 10

# Verify external memory is disabled
assert not trainer.is_external_memory_enabled()
config = trainer.get_external_memory_config()
assert config["use_external_memory"] is False


@pytest.mark.parametrize(
Expand Down
Loading