Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Add support for legate-sparse CSR in Tree model #191

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
1 change: 1 addition & 0 deletions ci/run_pytests_cpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ set -e -E -u -o pipefail
cd legateboost/test

legate \
--gpus 0 \
--sysmem 28000 \
--module pytest \
. \
Expand Down
1 change: 1 addition & 0 deletions conda/environments/all_cuda-122.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ dependencies:
- cuda-version>=12.2
- cupynumeric==25.01.*,>=0.0.0.dev0
- hypothesis>=6
- legate-sparse
- legate==25.01.*,>=0.0.0.dev0
- libcublas-dev
- llvm-openmp
Expand Down
1 change: 1 addition & 0 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -173,3 +173,4 @@ dependencies:
- pytest>=7,<8
- seaborn>=0.13
- xgboost>=2.0
- legate-sparse
3 changes: 3 additions & 0 deletions examples/sparse/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Sparse data

This example trains a youtube comment spam classifier on a sparse dataset. The comments as raw strings are converted to a sparse matrix of word counts using the `CountVectorizer` from scikit-learn.
47 changes: 47 additions & 0 deletions examples/sparse/sparse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import legateboost as lb

# Alberto, T. & Lochter, J. (2015). YouTube Spam Collection [Dataset].
# UCI Machine Learning Repository. https://doi.org/10.24432/C58885.
dataset_names = [
"youtube-spam-psy",
"youtube-spam-shakira",
"youtube-spam-lmfao",
"youtube-spam-eminem",
"youtube-spam-katyperry",
]
X = []
for dataset_name in dataset_names:
dataset = fetch_openml(name=dataset_name, as_frame=True)
X.append(dataset.data)

X = pd.concat(X)
y = X["CLASS"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train["CONTENT"])
X_test_vectorized = vectorizer.transform(X_test["CONTENT"])

model = lb.LBClassifier().fit(
X_train_vectorized, y_train, eval_set=[(X_test_vectorized, y_test)]
)


def evaluate_comment(comment):
print("Comment: {}".format(comment))
print(
"Probability of spam: {}".format(
model.predict_proba(vectorizer.transform([comment]))[0, 1]
)
)


evaluate_comment(X_test.iloc[15]["CONTENT"])
evaluate_comment(X_test.iloc[3]["CONTENT"])
evaluate_comment("Your text here")
39 changes: 21 additions & 18 deletions legateboost/input_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
import numpy as np
import scipy.sparse as sp

try:
from legate_sparse import csr_matrix
except ImportError:
csr_matrix = None

import cupynumeric as cn

__all__: List[str] = []
Expand All @@ -29,31 +34,29 @@ def check_sample_weight(sample_weight: Any, n: int) -> cn.ndarray:

def check_array(x: Any) -> cn.ndarray:
if sp.issparse(x):
raise ValueError("Sparse matrix not allowed.")

if not hasattr(x, "__legate_data_interface__"):
x = cn.array(np.require(x, requirements=["C", "A"]))
if hasattr(x, "__array_interface__"):
shape = x.__array_interface__["shape"]
if shape[0] <= 0:
raise ValueError(
"Found array with %d sample(s) (shape=%s) while a"
" minimum of %d is required." % (shape[0], shape, 1)
)
if len(shape) >= 2 and 0 in shape:
raise ValueError(
"Found array with %d feature(s) (shape=%s) while"
" a minimum of %d is required." % (shape[1], shape, 1)
)
x = csr_matrix(x)
elif isinstance(x, csr_matrix):
pass
else:
x = cn.array(x, copy=False)

if x.shape[0] <= 0:
raise ValueError(
"Found array with %d sample(s) (shape=%s) while a"
" minimum of %d is required." % (x.shape[0], x.shape, 1)
)
if len(x.shape) >= 2 and 0 in x.shape:
raise ValueError(
"Found array with %d feature(s) (shape=%s) while"
" a minimum of %d is required." % (x.shape[1], x.shape, 1)
)

if cn.iscomplexobj(x):
raise ValueError("Complex data not supported.")
# note: taking sum first then checking finiteness uses less memory
if np.issubdtype(x.dtype, np.floating) and not cn.isfinite(x.sum()):
raise ValueError("Input contains NaN or inf")

x = cn.array(x, copy=False)

return x


Expand Down
10 changes: 10 additions & 0 deletions legateboost/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,16 @@ def predict(self, X: cn.ndarray) -> cn.ndarray:
"""
pass

def supports_csr(self) -> bool:
"""Whether the model supports CSR matrix input.

Returns
-------
bool
True if the model supports CSR matrix input, False otherwise.
"""
return False

@abstractmethod
def __str__(self) -> str:
pass
Expand Down
166 changes: 155 additions & 11 deletions legateboost/models/tree.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,20 @@
import copy
from enum import IntEnum
from typing import Any
from typing import Any, Union

import cupynumeric as cn
from legate.core import TaskTarget, get_legate_runtime, types
from legate.core import (
ImageComputationHint,
TaskTarget,
get_legate_runtime,
image,
types,
)

try:
from legate_sparse import csr_matrix
except ImportError:
csr_matrix = None

from ..library import user_context, user_lib
from ..utils import get_store
Expand All @@ -12,7 +23,9 @@

class LegateBoostOpCode(IntEnum):
BUILD_TREE = user_lib.cffi.BUILD_TREE
PREDICT = user_lib.cffi.PREDICT
BUILD_TREE_CSR = user_lib.cffi.BUILD_TREE_CSR
PREDICT_TREE = user_lib.cffi.PREDICT_TREE
PREDICT_TREE_CSR = user_lib.cffi.PREDICT_TREE_CSR
UPDATE_TREE = user_lib.cffi.UPDATE_TREE


Expand Down Expand Up @@ -54,12 +67,7 @@ def __init__(
self.split_samples = split_samples
self.alpha = alpha

def fit(
self,
X: cn.ndarray,
g: cn.ndarray,
h: cn.ndarray,
) -> "Tree":
def fit_dense(self, X: cn.ndarray, g: cn.ndarray, h: cn.ndarray) -> "Tree":
num_outputs = g.shape[1]

task = get_legate_runtime().create_auto_task(
Expand Down Expand Up @@ -120,6 +128,90 @@ def fit(
self.hessian = cn.array(hessian, copy=False)
return self

def fit_csr(self, X: csr_matrix, g: cn.ndarray, h: cn.ndarray) -> "Tree":
num_outputs = g.shape[1]

task = get_legate_runtime().create_auto_task(
user_context, LegateBoostOpCode.BUILD_TREE_CSR
)

# promote these to 3d. When the g/h shapes match those of the dense version,
# it makes code reuse easier on the C++ side
g_ = get_store(g).promote(1, 1)
h_ = get_store(h).promote(1, 1)

task.add_scalar_arg(self.max_depth, types.int32)
max_nodes = 2 ** (self.max_depth + 1)
task.add_scalar_arg(max_nodes, types.int32)
task.add_scalar_arg(self.alpha, types.float64)
task.add_scalar_arg(self.split_samples, types.int32)
task.add_scalar_arg(self.random_state.randint(0, 2**31), types.int32)
task.add_scalar_arg(X.shape[0], types.int64)
task.add_scalar_arg(X.shape[1], types.int64)

# inputs
val_var = task.add_input(X.vals)
crd_var = task.add_input(X.crd)
pos_var = task.add_input(X.pos)
task.add_input(g_)
task.add_input(h_)
pos_promoted = X.pos.promote(1, g.shape[1]).promote(1, 1)
# we don't need this input but use it for alignment
task.add_input(pos_promoted)

task.add_alignment(g_, h_)
task.add_alignment(g_, pos_promoted)
task.add_constraint(
image(pos_var, crd_var, hint=ImageComputationHint.FIRST_LAST)
)
task.add_constraint(
image(pos_var, val_var, hint=ImageComputationHint.FIRST_LAST)
)

# outputs
leaf_value = get_legate_runtime().create_store(
types.float64, (max_nodes, num_outputs)
)
feature = get_legate_runtime().create_store(types.int32, (max_nodes,))
split_value = get_legate_runtime().create_store(types.float64, (max_nodes,))
gain = get_legate_runtime().create_store(types.float64, (max_nodes,))
hessian = get_legate_runtime().create_store(
types.float64, (max_nodes, num_outputs)
)
task.add_output(leaf_value)
task.add_output(feature)
task.add_output(split_value)
task.add_output(gain)
task.add_output(hessian)
task.add_broadcast(leaf_value)
task.add_broadcast(feature)
task.add_broadcast(split_value)
task.add_broadcast(gain)
task.add_broadcast(hessian)

if get_legate_runtime().machine.count(TaskTarget.GPU) > 1:
task.add_nccl_communicator()
elif get_legate_runtime().machine.count() > 1:
task.add_cpu_communicator()
task.execute()

self.leaf_value = cn.array(leaf_value, copy=False)
self.feature = cn.array(feature, copy=False)
self.split_value = cn.array(split_value, copy=False)
self.gain = cn.array(gain, copy=False)
self.hessian = cn.array(hessian, copy=False)
return self

def fit(
self,
X: Union[cn.ndarray, csr_matrix],
g: cn.ndarray,
h: cn.ndarray,
) -> "Tree":
if isinstance(X, csr_matrix):
return self.fit_csr(X, g, h)
return self.fit_dense(X, g, h)

def clear(self) -> None:
self.leaf_value.fill(0)
self.hessian.fill(0)
Expand Down Expand Up @@ -168,12 +260,12 @@ def update(
self.hessian = cn.array(hessian, copy=False)
return self

def predict(self, X: cn.ndarray) -> cn.ndarray:
def predict_dense(self, X: cn.ndarray) -> cn.ndarray:
n_rows = X.shape[0]
n_features = X.shape[1]
n_outputs = self.leaf_value.shape[1]
task = get_legate_runtime().create_auto_task(
user_context, LegateBoostOpCode.PREDICT
user_context, LegateBoostOpCode.PREDICT_TREE
)

pred = get_legate_runtime().create_store(types.float64, (n_rows, n_outputs))
Expand All @@ -197,9 +289,58 @@ def predict(self, X: cn.ndarray) -> cn.ndarray:

task.add_alignment(X_, pred_)
task.execute()
return cn.array(pred, copy=False)

def predict_csr(self, X: csr_matrix) -> cn.ndarray:
n_rows = X.shape[0]
n_outputs = self.leaf_value.shape[1]
task = get_legate_runtime().create_auto_task(
user_context, LegateBoostOpCode.PREDICT_TREE_CSR
)

pred = get_legate_runtime().create_store(types.float64, (n_rows, n_outputs))
# inputs
val_var = task.add_input(X.vals)
crd_var = task.add_input(X.crd)
pos_var = task.add_input(X.pos)
task.add_constraint(
image(pos_var, crd_var, hint=ImageComputationHint.FIRST_LAST)
)
task.add_constraint(
image(pos_var, val_var, hint=ImageComputationHint.FIRST_LAST)
)
pos_var_broadcast = X.pos.promote(1, n_outputs)
task.add_alignment(pos_var_broadcast, pred)

# scalars
task.add_scalar_arg(X.shape[1], types.int32)

# output
task.add_output(
pred.promote(1, 1)
) # add 1 dimension so it has the same dimension as dense version
task.add_output(pred) # only here for alignment, no used

# broadcast the tree structure
leaf_value_ = get_store(self.leaf_value)
feature_ = get_store(self.feature)
split_value_ = get_store(self.split_value)
task.add_input(leaf_value_)
task.add_input(feature_)
task.add_input(split_value_)
task.add_broadcast(leaf_value_)
task.add_broadcast(feature_)
task.add_broadcast(split_value_)

task.add_input(pos_var_broadcast) # used only for alignment
task.execute()
return cn.array(pred, copy=False)

def predict(self, X: Union[cn.ndarray, csr_matrix]) -> cn.ndarray:
if isinstance(X, csr_matrix):
return self.predict_csr(X)
return self.predict_dense(X)

def is_leaf(self, id: int) -> Any:
return self.feature[id] == -1

Expand Down Expand Up @@ -245,3 +386,6 @@ def __mul__(self, scalar: Any) -> "Tree":
new = copy.deepcopy(self)
new.leaf_value *= scalar
return new

def supports_csr(self) -> bool:
return True
18 changes: 18 additions & 0 deletions legateboost/test/models/test_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,21 @@ def test_alpha():
)
model.fit(X, y)
assert np.isclose(model.predict(X)[0], y.sum() / (y.size + alpha))


def test_csr():
csr_matrix = pytest.importorskip("legate_sparse").csr_matrix
X = csr_matrix(
(cn.array([1.0, 2.0, 3.0]), cn.array([0, 1, 2]), cn.array([0, 2, 3])),
shape=(2, 3),
)
g = cn.array([[1.0], [-1.0]])
h = cn.array([[1.0], [1.0]])

model = (
lb.models.Tree(alpha=0.0)
.set_random_state(np.random.RandomState(2))
.fit(X, g, h)
)
print(model)
assert np.allclose(model.predict(X), -g / h)
Loading
Loading