rapidsai · RAMitchell · Dec 4, 2024 · Dec 4, 2024 · Dec 4, 2024 · Dec 5, 2024
diff --git a/ci/run_pytests_cpu.sh b/ci/run_pytests_cpu.sh
@@ -18,6 +18,7 @@ set -e -E -u -o pipefail
 cd legateboost/test
 
 legate \
+    --gpus 0 \
     --sysmem 28000 \
     --module pytest \
     . \

diff --git a/conda/environments/all_cuda-122.yaml b/conda/environments/all_cuda-122.yaml
@@ -15,6 +15,7 @@ dependencies:
 - cuda-version>=12.2
 - cupynumeric==25.01.*,>=0.0.0.dev0
 - hypothesis>=6
+- legate-sparse
 - legate==25.01.*,>=0.0.0.dev0
 - libcublas-dev
 - llvm-openmp

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -173,3 +173,4 @@ dependencies:
           - pytest>=7,<8
           - seaborn>=0.13
           - xgboost>=2.0
+          - legate-sparse
diff --git a/examples/sparse/README.md b/examples/sparse/README.md
@@ -0,0 +1,3 @@
+# Sparse data
+
+This example trains a youtube comment spam classifier on a sparse dataset. The comments as raw strings are converted to a sparse matrix of word counts using the `CountVectorizer` from scikit-learn.
diff --git a/examples/sparse/sparse.py b/examples/sparse/sparse.py
@@ -0,0 +1,47 @@
+import pandas as pd
+from sklearn.datasets import fetch_openml
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.model_selection import train_test_split
+
+import legateboost as lb
+
+# Alberto, T. & Lochter, J. (2015). YouTube Spam Collection [Dataset].
+# UCI Machine Learning Repository. https://doi.org/10.24432/C58885.
+dataset_names = [
+    "youtube-spam-psy",
+    "youtube-spam-shakira",
+    "youtube-spam-lmfao",
+    "youtube-spam-eminem",
+    "youtube-spam-katyperry",
+]
+X = []
+for dataset_name in dataset_names:
+    dataset = fetch_openml(name=dataset_name, as_frame=True)
+    X.append(dataset.data)
+
+X = pd.concat(X)
+y = X["CLASS"]
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.3, random_state=42
+)
+vectorizer = CountVectorizer()
+X_train_vectorized = vectorizer.fit_transform(X_train["CONTENT"])
+X_test_vectorized = vectorizer.transform(X_test["CONTENT"])
+
+model = lb.LBClassifier().fit(
+    X_train_vectorized, y_train, eval_set=[(X_test_vectorized, y_test)]
+)
+
+
+def evaluate_comment(comment):
+    print("Comment: {}".format(comment))
+    print(
+        "Probability of spam: {}".format(
+            model.predict_proba(vectorizer.transform([comment]))[0, 1]
+        )
+    )
+
+
+evaluate_comment(X_test.iloc[15]["CONTENT"])
+evaluate_comment(X_test.iloc[3]["CONTENT"])
+evaluate_comment("Your text here")
diff --git a/legateboost/input_validation.py b/legateboost/input_validation.py
@@ -3,6 +3,11 @@
 import numpy as np
 import scipy.sparse as sp
 
+try:
+    from legate_sparse import csr_matrix
+except ImportError:
+    csr_matrix = None
+
 import cupynumeric as cn
 
 __all__: List[str] = []
@@ -29,31 +34,29 @@ def check_sample_weight(sample_weight: Any, n: int) -> cn.ndarray:
 
 def check_array(x: Any) -> cn.ndarray:
     if sp.issparse(x):
-        raise ValueError("Sparse matrix not allowed.")
-
-    if not hasattr(x, "__legate_data_interface__"):
-        x = cn.array(np.require(x, requirements=["C", "A"]))
-    if hasattr(x, "__array_interface__"):
-        shape = x.__array_interface__["shape"]
-        if shape[0] <= 0:
-            raise ValueError(
-                "Found array with %d sample(s) (shape=%s) while a"
-                " minimum of %d is required." % (shape[0], shape, 1)
-            )
-        if len(shape) >= 2 and 0 in shape:
-            raise ValueError(
-                "Found array with %d feature(s) (shape=%s) while"
-                " a minimum of %d is required." % (shape[1], shape, 1)
-            )
+        x = csr_matrix(x)
+    elif isinstance(x, csr_matrix):
+        pass
+    else:
+        x = cn.array(x, copy=False)
+
+    if x.shape[0] <= 0:
+        raise ValueError(
+            "Found array with %d sample(s) (shape=%s) while a"
+            " minimum of %d is required." % (x.shape[0], x.shape, 1)
+        )
+    if len(x.shape) >= 2 and 0 in x.shape:
+        raise ValueError(
+            "Found array with %d feature(s) (shape=%s) while"
+            " a minimum of %d is required." % (x.shape[1], x.shape, 1)
+        )
 
     if cn.iscomplexobj(x):
         raise ValueError("Complex data not supported.")
     # note: taking sum first then checking finiteness uses less memory
     if np.issubdtype(x.dtype, np.floating) and not cn.isfinite(x.sum()):
         raise ValueError("Input contains NaN or inf")
 
-    x = cn.array(x, copy=False)
-
     return x
 
 

diff --git a/legateboost/models/base_model.py b/legateboost/models/base_model.py
@@ -91,6 +91,16 @@ def predict(self, X: cn.ndarray) -> cn.ndarray:
         """
         pass
 
+    def supports_csr(self) -> bool:
+        """Whether the model supports CSR matrix input.
+
+        Returns
+        -------
+        bool
+            True if the model supports CSR matrix input, False otherwise.
+        """
+        return False
+
     @abstractmethod
     def __str__(self) -> str:
         pass

diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py
@@ -1,9 +1,20 @@
 import copy
 from enum import IntEnum
-from typing import Any
+from typing import Any, Union
 
 import cupynumeric as cn
-from legate.core import TaskTarget, get_legate_runtime, types
+from legate.core import (
+    ImageComputationHint,
+    TaskTarget,
+    get_legate_runtime,
+    image,
+    types,
+)
+
+try:
+    from legate_sparse import csr_matrix
+except ImportError:
+    csr_matrix = None
 
 from ..library import user_context, user_lib
 from ..utils import get_store
@@ -12,7 +23,9 @@
 
 class LegateBoostOpCode(IntEnum):
     BUILD_TREE = user_lib.cffi.BUILD_TREE
-    PREDICT = user_lib.cffi.PREDICT
+    BUILD_TREE_CSR = user_lib.cffi.BUILD_TREE_CSR
+    PREDICT_TREE = user_lib.cffi.PREDICT_TREE
+    PREDICT_TREE_CSR = user_lib.cffi.PREDICT_TREE_CSR
     UPDATE_TREE = user_lib.cffi.UPDATE_TREE
 
 
@@ -54,12 +67,7 @@ def __init__(
         self.split_samples = split_samples
         self.alpha = alpha
 
-    def fit(
-        self,
-        X: cn.ndarray,
-        g: cn.ndarray,
-        h: cn.ndarray,
-    ) -> "Tree":
+    def fit_dense(self, X: cn.ndarray, g: cn.ndarray, h: cn.ndarray) -> "Tree":
         num_outputs = g.shape[1]
 
         task = get_legate_runtime().create_auto_task(
@@ -120,6 +128,90 @@ def fit(
         self.hessian = cn.array(hessian, copy=False)
         return self
 
+    def fit_csr(self, X: csr_matrix, g: cn.ndarray, h: cn.ndarray) -> "Tree":
+        num_outputs = g.shape[1]
+
+        task = get_legate_runtime().create_auto_task(
+            user_context, LegateBoostOpCode.BUILD_TREE_CSR
+        )
+
+        # promote these to 3d. When the g/h shapes match those of the dense version,
+        # it makes code reuse easier on the C++ side
+        g_ = get_store(g).promote(1, 1)
+        h_ = get_store(h).promote(1, 1)
+
+        task.add_scalar_arg(self.max_depth, types.int32)
+        max_nodes = 2 ** (self.max_depth + 1)
+        task.add_scalar_arg(max_nodes, types.int32)
+        task.add_scalar_arg(self.alpha, types.float64)
+        task.add_scalar_arg(self.split_samples, types.int32)
+        task.add_scalar_arg(self.random_state.randint(0, 2**31), types.int32)
+        task.add_scalar_arg(X.shape[0], types.int64)
+        task.add_scalar_arg(X.shape[1], types.int64)
+
+        # inputs
+        val_var = task.add_input(X.vals)
+        crd_var = task.add_input(X.crd)
+        pos_var = task.add_input(X.pos)
+        task.add_input(g_)
+        task.add_input(h_)
+        pos_promoted = X.pos.promote(1, g.shape[1]).promote(1, 1)
+        # we don't need this input but use it for alignment
+        task.add_input(pos_promoted)
+
+        task.add_alignment(g_, h_)
+        task.add_alignment(g_, pos_promoted)
+        task.add_constraint(
+            image(pos_var, crd_var, hint=ImageComputationHint.FIRST_LAST)
+        )
+        task.add_constraint(
+            image(pos_var, val_var, hint=ImageComputationHint.FIRST_LAST)
+        )
+
+        # outputs
+        leaf_value = get_legate_runtime().create_store(
+            types.float64, (max_nodes, num_outputs)
+        )
+        feature = get_legate_runtime().create_store(types.int32, (max_nodes,))
+        split_value = get_legate_runtime().create_store(types.float64, (max_nodes,))
+        gain = get_legate_runtime().create_store(types.float64, (max_nodes,))
+        hessian = get_legate_runtime().create_store(
+            types.float64, (max_nodes, num_outputs)
+        )
+        task.add_output(leaf_value)
+        task.add_output(feature)
+        task.add_output(split_value)
+        task.add_output(gain)
+        task.add_output(hessian)
+        task.add_broadcast(leaf_value)
+        task.add_broadcast(feature)
+        task.add_broadcast(split_value)
+        task.add_broadcast(gain)
+        task.add_broadcast(hessian)
+
+        if get_legate_runtime().machine.count(TaskTarget.GPU) > 1:
+            task.add_nccl_communicator()
+        elif get_legate_runtime().machine.count() > 1:
+            task.add_cpu_communicator()
+        task.execute()
+
+        self.leaf_value = cn.array(leaf_value, copy=False)
+        self.feature = cn.array(feature, copy=False)
+        self.split_value = cn.array(split_value, copy=False)
+        self.gain = cn.array(gain, copy=False)
+        self.hessian = cn.array(hessian, copy=False)
+        return self
+
+    def fit(
+        self,
+        X: Union[cn.ndarray, csr_matrix],
+        g: cn.ndarray,
+        h: cn.ndarray,
+    ) -> "Tree":
+        if isinstance(X, csr_matrix):
+            return self.fit_csr(X, g, h)
+        return self.fit_dense(X, g, h)
+
     def clear(self) -> None:
         self.leaf_value.fill(0)
         self.hessian.fill(0)
@@ -168,12 +260,12 @@ def update(
         self.hessian = cn.array(hessian, copy=False)
         return self
 
-    def predict(self, X: cn.ndarray) -> cn.ndarray:
+    def predict_dense(self, X: cn.ndarray) -> cn.ndarray:
         n_rows = X.shape[0]
         n_features = X.shape[1]
         n_outputs = self.leaf_value.shape[1]
         task = get_legate_runtime().create_auto_task(
-            user_context, LegateBoostOpCode.PREDICT
+            user_context, LegateBoostOpCode.PREDICT_TREE
         )
 
         pred = get_legate_runtime().create_store(types.float64, (n_rows, n_outputs))
@@ -197,9 +289,58 @@ def predict(self, X: cn.ndarray) -> cn.ndarray:
 
         task.add_alignment(X_, pred_)
         task.execute()
+        return cn.array(pred, copy=False)
 
+    def predict_csr(self, X: csr_matrix) -> cn.ndarray:
+        n_rows = X.shape[0]
+        n_outputs = self.leaf_value.shape[1]
+        task = get_legate_runtime().create_auto_task(
+            user_context, LegateBoostOpCode.PREDICT_TREE_CSR
+        )
+
+        pred = get_legate_runtime().create_store(types.float64, (n_rows, n_outputs))
+        # inputs
+        val_var = task.add_input(X.vals)
+        crd_var = task.add_input(X.crd)
+        pos_var = task.add_input(X.pos)
+        task.add_constraint(
+            image(pos_var, crd_var, hint=ImageComputationHint.FIRST_LAST)
+        )
+        task.add_constraint(
+            image(pos_var, val_var, hint=ImageComputationHint.FIRST_LAST)
+        )
+        pos_var_broadcast = X.pos.promote(1, n_outputs)
+        task.add_alignment(pos_var_broadcast, pred)
+
+        # scalars
+        task.add_scalar_arg(X.shape[1], types.int32)
+
+        # output
+        task.add_output(
+            pred.promote(1, 1)
+        )  # add 1 dimension so it has the same dimension as dense version
+        task.add_output(pred)  # only here for alignment, no used
+
+        # broadcast the tree structure
+        leaf_value_ = get_store(self.leaf_value)
+        feature_ = get_store(self.feature)
+        split_value_ = get_store(self.split_value)
+        task.add_input(leaf_value_)
+        task.add_input(feature_)
+        task.add_input(split_value_)
+        task.add_broadcast(leaf_value_)
+        task.add_broadcast(feature_)
+        task.add_broadcast(split_value_)
+
+        task.add_input(pos_var_broadcast)  # used only for alignment
+        task.execute()
         return cn.array(pred, copy=False)
 
+    def predict(self, X: Union[cn.ndarray, csr_matrix]) -> cn.ndarray:
+        if isinstance(X, csr_matrix):
+            return self.predict_csr(X)
+        return self.predict_dense(X)
+
     def is_leaf(self, id: int) -> Any:
         return self.feature[id] == -1
 
@@ -245,3 +386,6 @@ def __mul__(self, scalar: Any) -> "Tree":
         new = copy.deepcopy(self)
         new.leaf_value *= scalar
         return new
+
+    def supports_csr(self) -> bool:
+        return True
diff --git a/legateboost/test/models/test_tree.py b/legateboost/test/models/test_tree.py
@@ -72,3 +72,21 @@ def test_alpha():
     )
     model.fit(X, y)
     assert np.isclose(model.predict(X)[0], y.sum() / (y.size + alpha))
+
+
+def test_csr():
+    csr_matrix = pytest.importorskip("legate_sparse").csr_matrix
+    X = csr_matrix(
+        (cn.array([1.0, 2.0, 3.0]), cn.array([0, 1, 2]), cn.array([0, 2, 3])),
+        shape=(2, 3),
+    )
+    g = cn.array([[1.0], [-1.0]])
+    h = cn.array([[1.0], [1.0]])
+
+    model = (
+        lb.models.Tree(alpha=0.0)
+        .set_random_state(np.random.RandomState(2))
+        .fit(X, g, h)
+    )
+    print(model)
+    assert np.allclose(model.predict(X), -g / h)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Sparse data

		This example trains a youtube comment spam classifier on a sparse dataset. The comments as raw strings are converted to a sparse matrix of word counts using the `CountVectorizer` from scikit-learn.