From a4e25559674744f551fb53390edcecbd8b5cbe32 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Wed, 4 Dec 2024 04:48:20 -0800
Subject: [PATCH 01/10] Add support for legate-sparse CSR in Tree model

---
 legateboost/models/tree.py           | 107 +++++++++++-
 legateboost/test/models/test_tree.py |  13 ++
 src/legateboost.h                    |   7 +-
 src/models/tree/build_tree.cc        |   3 +
 src/models/tree/build_tree.cu        | 250 +++++++++++++++++----------
 src/models/tree/build_tree.h         |  70 ++++++++
 6 files changed, 347 insertions(+), 103 deletions(-)

diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py
index a3aa8c37..ba64b10b 100644
--- a/legateboost/models/tree.py
+++ b/legateboost/models/tree.py
@@ -1,9 +1,20 @@
 import copy
 from enum import IntEnum
-from typing import Any
+from typing import Any, Union
 
 import cupynumeric as cn
-from legate.core import TaskTarget, get_legate_runtime, types
+from legate.core import (
+    ImageComputationHint,
+    TaskTarget,
+    get_legate_runtime,
+    image,
+    types,
+)
+
+try:
+    from legate_sparse import csr_matrix
+except ImportError:
+    csr_matrix = None
 
 from ..library import user_context, user_lib
 from ..utils import get_store
@@ -12,6 +23,7 @@
 
 class LegateBoostOpCode(IntEnum):
     BUILD_TREE = user_lib.cffi.BUILD_TREE
+    BUILD_TREE_CSR = user_lib.cffi.BUILD_TREE_CSR
     PREDICT = user_lib.cffi.PREDICT
     UPDATE_TREE = user_lib.cffi.UPDATE_TREE
 
@@ -54,12 +66,7 @@ def __init__(
         self.split_samples = split_samples
         self.alpha = alpha
 
-    def fit(
-        self,
-        X: cn.ndarray,
-        g: cn.ndarray,
-        h: cn.ndarray,
-    ) -> "Tree":
+    def fit_dense(self, X: cn.ndarray, g: cn.ndarray, h: cn.ndarray) -> "Tree":
         num_outputs = g.shape[1]
 
         task = get_legate_runtime().create_auto_task(
@@ -120,6 +127,90 @@ def fit(
         self.hessian = cn.array(hessian, copy=False)
         return self
 
+    def fit_csr(self, X: csr_matrix, g: cn.ndarray, h: cn.ndarray) -> "Tree":
+        num_outputs = g.shape[1]
+
+        task = get_legate_runtime().create_auto_task(
+            user_context, LegateBoostOpCode.BUILD_TREE_CSR
+        )
+
+        # promote these to 3d. When the g/h shapes match those of the dense version,
+        # it makes code reuse easier on the C++ side
+        g_ = get_store(g).promote(1, 1)
+        h_ = get_store(h).promote(1, 1)
+
+        task.add_scalar_arg(self.max_depth, types.int32)
+        max_nodes = 2 ** (self.max_depth + 1)
+        task.add_scalar_arg(max_nodes, types.int32)
+        task.add_scalar_arg(self.alpha, types.float64)
+        task.add_scalar_arg(self.split_samples, types.int32)
+        task.add_scalar_arg(self.random_state.randint(0, 2**31), types.int32)
+        task.add_scalar_arg(X.shape[0], types.int64)
+        task.add_scalar_arg(X.shape[1], types.int64)
+
+        # inputs
+        val_var = task.add_input(X.vals)
+        crd_var = task.add_input(X.crd)
+        pos_var = task.add_input(X.pos)
+        task.add_input(g_)
+        task.add_input(h_)
+        pos_promoted = X.pos.promote(1, g.shape[1]).promote(1, 1)
+        # we don't need this input but use it for alignment
+        task.add_input(pos_promoted)
+
+        task.add_alignment(g_, h_)
+        task.add_alignment(g_, pos_promoted)
+        task.add_constraint(
+            image(pos_var, crd_var, hint=ImageComputationHint.FIRST_LAST)
+        )
+        task.add_constraint(
+            image(pos_var, val_var, hint=ImageComputationHint.FIRST_LAST)
+        )
+
+        # outputs
+        leaf_value = get_legate_runtime().create_store(
+            types.float64, (max_nodes, num_outputs)
+        )
+        feature = get_legate_runtime().create_store(types.int32, (max_nodes,))
+        split_value = get_legate_runtime().create_store(types.float64, (max_nodes,))
+        gain = get_legate_runtime().create_store(types.float64, (max_nodes,))
+        hessian = get_legate_runtime().create_store(
+            types.float64, (max_nodes, num_outputs)
+        )
+        task.add_output(leaf_value)
+        task.add_output(feature)
+        task.add_output(split_value)
+        task.add_output(gain)
+        task.add_output(hessian)
+        task.add_broadcast(leaf_value)
+        task.add_broadcast(feature)
+        task.add_broadcast(split_value)
+        task.add_broadcast(gain)
+        task.add_broadcast(hessian)
+
+        if get_legate_runtime().machine.count(TaskTarget.GPU) > 1:
+            task.add_nccl_communicator()
+        elif get_legate_runtime().machine.count() > 1:
+            task.add_cpu_communicator()
+        task.execute()
+
+        self.leaf_value = cn.array(leaf_value, copy=False)
+        self.feature = cn.array(feature, copy=False)
+        self.split_value = cn.array(split_value, copy=False)
+        self.gain = cn.array(gain, copy=False)
+        self.hessian = cn.array(hessian, copy=False)
+        return self
+
+    def fit(
+        self,
+        X: Union[cn.ndarray, csr_matrix],
+        g: cn.ndarray,
+        h: cn.ndarray,
+    ) -> "Tree":
+        if isinstance(X, csr_matrix):
+            return self.fit_csr(X, g, h)
+        return self.fit_dense(X, g, h)
+
     def clear(self) -> None:
         self.leaf_value.fill(0)
         self.hessian.fill(0)
diff --git a/legateboost/test/models/test_tree.py b/legateboost/test/models/test_tree.py
index 8502eef3..e3a2f8c6 100644
--- a/legateboost/test/models/test_tree.py
+++ b/legateboost/test/models/test_tree.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pytest
+from legate_sparse import csr_matrix
 
 import cupynumeric as cn
 import legateboost as lb
@@ -72,3 +73,15 @@ def test_alpha():
     )
     model.fit(X, y)
     assert np.isclose(model.predict(X)[0], y.sum() / (y.size + alpha))
+
+
+def test_sparse():
+    num_outputs = 1
+    rs = cn.random.RandomState(0)
+    X = csr_matrix(
+        (cn.array([1.0, 2.0, 3.0]), cn.array([0, 1, 2]), cn.array([0, 2, 3])),
+        shape=(2, 3),
+    )
+    g = cn.array(rs.normal(size=(2, num_outputs)))
+    h = cn.array(rs.random((2, 1)) + 0.1)
+    lb.models.Tree().set_random_state(np.random.RandomState(2)).fit(X, g, h)
diff --git a/src/legateboost.h b/src/legateboost.h
index c37fd733..fe5a1d84 100644
--- a/src/legateboost.h
+++ b/src/legateboost.h
@@ -29,9 +29,10 @@ enum LegateBoostOpCode {  // NOLINT(performance-enum-size)
   DIGAMMA = 7,
   ZETA    = 8,
   /**/
-  GATHER   = 9,
-  RBF      = 10,
-  BUILD_NN = 11,
+  GATHER         = 9,
+  RBF            = 10,
+  BUILD_NN       = 11,
+  BUILD_TREE_CSR = 12,
 };
 
 #endif  // SRC_LEGATEBOOST_H_
diff --git a/src/models/tree/build_tree.cc b/src/models/tree/build_tree.cc
index 7b39791e..0fbe0b5b 100644
--- a/src/models/tree/build_tree.cc
+++ b/src/models/tree/build_tree.cc
@@ -489,6 +489,8 @@ struct build_tree_fn {
   legateboost::type_dispatch_float(X.code(), build_tree_fn(), context);
 }
 
+/*static*/ void BuildTreeCSRTask::cpu_variant(legate::TaskContext context) {}
+
 }  // namespace legateboost
 
 namespace  // unnamed
@@ -496,5 +498,6 @@ namespace  // unnamed
 static void __attribute__((constructor)) register_tasks(void)
 {
   legateboost::BuildTreeTask::register_variants();
+  legateboost::BuildTreeCSRTask::register_variants();
 }
 }  // namespace
diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu
index 21808bef..8b2295ba 100644
--- a/src/models/tree/build_tree.cu
+++ b/src/models/tree/build_tree.cu
@@ -191,11 +191,12 @@ using SharedMemoryHistogramType = GPairBase<int32_t>;
 const int kMaxSharedBins = 2048;  // 16KB shared memory. More is not helpful and creates more cache
                                   // misses for binary search in split_proposals.
 
-template <typename T,
+template <typename MatrixT,
           int kBlockThreads,
           int kItemsPerThread,
           int kItemsPerTile = kBlockThreads * kItemsPerThread>
 struct HistogramAgent {
+  using T                      = typename MatrixT::value_type;
   static const int kImpureTile = -1;  // Special value for a tile that is not pure (contains
                                       // multiple nodes)
   struct SharedMemoryHistogram {
@@ -245,8 +246,7 @@ struct HistogramAgent {
     }
   };
 
-  const legate::AccessorRO<T, 3> X;
-  const int64_t sample_offset;
+  const MatrixT X;
   const legate::AccessorRO<double, 3> g;
   const legate::AccessorRO<double, 3> h;
   const size_t n_outputs;
@@ -262,8 +262,7 @@ struct HistogramAgent {
   int feature_stride;
   SharedMemoryHistogram shared_histogram;
 
-  __device__ HistogramAgent(const legate::AccessorRO<T, 3>& X,
-                            int64_t sample_offset,
+  __device__ HistogramAgent(const MatrixT& X,
                             const legate::AccessorRO<double, 3>& g,
                             const legate::AccessorRO<double, 3>& h,
                             size_t n_outputs,
@@ -276,7 +275,6 @@ struct HistogramAgent {
                             int64_t seed,
                             SharedMemoryHistogramType* shared_memory)
     : X(X),
-      sample_offset(sample_offset),
       g(g),
       h(h),
       n_outputs(n_outputs),
@@ -311,11 +309,10 @@ struct HistogramAgent {
           sample_node, node_sums, histogram.ContainsNode(BinaryTree::Parent(sample_node)));
       if (!computeHistogram) continue;
 
-      auto x = X[{sample_offset + local_sample_idx, feature, 0}];
-      // int bin_idx = shared_split_proposals.FindBin(x, feature);
+      auto x      = X.Get(X.RowRange().lo[0] + local_sample_idx, feature);
       int bin_idx = split_proposals.FindBin(x, feature);
 
-      legate::Point<3> p = {sample_offset + local_sample_idx, 0, output};
+      legate::Point<3> p = {X.RowRange().lo[0] + local_sample_idx, 0, output};
       auto gpair_quantised =
         quantiser.QuantiseStochasticRounding({g[p], h[p]}, hash_combine(seed, p[0], p[2]));
       auto* addPosition = reinterpret_cast<typename IntegerGPair::value_type*>(
@@ -355,7 +352,7 @@ struct HistogramAgent {
     T x[kItemsPerThread];
 #pragma unroll
     for (int i = 0; i < kItemsPerThread; i++) {
-      x[i] = X[{sample_offset + local_sample_idx[i], feature[i], 0}];
+      x[i] = X.Get(X.RowRange().lo[0] + local_sample_idx[i], feature[i]);
     }
 
     int bin_idx[kItemsPerThread];
@@ -366,7 +363,7 @@ struct HistogramAgent {
     IntegerGPair gpair[kItemsPerThread];
 #pragma unroll
     for (int i = 0; i < kItemsPerThread; i++) {
-      legate::Point<3> p = {sample_offset + local_sample_idx[i], 0, output};
+      legate::Point<3> p = {X.RowRange().lo[0] + local_sample_idx[i], 0, output};
       gpair[i] =
         bin_idx[i] != SparseSplitProposals<T>::NOT_FOUND
           ? quantiser.QuantiseStochasticRounding({g[p], h[p]}, hash_combine(seed, p[0], p[2]))
@@ -415,14 +412,13 @@ struct HistogramAgent {
 };
 
 // NOLINTBEGIN(performance-unnecessary-value-param)
-template <typename T, int kBlockThreads, int kItemsPerThread>
+template <typename MatrixT, int kBlockThreads, int kItemsPerThread>
 __global__ static void __launch_bounds__(kBlockThreads)
-  fill_histogram_shared(legate::AccessorRO<T, 3> X,
-                        int64_t sample_offset,
+  fill_histogram_shared(MatrixT X,
                         legate::AccessorRO<double, 3> g,
                         legate::AccessorRO<double, 3> h,
                         size_t n_outputs,
-                        SparseSplitProposals<T> split_proposals,
+                        SparseSplitProposals<typename MatrixT::value_type> split_proposals,
                         NodeBatch batch,
                         Histogram<IntegerGPair> histogram,
                         legate::Buffer<IntegerGPair, 2> node_sums,
@@ -433,26 +429,26 @@ __global__ static void __launch_bounds__(kBlockThreads)
   __shared__ char shared_char[kMaxSharedBins * sizeof(SharedMemoryHistogramType)];
   SharedMemoryHistogramType* shared_memory =
     reinterpret_cast<SharedMemoryHistogramType*>(shared_char);
-  HistogramAgent<T, kBlockThreads, kItemsPerThread> agent(X,
-                                                          sample_offset,
-                                                          g,
-                                                          h,
-                                                          n_outputs,
-                                                          split_proposals,
-                                                          batch,
-                                                          histogram,
-                                                          node_sums,
-                                                          quantiser,
-                                                          feature_groups,
-                                                          seed,
-                                                          shared_memory);
+  HistogramAgent<MatrixT, kBlockThreads, kItemsPerThread> agent(X,
+                                                                g,
+                                                                h,
+                                                                n_outputs,
+                                                                split_proposals,
+                                                                batch,
+                                                                histogram,
+                                                                node_sums,
+                                                                quantiser,
+                                                                feature_groups,
+                                                                seed,
+                                                                shared_memory);
   agent.BuildHistogram();
 }
 // NOLINTEND(performance-unnecessary-value-param)
 
 // Manage the launch parameters for histogram kernel
-template <typename T, std::int32_t kBlockThreads = 1024, std::int32_t kItemsPerThread = 4>
+template <typename MatrixT, std::int32_t kBlockThreads = 1024, std::int32_t kItemsPerThread = 4>
 struct HistogramKernel {
+  using T                          = typename MatrixT::value_type;
   const std::int32_t kItemsPerTile = kBlockThreads * kItemsPerThread;
   legate::Buffer<int> feature_groups;
   int num_groups;
@@ -466,7 +462,7 @@ struct HistogramKernel {
     std::int32_t n_blocks_per_mp = 0;
     CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
       &n_blocks_per_mp,
-      fill_histogram_shared<T, kBlockThreads, kItemsPerThread>,
+      fill_histogram_shared<MatrixT, kBlockThreads, kItemsPerThread>,
       kBlockThreads,
       0));
     this->maximum_blocks_for_occupancy = n_blocks_per_mp * n_mps;
@@ -507,8 +503,7 @@ struct HistogramKernel {
                                stream));
   }
 
-  void BuildHistogram(const legate::AccessorRO<T, 3>& X,
-                      int64_t sample_offset,
+  void BuildHistogram(const MatrixT& X,
                       const legate::AccessorRO<double, 3>& g,
                       const legate::AccessorRO<double, 3>& h,
                       size_t n_outputs,
@@ -525,9 +520,8 @@ struct HistogramKernel {
     auto min_blocks  = (average_elements_per_group + kItemsPerTile - 1) / kItemsPerTile;
     auto x_grid_size = std::min(static_cast<uint64_t>(maximum_blocks_for_occupancy), min_blocks);
     // Launch the kernel
-    fill_histogram_shared<T, kBlockThreads, kItemsPerThread>
+    fill_histogram_shared<MatrixT, kBlockThreads, kItemsPerThread>
       <<<dim3(x_grid_size, num_groups, n_outputs), kBlockThreads, 0, stream>>>(X,
-                                                                               sample_offset,
                                                                                g,
                                                                                h,
                                                                                n_outputs,
@@ -818,10 +812,9 @@ struct Tree {
 // Use nccl to share the samples with all workers
 // Remove any duplicates
 // Return sparse matrix of split samples for each feature
-template <typename T>
+template <typename T, template <typename> class XMatrix>
 SparseSplitProposals<T> SelectSplitSamples(legate::TaskContext context,
-                                           const legate::AccessorRO<T, 3>& X,
-                                           legate::Rect<3> X_shape,
+                                           const XMatrix<T>& X,
                                            int split_samples,
                                            int seed,
                                            int64_t dataset_rows,
@@ -829,7 +822,6 @@ SparseSplitProposals<T> SelectSplitSamples(legate::TaskContext context,
 {
   auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM);
   auto policy       = DEFAULT_POLICY(thrust_alloc).on(stream);
-  int num_features  = X_shape.hi[1] - X_shape.lo[1] + 1;
   // Randomly choose split_samples rows
   auto row_samples = legate::create_buffer<int64_t, 1>(split_samples);
   auto counting    = thrust::make_counting_iterator(0);
@@ -840,51 +832,54 @@ SparseSplitProposals<T> SelectSplitSamples(legate::TaskContext context,
       eng.discard(idx);
       return dist(eng);
     });
-  auto draft_proposals = legate::create_buffer<T, 2>({num_features, split_samples});
+  auto draft_proposals = legate::create_buffer<T, 2>({X.NumFeatures(), split_samples});
 
   // fill with local data
-  LaunchN(num_features * split_samples, stream, [=] __device__(auto idx) {
-    auto i                  = idx / num_features;
-    auto j                  = idx % num_features;
+  LaunchN(X.NumFeatures() * split_samples, stream, [=] __device__(auto idx) {
+    auto i                  = idx / X.NumFeatures();
+    auto j                  = idx % X.NumFeatures();
     auto row                = row_samples[i];
-    bool has_data           = row >= X_shape.lo[0] && row <= X_shape.hi[0];
-    draft_proposals[{j, i}] = has_data ? X[{row, j, 0}] : T(0);
+    bool has_data           = X.RowRange().contains(row);
+    draft_proposals[{j, i}] = has_data ? X.Get(row, j) : T(0);
   });
 
   // Sum reduce over all workers
-  SumAllReduce(context, draft_proposals.ptr({0, 0}), num_features * split_samples, stream);
+  SumAllReduce(context, draft_proposals.ptr({0, 0}), X.NumFeatures() * split_samples, stream);
 
   CHECK_CUDA_STREAM(stream);
 
   // Condense split samples to unique values
   // First sort the samples
-  auto keys = legate::create_buffer<int32_t, 1>(num_features * split_samples);
-  thrust::transform(
-    policy, counting, counting + num_features * split_samples, keys.ptr(0), [=] __device__(int i) {
-      return i / split_samples;
-    });
+  auto keys = legate::create_buffer<int32_t, 1>(X.NumFeatures() * split_samples);
+  thrust::transform(policy,
+                    counting,
+                    counting + X.NumFeatures() * split_samples,
+                    keys.ptr(0),
+                    [=] __device__(int i) { return i / split_samples; });
 
   // Segmented sort
   auto begin =
     thrust::make_zip_iterator(thrust::make_tuple(keys.ptr(0), draft_proposals.ptr({0, 0})));
-  thrust::sort(policy, begin, begin + num_features * split_samples, [] __device__(auto a, auto b) {
-    if (thrust::get<0>(a) != thrust::get<0>(b)) { return thrust::get<0>(a) < thrust::get<0>(b); }
-    return thrust::get<1>(a) < thrust::get<1>(b);
-  });
+  thrust::sort(
+    policy, begin, begin + X.NumFeatures() * split_samples, [] __device__(auto a, auto b) {
+      if (thrust::get<0>(a) != thrust::get<0>(b)) { return thrust::get<0>(a) < thrust::get<0>(b); }
+      return thrust::get<1>(a) < thrust::get<1>(b);
+    });
 
   // Extract the unique values
-  auto out_keys        = legate::create_buffer<int32_t, 1>(num_features * split_samples);
-  auto split_proposals = legate::create_buffer<T, 1>(num_features * split_samples);
+  auto out_keys        = legate::create_buffer<int32_t, 1>(X.NumFeatures() * split_samples);
+  auto split_proposals = legate::create_buffer<T, 1>(X.NumFeatures() * split_samples);
   auto key_val =
     thrust::make_zip_iterator(thrust::make_tuple(keys.ptr(0), draft_proposals.ptr({0, 0})));
   auto out_iter =
     thrust::make_zip_iterator(thrust::make_tuple(out_keys.ptr(0), split_proposals.ptr(0)));
   auto result =
-    thrust::unique_copy(policy, key_val, key_val + num_features * split_samples, out_iter);
+    thrust::unique_copy(policy, key_val, key_val + X.NumFeatures() * split_samples, out_iter);
   auto n_unique = thrust::distance(out_iter, result);
   // Count the unique values for each feature
-  auto row_pointers = legate::create_buffer<int32_t, 1>(num_features + 1);
-  CHECK_CUDA(cudaMemsetAsync(row_pointers.ptr(0), 0, (num_features + 1) * sizeof(int32_t), stream));
+  auto row_pointers = legate::create_buffer<int32_t, 1>(X.NumFeatures() + 1);
+  CHECK_CUDA(
+    cudaMemsetAsync(row_pointers.ptr(0), 0, (X.NumFeatures() + 1) * sizeof(int32_t), stream));
 
   thrust::reduce_by_key(policy,
                         out_keys.ptr(0),
@@ -894,13 +889,13 @@ SparseSplitProposals<T> SelectSplitSamples(legate::TaskContext context,
                         row_pointers.ptr(1));
   // Scan the counts to get the row pointers for a CSR matrix
   thrust::inclusive_scan(
-    policy, row_pointers.ptr(1), row_pointers.ptr(1) + num_features, row_pointers.ptr(1));
+    policy, row_pointers.ptr(1), row_pointers.ptr(1) + X.NumFeatures(), row_pointers.ptr(1));
 
   CHECK_CUDA(cudaStreamSynchronize(stream));
   row_samples.destroy();
   draft_proposals.destroy();
   out_keys.destroy();
-  return SparseSplitProposals<T>(split_proposals, row_pointers, num_features, n_unique);
+  return SparseSplitProposals<T>(split_proposals, row_pointers, X.NumFeatures(), n_unique);
 }
 
 // Can't put a device lambda in constructor so make this a function
@@ -913,8 +908,9 @@ void FillPositions(const legate::Buffer<cuda::std::tuple<int32_t, int32_t>>& sor
   });
 }
 
-template <typename T>
+template <typename MatrixT>
 struct TreeBuilder {
+  using T = typename MatrixT::value_type;
   TreeBuilder(int32_t num_rows,
               int32_t num_features,
               int32_t num_outputs,
@@ -952,8 +948,7 @@ struct TreeBuilder {
     max_batch_size = max_histogram_nodes;
   }
 
-  template <typename TYPE>
-  void UpdatePositions(Tree& tree, const legate::AccessorRO<TYPE, 3>& X, legate::Rect<3> X_shape)
+  void UpdatePositions(Tree& tree, const MatrixT X)
   {
     auto tree_split_value_ptr = tree.split_value.ptr(0);
     auto tree_feature_ptr     = tree.feature.ptr(0);
@@ -967,9 +962,11 @@ struct TreeBuilder {
           sorted_positions[idx] = cuda::std::make_tuple(-1, row);
           return;
         }
-        double x_value = X[{X_shape.lo[0] + static_cast<int64_t>(row), tree_feature_ptr[pos], 0}];
-        bool left      = x_value <= tree_split_value_ptr[pos];
-        pos            = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos);
+
+        double x_value =
+          X.Get(X.RowRange().lo[0] + static_cast<int64_t>(row), tree_feature_ptr[pos]);
+        bool left             = x_value <= tree_split_value_ptr[pos];
+        pos                   = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos);
         sorted_positions[idx] = cuda::std::make_tuple(pos, row);
       });
     CHECK_CUDA_STREAM(stream);
@@ -1005,12 +1002,10 @@ struct TreeBuilder {
                                stream));
   }
 
-  template <typename TYPE>
   void ComputeHistogram(Histogram<IntegerGPair> histogram,
                         legate::TaskContext context,
                         Tree& tree,
-                        const legate::AccessorRO<TYPE, 3>& X,
-                        legate::Rect<3> X_shape,
+                        const MatrixT X,
                         const legate::AccessorRO<double, 3>& g,
                         const legate::AccessorRO<double, 3>& h,
                         NodeBatch batch,
@@ -1018,7 +1013,6 @@ struct TreeBuilder {
                         int depth)
   {
     histogram_kernel.BuildHistogram(X,
-                                    X_shape.lo[0],
                                     g,
                                     h,
                                     num_outputs,
@@ -1184,7 +1178,7 @@ struct TreeBuilder {
   Histogram<IntegerGPair> histogram;
   int max_batch_size;
   GradientQuantiser quantiser;
-  HistogramKernel<T> histogram_kernel;
+  HistogramKernel<MatrixT> histogram_kernel;
 
   cudaStream_t stream;
 };
@@ -1197,6 +1191,8 @@ struct build_tree_fn {
     auto [g, g_shape, g_accessor] = GetInputStore<double, 3>(context.input(1).data());
     auto [h, h_shape, h_accessor] = GetInputStore<double, 3>(context.input(2).data());
 
+    DenseXMatrix X_matrix(X_accessor, X_shape);
+
     EXPECT_DENSE_ROW_MAJOR(X_accessor.accessor, X_shape);
     auto num_features = X_shape.hi[1] - X_shape.lo[1] + 1;
     auto num_rows     = std::max<int64_t>(X_shape.hi[0] - X_shape.lo[0] + 1, 0);
@@ -1221,19 +1217,19 @@ struct build_tree_fn {
     Tree tree(max_nodes, num_outputs, stream, thrust_exec_policy);
 
     SparseSplitProposals<T> split_proposals =
-      SelectSplitSamples(context, X_accessor, X_shape, split_samples, seed, dataset_rows, stream);
+      SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows, stream);
 
     GradientQuantiser quantiser(context, g_accessor, h_accessor, g_shape, stream);
 
     // Begin building the tree
-    TreeBuilder<T> builder(num_rows,
-                           num_features,
-                           num_outputs,
-                           stream,
-                           tree.max_nodes,
-                           max_depth,
-                           split_proposals,
-                           quantiser);
+    TreeBuilder<DenseXMatrix<T>> builder(num_rows,
+                                         num_features,
+                                         num_outputs,
+                                         stream,
+                                         tree.max_nodes,
+                                         max_depth,
+                                         split_proposals,
+                                         quantiser);
 
     builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha, seed);
 
@@ -1242,22 +1238,86 @@ struct build_tree_fn {
       for (auto batch : batches) {
         auto histogram = builder.GetHistogram(batch);
 
-        builder.ComputeHistogram(histogram,
-                                 context,
-                                 tree,
-                                 X_accessor,
-                                 X_shape,
-                                 g_accessor,
-                                 h_accessor,
-                                 batch,
-                                 seed,
-                                 depth);
+        builder.ComputeHistogram(
+          histogram, context, tree, X_matrix, g_accessor, h_accessor, batch, seed, depth);
 
         builder.PerformBestSplit(tree, histogram, alpha, batch);
       }
       // Update position of entire level
       // Don't bother updating positions for the last level
-      if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_accessor, X_shape); }
+      if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_matrix); }
+    }
+
+    tree.WriteTreeOutput(context, thrust_exec_policy, quantiser);
+
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+    CHECK_CUDA_STREAM(stream);
+  }
+};
+
+struct build_tree_csr_fn {
+  template <typename T>
+  void operator()(legate::TaskContext context)
+  {
+    auto [X_vals, X_vals_shape, X_vals_accessor] = GetInputStore<T, 1>(context.input(0).data());
+    auto [X_coords, X_coords_shape, X_coords_accessor] =
+      GetInputStore<int64_t, 1>(context.input(1).data());
+    auto [X_offsets, X_offsets_shape, X_offsets_accessor] =
+      GetInputStore<legate::Rect<1>, 1>(context.input(2).data());
+    auto [g, g_shape, g_accessor] = GetInputStore<double, 3>(context.input(3).data());
+    auto [h, h_shape, h_accessor] = GetInputStore<double, 3>(context.input(4).data());
+
+    auto num_rows    = std::max<int64_t>(X_offsets_shape.hi[0] - X_offsets_shape.lo[0] + 1, 0);
+    auto num_outputs = g_shape.hi[1] - g_shape.lo[1] + 1;
+    EXPECT(g_shape.lo[1] == 0, "Outputs should not be split between workers.");
+
+    // Scalars
+    auto max_depth     = context.scalars().at(0).value<int>();
+    auto max_nodes     = context.scalars().at(1).value<int>();
+    auto alpha         = context.scalars().at(2).value<double>();
+    auto split_samples = context.scalars().at(3).value<int>();
+    auto seed          = context.scalars().at(4).value<int>();
+    auto dataset_rows  = context.scalars().at(5).value<int64_t>();
+    auto num_features  = context.scalars().at(6).value<int64_t>();
+
+    auto stream             = context.get_task_stream();
+    auto thrust_alloc       = ThrustAllocator(legate::Memory::GPU_FB_MEM);
+    auto thrust_exec_policy = DEFAULT_POLICY(thrust_alloc).on(stream);
+
+    Tree tree(max_nodes, num_outputs, stream, thrust_exec_policy);
+
+    CSRXMatrix<T> X_matrix(
+      X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features);
+    SparseSplitProposals<T> split_proposals =
+      SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows, stream);
+
+    GradientQuantiser quantiser(context, g_accessor, h_accessor, g_shape, stream);
+
+    // Begin building the tree
+    TreeBuilder<CSRXMatrix<T>> builder(num_rows,
+                                       num_features,
+                                       num_outputs,
+                                       stream,
+                                       tree.max_nodes,
+                                       max_depth,
+                                       split_proposals,
+                                       quantiser);
+
+    builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha, seed);
+
+    for (int depth = 0; depth < max_depth; ++depth) {
+      auto batches = builder.PrepareBatches(depth, thrust_exec_policy);
+      for (auto batch : batches) {
+        auto histogram = builder.GetHistogram(batch);
+
+        builder.ComputeHistogram(
+          histogram, context, tree, X_matrix, g_accessor, h_accessor, batch, seed, depth);
+
+        builder.PerformBestSplit(tree, histogram, alpha, batch);
+      }
+      // Update position of entire level
+      // Don't bother updating positions for the last level
+      if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_matrix); }
     }
 
     tree.WriteTreeOutput(context, thrust_exec_policy, quantiser);
@@ -1273,4 +1333,10 @@ struct build_tree_fn {
   type_dispatch_float(X.code(), build_tree_fn(), context);
 }
 
+/*static*/ void BuildTreeCSRTask::gpu_variant(legate::TaskContext context)
+{
+  const auto& X = context.input(0).data();
+  type_dispatch_float(X.code(), build_tree_csr_fn(), context);
+}
+
 }  // namespace legateboost
diff --git a/src/models/tree/build_tree.h b/src/models/tree/build_tree.h
index 0e8cc0b4..0c2072d1 100644
--- a/src/models/tree/build_tree.h
+++ b/src/models/tree/build_tree.h
@@ -101,6 +101,68 @@ __host__ __device__ inline double CalculateLeafValue(double G, double H, double
   return -G / (H + alpha);
 }
 
+// Create a uniform interface to two matrix formats
+// Dense and CSR
+template <typename T>
+class DenseXMatrix {
+ public:
+  using value_type = T;
+
+ private:
+  legate::AccessorRO<T, 3> x;
+  legate::Rect<3> shape;
+
+ public:
+  DenseXMatrix(legate::AccessorRO<T, 3> x, legate::Rect<3> shape) : x(x), shape(shape) {}
+  __host__ __device__ T Get(uint32_t i, uint32_t j) const { return x[legate::Point<3>{i, j, 0}]; }
+  __host__ __device__ int NumFeatures() const { return shape.hi[1] - shape.lo[1] + 1; }
+  __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const
+  {
+    return {shape.lo[0], shape.hi[0]};
+  }
+};
+
+template <typename T>
+class CSRXMatrix {
+ public:
+  using value_type = T;
+
+ private:
+  legate::AccessorRO<T, 1> values;
+  legate::AccessorRO<int64_t, 1> column_indices;
+  legate::AccessorRO<legate::Rect<1, legate::coord_t>, 1> row_ranges;
+  legate::Rect<1, legate::coord_t> row_ranges_shape;
+  int num_features;
+
+ public:
+  CSRXMatrix(legate::AccessorRO<T, 1> values,
+             legate::AccessorRO<int64_t, 1> column_indices,
+             legate::AccessorRO<legate::Rect<1, legate::coord_t>, 1> row_ranges,
+             legate::Rect<1, legate::coord_t> row_ranges_shape,
+             int num_features)
+    : values(values),
+      column_indices(column_indices),
+      row_ranges(row_ranges),
+      num_features(num_features),
+      row_ranges_shape(row_ranges_shape)
+  {
+  }
+
+  // Slower than dense due to search for column index
+  __host__ __device__ T Get(uint32_t i, uint32_t j) const
+  {
+    auto row_range = row_ranges[i];
+    // TODO(Rory): Binary search?
+    for (int64_t k = row_range.lo; k <= row_range.hi; k++) {
+      if (column_indices[k] == j) return values[k];
+      if (column_indices[k] > j) return 0;
+    }
+    return 0;
+  }
+  __host__ __device__ int NumFeatures() const { return num_features; }
+  __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const { return row_ranges_shape; }
+};
+
 // Container for the CSR matrix containing the split proposals
 template <typename T>
 class SparseSplitProposals {
@@ -243,4 +305,12 @@ class BuildTreeTask : public Task<BuildTreeTask, BUILD_TREE> {
 #endif
 };
 
+class BuildTreeCSRTask : public Task<BuildTreeCSRTask, BUILD_TREE_CSR> {
+ public:
+  static void cpu_variant(legate::TaskContext context);
+#ifdef LEGATEBOOST_USE_CUDA
+  static void gpu_variant(legate::TaskContext context);
+#endif
+};
+
 }  // namespace legateboost

From 27d03a0584e86b28262bed9b08d851e6a0fddcc1 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Wed, 4 Dec 2024 08:13:12 -0800
Subject: [PATCH 02/10] Add gpu prediction

---
 legateboost/models/tree.py           |  56 ++++++++++++++-
 legateboost/test/models/test_tree.py |   4 +-
 src/legateboost.h                    |  11 +--
 src/models/tree/build_tree.cu        |   1 +
 src/models/tree/build_tree.h         |  62 ----------------
 src/models/tree/matrix_types.h       |  80 +++++++++++++++++++++
 src/models/tree/predict.cc           |  11 ++-
 src/models/tree/predict.cu           | 102 ++++++++++++++++++++-------
 src/models/tree/predict.h            |  10 ++-
 9 files changed, 237 insertions(+), 100 deletions(-)
 create mode 100644 src/models/tree/matrix_types.h

diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py
index ba64b10b..025aa03e 100644
--- a/legateboost/models/tree.py
+++ b/legateboost/models/tree.py
@@ -24,7 +24,8 @@
 class LegateBoostOpCode(IntEnum):
     BUILD_TREE = user_lib.cffi.BUILD_TREE
     BUILD_TREE_CSR = user_lib.cffi.BUILD_TREE_CSR
-    PREDICT = user_lib.cffi.PREDICT
+    PREDICT_TREE = user_lib.cffi.PREDICT_TREE
+    PREDICT_TREE_CSR = user_lib.cffi.PREDICT_TREE_CSR
     UPDATE_TREE = user_lib.cffi.UPDATE_TREE
 
 
@@ -259,12 +260,12 @@ def update(
         self.hessian = cn.array(hessian, copy=False)
         return self
 
-    def predict(self, X: cn.ndarray) -> cn.ndarray:
+    def predict_dense(self, X: cn.ndarray) -> cn.ndarray:
         n_rows = X.shape[0]
         n_features = X.shape[1]
         n_outputs = self.leaf_value.shape[1]
         task = get_legate_runtime().create_auto_task(
-            user_context, LegateBoostOpCode.PREDICT
+            user_context, LegateBoostOpCode.PREDICT_TREE
         )
 
         pred = get_legate_runtime().create_store(types.float64, (n_rows, n_outputs))
@@ -288,9 +289,58 @@ def predict(self, X: cn.ndarray) -> cn.ndarray:
 
         task.add_alignment(X_, pred_)
         task.execute()
+        return cn.array(pred, copy=False)
+
+    def predict_csr(self, X: csr_matrix) -> cn.ndarray:
+        n_rows = X.shape[0]
+        n_outputs = self.leaf_value.shape[1]
+        task = get_legate_runtime().create_auto_task(
+            user_context, LegateBoostOpCode.PREDICT_TREE_CSR
+        )
+
+        pred = get_legate_runtime().create_store(types.float64, (n_rows, n_outputs))
+        # inputs
+        val_var = task.add_input(X.vals)
+        crd_var = task.add_input(X.crd)
+        pos_var = task.add_input(X.pos)
+        task.add_constraint(
+            image(pos_var, crd_var, hint=ImageComputationHint.FIRST_LAST)
+        )
+        task.add_constraint(
+            image(pos_var, val_var, hint=ImageComputationHint.FIRST_LAST)
+        )
+        pos_var_broadcast = X.pos.promote(1, n_outputs)
+        task.add_alignment(pos_var_broadcast, pred)
+
+        # scalars
+        task.add_scalar_arg(X.shape[1], types.int32)
+
+        # output
+        task.add_output(
+            pred.promote(1, 1)
+        )  # add 1 dimension so it has the same dimension as dense version
+        task.add_output(pred)  # only here for alignment, no used
 
+        # broadcast the tree structure
+        leaf_value_ = get_store(self.leaf_value)
+        feature_ = get_store(self.feature)
+        split_value_ = get_store(self.split_value)
+        task.add_input(leaf_value_)
+        task.add_input(feature_)
+        task.add_input(split_value_)
+        task.add_broadcast(leaf_value_)
+        task.add_broadcast(feature_)
+        task.add_broadcast(split_value_)
+
+        task.add_input(pos_var_broadcast)  # used only for alignment
+        task.execute()
         return cn.array(pred, copy=False)
 
+    def predict(self, X: Union[cn.ndarray, csr_matrix]) -> cn.ndarray:
+        if isinstance(X, csr_matrix):
+            return self.predict_csr(X)
+        return self.predict_dense(X)
+
     def is_leaf(self, id: int) -> Any:
         return self.feature[id] == -1
 
diff --git a/legateboost/test/models/test_tree.py b/legateboost/test/models/test_tree.py
index e3a2f8c6..c88ca011 100644
--- a/legateboost/test/models/test_tree.py
+++ b/legateboost/test/models/test_tree.py
@@ -84,4 +84,6 @@ def test_sparse():
     )
     g = cn.array(rs.normal(size=(2, num_outputs)))
     h = cn.array(rs.random((2, 1)) + 0.1)
-    lb.models.Tree().set_random_state(np.random.RandomState(2)).fit(X, g, h)
+
+    model = lb.models.Tree().set_random_state(np.random.RandomState(2)).fit(X, g, h)
+    model.predict(X)
diff --git a/src/legateboost.h b/src/legateboost.h
index b13aa4af..892c274a 100644
--- a/src/legateboost.h
+++ b/src/legateboost.h
@@ -20,7 +20,7 @@
 enum LegateBoostOpCode {  // NOLINT(performance-enum-size)
   OP_CODE_BASE = 0,
   BUILD_TREE   = 1,
-  PREDICT      = 2,
+  PREDICT_TREE = 2,
   UPDATE_TREE  = 3,
   /* special */
   ERF     = 4,
@@ -29,10 +29,11 @@ enum LegateBoostOpCode {  // NOLINT(performance-enum-size)
   DIGAMMA = 7,
   ZETA    = 8,
   /**/
-  GATHER         = 9,
-  RBF            = 10,
-  BUILD_NN       = 11,
-  BUILD_TREE_CSR = 12,
+  GATHER           = 9,
+  RBF              = 10,
+  BUILD_NN         = 11,
+  BUILD_TREE_CSR   = 12,
+  PREDICT_TREE_CSR = 13,
 };
 
 #endif  // SRC_LEGATEBOOST_H_
diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu
index a3fc489d..8945205b 100644
--- a/src/models/tree/build_tree.cu
+++ b/src/models/tree/build_tree.cu
@@ -32,6 +32,7 @@
 #include "../../cpp_utils/cpp_utils.cuh"
 #include "legate/comm/coll.h"
 #include "build_tree.h"
+#include "matrix_types.h"
 
 namespace legateboost {
 
diff --git a/src/models/tree/build_tree.h b/src/models/tree/build_tree.h
index 56125ecf..897ccad4 100644
--- a/src/models/tree/build_tree.h
+++ b/src/models/tree/build_tree.h
@@ -106,68 +106,6 @@ __host__ __device__ inline auto CalculateLeafValue(double G, double H, double al
   return -G / (H + alpha);
 }
 
-// Create a uniform interface to two matrix formats
-// Dense and CSR
-template <typename T>
-class DenseXMatrix {
- public:
-  using value_type = T;
-
- private:
-  legate::AccessorRO<T, 3> x;
-  legate::Rect<3> shape;
-
- public:
-  DenseXMatrix(legate::AccessorRO<T, 3> x, legate::Rect<3> shape) : x(x), shape(shape) {}
-  __host__ __device__ T Get(uint32_t i, uint32_t j) const { return x[legate::Point<3>{i, j, 0}]; }
-  __host__ __device__ int NumFeatures() const { return shape.hi[1] - shape.lo[1] + 1; }
-  __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const
-  {
-    return {shape.lo[0], shape.hi[0]};
-  }
-};
-
-template <typename T>
-class CSRXMatrix {
- public:
-  using value_type = T;
-
- private:
-  legate::AccessorRO<T, 1> values;
-  legate::AccessorRO<int64_t, 1> column_indices;
-  legate::AccessorRO<legate::Rect<1, legate::coord_t>, 1> row_ranges;
-  legate::Rect<1, legate::coord_t> row_ranges_shape;
-  int num_features;
-
- public:
-  CSRXMatrix(legate::AccessorRO<T, 1> values,
-             legate::AccessorRO<int64_t, 1> column_indices,
-             legate::AccessorRO<legate::Rect<1, legate::coord_t>, 1> row_ranges,
-             legate::Rect<1, legate::coord_t> row_ranges_shape,
-             int num_features)
-    : values(values),
-      column_indices(column_indices),
-      row_ranges(row_ranges),
-      num_features(num_features),
-      row_ranges_shape(row_ranges_shape)
-  {
-  }
-
-  // Slower than dense due to search for column index
-  __host__ __device__ T Get(uint32_t i, uint32_t j) const
-  {
-    auto row_range = row_ranges[i];
-    // TODO(Rory): Binary search?
-    for (int64_t k = row_range.lo; k <= row_range.hi; k++) {
-      if (column_indices[k] == j) return values[k];
-      if (column_indices[k] > j) return 0;
-    }
-    return 0;
-  }
-  __host__ __device__ int NumFeatures() const { return num_features; }
-  __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const { return row_ranges_shape; }
-};
-
 // Container for the CSR matrix containing the split proposals
 template <typename T>
 class SparseSplitProposals {
diff --git a/src/models/tree/matrix_types.h b/src/models/tree/matrix_types.h
new file mode 100644
index 00000000..508b351f
--- /dev/null
+++ b/src/models/tree/matrix_types.h
@@ -0,0 +1,80 @@
+/* Copyright 2024 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+#pragma once
+#include <legate.h>
+#include <cstdint>
+
+// Create a uniform interface to two matrix formats
+// Dense and CSR
+template <typename T>
+class DenseXMatrix {
+ public:
+  using value_type = T;
+
+ private:
+  legate::AccessorRO<T, 3> x;
+  legate::Rect<3> shape;
+
+ public:
+  DenseXMatrix(legate::AccessorRO<T, 3> x, legate::Rect<3> shape) : x(x), shape(shape) {}
+  __host__ __device__ T Get(uint32_t i, uint32_t j) const { return x[legate::Point<3>{i, j, 0}]; }
+  __host__ __device__ int NumFeatures() const { return shape.hi[1] - shape.lo[1] + 1; }
+  __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const
+  {
+    return {shape.lo[0], shape.hi[0]};
+  }
+};
+
+template <typename T>
+class CSRXMatrix {
+ public:
+  using value_type = T;
+
+ private:
+  legate::AccessorRO<T, 1> values;
+  legate::AccessorRO<int64_t, 1> column_indices;
+  legate::AccessorRO<legate::Rect<1, legate::coord_t>, 1> row_ranges;
+  legate::Rect<1, legate::coord_t> row_ranges_shape;
+  int num_features;
+
+ public:
+  CSRXMatrix(legate::AccessorRO<T, 1> values,
+             legate::AccessorRO<int64_t, 1> column_indices,
+             legate::AccessorRO<legate::Rect<1, legate::coord_t>, 1> row_ranges,
+             legate::Rect<1, legate::coord_t> row_ranges_shape,
+             int num_features)
+    : values(values),
+      column_indices(column_indices),
+      row_ranges(row_ranges),
+      num_features(num_features),
+      row_ranges_shape(row_ranges_shape)
+  {
+  }
+
+  // Slower than dense due to search for column index
+  __host__ __device__ T Get(uint32_t i, uint32_t j) const
+  {
+    auto row_range = row_ranges[i];
+    // TODO(Rory): Binary search?
+    for (int64_t k = row_range.lo; k <= row_range.hi; k++) {
+      if (column_indices[k] == j) return values[k];
+      if (column_indices[k] > j) return 0;
+    }
+    return 0;
+  }
+  __host__ __device__ int NumFeatures() const { return num_features; }
+  __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const { return row_ranges_shape; }
+};
diff --git a/src/models/tree/predict.cc b/src/models/tree/predict.cc
index a87d5044..5feeded9 100644
--- a/src/models/tree/predict.cc
+++ b/src/models/tree/predict.cc
@@ -62,7 +62,13 @@ struct predict_fn {
 };
 }  // namespace
 
-/*static*/ void PredictTask::cpu_variant(legate::TaskContext context)
+/*static*/ void PredictTreeTask::cpu_variant(legate::TaskContext context)
+{
+  const auto& X = context.input(0).data();
+  type_dispatch_float(X.code(), predict_fn(), context);
+}
+
+/*static*/ void PredictTreeCSRTask::cpu_variant(legate::TaskContext context)
 {
   const auto& X = context.input(0).data();
   type_dispatch_float(X.code(), predict_fn(), context);
@@ -74,6 +80,7 @@ namespace  // unnamed
 {
 void __attribute__((constructor)) register_tasks()
 {
-  legateboost::PredictTask::register_variants();
+  legateboost::PredictTreeTask::register_variants();
+  legateboost::PredictTreeCSRTask::register_variants();
 }
 }  // namespace
diff --git a/src/models/tree/predict.cu b/src/models/tree/predict.cu
index 65837619..181070f3 100644
--- a/src/models/tree/predict.cu
+++ b/src/models/tree/predict.cu
@@ -19,11 +19,42 @@
 #include "../../cpp_utils/cpp_utils.cuh"
 #include "../../cpp_utils/cpp_utils.h"
 #include "predict.h"
+#include "matrix_types.h"
 
 namespace legateboost {
 
 namespace {
-struct predict_fn {
+
+template <typename MatrixT>
+void PredictRows(const MatrixT& X,
+                 legate::AccessorWO<double, 3> pred_accessor,
+                 legate::Rect<3, legate::coord_t> pred_shape,
+                 legate::AccessorRO<double, 1> split_value,
+                 legate::AccessorRO<int32_t, 1> feature,
+                 legate::AccessorRO<double, 2> leaf_value,
+                 cudaStream_t stream)
+{
+  // rowwise kernel
+  auto prediction_lambda = [=] __device__(size_t idx) {
+    int64_t pos         = 0;
+    auto global_row_idx = X.RowRange().lo + idx;
+    // Use a max depth of 100 to avoid infinite loops
+    const int max_depth = 100;
+    for (int depth = 0; depth < max_depth; depth++) {
+      if (feature[pos] == -1) { break; }
+      double const X_val = X.Get(global_row_idx, feature[pos]);
+      pos                = X_val <= split_value[pos] ? (pos * 2) + 1 : (pos * 2) + 2;
+    }
+    for (int64_t j = pred_shape.lo[2]; j <= pred_shape.hi[2]; j++) {
+      pred_accessor[{global_row_idx, 0, j}] = leaf_value[{pos, j}];
+    }
+  };  // NOLINT(readability/braces)
+
+  LaunchN(X.RowRange().volume(), stream, prediction_lambda);
+  CHECK_CUDA_STREAM(stream);
+}
+
+struct predict_dense_fn {
   template <typename T>
   void operator()(legate::TaskContext context)
   {
@@ -39,7 +70,6 @@ struct predict_fn {
     auto pred          = context.output(0).data();
     auto pred_shape    = pred.shape<3>();
     auto pred_accessor = pred.write_accessor<double, 3>();
-    auto n_outputs     = pred_shape.hi[2] - pred_shape.lo[2] + 1;
 
     EXPECT(pred_shape.lo[2] == 0, "Expect all outputs to be present");
     // We should have one output prediction per row of X
@@ -50,36 +80,56 @@ struct predict_fn {
     EXPECT_IS_BROADCAST(context.input(2).data().shape<1>());
     EXPECT_IS_BROADCAST(context.input(3).data().shape<1>());
 
-    // rowwise kernel
-    auto prediction_lambda = [=] __device__(size_t idx) {
-      int64_t pos              = 0;
-      legate::Point<3> x_point = {X_shape.lo[0] + static_cast<int64_t>(idx), 0, 0};
-
-      // Use a max depth of 100 to avoid infinite loops
-      const int max_depth = 100;
-      for (int depth = 0; depth < max_depth; depth++) {
-        if (feature[pos] == -1) { break; }
-        x_point[1]         = feature[pos];
-        double const X_val = X_accessor[x_point];
-        pos                = X_val <= split_value[pos] ? (pos * 2) + 1 : (pos * 2) + 2;
-      }
-      for (int64_t j = 0; j < n_outputs; j++) {
-        pred_accessor[{X_shape.lo[0] + static_cast<int64_t>(idx), 0, j}] = leaf_value[{pos, j}];
-      }
-    };  // NOLINT(readability/braces)
-
-    auto* stream = context.get_task_stream();
-    LaunchN(X_shape.hi[0] - X_shape.lo[0] + 1, stream, prediction_lambda);
-
-    CHECK_CUDA_STREAM(stream);
+    PredictRows(DenseXMatrix<T>(X_accessor, X_shape),
+                pred_accessor,
+                pred_shape,
+                split_value,
+                feature,
+                leaf_value,
+                context.get_task_stream());
+  }
+};
+
+struct predict_csr_fn {
+  template <typename T>
+  void operator()(legate::TaskContext context)
+  {
+    auto [X_vals, X_vals_shape, X_vals_accessor] = GetInputStore<T, 1>(context.input(0).data());
+    auto [X_coords, X_coords_shape, X_coords_accessor] =
+      GetInputStore<int64_t, 1>(context.input(1).data());
+    auto [X_offsets, X_offsets_shape, X_offsets_accessor] =
+      GetInputStore<legate::Rect<1, legate::coord_t>, 1>(context.input(2).data());
+
+    auto leaf_value  = context.input(3).data().read_accessor<double, 2>();
+    auto feature     = context.input(4).data().read_accessor<int32_t, 1>();
+    auto split_value = context.input(5).data().read_accessor<double, 1>();
+
+    auto pred          = context.output(0).data();
+    auto pred_shape    = pred.shape<3>();
+    auto pred_accessor = pred.write_accessor<double, 3>();
+
+    auto num_features = context.scalars().at(0).value<int32_t>();
+    CSRXMatrix<T> X(
+      X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features);
+
+    EXPECT_AXIS_ALIGNED(0, X_offsets_shape, pred_shape);
+
+    PredictRows(
+      X, pred_accessor, pred_shape, split_value, feature, leaf_value, context.get_task_stream());
   }
 };
 }  // namespace
 
-/*static*/ void PredictTask::gpu_variant(legate::TaskContext context)
+/*static*/ void PredictTreeTask::gpu_variant(legate::TaskContext context)
+{
+  auto X = context.input(0).data();
+  type_dispatch_float(X.code(), predict_dense_fn(), context);
+}
+
+/*static*/ void PredictTreeCSRTask::gpu_variant(legate::TaskContext context)
 {
   auto X = context.input(0).data();
-  type_dispatch_float(X.code(), predict_fn(), context);
+  type_dispatch_float(X.code(), predict_csr_fn(), context);
 }
 
 }  // namespace legateboost
diff --git a/src/models/tree/predict.h b/src/models/tree/predict.h
index 1d98e6b5..c55efd42 100644
--- a/src/models/tree/predict.h
+++ b/src/models/tree/predict.h
@@ -20,7 +20,15 @@
 
 namespace legateboost {
 
-class PredictTask : public Task<PredictTask, PREDICT> {
+class PredictTreeTask : public Task<PredictTreeTask, PREDICT_TREE> {
+ public:
+  static void cpu_variant(legate::TaskContext context);
+#ifdef LEGATEBOOST_USE_CUDA
+  static void gpu_variant(legate::TaskContext context);
+#endif
+};
+
+class PredictTreeCSRTask : public Task<PredictTreeCSRTask, PREDICT_TREE_CSR> {
  public:
   static void cpu_variant(legate::TaskContext context);
 #ifdef LEGATEBOOST_USE_CUDA

From ed290d060e949b22965bd7dfccde7c924b92e2b9 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 5 Dec 2024 03:31:54 -0800
Subject: [PATCH 03/10] Accept csr input in estimator

---
 conda/environments/all_cuda-122.yaml |  1 +
 dependencies.yaml                    |  1 +
 legateboost/input_validation.py      | 39 +++++++++++++++-------------
 legateboost/legateboost.py           |  1 +
 legateboost/models/base_model.py     | 10 +++++++
 legateboost/models/tree.py           |  3 +++
 legateboost/test/models/test_tree.py |  4 +--
 legateboost/test/test_estimator.py   | 23 +++++++++++++++-
 pyproject.toml                       |  1 +
 9 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/conda/environments/all_cuda-122.yaml b/conda/environments/all_cuda-122.yaml
index ae2e3713..3f383c17 100644
--- a/conda/environments/all_cuda-122.yaml
+++ b/conda/environments/all_cuda-122.yaml
@@ -15,6 +15,7 @@ dependencies:
 - cuda-version>=12.2
 - cupynumeric==25.01.*,>=0.0.0.dev0
 - hypothesis>=6
+- legate-sparse
 - legate==25.01.*,>=0.0.0.dev0
 - libcublas-dev
 - make
diff --git a/dependencies.yaml b/dependencies.yaml
index b04de179..850d6549 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -152,3 +152,4 @@ dependencies:
           - pytest>=7,<8
           - seaborn>=0.13
           - xgboost>=2.0
+          - legate-sparse
diff --git a/legateboost/input_validation.py b/legateboost/input_validation.py
index 970328ed..296ef061 100644
--- a/legateboost/input_validation.py
+++ b/legateboost/input_validation.py
@@ -3,6 +3,11 @@
 import numpy as np
 import scipy.sparse as sp
 
+try:
+    from legate_sparse import csr_matrix
+except ImportError:
+    csr_matrix = None
+
 import cupynumeric as cn
 
 __all__: List[str] = []
@@ -29,22 +34,22 @@ def check_sample_weight(sample_weight: Any, n: int) -> cn.ndarray:
 
 def check_array(x: Any) -> cn.ndarray:
     if sp.issparse(x):
-        raise ValueError("Sparse matrix not allowed.")
-
-    if not hasattr(x, "__legate_data_interface__"):
-        x = cn.array(np.require(x, requirements=["C", "A"]))
-    if hasattr(x, "__array_interface__"):
-        shape = x.__array_interface__["shape"]
-        if shape[0] <= 0:
-            raise ValueError(
-                "Found array with %d sample(s) (shape=%s) while a"
-                " minimum of %d is required." % (shape[0], shape, 1)
-            )
-        if len(shape) >= 2 and 0 in shape:
-            raise ValueError(
-                "Found array with %d feature(s) (shape=%s) while"
-                " a minimum of %d is required." % (shape[1], shape, 1)
-            )
+        x = csr_matrix(x)
+    elif isinstance(x, csr_matrix):
+        pass
+    else:
+        x = cn.array(x, copy=False)
+
+    if x.shape[0] <= 0:
+        raise ValueError(
+            "Found array with %d sample(s) (shape=%s) while a"
+            " minimum of %d is required." % (x.shape[0], x.shape, 1)
+        )
+    if len(x.shape) >= 2 and 0 in x.shape:
+        raise ValueError(
+            "Found array with %d feature(s) (shape=%s) while"
+            " a minimum of %d is required." % (x.shape[1], x.shape, 1)
+        )
 
     if cn.iscomplexobj(x):
         raise ValueError("Complex data not supported.")
@@ -52,8 +57,6 @@ def check_array(x: Any) -> cn.ndarray:
     if np.issubdtype(x.dtype, np.floating) and not cn.isfinite(x.sum()):
         raise ValueError("Input contains NaN or inf")
 
-    x = cn.array(x, copy=False)
-
     return x
 
 
diff --git a/legateboost/legateboost.py b/legateboost/legateboost.py
index 3bda0d5d..b171497b 100644
--- a/legateboost/legateboost.py
+++ b/legateboost/legateboost.py
@@ -91,6 +91,7 @@ def _more_tags(self) -> Any:
                 ),
                 "check_dtype_object": ("object type data not supported."),
             },
+            "X_types": ["2darray", "sparse"],
         }
 
     def _setup_metrics(self) -> list[BaseMetric]:
diff --git a/legateboost/models/base_model.py b/legateboost/models/base_model.py
index 07408559..1b23cdcb 100644
--- a/legateboost/models/base_model.py
+++ b/legateboost/models/base_model.py
@@ -91,6 +91,16 @@ def predict(self, X: cn.ndarray) -> cn.ndarray:
         """
         pass
 
+    def supports_csr(self) -> bool:
+        """Whether the model supports CSR matrix input.
+
+        Returns
+        -------
+        bool
+            True if the model supports CSR matrix input, False otherwise.
+        """
+        return False
+
     @abstractmethod
     def __str__(self) -> str:
         pass
diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py
index 025aa03e..ff4079d3 100644
--- a/legateboost/models/tree.py
+++ b/legateboost/models/tree.py
@@ -386,3 +386,6 @@ def __mul__(self, scalar: Any) -> "Tree":
         new = copy.deepcopy(self)
         new.leaf_value *= scalar
         return new
+
+    def supports_csr(self) -> bool:
+        return True
diff --git a/legateboost/test/models/test_tree.py b/legateboost/test/models/test_tree.py
index c88ca011..9c6f7fe5 100644
--- a/legateboost/test/models/test_tree.py
+++ b/legateboost/test/models/test_tree.py
@@ -1,6 +1,5 @@
 import numpy as np
 import pytest
-from legate_sparse import csr_matrix
 
 import cupynumeric as cn
 import legateboost as lb
@@ -75,7 +74,8 @@ def test_alpha():
     assert np.isclose(model.predict(X)[0], y.sum() / (y.size + alpha))
 
 
-def test_sparse():
+def test_csr():
+    csr_matrix = pytest.importorskip("legate_sparse.csr_matrix")
     num_outputs = 1
     rs = cn.random.RandomState(0)
     X = csr_matrix(
diff --git a/legateboost/test/test_estimator.py b/legateboost/test/test_estimator.py
index 360a3634..db206ab6 100644
--- a/legateboost/test/test_estimator.py
+++ b/legateboost/test/test_estimator.py
@@ -1,12 +1,17 @@
 import numpy as np
 import pytest
+import scipy
 from sklearn.datasets import make_regression
 from sklearn.model_selection import train_test_split
 from sklearn.utils.estimator_checks import parametrize_with_checks
 
 import cupynumeric as cn
 import legateboost as lb
-from legateboost.testing.utils import non_increasing, sanity_check_models
+from legateboost.testing.utils import (
+    all_base_models,
+    non_increasing,
+    sanity_check_models,
+)
 
 
 def test_init():
@@ -215,3 +220,19 @@ def test_iterator_methods():
     assert list(model) == list(model.models_)
     for i, est in enumerate(model):
         assert est == model[i]
+
+
+@pytest.mark.parametrize(
+    "base_model", filter(lambda m: m.supports_csr(), all_base_models()), ids=type
+)
+def test_csr_input(base_model):
+    csr_matrix = pytest.importorskip("legate_sparse").csr_matrix
+    X_scipy = scipy.sparse.csr_matrix([[1.0, 0.0, 2.0], [0.0, 3.0, 0.0]])
+    X_legate_sparse = csr_matrix(X_scipy)
+    y = cn.array([1.0, 2.0])
+    model = lb.LBRegressor(
+        n_estimators=1,
+        base_models=(base_model,),
+    )
+    model.fit(X_scipy, y)
+    model.fit(X_legate_sparse, y)
diff --git a/pyproject.toml b/pyproject.toml
index 9f3a177d..c09a83d4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,6 +37,7 @@ requires-python = ">=3.10"
 [project.optional-dependencies]
 test = [
     "hypothesis>=6",
+    "legate-sparse",
     "matplotlib>=3.9",
     "mypy>=1.13",
     "nbconvert>=7.16",

From 9f07b1af8b00df7c0a1ce4ca337a9466415fbcbf Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 6 Dec 2024 04:51:14 -0800
Subject: [PATCH 04/10] Add example

---
 examples/sparse/README.md     |  3 +++
 examples/sparse/sparse.py     | 47 +++++++++++++++++++++++++++++++++++
 src/models/tree/build_tree.cu |  4 +--
 3 files changed, 52 insertions(+), 2 deletions(-)
 create mode 100644 examples/sparse/README.md
 create mode 100644 examples/sparse/sparse.py

diff --git a/examples/sparse/README.md b/examples/sparse/README.md
new file mode 100644
index 00000000..c0cd3efc
--- /dev/null
+++ b/examples/sparse/README.md
@@ -0,0 +1,3 @@
+# Sparse data
+
+This example trains a youtube comment spam classifier on a sparse dataset. The comments as raw strings are converted to a sparse matrix of word counts using the `CountVectorizer` from scikit-learn.
diff --git a/examples/sparse/sparse.py b/examples/sparse/sparse.py
new file mode 100644
index 00000000..a189beff
--- /dev/null
+++ b/examples/sparse/sparse.py
@@ -0,0 +1,47 @@
+import pandas as pd
+from sklearn.datasets import fetch_openml
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.model_selection import train_test_split
+
+import legateboost as lb
+
+# Alberto, T. & Lochter, J. (2015). YouTube Spam Collection [Dataset].
+# UCI Machine Learning Repository. https://doi.org/10.24432/C58885.
+dataset_names = [
+    "youtube-spam-psy",
+    "youtube-spam-shakira",
+    "youtube-spam-lmfao",
+    "youtube-spam-eminem",
+    "youtube-spam-katyperry",
+]
+X = []
+for dataset_name in dataset_names:
+    dataset = fetch_openml(name=dataset_name, as_frame=True)
+    X.append(dataset.data)
+
+X = pd.concat(X)
+y = X["CLASS"]
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.3, random_state=42
+)
+vectorizer = CountVectorizer()
+X_train_vectorized = vectorizer.fit_transform(X_train["CONTENT"])
+X_test_vectorized = vectorizer.transform(X_test["CONTENT"])
+
+model = lb.LBClassifier().fit(
+    X_train_vectorized, y_train, eval_set=[(X_test_vectorized, y_test)]
+)
+
+
+def evaluate_comment(comment):
+    print("Comment: {}".format(comment))
+    print(
+        "Probability of spam: {}".format(
+            model.predict_proba(vectorizer.transform([comment]))[0, 1]
+        )
+    )
+
+
+evaluate_comment(X_test.iloc[15]["CONTENT"])
+evaluate_comment(X_test.iloc[3]["CONTENT"])
+evaluate_comment("Your text here")
diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu
index 8945205b..c2f224fa 100644
--- a/src/models/tree/build_tree.cu
+++ b/src/models/tree/build_tree.cu
@@ -1339,8 +1339,8 @@ struct build_tree_csr_fn {
     auto [h, h_shape, h_accessor] = GetInputStore<double, 3>(context.input(4).data());
 
     auto num_rows    = std::max<int64_t>(X_offsets_shape.hi[0] - X_offsets_shape.lo[0] + 1, 0);
-    auto num_outputs = g_shape.hi[1] - g_shape.lo[1] + 1;
-    EXPECT(g_shape.lo[1] == 0, "Outputs should not be split between workers.");
+    auto num_outputs = g_shape.hi[2] - g_shape.lo[2] + 1;
+    EXPECT(g_shape.lo[2] == 0, "Outputs should not be split between workers.");
 
     // Scalars
     auto max_depth     = context.scalars().at(0).value<int>();

From 271878ac7c62861953bf2033dd48b22eb4aceb35 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 6 Dec 2024 05:32:58 -0800
Subject: [PATCH 05/10] Add cpu csr prediction

---
 ci/run_pytests_cpu.sh                |  1 +
 legateboost/test/models/test_tree.py | 10 ++--
 src/models/tree/build_tree.cc        | 51 ++++++++++++-------
 src/models/tree/build_tree.cu        |  2 +-
 src/models/tree/build_tree.h         |  2 +-
 src/models/tree/predict.cc           | 76 ++++++++++++++++++++++------
 6 files changed, 99 insertions(+), 43 deletions(-)

diff --git a/ci/run_pytests_cpu.sh b/ci/run_pytests_cpu.sh
index 50e138ac..16a74640 100755
--- a/ci/run_pytests_cpu.sh
+++ b/ci/run_pytests_cpu.sh
@@ -18,6 +18,7 @@ set -e -E -u -o pipefail
 cd legateboost/test
 
 legate \
+    --gpus 0 \
     --sysmem 28000 \
     --module pytest \
     . \
diff --git a/legateboost/test/models/test_tree.py b/legateboost/test/models/test_tree.py
index 9c6f7fe5..99f82171 100644
--- a/legateboost/test/models/test_tree.py
+++ b/legateboost/test/models/test_tree.py
@@ -75,15 +75,13 @@ def test_alpha():
 
 
 def test_csr():
-    csr_matrix = pytest.importorskip("legate_sparse.csr_matrix")
-    num_outputs = 1
-    rs = cn.random.RandomState(0)
+    csr_matrix = pytest.importorskip("legate_sparse").csr_matrix
     X = csr_matrix(
         (cn.array([1.0, 2.0, 3.0]), cn.array([0, 1, 2]), cn.array([0, 2, 3])),
         shape=(2, 3),
     )
-    g = cn.array(rs.normal(size=(2, num_outputs)))
-    h = cn.array(rs.random((2, 1)) + 0.1)
+    g = cn.array([[1.0], [-1.0]])
+    h = cn.array([[1.0], [1.0]])
 
     model = lb.models.Tree().set_random_state(np.random.RandomState(2)).fit(X, g, h)
-    model.predict(X)
+    assert np.allclose(model.predict(X), -g / h)
diff --git a/src/models/tree/build_tree.cc b/src/models/tree/build_tree.cc
index 16ce0177..e7e08083 100644
--- a/src/models/tree/build_tree.cc
+++ b/src/models/tree/build_tree.cc
@@ -24,6 +24,7 @@
 #include "legate_library.h"
 #include "legateboost.h"
 #include "cpp_utils/cpp_utils.h"
+#include "matrix_types.h"
 
 namespace legateboost {
 namespace {
@@ -127,10 +128,9 @@ void WriteTreeOutput(legate::TaskContext context, const Tree& tree)
 // Share the samples with all workers
 // Remove any duplicates
 // Return sparse matrix of split samples for each feature
-template <typename T>
+template <typename T, template <typename> class XMatrix>
 auto SelectSplitSamples(legate::TaskContext context,
-                        const legate::AccessorRO<T, 3>& X,
-                        legate::Rect<3> X_shape,
+                        const XMatrix<T>& X,
                         int split_samples,
                         int seed,
                         int64_t dataset_rows) -> SparseSplitProposals<T>
@@ -143,23 +143,22 @@ auto SelectSplitSamples(legate::TaskContext context,
     return dist(eng);
   });
 
-  auto num_features    = X_shape.hi[1] - X_shape.lo[1] + 1;
-  auto draft_proposals = legate::create_buffer<T, 2>({num_features, split_samples});
+  auto draft_proposals = legate::create_buffer<T, 2>({X.NumFeatures(), split_samples});
   for (int i = 0; i < split_samples; i++) {
     auto row            = row_samples[i];
-    bool const has_data = row >= X_shape.lo[0] && row <= X_shape.hi[0];
-    for (int j = 0; j < num_features; j++) {
-      draft_proposals[{j, i}] = has_data ? X[{row, j, 0}] : T(0);
+    const bool has_data = X.RowRange().contains(row);
+    for (int j = 0; j < X.NumFeatures(); j++) {
+      draft_proposals[{j, i}] = has_data ? X.Get(row, j) : T(0);
     }
   }
-  SumAllReduce(context, tcb::span<T>(draft_proposals.ptr({0, 0}), num_features * split_samples));
+  SumAllReduce(context, tcb::span<T>(draft_proposals.ptr({0, 0}), X.NumFeatures() * split_samples));
 
   // Sort samples
   std::vector<T> split_proposals_tmp;
-  split_proposals_tmp.reserve(num_features * split_samples);
-  auto row_pointers = legate::create_buffer<int32_t, 1>(num_features + 1);
+  split_proposals_tmp.reserve(X.NumFeatures() * split_samples);
+  auto row_pointers = legate::create_buffer<int32_t, 1>(X.NumFeatures() + 1);
   row_pointers[0]   = 0;
-  for (int j = 0; j < num_features; j++) {
+  for (int j = 0; j < X.NumFeatures(); j++) {
     auto ptr = draft_proposals.ptr({j, 0});
     tcb::span<T> const feature_proposals(draft_proposals.ptr({j, 0}), split_samples);
     std::set<T> const unique(feature_proposals.begin(), feature_proposals.end());
@@ -170,7 +169,7 @@ auto SelectSplitSamples(legate::TaskContext context,
   auto split_proposals = legate::create_buffer<T, 1>(split_proposals_tmp.size());
   std::copy(split_proposals_tmp.begin(), split_proposals_tmp.end(), split_proposals.ptr(0));
   return SparseSplitProposals<T>(
-    split_proposals, row_pointers, num_features, split_proposals_tmp.size());
+    split_proposals, row_pointers, X.NumFeatures(), split_proposals_tmp.size());
 }
 
 template <typename T>
@@ -446,13 +445,14 @@ struct TreeBuilder {
   Histogram<GPair> histogram;
 };
 
-struct build_tree_fn {
+struct build_tree_dense_fn {
   template <typename T>
   void operator()(legate::TaskContext context)
   {
     auto [X, X_shape, X_accessor] = GetInputStore<T, 3>(context.input(0).data());
     auto [g, g_shape, g_accessor] = GetInputStore<double, 3>(context.input(1).data());
     auto [h, h_shape, h_accessor] = GetInputStore<double, 3>(context.input(2).data());
+
     EXPECT_DENSE_ROW_MAJOR(X_accessor.accessor, X_shape);
     auto num_features = X_shape.hi[1] - X_shape.lo[1] + 1;
     auto num_rows     = std::max<int64_t>(X_shape.hi[0] - X_shape.lo[0] + 1, 0);
@@ -471,8 +471,11 @@ struct build_tree_fn {
     auto dataset_rows  = context.scalars().at(5).value<int64_t>();
 
     Tree tree(max_nodes, narrow<int>(num_outputs));
+
+    DenseXMatrix<T> X_matrix(X_accessor, X_shape);
+
     SparseSplitProposals<T> const split_proposals =
-      SelectSplitSamples(context, X_accessor, X_shape, split_samples, seed, dataset_rows);
+      SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows);
 
     // Begin building the tree
     TreeBuilder<T> builder(
@@ -497,15 +500,25 @@ struct build_tree_fn {
   }
 };
 
+struct build_tree_csr_fn {
+  template <typename T>
+  void operator()(legate::TaskContext context)
+  {
+  }
+};
 }  // namespace
 
-/*static*/ void BuildTreeTask::cpu_variant(legate::TaskContext context)
+/*static*/ void BuildTreeDenseTask::cpu_variant(legate::TaskContext context)
 {
   const auto& X = context.input(0).data();
-  legateboost::type_dispatch_float(X.code(), build_tree_fn(), context);
+  legateboost::type_dispatch_float(X.code(), build_tree_dense_fn(), context);
 }
 
-/*static*/ void BuildTreeCSRTask::cpu_variant(legate::TaskContext context) {}
+/*static*/ void BuildTreeCSRTask::cpu_variant(legate::TaskContext context)
+{
+  const auto& X = context.input(0).data();
+  legateboost::type_dispatch_float(X.code(), build_tree_csr_fn(), context);
+}
 
 }  // namespace legateboost
 
@@ -513,7 +526,7 @@ namespace  // unnamed
 {
 void __attribute__((constructor)) register_tasks()
 {
-  legateboost::BuildTreeTask::register_variants();
+  legateboost::BuildTreeDenseTask::register_variants();
   legateboost::BuildTreeCSRTask::register_variants();
 }
 }  // namespace
diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu
index c2f224fa..d8652037 100644
--- a/src/models/tree/build_tree.cu
+++ b/src/models/tree/build_tree.cu
@@ -1398,7 +1398,7 @@ struct build_tree_csr_fn {
   }
 };
 
-/*static*/ void BuildTreeTask::gpu_variant(legate::TaskContext context)
+/*static*/ void BuildTreeDenseTask::gpu_variant(legate::TaskContext context)
 {
   const auto& X = context.input(0).data();
   type_dispatch_float(X.code(), build_tree_fn(), context);
diff --git a/src/models/tree/build_tree.h b/src/models/tree/build_tree.h
index 897ccad4..ace0eef3 100644
--- a/src/models/tree/build_tree.h
+++ b/src/models/tree/build_tree.h
@@ -243,7 +243,7 @@ class Histogram {
   }
 };
 
-class BuildTreeTask : public Task<BuildTreeTask, BUILD_TREE> {
+class BuildTreeDenseTask : public Task<BuildTreeDenseTask, BUILD_TREE> {
  public:
   static void cpu_variant(legate::TaskContext context);
 #ifdef LEGATEBOOST_USE_CUDA
diff --git a/src/models/tree/predict.cc b/src/models/tree/predict.cc
index 5feeded9..d24eaf1f 100644
--- a/src/models/tree/predict.cc
+++ b/src/models/tree/predict.cc
@@ -16,11 +16,34 @@
 #include "predict.h"
 #include <cstdint>
 #include "../../cpp_utils/cpp_utils.h"
+#include "matrix_types.h"
 
 namespace legateboost {
 
 namespace {
-struct predict_fn {
+template <typename MatrixT>
+void PredictRows(const MatrixT& X,
+                 legate::AccessorWO<double, 3> pred_accessor,
+                 legate::Rect<3, legate::coord_t> pred_shape,
+                 legate::AccessorRO<double, 1> split_value,
+                 legate::AccessorRO<int32_t, 1> feature,
+                 legate::AccessorRO<double, 2> leaf_value)
+{
+  for (int64_t i = X.RowRange().lo[0]; i <= X.RowRange().hi[0]; i++) {
+    int pos = 0;
+    // Use a max depth of 100 to avoid infinite loops
+    const int max_depth = 100;
+    for (int depth = 0; depth < max_depth; depth++) {
+      if (feature[pos] == -1) { break; }
+      auto x = X.Get(i, feature[pos]);
+      pos    = x <= split_value[pos] ? (pos * 2) + 1 : (pos * 2) + 2;
+    }
+    for (int64_t j = pred_shape.lo[2]; j <= pred_shape.hi[2]; j++) {
+      pred_accessor[{i, 0, j}] = leaf_value[{pos, j}];
+    }
+  }
+}
+struct predict_dense_fn {
   template <typename T>
   void operator()(legate::TaskContext context)
   {
@@ -45,19 +68,40 @@ struct predict_fn {
     EXPECT_IS_BROADCAST(context.input(2).data().shape<1>());
     EXPECT_IS_BROADCAST(context.input(3).data().shape<1>());
 
-    for (int64_t i = X_shape.lo[0]; i <= X_shape.hi[0]; i++) {
-      int pos = 0;
-      // Use a max depth of 100 to avoid infinite loops
-      const int max_depth = 100;
-      for (int depth = 0; depth < max_depth; depth++) {
-        if (feature[pos] == -1) { break; }
-        auto x = X_accessor[{i, feature[pos], 0}];
-        pos    = x <= split_value[pos] ? (pos * 2) + 1 : (pos * 2) + 2;
-      }
-      for (int64_t j = pred_shape.lo[2]; j <= pred_shape.hi[2]; j++) {
-        pred_accessor[{i, 0, j}] = leaf_value[{pos, j}];
-      }
-    }
+    PredictRows(DenseXMatrix<T>(X_accessor, X_shape),
+                pred_accessor,
+                pred_shape,
+                split_value,
+                feature,
+                leaf_value);
+  }
+};
+
+struct predict_csr_fn {
+  template <typename T>
+  void operator()(legate::TaskContext context)
+  {
+    auto [X_vals, X_vals_shape, X_vals_accessor] = GetInputStore<T, 1>(context.input(0).data());
+    auto [X_coords, X_coords_shape, X_coords_accessor] =
+      GetInputStore<int64_t, 1>(context.input(1).data());
+    auto [X_offsets, X_offsets_shape, X_offsets_accessor] =
+      GetInputStore<legate::Rect<1, legate::coord_t>, 1>(context.input(2).data());
+
+    auto leaf_value  = context.input(3).data().read_accessor<double, 2>();
+    auto feature     = context.input(4).data().read_accessor<int32_t, 1>();
+    auto split_value = context.input(5).data().read_accessor<double, 1>();
+
+    auto pred          = context.output(0).data();
+    auto pred_shape    = pred.shape<3>();
+    auto pred_accessor = pred.write_accessor<double, 3>();
+
+    auto num_features = context.scalars().at(0).value<int32_t>();
+    CSRXMatrix<T> X(
+      X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features);
+
+    EXPECT_AXIS_ALIGNED(0, X_offsets_shape, pred_shape);
+
+    PredictRows(X, pred_accessor, pred_shape, split_value, feature, leaf_value);
   }
 };
 }  // namespace
@@ -65,13 +109,13 @@ struct predict_fn {
 /*static*/ void PredictTreeTask::cpu_variant(legate::TaskContext context)
 {
   const auto& X = context.input(0).data();
-  type_dispatch_float(X.code(), predict_fn(), context);
+  type_dispatch_float(X.code(), predict_dense_fn(), context);
 }
 
 /*static*/ void PredictTreeCSRTask::cpu_variant(legate::TaskContext context)
 {
   const auto& X = context.input(0).data();
-  type_dispatch_float(X.code(), predict_fn(), context);
+  type_dispatch_float(X.code(), predict_csr_fn(), context);
 }
 
 }  // namespace legateboost

From 937b7b71750b9e7745f18d344810f25df6d803d7 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 6 Dec 2024 05:45:30 -0800
Subject: [PATCH 06/10] Add cpu tree builder

---
 legateboost/test/models/test_tree.py |  6 ++-
 src/models/tree/build_tree.cc        | 70 ++++++++++++++++++++++------
 2 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/legateboost/test/models/test_tree.py b/legateboost/test/models/test_tree.py
index 99f82171..7d5f2e04 100644
--- a/legateboost/test/models/test_tree.py
+++ b/legateboost/test/models/test_tree.py
@@ -83,5 +83,9 @@ def test_csr():
     g = cn.array([[1.0], [-1.0]])
     h = cn.array([[1.0], [1.0]])
 
-    model = lb.models.Tree().set_random_state(np.random.RandomState(2)).fit(X, g, h)
+    model = (
+        lb.models.Tree(alpha=0.0)
+        .set_random_state(np.random.RandomState(2))
+        .fit(X, g, h)
+    )
     assert np.allclose(model.predict(X), -g / h)
diff --git a/src/models/tree/build_tree.cc b/src/models/tree/build_tree.cc
index e7e08083..5e804ccb 100644
--- a/src/models/tree/build_tree.cc
+++ b/src/models/tree/build_tree.cc
@@ -172,8 +172,9 @@ auto SelectSplitSamples(legate::TaskContext context,
     split_proposals, row_pointers, X.NumFeatures(), split_proposals_tmp.size());
 }
 
-template <typename T>
+template <typename MatrixT>
 struct TreeBuilder {
+  using T = typename MatrixT::value_type;
   TreeBuilder(int32_t num_rows,
               int32_t num_features,
               int32_t num_outputs,
@@ -201,24 +202,22 @@ struct TreeBuilder {
                                  split_proposals.histogram_size);
     max_batch_size = max_histogram_nodes;
   }
-  template <typename TYPE>
   void ComputeHistogram(Histogram<GPair> histogram,
                         legate::TaskContext context,
                         Tree& tree,
-                        const legate::AccessorRO<TYPE, 3>& X,
-                        legate::Rect<3> X_shape,
+                        const MatrixT& X,
                         const legate::AccessorRO<double, 3>& g,
                         const legate::AccessorRO<double, 3>& h,
                         NodeBatch batch)
   {
     // Build the histogram
     for (auto [position, index_local] : batch) {
-      auto index_global  = index_local + X_shape.lo[0];
+      auto index_global  = index_local + X.RowRange().lo[0];
       bool const compute = ComputeHistogramBin(
         position, tree.node_sums, histogram.ContainsNode(BinaryTree::Parent(position)));
       if (position < 0 || !compute) { continue; }
       for (int64_t j = 0; j < num_features; j++) {
-        auto x_value      = X[{index_global, j, 0}];
+        auto x_value      = X.Get(index_global, j);
         int const bin_idx = split_proposals.FindBin(x_value, j);
 
         if (bin_idx != SparseSplitProposals<T>::NOT_FOUND) {
@@ -343,8 +342,7 @@ struct TreeBuilder {
       }
     }
   }
-  template <typename TYPE>
-  void UpdatePositions(Tree& tree, const legate::AccessorRO<TYPE, 3>& X, legate::Rect<3> X_shape)
+  void UpdatePositions(Tree& tree, const MatrixT& X)
   {
     // Update the positions
     for (int i = 0; i < num_rows; i++) {
@@ -353,7 +351,7 @@ struct TreeBuilder {
         sorted_positions[i] = {-1, index_local};
         continue;
       }
-      auto x              = X[{X_shape.lo[0] + index_local, tree.feature[pos], 0}];
+      auto x              = X.Get(X.RowRange().lo[0] + index_local, tree.feature[pos]);
       bool const left     = x <= tree.split_value[pos];
       pos                 = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos);
       sorted_positions[i] = {pos, index_local};
@@ -478,7 +476,7 @@ struct build_tree_dense_fn {
       SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows);
 
     // Begin building the tree
-    TreeBuilder<T> builder(
+    TreeBuilder<DenseXMatrix<T>> builder(
       num_rows, num_features, num_outputs, max_nodes, max_depth, split_proposals);
 
     builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha);
@@ -487,14 +485,13 @@ struct build_tree_dense_fn {
       for (auto batch : batches) {
         auto histogram = builder.GetHistogram(batch);
 
-        builder.ComputeHistogram(
-          histogram, context, tree, X_accessor, X_shape, g_accessor, h_accessor, batch);
+        builder.ComputeHistogram(histogram, context, tree, X_matrix, g_accessor, h_accessor, batch);
 
         builder.PerformBestSplit(tree, histogram, alpha, batch);
       }
       // Update position of entire level
       // Don't bother updating positions for the last level
-      if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_accessor, X_shape); }
+      if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_matrix); }
     }
     WriteTreeOutput(context, tree);
   }
@@ -504,6 +501,53 @@ struct build_tree_csr_fn {
   template <typename T>
   void operator()(legate::TaskContext context)
   {
+    auto [X_vals, X_vals_shape, X_vals_accessor] = GetInputStore<T, 1>(context.input(0).data());
+    auto [X_coords, X_coords_shape, X_coords_accessor] =
+      GetInputStore<int64_t, 1>(context.input(1).data());
+    auto [X_offsets, X_offsets_shape, X_offsets_accessor] =
+      GetInputStore<legate::Rect<1>, 1>(context.input(2).data());
+    auto [g, g_shape, g_accessor] = GetInputStore<double, 3>(context.input(3).data());
+    auto [h, h_shape, h_accessor] = GetInputStore<double, 3>(context.input(4).data());
+
+    auto num_rows    = std::max<int64_t>(X_offsets_shape.hi[0] - X_offsets_shape.lo[0] + 1, 0);
+    auto num_outputs = g_shape.hi[2] - g_shape.lo[2] + 1;
+    EXPECT(g_shape.lo[2] == 0, "Outputs should not be split between workers.");
+
+    // Scalars
+    auto max_depth     = context.scalars().at(0).value<int>();
+    auto max_nodes     = context.scalars().at(1).value<int>();
+    auto alpha         = context.scalars().at(2).value<double>();
+    auto split_samples = context.scalars().at(3).value<int>();
+    auto seed          = context.scalars().at(4).value<int>();
+    auto dataset_rows  = context.scalars().at(5).value<int64_t>();
+    auto num_features  = context.scalars().at(6).value<int64_t>();
+
+    Tree tree(max_nodes, num_outputs);
+
+    CSRXMatrix<T> X_matrix(
+      X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features);
+    const SparseSplitProposals<T> split_proposals =
+      SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows);
+
+    // Begin building the tree
+    TreeBuilder<CSRXMatrix<T>> builder(
+      num_rows, num_features, num_outputs, max_nodes, max_depth, split_proposals);
+
+    builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha);
+    for (int depth = 0; depth < max_depth; ++depth) {
+      auto batches = builder.PrepareBatches(depth);
+      for (auto batch : batches) {
+        auto histogram = builder.GetHistogram(batch);
+
+        builder.ComputeHistogram(histogram, context, tree, X_matrix, g_accessor, h_accessor, batch);
+
+        builder.PerformBestSplit(tree, histogram, alpha, batch);
+      }
+      // Update position of entire level
+      // Don't bother updating positions for the last level
+      if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_matrix); }
+    }
+    WriteTreeOutput(context, tree);
   }
 };
 }  // namespace

From e11d81cd8e112b36ade6564731b98da9e727651a Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 6 Dec 2024 06:45:33 -0800
Subject: [PATCH 07/10] Refactor

---
 src/models/tree/build_tree.cc |  74 +++++++++--------
 src/models/tree/build_tree.cu | 144 +++++++++++++++++-----------------
 2 files changed, 107 insertions(+), 111 deletions(-)

diff --git a/src/models/tree/build_tree.cc b/src/models/tree/build_tree.cc
index 5e804ccb..a99d5e20 100644
--- a/src/models/tree/build_tree.cc
+++ b/src/models/tree/build_tree.cc
@@ -185,6 +185,7 @@ struct TreeBuilder {
       num_features(num_features),
       num_outputs(num_outputs),
       max_nodes(max_nodes),
+      max_depth(max_depth),
       split_proposals(split_proposals)
   {
     sorted_positions = legate::create_buffer<std::tuple<int32_t, int32_t>>(num_rows);
@@ -202,6 +203,33 @@ struct TreeBuilder {
                                  split_proposals.histogram_size);
     max_batch_size = max_histogram_nodes;
   }
+
+  Tree Build(legate::TaskContext context,
+             const MatrixT& X_matrix,
+             legate::AccessorRO<double, 3> g_accessor,
+             legate::AccessorRO<double, 3> h_accessor,
+             legate::Rect<3> g_shape,
+             double alpha)
+  {
+    // Begin building the tree
+    Tree tree(max_nodes, narrow<int>(num_outputs));
+    this->InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha);
+    for (int depth = 0; depth < max_depth; ++depth) {
+      auto batches = this->PrepareBatches(depth);
+      for (auto batch : batches) {
+        auto histogram = this->GetHistogram(batch);
+
+        this->ComputeHistogram(histogram, context, tree, X_matrix, g_accessor, h_accessor, batch);
+
+        this->PerformBestSplit(tree, histogram, alpha, batch);
+      }
+      // Update position of entire level
+      // Don't bother updating positions for the last level
+      if (depth < max_depth - 1) { this->UpdatePositions(tree, X_matrix); }
+    }
+    return tree;
+  }
+
   void ComputeHistogram(Histogram<GPair> histogram,
                         legate::TaskContext context,
                         Tree& tree,
@@ -438,6 +466,7 @@ struct TreeBuilder {
   int32_t num_features;
   int32_t num_outputs;
   int32_t max_nodes;
+  int32_t max_depth;
   int max_batch_size;
   SparseSplitProposals<T> split_proposals;
   Histogram<GPair> histogram;
@@ -468,31 +497,16 @@ struct build_tree_dense_fn {
     auto seed          = context.scalars().at(4).value<int>();
     auto dataset_rows  = context.scalars().at(5).value<int64_t>();
 
-    Tree tree(max_nodes, narrow<int>(num_outputs));
-
     DenseXMatrix<T> X_matrix(X_accessor, X_shape);
 
     SparseSplitProposals<T> const split_proposals =
       SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows);
 
-    // Begin building the tree
-    TreeBuilder<DenseXMatrix<T>> builder(
-      num_rows, num_features, num_outputs, max_nodes, max_depth, split_proposals);
+    // Dispatch the tree building algorithm templated on the matrix type
+    auto tree = TreeBuilder<DenseXMatrix<T>>(
+                  num_rows, num_features, num_outputs, max_nodes, max_depth, split_proposals)
+                  .Build(context, X_matrix, g_accessor, h_accessor, g_shape, alpha);
 
-    builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha);
-    for (int depth = 0; depth < max_depth; ++depth) {
-      auto batches = builder.PrepareBatches(depth);
-      for (auto batch : batches) {
-        auto histogram = builder.GetHistogram(batch);
-
-        builder.ComputeHistogram(histogram, context, tree, X_matrix, g_accessor, h_accessor, batch);
-
-        builder.PerformBestSplit(tree, histogram, alpha, batch);
-      }
-      // Update position of entire level
-      // Don't bother updating positions for the last level
-      if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_matrix); }
-    }
     WriteTreeOutput(context, tree);
   }
 };
@@ -522,31 +536,15 @@ struct build_tree_csr_fn {
     auto dataset_rows  = context.scalars().at(5).value<int64_t>();
     auto num_features  = context.scalars().at(6).value<int64_t>();
 
-    Tree tree(max_nodes, num_outputs);
-
     CSRXMatrix<T> X_matrix(
       X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features);
     const SparseSplitProposals<T> split_proposals =
       SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows);
 
-    // Begin building the tree
-    TreeBuilder<CSRXMatrix<T>> builder(
-      num_rows, num_features, num_outputs, max_nodes, max_depth, split_proposals);
+    auto tree = TreeBuilder<CSRXMatrix<T>>(
+                  num_rows, num_features, num_outputs, max_nodes, max_depth, split_proposals)
+                  .Build(context, X_matrix, g_accessor, h_accessor, g_shape, alpha);
 
-    builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha);
-    for (int depth = 0; depth < max_depth; ++depth) {
-      auto batches = builder.PrepareBatches(depth);
-      for (auto batch : batches) {
-        auto histogram = builder.GetHistogram(batch);
-
-        builder.ComputeHistogram(histogram, context, tree, X_matrix, g_accessor, h_accessor, batch);
-
-        builder.PerformBestSplit(tree, histogram, alpha, batch);
-      }
-      // Update position of entire level
-      // Don't bother updating positions for the last level
-      if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_matrix); }
-    }
     WriteTreeOutput(context, tree);
   }
 };
diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu
index d8652037..f50565e8 100644
--- a/src/models/tree/build_tree.cu
+++ b/src/models/tree/build_tree.cu
@@ -819,11 +819,12 @@ struct Tree {
                        [=] __device__(const legate::Point<DIM>& p) { out_acc[p] = x[p]; });
   }
 
-  template <typename ThrustPolicyT>
-  void WriteTreeOutput(legate::TaskContext context,
-                       const ThrustPolicyT& policy,
-                       GradientQuantiser quantiser)
+  void WriteTreeOutput(legate::TaskContext context, GradientQuantiser quantiser)
   {
+    auto stream       = context.get_task_stream();
+    auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM);
+    auto policy       = DEFAULT_POLICY(thrust_alloc).on(stream);
+
     WriteOutput(context.output(0).data(), leaf_value, policy);
     WriteOutput(context.output(1).data(), feature, policy);
     WriteOutput(context.output(2).data(), split_value, policy);
@@ -843,7 +844,7 @@ struct Tree {
 
   ~Tree()                              = default;
   Tree(const Tree&)                    = delete;
-  Tree(Tree&&)                         = delete;
+  Tree(Tree&&)                         = default;
   auto operator=(const Tree&) -> Tree& = delete;
   auto operator=(Tree&&) -> Tree&      = delete;
 
@@ -975,15 +976,18 @@ struct TreeBuilder {
               int32_t max_nodes,
               int32_t max_depth,
               const SparseSplitProposals<T>& split_proposals,
-              GradientQuantiser quantiser)
+              GradientQuantiser quantiser,
+              int64_t seed)
     : num_rows(num_rows),
       num_features(num_features),
       num_outputs(num_outputs),
       stream(stream),
       max_nodes(max_nodes),
+      max_depth(max_depth),
       split_proposals(split_proposals),
       quantiser(quantiser),
-      histogram_kernel(split_proposals, stream)
+      histogram_kernel(split_proposals, stream),
+      seed(seed)
   {
     sorted_positions = legate::create_buffer<cuda::std::tuple<int32_t, int32_t>>(num_rows);
     FillPositions(sorted_positions, num_rows, stream);
@@ -1007,6 +1011,38 @@ struct TreeBuilder {
     max_batch_size = max_histogram_nodes;
   }
 
+  Tree Build(legate::TaskContext context,
+             const MatrixT& X_matrix,
+             legate::AccessorRO<double, 3> g_accessor,
+             legate::AccessorRO<double, 3> h_accessor,
+             legate::Rect<3> g_shape,
+             double alpha)
+  {
+    auto stream             = context.get_task_stream();
+    auto thrust_alloc       = ThrustAllocator(legate::Memory::GPU_FB_MEM);
+    auto thrust_exec_policy = DEFAULT_POLICY(thrust_alloc).on(stream);
+
+    Tree tree(max_nodes, num_outputs, stream, thrust_exec_policy);
+
+    this->InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha);
+
+    for (int depth = 0; depth < max_depth; ++depth) {
+      auto batches = this->PrepareBatches(depth);
+      for (auto batch : batches) {
+        auto histogram = this->GetHistogram(batch);
+
+        this->ComputeHistogram(histogram, context, tree, X_matrix, g_accessor, h_accessor, batch);
+
+        this->PerformBestSplit(tree, histogram, alpha, batch);
+      }
+      // Update position of entire level
+      // Don't bother updating positions for the last level
+      if (depth < max_depth - 1) { this->UpdatePositions(tree, X_matrix); }
+    }
+
+    return tree;
+  }
+
   TreeBuilder(const TreeBuilder&)                    = delete;
   TreeBuilder(TreeBuilder&&)                         = delete;
   auto operator=(const TreeBuilder&) -> TreeBuilder& = delete;
@@ -1075,8 +1111,7 @@ struct TreeBuilder {
                         const MatrixT X,
                         const legate::AccessorRO<double, 3>& g,
                         const legate::AccessorRO<double, 3>& h,
-                        NodeBatch batch,
-                        int64_t seed)
+                        NodeBatch batch)
   {
     histogram_kernel.BuildHistogram(X,
                                     g,
@@ -1137,8 +1172,7 @@ struct TreeBuilder {
                       const legate::AccessorRO<double, 3>& g,
                       const legate::AccessorRO<double, 3>& h,
                       legate::Rect<3> g_shape,
-                      double alpha,
-                      int64_t seed)
+                      double alpha)
   {
     const int kBlockThreads = 256;
     const size_t blocks     = (num_rows + kBlockThreads - 1) / kBlockThreads;
@@ -1245,6 +1279,8 @@ struct TreeBuilder {
   const int32_t num_features;
   const int32_t num_outputs;
   const int32_t max_nodes;
+  const int32_t max_depth;
+  const int64_t seed;
   SparseSplitProposals<T> split_proposals;
   Histogram<IntegerGPair> histogram;
   int max_batch_size;
@@ -1285,41 +1321,23 @@ struct build_tree_fn {
     auto thrust_alloc       = ThrustAllocator(legate::Memory::GPU_FB_MEM);
     auto thrust_exec_policy = DEFAULT_POLICY(thrust_alloc).on(stream);
 
-    Tree tree(max_nodes, num_outputs, stream, thrust_exec_policy);
-
     const SparseSplitProposals<T> split_proposals =
       SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows, stream);
 
     GradientQuantiser const quantiser(context, g_accessor, h_accessor, g_shape, stream);
 
-    // Begin building the tree
-    TreeBuilder<DenseXMatrix<T>> builder(num_rows,
-                                         num_features,
-                                         num_outputs,
-                                         stream,
-                                         tree.max_nodes,
-                                         max_depth,
-                                         split_proposals,
-                                         quantiser);
+    auto tree = TreeBuilder<DenseXMatrix<T>>(num_rows,
+                                             num_features,
+                                             num_outputs,
+                                             stream,
+                                             max_nodes,
+                                             max_depth,
+                                             split_proposals,
+                                             quantiser,
+                                             seed)
+                  .Build(context, X_matrix, g_accessor, h_accessor, g_shape, alpha);
 
-    builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha, seed);
-
-    for (int depth = 0; depth < max_depth; ++depth) {
-      auto batches = builder.PrepareBatches(depth);
-      for (auto batch : batches) {
-        auto histogram = builder.GetHistogram(batch);
-
-        builder.ComputeHistogram(
-          histogram, context, tree, X_matrix, g_accessor, h_accessor, batch, seed);
-
-        builder.PerformBestSplit(tree, histogram, alpha, batch);
-      }
-      // Update position of entire level
-      // Don't bother updating positions for the last level
-      if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_matrix); }
-    }
-
-    tree.WriteTreeOutput(context, thrust_exec_policy, quantiser);
+    tree.WriteTreeOutput(context, quantiser);
 
     CHECK_CUDA(cudaStreamSynchronize(stream));
     CHECK_CUDA_STREAM(stream);
@@ -1351,12 +1369,7 @@ struct build_tree_csr_fn {
     auto dataset_rows  = context.scalars().at(5).value<int64_t>();
     auto num_features  = context.scalars().at(6).value<int64_t>();
 
-    auto stream             = context.get_task_stream();
-    auto thrust_alloc       = ThrustAllocator(legate::Memory::GPU_FB_MEM);
-    auto thrust_exec_policy = DEFAULT_POLICY(thrust_alloc).on(stream);
-
-    Tree tree(max_nodes, num_outputs, stream, thrust_exec_policy);
-
+    auto* stream = context.get_task_stream();
     CSRXMatrix<T> X_matrix(
       X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features);
     const SparseSplitProposals<T> split_proposals =
@@ -1365,33 +1378,18 @@ struct build_tree_csr_fn {
     GradientQuantiser quantiser(context, g_accessor, h_accessor, g_shape, stream);
 
     // Begin building the tree
-    TreeBuilder<CSRXMatrix<T>> builder(num_rows,
-                                       num_features,
-                                       num_outputs,
-                                       stream,
-                                       tree.max_nodes,
-                                       max_depth,
-                                       split_proposals,
-                                       quantiser);
-
-    builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha, seed);
-
-    for (int depth = 0; depth < max_depth; ++depth) {
-      auto batches = builder.PrepareBatches(depth);
-      for (auto batch : batches) {
-        auto histogram = builder.GetHistogram(batch);
-
-        builder.ComputeHistogram(
-          histogram, context, tree, X_matrix, g_accessor, h_accessor, batch, seed);
-
-        builder.PerformBestSplit(tree, histogram, alpha, batch);
-      }
-      // Update position of entire level
-      // Don't bother updating positions for the last level
-      if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_matrix); }
-    }
-
-    tree.WriteTreeOutput(context, thrust_exec_policy, quantiser);
+    auto tree = TreeBuilder<CSRXMatrix<T>>(num_rows,
+                                           num_features,
+                                           num_outputs,
+                                           stream,
+                                           max_nodes,
+                                           max_depth,
+                                           split_proposals,
+                                           quantiser,
+                                           seed)
+                  .Build(context, X_matrix, g_accessor, h_accessor, g_shape, alpha);
+
+    tree.WriteTreeOutput(context, quantiser);
 
     CHECK_CUDA(cudaStreamSynchronize(stream));
     CHECK_CUDA_STREAM(stream);

From 50ccc8a41af7772f530444ef71e6dc5e2bcfc7fb Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Wed, 11 Dec 2024 04:05:18 -0800
Subject: [PATCH 08/10] Add custom CSR kernel for GPU

---
 legateboost/test/test_estimator.py |  38 ++++-
 src/models/tree/build_tree.cc      |   9 +-
 src/models/tree/build_tree.cu      | 216 +++++++++++++++++++++--------
 src/models/tree/build_tree.h       |   6 +
 src/models/tree/matrix_types.h     |  54 ++++++--
 src/models/tree/predict.cc         |   9 +-
 src/models/tree/predict.cu         |   9 +-
 7 files changed, 260 insertions(+), 81 deletions(-)

diff --git a/legateboost/test/test_estimator.py b/legateboost/test/test_estimator.py
index db206ab6..17affbbf 100644
--- a/legateboost/test/test_estimator.py
+++ b/legateboost/test/test_estimator.py
@@ -227,12 +227,44 @@ def test_iterator_methods():
 )
 def test_csr_input(base_model):
     csr_matrix = pytest.importorskip("legate_sparse").csr_matrix
-    X_scipy = scipy.sparse.csr_matrix([[1.0, 0.0, 2.0], [0.0, 3.0, 0.0]])
+    X_dense = cn.array([[1.0, 0.0, 2.0], [0.0, 3.0, 0.0]])
+    X_scipy = scipy.sparse.csr_matrix(X_dense)
     X_legate_sparse = csr_matrix(X_scipy)
     y = cn.array([1.0, 2.0])
     model = lb.LBRegressor(
+        init=None,
         n_estimators=1,
         base_models=(base_model,),
+        learning_rate=1.0,
     )
-    model.fit(X_scipy, y)
-    model.fit(X_legate_sparse, y)
+    sparse_pred = model.fit(X_scipy, y).predict(X_scipy)
+    legate_sparse_pred = model.fit(X_legate_sparse, y).predict(X_legate_sparse)
+    dense_pred = model.fit(X_dense, y).predict(X_dense)
+    assert cn.allclose(sparse_pred, legate_sparse_pred)
+    assert cn.allclose(sparse_pred, dense_pred)
+
+    # Generate a sparse dataset and check that dense and sparse
+    # input give a bitwise equal result
+    rng = np.random.RandomState(0)
+    X = rng.binomial(1, 0.1, (100, 100)).astype(np.float32)
+    y = rng.randint(0, 5, 100)
+    X_scipy = scipy.sparse.csr_matrix(X)
+
+    # regression
+    params = {"base_models": (base_model,), "n_estimators": 5, "random_state": 0}
+    sparse_model = lb.LBRegressor(**params)
+    dense_model = lb.LBRegressor(**params)
+    dense_pred = dense_model.fit(X, y).predict(X)
+    X_csr = csr_matrix(X)
+    assert X_csr.nnz < X.size
+    sparse_pred = sparse_model.fit(X_csr, y).predict(X_csr)
+    assert cn.all(dense_pred == sparse_pred)
+    sanity_check_models(sparse_model)
+
+    # classification
+    sparse_model = lb.LBClassifier(**params)
+    dense_model = lb.LBClassifier(**params)
+    dense_pred = dense_model.fit(X, y).predict_proba(X)
+    sparse_pred = sparse_model.fit(X_csr, y).predict_proba(X_csr)
+    assert cn.all(dense_pred == sparse_pred)
+    sanity_check_models(sparse_model)
diff --git a/src/models/tree/build_tree.cc b/src/models/tree/build_tree.cc
index a99d5e20..624b3da1 100644
--- a/src/models/tree/build_tree.cc
+++ b/src/models/tree/build_tree.cc
@@ -536,8 +536,13 @@ struct build_tree_csr_fn {
     auto dataset_rows  = context.scalars().at(5).value<int64_t>();
     auto num_features  = context.scalars().at(6).value<int64_t>();
 
-    CSRXMatrix<T> X_matrix(
-      X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features);
+    CSRXMatrix<T> X_matrix(X_vals_accessor,
+                           X_coords_accessor,
+                           X_offsets_accessor,
+                           X_vals_shape,
+                           X_offsets_shape,
+                           num_features,
+                           X_vals_shape.volume());
     const SparseSplitProposals<T> split_proposals =
       SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows);
 
diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu
index f50565e8..51be29ff 100644
--- a/src/models/tree/build_tree.cu
+++ b/src/models/tree/build_tree.cu
@@ -210,12 +210,11 @@ using SharedMemoryHistogramType = GPairBase<int32_t>;
 const int kMaxSharedBins = 2048;  // 16KB shared memory. More is not helpful and creates more cache
                                   // misses for binary search in split_proposals.
 
-template <typename MatrixT,
+template <typename T,
           int kBlockThreads,
           int kItemsPerThread,
           int kItemsPerTile = kBlockThreads * kItemsPerThread>
 struct HistogramAgent {
-  using T                      = typename MatrixT::value_type;
   static const int kImpureTile = -1;  // Special value for a tile that is not pure (contains
                                       // multiple nodes)
   struct SharedMemoryHistogram {
@@ -273,7 +272,7 @@ struct HistogramAgent {
     }
   };
 
-  const MatrixT& X;
+  const DenseXMatrix<T>& X;
   const legate::AccessorRO<double, 3>& g;
   const legate::AccessorRO<double, 3>& h;
   const size_t& n_outputs;
@@ -289,7 +288,7 @@ struct HistogramAgent {
   int feature_stride;
   SharedMemoryHistogram shared_histogram;
 
-  __device__ HistogramAgent(const MatrixT& X,
+  __device__ HistogramAgent(const DenseXMatrix<T>& X,
                             const legate::AccessorRO<double, 3>& g,
                             const legate::AccessorRO<double, 3>& h,
                             const size_t& n_outputs,
@@ -342,8 +341,7 @@ struct HistogramAgent {
           sample_node, node_sums, histogram.ContainsNode(BinaryTree::Parent(sample_node)));
       if (!computeHistogram) { continue; }
 
-      auto x = X.Get(X.RowRange().lo[0] + local_sample_idx, feature);
-      // int bin_idx = shared_split_proposals.FindBin(x, feature);
+      auto x            = X.Get(X.RowRange().lo[0] + local_sample_idx, feature);
       int const bin_idx = split_proposals.FindBin(x, feature);
 
       legate::Point<3> p = {X.RowRange().lo[0] + local_sample_idx, 0, output};
@@ -454,19 +452,19 @@ struct HistogramAgent {
 };
 
 // NOLINTBEGIN(performance-unnecessary-value-param)
-template <typename MatrixT, int kBlockThreads, int kItemsPerThread>
+template <typename T, int kBlockThreads, int kItemsPerThread>
 __global__ void __launch_bounds__(kBlockThreads)
-  fill_histogram_shared(MatrixT X,
-                        legate::AccessorRO<double, 3> g,
-                        legate::AccessorRO<double, 3> h,
-                        size_t n_outputs,
-                        SparseSplitProposals<typename MatrixT::value_type> split_proposals,
-                        NodeBatch batch,
-                        Histogram<IntegerGPair> histogram,
-                        legate::Buffer<IntegerGPair, 2> node_sums,
-                        GradientQuantiser quantiser,
-                        legate::Buffer<int> feature_groups,
-                        int64_t seed)
+  fill_histogram_dense(DenseXMatrix<T> X,
+                       legate::AccessorRO<double, 3> g,
+                       legate::AccessorRO<double, 3> h,
+                       size_t n_outputs,
+                       SparseSplitProposals<T> split_proposals,
+                       NodeBatch batch,
+                       Histogram<IntegerGPair> histogram,
+                       legate::Buffer<IntegerGPair, 2> node_sums,
+                       GradientQuantiser quantiser,
+                       legate::Buffer<int> feature_groups,
+                       int64_t seed)
 {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,hicpp-avoid-c-arrays)
   __shared__ char shared_char[kMaxSharedBins * sizeof(SharedMemoryHistogramType)];
@@ -474,22 +472,69 @@ __global__ void __launch_bounds__(kBlockThreads)
   auto* shared_memory =
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
     reinterpret_cast<SharedMemoryHistogramType*>(shared_char);
-  HistogramAgent<MatrixT, kBlockThreads, kItemsPerThread> agent(X,
-                                                                g,
-                                                                h,
-                                                                n_outputs,
-                                                                split_proposals,
-                                                                batch,
-                                                                histogram,
-                                                                node_sums,
-                                                                quantiser,
-                                                                feature_groups,
-                                                                seed,
-                                                                shared_memory);
+  HistogramAgent<T, kBlockThreads, kItemsPerThread> agent(X,
+                                                          g,
+                                                          h,
+                                                          n_outputs,
+                                                          split_proposals,
+                                                          batch,
+                                                          histogram,
+                                                          node_sums,
+                                                          quantiser,
+                                                          feature_groups,
+                                                          seed,
+                                                          shared_memory);
   agent.BuildHistogram();
 }
 // NOLINTEND(performance-unnecessary-value-param)
 
+// NOLINTBEGIN(performance-unnecessary-value-param)
+template <typename T, int kBlockThreads>
+__global__ void __launch_bounds__(kBlockThreads)
+  fill_histogram_csr(CSRXMatrix<T> X,
+                     legate::AccessorRO<double, 3> g,
+                     legate::AccessorRO<double, 3> h,
+                     size_t n_outputs,
+                     SparseSplitProposals<T> split_proposals,
+                     NodeBatch batch,
+                     Histogram<IntegerGPair> histogram,
+                     legate::Buffer<IntegerGPair, 2> node_sums,
+                     GradientQuantiser quantiser,
+                     int64_t seed)
+{
+  // Grid stride loop over rows
+  for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < batch.InstancesInBatch();
+       idx += blockDim.x * gridDim.x) {
+    auto [sample_node, local_sample_idx] = batch.instances[idx];
+    // If we don't need to compute this node, skip
+    if (!ComputeHistogramBin(
+          sample_node, node_sums, histogram.ContainsNode(BinaryTree::Parent(sample_node)))) {
+      continue;
+    }
+    // Which matrix elements belong to this row?
+    std::size_t const global_sample_idx = X.RowRange().lo[0] + local_sample_idx;
+    auto elements                       = X.row_ranges[global_sample_idx];
+    for (std::size_t element = elements.lo[0]; element <= elements.hi[0]; element++) {
+      auto feature      = X.column_indices[X.vals_shape.lo[0] + element];
+      auto x            = X.values[X.vals_shape.lo[0] + element];
+      int const bin_idx = split_proposals.FindBin(x, feature);
+      if (bin_idx == SparseSplitProposals<T>::NOT_FOUND) continue;
+      for (int output = 0; output < n_outputs; output++) {
+        legate::Point<3> p = {global_sample_idx, 0, output};
+        auto gpair_quantised =
+          quantiser.QuantiseStochasticRounding({g[p], h[p]}, hash_combine(seed, p[0], p[2]));
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+        auto* addPosition = reinterpret_cast<Histogram<IntegerGPair>::atomic_add_type*>(
+          &histogram[{sample_node, output, bin_idx}]);
+        atomicAdd(addPosition, gpair_quantised.grad);
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+        atomicAdd(addPosition + 1, gpair_quantised.hess);
+      }
+    }
+  }
+}
+// NOLINTEND(performance-unnecessary-value-param)
+
 // Manage the launch parameters for histogram kernel
 template <typename MatrixT, std::int32_t kBlockThreads = 1024, std::int32_t kItemsPerThread = 4>
 struct HistogramKernel {
@@ -506,10 +551,7 @@ struct HistogramKernel {
     CHECK_CUDA(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
     std::int32_t n_blocks_per_mp = 0;
     CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-      &n_blocks_per_mp,
-      fill_histogram_shared<MatrixT, kBlockThreads, kItemsPerThread>,
-      kBlockThreads,
-      0));
+      &n_blocks_per_mp, fill_histogram_dense<T, kBlockThreads, kItemsPerThread>, kBlockThreads, 0));
     this->maximum_blocks_for_occupancy = n_blocks_per_mp * n_mps;
     FindFeatureGroups(split_proposals, stream);
   }
@@ -561,24 +603,35 @@ struct HistogramKernel {
                       int64_t seed,
                       cudaStream_t stream)
   {
-    int const average_features_per_group = split_proposals.num_features / num_groups;
-    std::size_t const average_elements_per_group =
-      batch.InstancesInBatch() * average_features_per_group;
-    auto min_blocks  = (average_elements_per_group + kItemsPerTile - 1) / kItemsPerTile;
-    auto x_grid_size = std::min(static_cast<uint64_t>(maximum_blocks_for_occupancy), min_blocks);
-    // Launch the kernel
-    fill_histogram_shared<MatrixT, kBlockThreads, kItemsPerThread>
-      <<<dim3(x_grid_size, num_groups, n_outputs), kBlockThreads, 0, stream>>>(X,
-                                                                               g,
-                                                                               h,
-                                                                               n_outputs,
-                                                                               split_proposals,
-                                                                               batch,
-                                                                               histogram,
-                                                                               node_sums,
-                                                                               quantiser,
-                                                                               feature_groups,
-                                                                               seed);
+    if constexpr (std::is_same_v<MatrixT, DenseXMatrix<T>>) {
+      int const average_features_per_group = split_proposals.num_features / num_groups;
+      std::size_t const average_elements_per_group =
+        batch.InstancesInBatch() * average_features_per_group;
+      auto min_blocks  = (average_elements_per_group + kItemsPerTile - 1) / kItemsPerTile;
+      auto x_grid_size = std::min(static_cast<uint64_t>(maximum_blocks_for_occupancy), min_blocks);
+      // Launch the kernel
+      fill_histogram_dense<T, kBlockThreads, kItemsPerThread>
+        <<<dim3(x_grid_size, num_groups, n_outputs), kBlockThreads, 0, stream>>>(X,
+                                                                                 g,
+                                                                                 h,
+                                                                                 n_outputs,
+                                                                                 split_proposals,
+                                                                                 batch,
+                                                                                 histogram,
+                                                                                 node_sums,
+                                                                                 quantiser,
+                                                                                 feature_groups,
+                                                                                 seed);
+    } else {
+      // For sparse data we don't currently make use of feature groups or shared memory
+      // Use 1 thread per row for lack of a better option currently
+      // Other methods might involve complicated load balancing
+      auto min_blocks  = (batch.InstancesInBatch() + kItemsPerTile - 1) / kItemsPerTile;
+      auto x_grid_size = std::min(static_cast<uint64_t>(maximum_blocks_for_occupancy), min_blocks);
+      // Launch the kernel
+      fill_histogram_csr<T, kBlockThreads><<<dim3(x_grid_size), kBlockThreads, 0, stream>>>(
+        X, g, h, n_outputs, split_proposals, batch, histogram, node_sums, quantiser, seed);
+    }
   }
 };
 
@@ -684,6 +737,36 @@ struct GainFeaturePair {
   }
 };
 
+// In the case where we have a sparse matrix, gradients for 0's have not been accumulated in the
+// histogram We can infer the gradients at the matrix zeroes by subtracting the sum of the gradients
+// at the last bin (which contains gradients from every non-zero element)
+// from the sum of the gradients in the node (this sum always includes gradients for every element
+// in that node)
+template <typename T>
+__device__ auto GetSparseSum(Histogram<IntegerGPair>& histogram,
+                             const SparseSplitProposals<T>& split_proposals,
+                             const IntegerGPair& node_sum,
+                             int node_id,
+                             int output,
+                             int bin_idx)
+{
+  auto left_sum                     = vectorised_load(&histogram[{node_id, output, bin_idx}]);
+  auto right_sum                    = node_sum - left_sum;
+  auto feature                      = split_proposals.FindFeature(bin_idx);
+  auto [feature_begin, feature_end] = split_proposals.FeatureRange(feature);
+  auto scan_sum   = vectorised_load(&histogram[{node_id, output, feature_end - 1}]);
+  auto zero_bin   = split_proposals.FindBin(0.0, feature);
+  auto sparse_sum = node_sum - scan_sum;
+  if (zero_bin == SparseSplitProposals<T>::NOT_FOUND || bin_idx < zero_bin) {
+    // Do nothing, this amount is already on the right
+  } else {
+    // Move it to the left
+    left_sum += sparse_sum;
+    right_sum -= sparse_sum;
+  }
+  return std::make_tuple(left_sum, right_sum);
+}
+
 // NOLINTBEGIN(performance-unnecessary-value-param)
 template <typename TYPE, int BLOCK_THREADS>
 __global__ void __launch_bounds__(BLOCK_THREADS)
@@ -718,9 +801,13 @@ __global__ void __launch_bounds__(BLOCK_THREADS)
        bin_idx += BLOCK_THREADS) {
     double gain = 0;
     for (int output = 0; output < n_outputs; ++output) {
-      auto node_sum  = vectorised_load(&node_sums[{node_id, output}]);
-      auto left_sum  = vectorised_load(&histogram[{node_id, output, bin_idx}]);
-      auto right_sum = node_sum - left_sum;
+      auto node_sum = vectorised_load(&node_sums[{node_id, output}]);
+
+      auto [left_sum, right_sum] =
+        GetSparseSum(histogram, split_proposals, node_sum, node_id, output, bin_idx);
+      // printf("node %d , bin %d,  left sum %ld %ld right sum %ld %ld \n", node_id ,bin_idx,
+      // left_sum.grad, left_sum.hess, right_sum.grad, right_sum.hess);
+
       if (left_sum.hess <= 0 || right_sum.hess <= 0) {
         gain = 0;
         break;
@@ -755,9 +842,11 @@ __global__ void __launch_bounds__(BLOCK_THREADS)
   if (node_best_gain > eps) {
     int const node_best_feature = split_proposals.FindFeature(node_best_bin_idx);
     for (int output = narrow_cast<int>(threadIdx.x); output < n_outputs; output += BLOCK_THREADS) {
-      auto node_sum  = vectorised_load(&node_sums[{node_id, output}]);
-      auto left_sum  = vectorised_load(&histogram[{node_id, output, node_best_bin_idx}]);
-      auto right_sum = node_sum - left_sum;
+      auto node_sum = vectorised_load(&node_sums[{node_id, output}]);
+
+      auto [left_sum, right_sum] =
+        GetSparseSum(histogram, split_proposals, node_sum, node_id, output, node_best_bin_idx);
+
       node_sums[{BinaryTree::LeftChild(node_id), output}]  = left_sum;
       node_sums[{BinaryTree::RightChild(node_id), output}] = right_sum;
 
@@ -1069,6 +1158,8 @@ struct TreeBuilder {
                 X.Get(X.RowRange().lo[0] + static_cast<int64_t>(row), tree_feature_span[pos]);
               bool left = x_value <= tree_split_value_span[pos];
               pos       = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos);
+              // printf("Row %d, feature %d, value %f pos %d\n", row, tree_feature_span[pos],
+              // x_value, pos);
               sorted_positions[idx] = cuda::std::make_tuple(pos, row);
             });
     CHECK_CUDA_STREAM(stream);
@@ -1370,8 +1461,13 @@ struct build_tree_csr_fn {
     auto num_features  = context.scalars().at(6).value<int64_t>();
 
     auto* stream = context.get_task_stream();
-    CSRXMatrix<T> X_matrix(
-      X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features);
+    CSRXMatrix<T> X_matrix(X_vals_accessor,
+                           X_coords_accessor,
+                           X_offsets_accessor,
+                           X_vals_shape,
+                           X_offsets_shape,
+                           num_features,
+                           X_vals_shape.volume());
     const SparseSplitProposals<T> split_proposals =
       SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows, stream);
 
diff --git a/src/models/tree/build_tree.h b/src/models/tree/build_tree.h
index ace0eef3..a0026b70 100644
--- a/src/models/tree/build_tree.h
+++ b/src/models/tree/build_tree.h
@@ -40,6 +40,12 @@ struct GPairBase {
     this->hess += b.hess;
     return *this;
   }
+  __host__ __device__ auto operator-=(const GPairBase<T>& b) -> GPairBase<T>&
+  {
+    this->grad -= b.grad;
+    this->hess -= b.hess;
+    return *this;
+  }
 };
 
 template <typename T>
diff --git a/src/models/tree/matrix_types.h b/src/models/tree/matrix_types.h
index 508b351f..10891e02 100644
--- a/src/models/tree/matrix_types.h
+++ b/src/models/tree/matrix_types.h
@@ -16,6 +16,11 @@
 #pragma once
 #include <legate.h>
 #include <cstdint>
+#ifdef __CUDACC__
+#include <thrust/binary_search.h>
+#else
+#include <algorithm>
+#endif
 
 // Create a uniform interface to two matrix formats
 // Dense and CSR
@@ -30,7 +35,12 @@ class DenseXMatrix {
 
  public:
   DenseXMatrix(legate::AccessorRO<T, 3> x, legate::Rect<3> shape) : x(x), shape(shape) {}
-  __host__ __device__ T Get(uint32_t i, uint32_t j) const { return x[legate::Point<3>{i, j, 0}]; }
+  // Global row index refers to the index across partitions
+  // For features, each worker has every feature so the global is the same as the local index
+  __host__ __device__ T Get(std::size_t global_row_idx, uint32_t feature_idx) const
+  {
+    return x[legate::Point<3>{global_row_idx, feature_idx, 0}];
+  }
   __host__ __device__ int NumFeatures() const { return shape.hi[1] - shape.lo[1] + 1; }
   __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const
   {
@@ -43,38 +53,58 @@ class CSRXMatrix {
  public:
   using value_type = T;
 
- private:
   legate::AccessorRO<T, 1> values;
+  legate::Rect<1, legate::coord_t> vals_shape;
   legate::AccessorRO<int64_t, 1> column_indices;
   legate::AccessorRO<legate::Rect<1, legate::coord_t>, 1> row_ranges;
   legate::Rect<1, legate::coord_t> row_ranges_shape;
   int num_features;
+  std::size_t nnz;  // The number of nnz in ths local partition
 
- public:
   CSRXMatrix(legate::AccessorRO<T, 1> values,
              legate::AccessorRO<int64_t, 1> column_indices,
              legate::AccessorRO<legate::Rect<1, legate::coord_t>, 1> row_ranges,
+             legate::Rect<1, legate::coord_t> vals_shape,
              legate::Rect<1, legate::coord_t> row_ranges_shape,
-             int num_features)
+             int num_features,
+             std::size_t nnz)
     : values(values),
       column_indices(column_indices),
       row_ranges(row_ranges),
       num_features(num_features),
-      row_ranges_shape(row_ranges_shape)
+      vals_shape(vals_shape),
+      row_ranges_shape(row_ranges_shape),
+      nnz(nnz)
   {
   }
 
-  // Slower than dense due to search for column index
-  __host__ __device__ T Get(uint32_t i, uint32_t j) const
+  // Global row index refers to the index across partitions
+  // For features, each worker has every feature so the global is the same as the local index
+  // This method is less efficient than its Dense counterpart due to the need to search for the
+  // feature
+  __host__ __device__ T Get(std::size_t global_row_idx, uint32_t feature_idx) const
   {
-    auto row_range = row_ranges[i];
-    // TODO(Rory): Binary search?
-    for (int64_t k = row_range.lo; k <= row_range.hi; k++) {
-      if (column_indices[k] == j) return values[k];
-      if (column_indices[k] > j) return 0;
+    auto row_range = row_ranges[global_row_idx];
+
+    tcb::span<const int64_t> column_indices_span(column_indices.ptr(row_range.lo),
+                                                 row_range.volume());
+
+#ifdef __CUDACC__
+    auto result = thrust::lower_bound(
+      thrust::seq, column_indices_span.begin(), column_indices_span.end(), feature_idx);
+#else
+    auto result =
+      std::lower_bound(column_indices_span.begin(), column_indices_span.end(), feature_idx);
+#endif
+
+    if (result != column_indices_span.end() && *result == feature_idx) {
+      return values[row_range.lo + (result - column_indices_span.begin())];
     }
     return 0;
   }
+
+  auto NNZ() const { return nnz; }
+
   __host__ __device__ int NumFeatures() const { return num_features; }
   __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const { return row_ranges_shape; }
 };
diff --git a/src/models/tree/predict.cc b/src/models/tree/predict.cc
index d24eaf1f..19f80bc2 100644
--- a/src/models/tree/predict.cc
+++ b/src/models/tree/predict.cc
@@ -96,8 +96,13 @@ struct predict_csr_fn {
     auto pred_accessor = pred.write_accessor<double, 3>();
 
     auto num_features = context.scalars().at(0).value<int32_t>();
-    CSRXMatrix<T> X(
-      X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features);
+    CSRXMatrix<T> X(X_vals_accessor,
+                    X_coords_accessor,
+                    X_offsets_accessor,
+                    X_vals_shape,
+                    X_offsets_shape,
+                    num_features,
+                    X_vals_shape.volume());
 
     EXPECT_AXIS_ALIGNED(0, X_offsets_shape, pred_shape);
 
diff --git a/src/models/tree/predict.cu b/src/models/tree/predict.cu
index 181070f3..c1c94770 100644
--- a/src/models/tree/predict.cu
+++ b/src/models/tree/predict.cu
@@ -109,8 +109,13 @@ struct predict_csr_fn {
     auto pred_accessor = pred.write_accessor<double, 3>();
 
     auto num_features = context.scalars().at(0).value<int32_t>();
-    CSRXMatrix<T> X(
-      X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features);
+    CSRXMatrix<T> X(X_vals_accessor,
+                    X_coords_accessor,
+                    X_offsets_accessor,
+                    X_vals_shape,
+                    X_offsets_shape,
+                    num_features,
+                    X_vals_shape.volume());
 
     EXPECT_AXIS_ALIGNED(0, X_offsets_shape, pred_shape);
 

From ea45f872609563b48c1e3f7a16443743b97819ec Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 12 Dec 2024 03:21:10 -0800
Subject: [PATCH 09/10] Add CPU sparse algorithm

---
 legateboost/test/models/test_tree.py |   1 +
 legateboost/test/test_estimator.py   |  11 ++-
 src/models/tree/build_tree.cc        | 119 ++++++++++++++++++++-------
 src/models/tree/build_tree.cu        |  71 ++++++----------
 src/models/tree/build_tree.h         |  34 ++++++++
 src/models/tree/matrix_types.h       |   7 +-
 src/models/tree/predict.cc           |   2 +-
 src/models/tree/predict.cu           |   4 +-
 8 files changed, 162 insertions(+), 87 deletions(-)

diff --git a/legateboost/test/models/test_tree.py b/legateboost/test/models/test_tree.py
index 7d5f2e04..bb9fa9aa 100644
--- a/legateboost/test/models/test_tree.py
+++ b/legateboost/test/models/test_tree.py
@@ -88,4 +88,5 @@ def test_csr():
         .set_random_state(np.random.RandomState(2))
         .fit(X, g, h)
     )
+    print(model)
     assert np.allclose(model.predict(X), -g / h)
diff --git a/legateboost/test/test_estimator.py b/legateboost/test/test_estimator.py
index 17affbbf..5af10a73 100644
--- a/legateboost/test/test_estimator.py
+++ b/legateboost/test/test_estimator.py
@@ -244,7 +244,10 @@ def test_csr_input(base_model):
     assert cn.allclose(sparse_pred, dense_pred)
 
     # Generate a sparse dataset and check that dense and sparse
-    # input give a bitwise equal result
+    # input give equivalent results
+    # unfortunately we can't test that they are bitwise identical
+    # the changing order of floating point sums can lead to different results
+    # so instead assert that sparse models are roughly as accurate as dense ones
     rng = np.random.RandomState(0)
     X = rng.binomial(1, 0.1, (100, 100)).astype(np.float32)
     y = rng.randint(0, 5, 100)
@@ -258,7 +261,8 @@ def test_csr_input(base_model):
     X_csr = csr_matrix(X)
     assert X_csr.nnz < X.size
     sparse_pred = sparse_model.fit(X_csr, y).predict(X_csr)
-    assert cn.all(dense_pred == sparse_pred)
+    assert sparse_model.score(X, y) > 0.4
+    assert dense_model.score(X, y) > 0.4
     sanity_check_models(sparse_model)
 
     # classification
@@ -266,5 +270,6 @@ def test_csr_input(base_model):
     dense_model = lb.LBClassifier(**params)
     dense_pred = dense_model.fit(X, y).predict_proba(X)
     sparse_pred = sparse_model.fit(X_csr, y).predict_proba(X_csr)
-    assert cn.all(dense_pred == sparse_pred)
+    assert sparse_model.score(X, y) > 0.8
+    assert dense_model.score(X, y) > 0.8
     sanity_check_models(sparse_model)
diff --git a/src/models/tree/build_tree.cc b/src/models/tree/build_tree.cc
index 624b3da1..55957ecc 100644
--- a/src/models/tree/build_tree.cc
+++ b/src/models/tree/build_tree.cc
@@ -146,7 +146,7 @@ auto SelectSplitSamples(legate::TaskContext context,
   auto draft_proposals = legate::create_buffer<T, 2>({X.NumFeatures(), split_samples});
   for (int i = 0; i < split_samples; i++) {
     auto row            = row_samples[i];
-    const bool has_data = X.RowRange().contains(row);
+    const bool has_data = X.RowSubset().contains(row);
     for (int j = 0; j < X.NumFeatures(); j++) {
       draft_proposals[{j, i}] = has_data ? X.Get(row, j) : T(0);
     }
@@ -230,24 +230,51 @@ struct TreeBuilder {
     return tree;
   }
 
-  void ComputeHistogram(Histogram<GPair> histogram,
-                        legate::TaskContext context,
-                        Tree& tree,
-                        const MatrixT& X,
-                        const legate::AccessorRO<double, 3>& g,
-                        const legate::AccessorRO<double, 3>& h,
-                        NodeBatch batch)
+  void DenseHistogramKernel(const Tree& tree,
+                            Histogram<GPair>& histogram,
+                            const DenseXMatrix<T>& X,
+                            legate::AccessorRO<double, 3> g,
+                            legate::AccessorRO<double, 3> h,
+                            NodeBatch batch)
   {
     // Build the histogram
     for (auto [position, index_local] : batch) {
-      auto index_global  = index_local + X.RowRange().lo[0];
+      auto index_global  = index_local + X.RowSubset().lo[0];
       bool const compute = ComputeHistogramBin(
         position, tree.node_sums, histogram.ContainsNode(BinaryTree::Parent(position)));
       if (position < 0 || !compute) { continue; }
       for (int64_t j = 0; j < num_features; j++) {
         auto x_value      = X.Get(index_global, j);
         int const bin_idx = split_proposals.FindBin(x_value, j);
+        if (bin_idx != SparseSplitProposals<T>::NOT_FOUND) {
+          for (int64_t k = 0; k < num_outputs; ++k) {
+            histogram[{position, k, bin_idx}] +=
+              GPair{g[{index_global, 0, k}], h[{index_global, 0, k}]};
+          }
+        }
+      }
+    }
+  }
 
+  // Kernel specialised to iterate only over the non-zero elements of the sparse matrix
+  void CSRHistogramKernel(const Tree& tree,
+                          Histogram<GPair>& histogram,
+                          const CSRXMatrix<T>& X,
+                          legate::AccessorRO<double, 3> g,
+                          legate::AccessorRO<double, 3> h,
+                          NodeBatch batch)
+  {
+    // Build the histogram
+    for (auto [position, index_local] : batch) {
+      auto index_global  = index_local + X.RowSubset().lo[0];
+      bool const compute = ComputeHistogramBin(
+        position, tree.node_sums, histogram.ContainsNode(BinaryTree::Parent(position)));
+      if (position < 0 || !compute) { continue; }
+      auto row_range = X.row_ranges[index_global];
+      for (auto element_idx = row_range.lo[0]; element_idx <= row_range.hi[0]; element_idx++) {
+        auto feature      = X.column_indices[element_idx];
+        auto x            = X.values[element_idx];
+        int const bin_idx = split_proposals.FindBin(x, feature);
         if (bin_idx != SparseSplitProposals<T>::NOT_FOUND) {
           for (int64_t k = 0; k < num_outputs; ++k) {
             histogram[{position, k, bin_idx}] +=
@@ -256,6 +283,21 @@ struct TreeBuilder {
         }
       }
     }
+  }
+
+  void ComputeHistogram(Histogram<GPair> histogram,
+                        legate::TaskContext context,
+                        const Tree& tree,
+                        const MatrixT& X,
+                        const legate::AccessorRO<double, 3>& g,
+                        const legate::AccessorRO<double, 3>& h,
+                        NodeBatch batch)
+  {
+    if constexpr (std::is_same_v<MatrixT, DenseXMatrix<T>>) {
+      this->DenseHistogramKernel(tree, histogram, X, g, h, batch);
+    } else {
+      this->CSRHistogramKernel(tree, histogram, X, g, h, batch);
+    }
 
     // NCCL cannot allreduce custom types, need to reinterpret as double
     SumAllReduce(
@@ -266,7 +308,7 @@ struct TreeBuilder {
     this->Scan(histogram, batch, tree);
   }
 
-  void Scan(Histogram<GPair> histogram, NodeBatch batch, Tree& tree)
+  void Scan(Histogram<GPair> histogram, NodeBatch batch, const Tree& tree)
   {
     auto scan_node_histogram = [&](int node_idx) {
       for (int feature = 0; feature < num_features; feature++) {
@@ -319,6 +361,7 @@ struct TreeBuilder {
   }
   void PerformBestSplit(Tree& tree, Histogram<GPair> histogram, double alpha, NodeBatch batch)
   {
+    const bool is_sparse_matrix = std::is_same_v<MatrixT, CSRXMatrix<T>>;
     for (int node_id = batch.node_idx_begin; node_id < batch.node_idx_end; node_id++) {
       double best_gain = 0;
       int best_feature = -1;
@@ -328,10 +371,18 @@ struct TreeBuilder {
         for (int bin_idx = feature_begin; bin_idx < feature_end; bin_idx++) {
           double gain = 0;
           for (int output = 0; output < num_outputs; ++output) {
-            auto [G_L, H_L]  = histogram[{node_id, output, bin_idx}];
-            auto [G, H]      = tree.node_sums[{node_id, output}];
-            auto G_R         = G - G_L;
-            auto H_R         = H - H_L;
+            auto [left_sum, right_sum] = InferSplitSums(histogram,
+                                                        split_proposals,
+                                                        tree.node_sums[{node_id, output}],
+                                                        node_id,
+                                                        output,
+                                                        bin_idx,
+                                                        feature,
+                                                        is_sparse_matrix);
+            auto [G_L, H_L]            = left_sum;
+            auto [G_R, H_R]            = right_sum;
+            auto [G, H]                = tree.node_sums[{node_id, output}];
+
             double const reg = std::max(eps, alpha);  // Regularisation term
             gain +=
               0.5 * ((G_L * G_L) / (H_L + reg) + (G_R * G_R) / (H_R + reg) - (G * G) / (H + reg));
@@ -344,29 +395,33 @@ struct TreeBuilder {
         }
       }
       if (best_gain > eps) {
-        std::vector<double> left_leaf(num_outputs);
-        std::vector<double> right_leaf(num_outputs);
-        std::vector<GPair> left_sum(num_outputs);
-        std::vector<GPair> right_sum(num_outputs);
+        std::vector<double> left_leaves(num_outputs);
+        std::vector<double> right_leaves(num_outputs);
+        std::vector<GPair> left_sums(num_outputs);
+        std::vector<GPair> right_sums(num_outputs);
         for (int output = 0; output < num_outputs; ++output) {
-          auto [G_L, H_L]    = histogram[{node_id, output, best_bin}];
-          auto [G, H]        = tree.node_sums[{node_id, output}];
-          auto G_R           = G - G_L;
-          auto H_R           = H - H_L;
-          left_leaf[output]  = CalculateLeafValue(G_L, H_L, alpha);
-          right_leaf[output] = CalculateLeafValue(G_R, H_R, alpha);
-          left_sum[output]   = {G_L, H_L};
-          right_sum[output]  = {G_R, H_R};
+          auto [left_sum, right_sum] = InferSplitSums(histogram,
+                                                      split_proposals,
+                                                      tree.node_sums[{node_id, output}],
+                                                      node_id,
+                                                      output,
+                                                      best_bin,
+                                                      best_feature,
+                                                      is_sparse_matrix);
+          left_leaves[output]        = CalculateLeafValue(left_sum.grad, left_sum.hess, alpha);
+          right_leaves[output]       = CalculateLeafValue(right_sum.grad, right_sum.hess, alpha);
+          left_sums[output]          = left_sum;
+          right_sums[output]         = right_sum;
         }
-        if (left_sum[0].hess <= 0.0 || right_sum[0].hess <= 0.0) { continue; }
+        if (left_sums[0].hess <= 0.0 || right_sums[0].hess <= 0.0) { continue; }
         tree.AddSplit(node_id,
                       best_feature,
                       split_proposals.split_proposals[legate::coord_t{best_bin}],
-                      left_leaf,
-                      right_leaf,
+                      left_leaves,
+                      right_leaves,
                       best_gain,
-                      left_sum,
-                      right_sum);
+                      left_sums,
+                      right_sums);
       }
     }
   }
@@ -379,7 +434,7 @@ struct TreeBuilder {
         sorted_positions[i] = {-1, index_local};
         continue;
       }
-      auto x              = X.Get(X.RowRange().lo[0] + index_local, tree.feature[pos]);
+      auto x              = X.Get(X.RowSubset().lo[0] + index_local, tree.feature[pos]);
       bool const left     = x <= tree.split_value[pos];
       pos                 = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos);
       sorted_positions[i] = {pos, index_local};
diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu
index 51be29ff..884f945c 100644
--- a/src/models/tree/build_tree.cu
+++ b/src/models/tree/build_tree.cu
@@ -341,10 +341,10 @@ struct HistogramAgent {
           sample_node, node_sums, histogram.ContainsNode(BinaryTree::Parent(sample_node)));
       if (!computeHistogram) { continue; }
 
-      auto x            = X.Get(X.RowRange().lo[0] + local_sample_idx, feature);
+      auto x            = X.Get(X.RowSubset().lo[0] + local_sample_idx, feature);
       int const bin_idx = split_proposals.FindBin(x, feature);
 
-      legate::Point<3> p = {X.RowRange().lo[0] + local_sample_idx, 0, output};
+      legate::Point<3> p = {X.RowSubset().lo[0] + local_sample_idx, 0, output};
       auto gpair_quantised =
         quantiser.QuantiseStochasticRounding({g[p], h[p]}, hash_combine(seed, p[0], p[2]));
       // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
@@ -391,7 +391,7 @@ struct HistogramAgent {
     std::array<T, kItemsPerThread> x{};
 #pragma unroll
     for (int i = 0; i < kItemsPerThread; i++) {
-      x[i] = X.Get(X.RowRange().lo[0] + local_sample_idx[i], feature[i]);
+      x[i] = X.Get(X.RowSubset().lo[0] + local_sample_idx[i], feature[i]);
     }
 
     std::array<int, kItemsPerThread> bin_idx{};
@@ -402,7 +402,7 @@ struct HistogramAgent {
     std::array<IntegerGPair, kItemsPerThread> gpair{};
 #pragma unroll
     for (int i = 0; i < kItemsPerThread; i++) {
-      legate::Point<3> p = {X.RowRange().lo[0] + local_sample_idx[i], 0, output};
+      legate::Point<3> p = {X.RowSubset().lo[0] + local_sample_idx[i], 0, output};
       gpair[i] =
         bin_idx[i] != SparseSplitProposals<T>::NOT_FOUND
           ? quantiser.QuantiseStochasticRounding({g[p], h[p]}, hash_combine(seed, p[0], p[2]))
@@ -512,7 +512,7 @@ __global__ void __launch_bounds__(kBlockThreads)
       continue;
     }
     // Which matrix elements belong to this row?
-    std::size_t const global_sample_idx = X.RowRange().lo[0] + local_sample_idx;
+    std::size_t const global_sample_idx = X.RowSubset().lo[0] + local_sample_idx;
     auto elements                       = X.row_ranges[global_sample_idx];
     for (std::size_t element = elements.lo[0]; element <= elements.hi[0]; element++) {
       auto feature      = X.column_indices[X.vals_shape.lo[0] + element];
@@ -737,42 +737,12 @@ struct GainFeaturePair {
   }
 };
 
-// In the case where we have a sparse matrix, gradients for 0's have not been accumulated in the
-// histogram We can infer the gradients at the matrix zeroes by subtracting the sum of the gradients
-// at the last bin (which contains gradients from every non-zero element)
-// from the sum of the gradients in the node (this sum always includes gradients for every element
-// in that node)
-template <typename T>
-__device__ auto GetSparseSum(Histogram<IntegerGPair>& histogram,
-                             const SparseSplitProposals<T>& split_proposals,
-                             const IntegerGPair& node_sum,
-                             int node_id,
-                             int output,
-                             int bin_idx)
-{
-  auto left_sum                     = vectorised_load(&histogram[{node_id, output, bin_idx}]);
-  auto right_sum                    = node_sum - left_sum;
-  auto feature                      = split_proposals.FindFeature(bin_idx);
-  auto [feature_begin, feature_end] = split_proposals.FeatureRange(feature);
-  auto scan_sum   = vectorised_load(&histogram[{node_id, output, feature_end - 1}]);
-  auto zero_bin   = split_proposals.FindBin(0.0, feature);
-  auto sparse_sum = node_sum - scan_sum;
-  if (zero_bin == SparseSplitProposals<T>::NOT_FOUND || bin_idx < zero_bin) {
-    // Do nothing, this amount is already on the right
-  } else {
-    // Move it to the left
-    left_sum += sparse_sum;
-    right_sum -= sparse_sum;
-  }
-  return std::make_tuple(left_sum, right_sum);
-}
-
 // NOLINTBEGIN(performance-unnecessary-value-param)
-template <typename TYPE, int BLOCK_THREADS>
+template <typename T, int BLOCK_THREADS>
 __global__ void __launch_bounds__(BLOCK_THREADS)
   perform_best_split(Histogram<IntegerGPair> histogram,
                      size_t n_outputs,
-                     SparseSplitProposals<TYPE> split_proposals,
+                     SparseSplitProposals<T> split_proposals,
                      double eps,
                      double alpha,
                      legate::Buffer<double, 2> tree_leaf_value,
@@ -781,7 +751,8 @@ __global__ void __launch_bounds__(BLOCK_THREADS)
                      legate::Buffer<double, 1> tree_split_value,
                      legate::Buffer<double, 1> tree_gain,
                      NodeBatch batch,
-                     GradientQuantiser quantiser)
+                     GradientQuantiser quantiser,
+                     bool is_sparse)
 {
   // using one block per (level) node to have blockwise reductions
   int const node_id = narrow<int>(batch.node_idx_begin + blockIdx.x);
@@ -803,10 +774,9 @@ __global__ void __launch_bounds__(BLOCK_THREADS)
     for (int output = 0; output < n_outputs; ++output) {
       auto node_sum = vectorised_load(&node_sums[{node_id, output}]);
 
-      auto [left_sum, right_sum] =
-        GetSparseSum(histogram, split_proposals, node_sum, node_id, output, bin_idx);
-      // printf("node %d , bin %d,  left sum %ld %ld right sum %ld %ld \n", node_id ,bin_idx,
-      // left_sum.grad, left_sum.hess, right_sum.grad, right_sum.hess);
+      auto feature               = split_proposals.FindFeature(bin_idx);
+      auto [left_sum, right_sum] = InferSplitSums(
+        histogram, split_proposals, node_sum, node_id, output, bin_idx, feature, is_sparse);
 
       if (left_sum.hess <= 0 || right_sum.hess <= 0) {
         gain = 0;
@@ -844,8 +814,14 @@ __global__ void __launch_bounds__(BLOCK_THREADS)
     for (int output = narrow_cast<int>(threadIdx.x); output < n_outputs; output += BLOCK_THREADS) {
       auto node_sum = vectorised_load(&node_sums[{node_id, output}]);
 
-      auto [left_sum, right_sum] =
-        GetSparseSum(histogram, split_proposals, node_sum, node_id, output, node_best_bin_idx);
+      auto [left_sum, right_sum] = InferSplitSums(histogram,
+                                                  split_proposals,
+                                                  node_sum,
+                                                  node_id,
+                                                  output,
+                                                  node_best_bin_idx,
+                                                  node_best_feature,
+                                                  is_sparse);
 
       node_sums[{BinaryTree::LeftChild(node_id), output}]  = left_sum;
       node_sums[{BinaryTree::RightChild(node_id), output}] = right_sum;
@@ -978,7 +954,7 @@ SparseSplitProposals<T> SelectSplitSamples(legate::TaskContext context,
     auto i                  = idx / X.NumFeatures();
     auto j                  = idx % X.NumFeatures();
     auto row                = row_samples[i];
-    bool has_data           = X.RowRange().contains(row);
+    bool has_data           = X.RowSubset().contains(row);
     draft_proposals[{j, i}] = has_data ? X.Get(row, j) : T(0);
   });
 
@@ -1155,7 +1131,7 @@ struct TreeBuilder {
               }
 
               double x_value =
-                X.Get(X.RowRange().lo[0] + static_cast<int64_t>(row), tree_feature_span[pos]);
+                X.Get(X.RowSubset().lo[0] + static_cast<int64_t>(row), tree_feature_span[pos]);
               bool left = x_value <= tree_split_value_span[pos];
               pos       = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos);
               // printf("Row %d, feature %d, value %f pos %d\n", row, tree_feature_span[pos],
@@ -1255,7 +1231,8 @@ struct TreeBuilder {
                                                            tree.split_value,
                                                            tree.gain,
                                                            batch,
-                                                           quantiser);
+                                                           quantiser,
+                                                           std::is_same_v<MatrixT, CSRXMatrix<T>>);
     CHECK_CUDA_STREAM(stream);
   }
   void InitialiseRoot(legate::TaskContext context,
diff --git a/src/models/tree/build_tree.h b/src/models/tree/build_tree.h
index a0026b70..9c3f7e12 100644
--- a/src/models/tree/build_tree.h
+++ b/src/models/tree/build_tree.h
@@ -249,6 +249,40 @@ class Histogram {
   }
 };
 
+// From the scanned histogram gradients and the node sums, infer the gradients for the left and
+// right partitions For a dense matrix the left sum is the scanned histogram and the right sum is
+// the node sum minus the left sum In the case where we have a sparse matrix, gradients for 0's have
+// not been accumulated in the histogram. We can infer the gradients at the matrix zeroes by
+// subtracting the sum of the gradients at the last bin (which contains gradients from every
+// non-zero element) from the sum of the gradients in the node (this sum always includes gradients
+// for every element in that node)
+template <typename T, typename GPairT>
+__host__ __device__ auto InferSplitSums(Histogram<GPairT>& scanned_histogram,
+                                        const SparseSplitProposals<T>& split_proposals,
+                                        const GPairT& node_sum,
+                                        int node_id,
+                                        int output,
+                                        int bin_idx,
+                                        int feature,
+                                        bool is_sparse) -> std::tuple<GPairT, GPairT>
+{
+  auto left_sum  = scanned_histogram[{node_id, output, bin_idx}];
+  auto right_sum = node_sum - left_sum;
+  if (!is_sparse) { return std::make_tuple(left_sum, right_sum); }
+  auto [feature_begin, feature_end] = split_proposals.FeatureRange(feature);
+  auto scan_sum                     = scanned_histogram[{node_id, output, feature_end - 1}];
+  auto zero_bin                     = split_proposals.FindBin(0.0, feature);
+  auto sparse_sum                   = node_sum - scan_sum;
+  if (zero_bin == SparseSplitProposals<T>::NOT_FOUND || bin_idx < zero_bin) {
+    // Do nothing, this amount is already on the right
+  } else {
+    // Move it to the left
+    left_sum += sparse_sum;
+    right_sum -= sparse_sum;
+  }
+  return std::make_tuple(left_sum, right_sum);
+}
+
 class BuildTreeDenseTask : public Task<BuildTreeDenseTask, BUILD_TREE> {
  public:
   static void cpu_variant(legate::TaskContext context);
diff --git a/src/models/tree/matrix_types.h b/src/models/tree/matrix_types.h
index 10891e02..a7c21839 100644
--- a/src/models/tree/matrix_types.h
+++ b/src/models/tree/matrix_types.h
@@ -42,7 +42,7 @@ class DenseXMatrix {
     return x[legate::Point<3>{global_row_idx, feature_idx, 0}];
   }
   __host__ __device__ int NumFeatures() const { return shape.hi[1] - shape.lo[1] + 1; }
-  __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const
+  __host__ __device__ legate::Rect<1, legate::coord_t> RowSubset() const
   {
     return {shape.lo[0], shape.hi[0]};
   }
@@ -106,5 +106,8 @@ class CSRXMatrix {
   auto NNZ() const { return nnz; }
 
   __host__ __device__ int NumFeatures() const { return num_features; }
-  __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const { return row_ranges_shape; }
+  __host__ __device__ legate::Rect<1, legate::coord_t> RowSubset() const
+  {
+    return row_ranges_shape;
+  }
 };
diff --git a/src/models/tree/predict.cc b/src/models/tree/predict.cc
index 19f80bc2..0e5cb88e 100644
--- a/src/models/tree/predict.cc
+++ b/src/models/tree/predict.cc
@@ -29,7 +29,7 @@ void PredictRows(const MatrixT& X,
                  legate::AccessorRO<int32_t, 1> feature,
                  legate::AccessorRO<double, 2> leaf_value)
 {
-  for (int64_t i = X.RowRange().lo[0]; i <= X.RowRange().hi[0]; i++) {
+  for (int64_t i = X.RowSubset().lo[0]; i <= X.RowSubset().hi[0]; i++) {
     int pos = 0;
     // Use a max depth of 100 to avoid infinite loops
     const int max_depth = 100;
diff --git a/src/models/tree/predict.cu b/src/models/tree/predict.cu
index c1c94770..3b573c5d 100644
--- a/src/models/tree/predict.cu
+++ b/src/models/tree/predict.cu
@@ -37,7 +37,7 @@ void PredictRows(const MatrixT& X,
   // rowwise kernel
   auto prediction_lambda = [=] __device__(size_t idx) {
     int64_t pos         = 0;
-    auto global_row_idx = X.RowRange().lo + idx;
+    auto global_row_idx = X.RowSubset().lo + idx;
     // Use a max depth of 100 to avoid infinite loops
     const int max_depth = 100;
     for (int depth = 0; depth < max_depth; depth++) {
@@ -50,7 +50,7 @@ void PredictRows(const MatrixT& X,
     }
   };  // NOLINT(readability/braces)
 
-  LaunchN(X.RowRange().volume(), stream, prediction_lambda);
+  LaunchN(X.RowSubset().volume(), stream, prediction_lambda);
   CHECK_CUDA_STREAM(stream);
 }
 

From 936570e8edd957c95a7045d6a3a4ea3832b91334 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 9 Jan 2025 01:04:47 -0800
Subject: [PATCH 10/10] clang-tidy and other fixes

---
 src/models/tree/build_tree.cc  | 47 +++++++++++------------
 src/models/tree/build_tree.cu  | 69 +++++++++++++++++-----------------
 src/models/tree/build_tree.h   | 10 ++++-
 src/models/tree/matrix_types.h | 45 +++++++++++++---------
 src/models/tree/predict.cc     | 22 +++++------
 src/models/tree/predict.cu     | 22 +++++------
 6 files changed, 116 insertions(+), 99 deletions(-)

diff --git a/src/models/tree/build_tree.cc b/src/models/tree/build_tree.cc
index de3a0f39..0d14872a 100644
--- a/src/models/tree/build_tree.cc
+++ b/src/models/tree/build_tree.cc
@@ -172,7 +172,7 @@ auto SelectSplitSamples(legate::TaskContext context,
 
   // Set the largest split sample to +inf such that an element must belong to one of the bins
   // i.e. we cannot go off the end when searching for a bin
-  for (int feature = 0; feature < num_features; feature++) {
+  for (int feature = 0; feature < X.NumFeatures(); feature++) {
     auto end                 = row_pointers[feature + 1];
     split_proposals[end - 1] = std::numeric_limits<T>::infinity();
   }
@@ -213,12 +213,12 @@ struct TreeBuilder {
     max_batch_size = max_histogram_nodes;
   }
 
-  Tree Build(legate::TaskContext context,
+  auto Build(legate::TaskContext context,
              const MatrixT& X_matrix,
-             legate::AccessorRO<double, 3> g_accessor,
-             legate::AccessorRO<double, 3> h_accessor,
+             const legate::AccessorRO<double, 3>& g_accessor,
+             const legate::AccessorRO<double, 3>& h_accessor,
              legate::Rect<3> g_shape,
-             double alpha)
+             double alpha) -> Tree
   {
     // Begin building the tree
     Tree tree(max_nodes, narrow<int>(num_outputs));
@@ -242,8 +242,8 @@ struct TreeBuilder {
   void DenseHistogramKernel(const Tree& tree,
                             Histogram<GPair>& histogram,
                             const DenseXMatrix<T>& X,
-                            legate::AccessorRO<double, 3> g,
-                            legate::AccessorRO<double, 3> h,
+                            const legate::AccessorRO<double, 3>& g,
+                            const legate::AccessorRO<double, 3>& h,
                             NodeBatch batch)
   {
     // Build the histogram
@@ -268,8 +268,8 @@ struct TreeBuilder {
   void CSRHistogramKernel(const Tree& tree,
                           Histogram<GPair>& histogram,
                           const CSRXMatrix<T>& X,
-                          legate::AccessorRO<double, 3> g,
-                          legate::AccessorRO<double, 3> h,
+                          const legate::AccessorRO<double, 3>& g,
+                          const legate::AccessorRO<double, 3>& h,
                           NodeBatch batch)
   {
     // Build the histogram
@@ -283,11 +283,9 @@ struct TreeBuilder {
         auto feature      = X.column_indices[element_idx];
         auto x            = X.values[element_idx];
         int const bin_idx = split_proposals.FindBin(x, feature);
-        if (bin_idx != SparseSplitProposals<T>::NOT_FOUND) {
-          for (int64_t k = 0; k < num_outputs; ++k) {
-            histogram[{position, k, bin_idx}] +=
-              GPair{g[{index_global, 0, k}], h[{index_global, 0, k}]};
-          }
+        for (int64_t k = 0; k < num_outputs; ++k) {
+          histogram[{position, k, bin_idx}] +=
+            GPair{g[{index_global, 0, k}], h[{index_global, 0, k}]};
         }
       }
     }
@@ -367,7 +365,10 @@ struct TreeBuilder {
       }
     }
   }
-  void PerformBestSplit(Tree& tree, Histogram<GPair> histogram, double alpha, NodeBatch batch)
+  void PerformBestSplit(Tree& tree,
+                        const Histogram<GPair>& histogram,
+                        double alpha,
+                        NodeBatch batch)
   {
     const bool is_sparse_matrix = std::is_same_v<MatrixT, CSRXMatrix<T>>;
     for (int node_id = batch.node_idx_begin; node_id < batch.node_idx_end; node_id++) {
@@ -560,7 +561,7 @@ struct build_tree_dense_fn {
     auto seed          = context.scalars().at(4).value<int>();
     auto dataset_rows  = context.scalars().at(5).value<int64_t>();
 
-    DenseXMatrix<T> X_matrix(X_accessor, X_shape);
+    DenseXMatrix<T> const X_matrix(X_accessor, X_shape);
 
     SparseSplitProposals<T> const split_proposals =
       SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows);
@@ -599,13 +600,13 @@ struct build_tree_csr_fn {
     auto dataset_rows  = context.scalars().at(5).value<int64_t>();
     auto num_features  = context.scalars().at(6).value<int64_t>();
 
-    CSRXMatrix<T> X_matrix(X_vals_accessor,
-                           X_coords_accessor,
-                           X_offsets_accessor,
-                           X_vals_shape,
-                           X_offsets_shape,
-                           num_features,
-                           X_vals_shape.volume());
+    CSRXMatrix<T> const X_matrix(X_vals_accessor,
+                                 X_coords_accessor,
+                                 X_offsets_accessor,
+                                 X_vals_shape,
+                                 X_offsets_shape,
+                                 num_features,
+                                 X_vals_shape.volume());
     const SparseSplitProposals<T> split_proposals =
       SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows);
 
diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu
index c4b528fe..e248b655 100644
--- a/src/models/tree/build_tree.cu
+++ b/src/models/tree/build_tree.cu
@@ -21,11 +21,11 @@
 #include <thrust/sort.h>
 #include <thrust/random.h>
 #include <thrust/unique.h>
-#include <cstddef>
 #include <numeric>
 #include <limits>
 #include <vector>
 #include <algorithm>
+#include <cstddef>
 #include "legate_library.h"
 #include "legateboost.h"
 #include "../../cpp_utils/cpp_utils.h"
@@ -395,7 +395,7 @@ struct HistogramAgent {
     std::array<IntegerGPair, kItemsPerThread> gpair{};
 #pragma unroll
     for (int i = 0; i < kItemsPerThread; i++) {
-      legate::Point<3> p = {sample_offset + local_sample_idx[i], 0, output};
+      legate::Point<3> p = {sample_node[i] + local_sample_idx[i], 0, output};
       gpair[i] = quantiser.QuantiseStochasticRounding({g[p], h[p]}, hash_combine(seed, p[0], p[2]));
     }
 #pragma unroll
@@ -489,8 +489,8 @@ __global__ void __launch_bounds__(kBlockThreads)
                      int64_t seed)
 {
   // Grid stride loop over rows
-  for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < batch.InstancesInBatch();
-       idx += blockDim.x * gridDim.x) {
+  for (std::size_t idx = (blockIdx.x * blockDim.x) + threadIdx.x; idx < batch.InstancesInBatch();
+       idx += static_cast<std::size_t>(blockDim.x * gridDim.x)) {
     auto [sample_node, local_sample_idx] = batch.instances[idx];
     // If we don't need to compute this node, skip
     if (!ComputeHistogramBin(
@@ -504,7 +504,6 @@ __global__ void __launch_bounds__(kBlockThreads)
       auto feature      = X.column_indices[X.vals_shape.lo[0] + element];
       auto x            = X.values[X.vals_shape.lo[0] + element];
       int const bin_idx = split_proposals.FindBin(x, feature);
-      if (bin_idx == SparseSplitProposals<T>::NOT_FOUND) continue;
       for (int output = 0; output < n_outputs; output++) {
         legate::Point<3> p = {global_sample_idx, 0, output};
         auto gpair_quantised =
@@ -872,7 +871,7 @@ struct Tree {
 
   void WriteTreeOutput(legate::TaskContext context, GradientQuantiser quantiser)
   {
-    auto stream       = context.get_task_stream();
+    auto* stream      = context.get_task_stream();
     auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM);
     auto policy       = DEFAULT_POLICY(thrust_alloc).on(stream);
 
@@ -904,8 +903,8 @@ struct Tree {
   legate::Buffer<double, 1> split_value;
   legate::Buffer<double, 1> gain;
   legate::Buffer<IntegerGPair, 2> node_sums;
-  const int num_outputs;
-  const int max_nodes;
+  int num_outputs;
+  int max_nodes;
   cudaStream_t stream;
 };
 
@@ -914,12 +913,12 @@ struct Tree {
 // Remove any duplicates
 // Return sparse matrix of split samples for each feature
 template <typename T, template <typename> class XMatrix>
-SparseSplitProposals<T> SelectSplitSamples(legate::TaskContext context,
-                                           const XMatrix<T>& X,
-                                           int split_samples,
-                                           int seed,
-                                           int64_t dataset_rows,
-                                           cudaStream_t stream)
+auto SelectSplitSamples(legate::TaskContext context,
+                        const XMatrix<T>& X,
+                        int split_samples,
+                        int seed,
+                        int64_t dataset_rows,
+                        cudaStream_t stream) -> SparseSplitProposals<T>
 {
   auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM);
   auto policy       = DEFAULT_POLICY(thrust_alloc).on(stream);
@@ -940,7 +939,7 @@ SparseSplitProposals<T> SelectSplitSamples(legate::TaskContext context,
     auto i                  = idx / X.NumFeatures();
     auto j                  = idx % X.NumFeatures();
     auto row                = row_samples[i];
-    bool has_data           = X.RowSubset().contains(row);
+    bool const has_data     = X.RowSubset().contains(row);
     draft_proposals[{j, i}] = has_data ? X.Get(row, j) : T(0);
   });
 
@@ -1000,7 +999,7 @@ SparseSplitProposals<T> SelectSplitSamples(legate::TaskContext context,
 
   // Set the largest split sample to +inf such that an element must belong to one of the bins
   // i.e. we cannot go off the end when searching for a bin
-  LaunchN(num_features, stream, [=] __device__(int i) {
+  LaunchN(X.NumFeatures(), stream, [=] __device__(int i) {
     auto end                 = row_pointers_span[i + 1];
     split_proposals[end - 1] = std::numeric_limits<T>::infinity();
   });
@@ -1069,14 +1068,14 @@ struct TreeBuilder {
     max_batch_size = max_histogram_nodes;
   }
 
-  Tree Build(legate::TaskContext context,
+  auto Build(legate::TaskContext context,
              const MatrixT& X_matrix,
-             legate::AccessorRO<double, 3> g_accessor,
-             legate::AccessorRO<double, 3> h_accessor,
+             const legate::AccessorRO<double, 3>& g_accessor,
+             const legate::AccessorRO<double, 3>& h_accessor,
              legate::Rect<3> g_shape,
-             double alpha)
+             double alpha) -> Tree
   {
-    auto stream             = context.get_task_stream();
+    auto* stream            = context.get_task_stream();
     auto thrust_alloc       = ThrustAllocator(legate::Memory::GPU_FB_MEM);
     auto thrust_exec_policy = DEFAULT_POLICY(thrust_alloc).on(stream);
 
@@ -1107,7 +1106,7 @@ struct TreeBuilder {
   auto operator=(TreeBuilder&&) -> TreeBuilder&      = delete;
   ~TreeBuilder()                                     = default;
 
-  void UpdatePositions(Tree& tree, const MatrixT X)
+  void UpdatePositions(Tree& tree, const MatrixT& X)
   {
     tcb::span<int32_t> const tree_feature_span(tree.feature.ptr(0), max_nodes);
     tcb::span<double> const tree_split_value_span(tree.split_value.ptr(0), max_nodes);
@@ -1123,10 +1122,10 @@ struct TreeBuilder {
                 return;
               }
 
-              double x_value =
+              double const x_value =
                 X.Get(X.RowSubset().lo[0] + static_cast<int64_t>(row), tree_feature_span[pos]);
-              bool left = x_value <= tree_split_value_span[pos];
-              pos       = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos);
+              bool const left = x_value <= tree_split_value_span[pos];
+              pos             = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos);
               // printf("Row %d, feature %d, value %f pos %d\n", row, tree_feature_span[pos],
               // x_value, pos);
               sorted_positions[idx] = cuda::std::make_tuple(pos, row);
@@ -1168,7 +1167,7 @@ struct TreeBuilder {
   void ComputeHistogram(Histogram<IntegerGPair> histogram,
                         legate::TaskContext context,
                         Tree& tree,
-                        const MatrixT X,
+                        const MatrixT& X,
                         const legate::AccessorRO<double, 3>& g,
                         const legate::AccessorRO<double, 3>& h,
                         NodeBatch batch)
@@ -1359,7 +1358,7 @@ struct build_tree_fn {
     auto [g, g_shape, g_accessor] = GetInputStore<double, 3>(context.input(1).data());
     auto [h, h_shape, h_accessor] = GetInputStore<double, 3>(context.input(2).data());
 
-    DenseXMatrix X_matrix(X_accessor, X_shape);
+    DenseXMatrix const X_matrix(X_accessor, X_shape);
 
     EXPECT_DENSE_ROW_MAJOR(X_accessor.accessor, X_shape);
     auto num_features = X_shape.hi[1] - X_shape.lo[1] + 1;
@@ -1431,17 +1430,17 @@ struct build_tree_csr_fn {
     auto num_features  = context.scalars().at(6).value<int64_t>();
 
     auto* stream = context.get_task_stream();
-    CSRXMatrix<T> X_matrix(X_vals_accessor,
-                           X_coords_accessor,
-                           X_offsets_accessor,
-                           X_vals_shape,
-                           X_offsets_shape,
-                           num_features,
-                           X_vals_shape.volume());
+    CSRXMatrix<T> const X_matrix(X_vals_accessor,
+                                 X_coords_accessor,
+                                 X_offsets_accessor,
+                                 X_vals_shape,
+                                 X_offsets_shape,
+                                 num_features,
+                                 X_vals_shape.volume());
     const SparseSplitProposals<T> split_proposals =
       SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows, stream);
 
-    GradientQuantiser quantiser(context, g_accessor, h_accessor, g_shape, stream);
+    GradientQuantiser const quantiser(context, g_accessor, h_accessor, g_shape, stream);
 
     // Begin building the tree
     auto tree = TreeBuilder<CSRXMatrix<T>>(num_rows,
diff --git a/src/models/tree/build_tree.h b/src/models/tree/build_tree.h
index 5348bf2f..e45138c7 100644
--- a/src/models/tree/build_tree.h
+++ b/src/models/tree/build_tree.h
@@ -249,6 +249,12 @@ class Histogram {
   {
     return buffer_[{p[0] - node_begin_, p[1], p[2]}];
   }
+
+  // Node, output, bin
+  __host__ __device__ auto operator[](legate::Point<3> p) const -> GPairT
+  {
+    return buffer_[{p[0] - node_begin_, p[1], p[2]}];
+  }
 };
 
 // From the scanned histogram gradients and the node sums, infer the gradients for the left and
@@ -259,7 +265,7 @@ class Histogram {
 // non-zero element) from the sum of the gradients in the node (this sum always includes gradients
 // for every element in that node)
 template <typename T, typename GPairT>
-__host__ __device__ auto InferSplitSums(Histogram<GPairT>& scanned_histogram,
+__host__ __device__ auto InferSplitSums(const Histogram<GPairT>& scanned_histogram,
                                         const SparseSplitProposals<T>& split_proposals,
                                         const GPairT& node_sum,
                                         int node_id,
@@ -275,7 +281,7 @@ __host__ __device__ auto InferSplitSums(Histogram<GPairT>& scanned_histogram,
   auto scan_sum                     = scanned_histogram[{node_id, output, feature_end - 1}];
   auto zero_bin                     = split_proposals.FindBin(0.0, feature);
   auto sparse_sum                   = node_sum - scan_sum;
-  if (zero_bin == SparseSplitProposals<T>::NOT_FOUND || bin_idx < zero_bin) {
+  if (bin_idx < zero_bin) {
     // Do nothing, this amount is already on the right
   } else {
     // Move it to the left
diff --git a/src/models/tree/matrix_types.h b/src/models/tree/matrix_types.h
index a7c21839..32eb62ab 100644
--- a/src/models/tree/matrix_types.h
+++ b/src/models/tree/matrix_types.h
@@ -16,12 +16,14 @@
 #pragma once
 #include <legate.h>
 #include <cstdint>
+#include <utility>
 #ifdef __CUDACC__
 #include <thrust/binary_search.h>
 #else
 #include <algorithm>
 #endif
 
+namespace legateboost {
 // Create a uniform interface to two matrix formats
 // Dense and CSR
 template <typename T>
@@ -34,15 +36,22 @@ class DenseXMatrix {
   legate::Rect<3> shape;
 
  public:
-  DenseXMatrix(legate::AccessorRO<T, 3> x, legate::Rect<3> shape) : x(x), shape(shape) {}
+  DenseXMatrix(const legate::AccessorRO<T, 3>& x, legate::Rect<3> shape)
+    : x(std::move(x)), shape(shape)
+  {
+  }
   // Global row index refers to the index across partitions
   // For features, each worker has every feature so the global is the same as the local index
-  __host__ __device__ T Get(std::size_t global_row_idx, uint32_t feature_idx) const
+  [[nodiscard]] __host__ __device__ auto Get(std::size_t global_row_idx, uint32_t feature_idx) const
+    -> T
   {
     return x[legate::Point<3>{global_row_idx, feature_idx, 0}];
   }
-  __host__ __device__ int NumFeatures() const { return shape.hi[1] - shape.lo[1] + 1; }
-  __host__ __device__ legate::Rect<1, legate::coord_t> RowSubset() const
+  [[nodiscard]] __host__ __device__ auto NumFeatures() const -> int
+  {
+    return shape.hi[1] - shape.lo[1] + 1;
+  }
+  [[nodiscard]] __host__ __device__ auto RowSubset() const -> legate::Rect<1, legate::coord_t>
   {
     return {shape.lo[0], shape.hi[0]};
   }
@@ -61,14 +70,14 @@ class CSRXMatrix {
   int num_features;
   std::size_t nnz;  // The number of nnz in ths local partition
 
-  CSRXMatrix(legate::AccessorRO<T, 1> values,
-             legate::AccessorRO<int64_t, 1> column_indices,
-             legate::AccessorRO<legate::Rect<1, legate::coord_t>, 1> row_ranges,
+  CSRXMatrix(const legate::AccessorRO<T, 1>& values,
+             const legate::AccessorRO<int64_t, 1>& column_indices,
+             const legate::AccessorRO<legate::Rect<1, legate::coord_t>, 1>& row_ranges,
              legate::Rect<1, legate::coord_t> vals_shape,
              legate::Rect<1, legate::coord_t> row_ranges_shape,
              int num_features,
              std::size_t nnz)
-    : values(values),
+    : values(std::move(values)),
       column_indices(column_indices),
       row_ranges(row_ranges),
       num_features(num_features),
@@ -82,18 +91,19 @@ class CSRXMatrix {
   // For features, each worker has every feature so the global is the same as the local index
   // This method is less efficient than its Dense counterpart due to the need to search for the
   // feature
-  __host__ __device__ T Get(std::size_t global_row_idx, uint32_t feature_idx) const
+  [[nodiscard]] __host__ __device__ auto Get(std::size_t global_row_idx, uint32_t feature_idx) const
+    -> T
   {
-    auto row_range = row_ranges[global_row_idx];
+    auto row_range = row_ranges[narrow<int64_t>(global_row_idx)];
 
-    tcb::span<const int64_t> column_indices_span(column_indices.ptr(row_range.lo),
-                                                 row_range.volume());
+    tcb::span<const int64_t> const column_indices_span(column_indices.ptr(row_range.lo),
+                                                       row_range.volume());
 
 #ifdef __CUDACC__
-    auto result = thrust::lower_bound(
+    const auto* result = thrust::lower_bound(
       thrust::seq, column_indices_span.begin(), column_indices_span.end(), feature_idx);
 #else
-    auto result =
+    const auto* result =
       std::lower_bound(column_indices_span.begin(), column_indices_span.end(), feature_idx);
 #endif
 
@@ -103,11 +113,12 @@ class CSRXMatrix {
     return 0;
   }
 
-  auto NNZ() const { return nnz; }
+  [[nodiscard]] auto NNZ() const { return nnz; }
 
-  __host__ __device__ int NumFeatures() const { return num_features; }
-  __host__ __device__ legate::Rect<1, legate::coord_t> RowSubset() const
+  [[nodiscard]] __host__ __device__ auto NumFeatures() const -> int { return num_features; }
+  [[nodiscard]] __host__ __device__ auto RowSubset() const -> legate::Rect<1, legate::coord_t>
   {
     return row_ranges_shape;
   }
 };
+}  // namespace legateboost
diff --git a/src/models/tree/predict.cc b/src/models/tree/predict.cc
index 0e5cb88e..13e36f0c 100644
--- a/src/models/tree/predict.cc
+++ b/src/models/tree/predict.cc
@@ -23,11 +23,11 @@ namespace legateboost {
 namespace {
 template <typename MatrixT>
 void PredictRows(const MatrixT& X,
-                 legate::AccessorWO<double, 3> pred_accessor,
+                 const legate::AccessorWO<double, 3>& pred_accessor,
                  legate::Rect<3, legate::coord_t> pred_shape,
-                 legate::AccessorRO<double, 1> split_value,
-                 legate::AccessorRO<int32_t, 1> feature,
-                 legate::AccessorRO<double, 2> leaf_value)
+                 const legate::AccessorRO<double, 1>& split_value,
+                 const legate::AccessorRO<int32_t, 1>& feature,
+                 const legate::AccessorRO<double, 2>& leaf_value)
 {
   for (int64_t i = X.RowSubset().lo[0]; i <= X.RowSubset().hi[0]; i++) {
     int pos = 0;
@@ -96,13 +96,13 @@ struct predict_csr_fn {
     auto pred_accessor = pred.write_accessor<double, 3>();
 
     auto num_features = context.scalars().at(0).value<int32_t>();
-    CSRXMatrix<T> X(X_vals_accessor,
-                    X_coords_accessor,
-                    X_offsets_accessor,
-                    X_vals_shape,
-                    X_offsets_shape,
-                    num_features,
-                    X_vals_shape.volume());
+    CSRXMatrix<T> const X(X_vals_accessor,
+                          X_coords_accessor,
+                          X_offsets_accessor,
+                          X_vals_shape,
+                          X_offsets_shape,
+                          num_features,
+                          X_vals_shape.volume());
 
     EXPECT_AXIS_ALIGNED(0, X_offsets_shape, pred_shape);
 
diff --git a/src/models/tree/predict.cu b/src/models/tree/predict.cu
index 3b573c5d..7c93b0f6 100644
--- a/src/models/tree/predict.cu
+++ b/src/models/tree/predict.cu
@@ -27,11 +27,11 @@ namespace {
 
 template <typename MatrixT>
 void PredictRows(const MatrixT& X,
-                 legate::AccessorWO<double, 3> pred_accessor,
+                 const legate::AccessorWO<double, 3>& pred_accessor,
                  legate::Rect<3, legate::coord_t> pred_shape,
-                 legate::AccessorRO<double, 1> split_value,
-                 legate::AccessorRO<int32_t, 1> feature,
-                 legate::AccessorRO<double, 2> leaf_value,
+                 const legate::AccessorRO<double, 1>& split_value,
+                 const legate::AccessorRO<int32_t, 1>& feature,
+                 const legate::AccessorRO<double, 2>& leaf_value,
                  cudaStream_t stream)
 {
   // rowwise kernel
@@ -109,13 +109,13 @@ struct predict_csr_fn {
     auto pred_accessor = pred.write_accessor<double, 3>();
 
     auto num_features = context.scalars().at(0).value<int32_t>();
-    CSRXMatrix<T> X(X_vals_accessor,
-                    X_coords_accessor,
-                    X_offsets_accessor,
-                    X_vals_shape,
-                    X_offsets_shape,
-                    num_features,
-                    X_vals_shape.volume());
+    CSRXMatrix<T> const X(X_vals_accessor,
+                          X_coords_accessor,
+                          X_offsets_accessor,
+                          X_vals_shape,
+                          X_offsets_shape,
+                          num_features,
+                          X_vals_shape.volume());
 
     EXPECT_AXIS_ALIGNED(0, X_offsets_shape, pred_shape);