From a4e25559674744f551fb53390edcecbd8b5cbe32 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Wed, 4 Dec 2024 04:48:20 -0800 Subject: [PATCH 01/10] Add support for legate-sparse CSR in Tree model --- legateboost/models/tree.py | 107 +++++++++++- legateboost/test/models/test_tree.py | 13 ++ src/legateboost.h | 7 +- src/models/tree/build_tree.cc | 3 + src/models/tree/build_tree.cu | 250 +++++++++++++++++---------- src/models/tree/build_tree.h | 70 ++++++++ 6 files changed, 347 insertions(+), 103 deletions(-) diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py index a3aa8c37..ba64b10b 100644 --- a/legateboost/models/tree.py +++ b/legateboost/models/tree.py @@ -1,9 +1,20 @@ import copy from enum import IntEnum -from typing import Any +from typing import Any, Union import cupynumeric as cn -from legate.core import TaskTarget, get_legate_runtime, types +from legate.core import ( + ImageComputationHint, + TaskTarget, + get_legate_runtime, + image, + types, +) + +try: + from legate_sparse import csr_matrix +except ImportError: + csr_matrix = None from ..library import user_context, user_lib from ..utils import get_store @@ -12,6 +23,7 @@ class LegateBoostOpCode(IntEnum): BUILD_TREE = user_lib.cffi.BUILD_TREE + BUILD_TREE_CSR = user_lib.cffi.BUILD_TREE_CSR PREDICT = user_lib.cffi.PREDICT UPDATE_TREE = user_lib.cffi.UPDATE_TREE @@ -54,12 +66,7 @@ def __init__( self.split_samples = split_samples self.alpha = alpha - def fit( - self, - X: cn.ndarray, - g: cn.ndarray, - h: cn.ndarray, - ) -> "Tree": + def fit_dense(self, X: cn.ndarray, g: cn.ndarray, h: cn.ndarray) -> "Tree": num_outputs = g.shape[1] task = get_legate_runtime().create_auto_task( @@ -120,6 +127,90 @@ def fit( self.hessian = cn.array(hessian, copy=False) return self + def fit_csr(self, X: csr_matrix, g: cn.ndarray, h: cn.ndarray) -> "Tree": + num_outputs = g.shape[1] + + task = get_legate_runtime().create_auto_task( + user_context, LegateBoostOpCode.BUILD_TREE_CSR + ) + + # promote these to 3d. When the g/h shapes match those of the dense version, + # it makes code reuse easier on the C++ side + g_ = get_store(g).promote(1, 1) + h_ = get_store(h).promote(1, 1) + + task.add_scalar_arg(self.max_depth, types.int32) + max_nodes = 2 ** (self.max_depth + 1) + task.add_scalar_arg(max_nodes, types.int32) + task.add_scalar_arg(self.alpha, types.float64) + task.add_scalar_arg(self.split_samples, types.int32) + task.add_scalar_arg(self.random_state.randint(0, 2**31), types.int32) + task.add_scalar_arg(X.shape[0], types.int64) + task.add_scalar_arg(X.shape[1], types.int64) + + # inputs + val_var = task.add_input(X.vals) + crd_var = task.add_input(X.crd) + pos_var = task.add_input(X.pos) + task.add_input(g_) + task.add_input(h_) + pos_promoted = X.pos.promote(1, g.shape[1]).promote(1, 1) + # we don't need this input but use it for alignment + task.add_input(pos_promoted) + + task.add_alignment(g_, h_) + task.add_alignment(g_, pos_promoted) + task.add_constraint( + image(pos_var, crd_var, hint=ImageComputationHint.FIRST_LAST) + ) + task.add_constraint( + image(pos_var, val_var, hint=ImageComputationHint.FIRST_LAST) + ) + + # outputs + leaf_value = get_legate_runtime().create_store( + types.float64, (max_nodes, num_outputs) + ) + feature = get_legate_runtime().create_store(types.int32, (max_nodes,)) + split_value = get_legate_runtime().create_store(types.float64, (max_nodes,)) + gain = get_legate_runtime().create_store(types.float64, (max_nodes,)) + hessian = get_legate_runtime().create_store( + types.float64, (max_nodes, num_outputs) + ) + task.add_output(leaf_value) + task.add_output(feature) + task.add_output(split_value) + task.add_output(gain) + task.add_output(hessian) + task.add_broadcast(leaf_value) + task.add_broadcast(feature) + task.add_broadcast(split_value) + task.add_broadcast(gain) + task.add_broadcast(hessian) + + if get_legate_runtime().machine.count(TaskTarget.GPU) > 1: + task.add_nccl_communicator() + elif get_legate_runtime().machine.count() > 1: + task.add_cpu_communicator() + task.execute() + + self.leaf_value = cn.array(leaf_value, copy=False) + self.feature = cn.array(feature, copy=False) + self.split_value = cn.array(split_value, copy=False) + self.gain = cn.array(gain, copy=False) + self.hessian = cn.array(hessian, copy=False) + return self + + def fit( + self, + X: Union[cn.ndarray, csr_matrix], + g: cn.ndarray, + h: cn.ndarray, + ) -> "Tree": + if isinstance(X, csr_matrix): + return self.fit_csr(X, g, h) + return self.fit_dense(X, g, h) + def clear(self) -> None: self.leaf_value.fill(0) self.hessian.fill(0) diff --git a/legateboost/test/models/test_tree.py b/legateboost/test/models/test_tree.py index 8502eef3..e3a2f8c6 100644 --- a/legateboost/test/models/test_tree.py +++ b/legateboost/test/models/test_tree.py @@ -1,5 +1,6 @@ import numpy as np import pytest +from legate_sparse import csr_matrix import cupynumeric as cn import legateboost as lb @@ -72,3 +73,15 @@ def test_alpha(): ) model.fit(X, y) assert np.isclose(model.predict(X)[0], y.sum() / (y.size + alpha)) + + +def test_sparse(): + num_outputs = 1 + rs = cn.random.RandomState(0) + X = csr_matrix( + (cn.array([1.0, 2.0, 3.0]), cn.array([0, 1, 2]), cn.array([0, 2, 3])), + shape=(2, 3), + ) + g = cn.array(rs.normal(size=(2, num_outputs))) + h = cn.array(rs.random((2, 1)) + 0.1) + lb.models.Tree().set_random_state(np.random.RandomState(2)).fit(X, g, h) diff --git a/src/legateboost.h b/src/legateboost.h index c37fd733..fe5a1d84 100644 --- a/src/legateboost.h +++ b/src/legateboost.h @@ -29,9 +29,10 @@ enum LegateBoostOpCode { // NOLINT(performance-enum-size) DIGAMMA = 7, ZETA = 8, /**/ - GATHER = 9, - RBF = 10, - BUILD_NN = 11, + GATHER = 9, + RBF = 10, + BUILD_NN = 11, + BUILD_TREE_CSR = 12, }; #endif // SRC_LEGATEBOOST_H_ diff --git a/src/models/tree/build_tree.cc b/src/models/tree/build_tree.cc index 7b39791e..0fbe0b5b 100644 --- a/src/models/tree/build_tree.cc +++ b/src/models/tree/build_tree.cc @@ -489,6 +489,8 @@ struct build_tree_fn { legateboost::type_dispatch_float(X.code(), build_tree_fn(), context); } +/*static*/ void BuildTreeCSRTask::cpu_variant(legate::TaskContext context) {} + } // namespace legateboost namespace // unnamed @@ -496,5 +498,6 @@ namespace // unnamed static void __attribute__((constructor)) register_tasks(void) { legateboost::BuildTreeTask::register_variants(); + legateboost::BuildTreeCSRTask::register_variants(); } } // namespace diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu index 21808bef..8b2295ba 100644 --- a/src/models/tree/build_tree.cu +++ b/src/models/tree/build_tree.cu @@ -191,11 +191,12 @@ using SharedMemoryHistogramType = GPairBase; const int kMaxSharedBins = 2048; // 16KB shared memory. More is not helpful and creates more cache // misses for binary search in split_proposals. -template struct HistogramAgent { + using T = typename MatrixT::value_type; static const int kImpureTile = -1; // Special value for a tile that is not pure (contains // multiple nodes) struct SharedMemoryHistogram { @@ -245,8 +246,7 @@ struct HistogramAgent { } }; - const legate::AccessorRO X; - const int64_t sample_offset; + const MatrixT X; const legate::AccessorRO g; const legate::AccessorRO h; const size_t n_outputs; @@ -262,8 +262,7 @@ struct HistogramAgent { int feature_stride; SharedMemoryHistogram shared_histogram; - __device__ HistogramAgent(const legate::AccessorRO& X, - int64_t sample_offset, + __device__ HistogramAgent(const MatrixT& X, const legate::AccessorRO& g, const legate::AccessorRO& h, size_t n_outputs, @@ -276,7 +275,6 @@ struct HistogramAgent { int64_t seed, SharedMemoryHistogramType* shared_memory) : X(X), - sample_offset(sample_offset), g(g), h(h), n_outputs(n_outputs), @@ -311,11 +309,10 @@ struct HistogramAgent { sample_node, node_sums, histogram.ContainsNode(BinaryTree::Parent(sample_node))); if (!computeHistogram) continue; - auto x = X[{sample_offset + local_sample_idx, feature, 0}]; - // int bin_idx = shared_split_proposals.FindBin(x, feature); + auto x = X.Get(X.RowRange().lo[0] + local_sample_idx, feature); int bin_idx = split_proposals.FindBin(x, feature); - legate::Point<3> p = {sample_offset + local_sample_idx, 0, output}; + legate::Point<3> p = {X.RowRange().lo[0] + local_sample_idx, 0, output}; auto gpair_quantised = quantiser.QuantiseStochasticRounding({g[p], h[p]}, hash_combine(seed, p[0], p[2])); auto* addPosition = reinterpret_cast( @@ -355,7 +352,7 @@ struct HistogramAgent { T x[kItemsPerThread]; #pragma unroll for (int i = 0; i < kItemsPerThread; i++) { - x[i] = X[{sample_offset + local_sample_idx[i], feature[i], 0}]; + x[i] = X.Get(X.RowRange().lo[0] + local_sample_idx[i], feature[i]); } int bin_idx[kItemsPerThread]; @@ -366,7 +363,7 @@ struct HistogramAgent { IntegerGPair gpair[kItemsPerThread]; #pragma unroll for (int i = 0; i < kItemsPerThread; i++) { - legate::Point<3> p = {sample_offset + local_sample_idx[i], 0, output}; + legate::Point<3> p = {X.RowRange().lo[0] + local_sample_idx[i], 0, output}; gpair[i] = bin_idx[i] != SparseSplitProposals::NOT_FOUND ? quantiser.QuantiseStochasticRounding({g[p], h[p]}, hash_combine(seed, p[0], p[2])) @@ -415,14 +412,13 @@ struct HistogramAgent { }; // NOLINTBEGIN(performance-unnecessary-value-param) -template +template __global__ static void __launch_bounds__(kBlockThreads) - fill_histogram_shared(legate::AccessorRO X, - int64_t sample_offset, + fill_histogram_shared(MatrixT X, legate::AccessorRO g, legate::AccessorRO h, size_t n_outputs, - SparseSplitProposals split_proposals, + SparseSplitProposals split_proposals, NodeBatch batch, Histogram histogram, legate::Buffer node_sums, @@ -433,26 +429,26 @@ __global__ static void __launch_bounds__(kBlockThreads) __shared__ char shared_char[kMaxSharedBins * sizeof(SharedMemoryHistogramType)]; SharedMemoryHistogramType* shared_memory = reinterpret_cast(shared_char); - HistogramAgent agent(X, - sample_offset, - g, - h, - n_outputs, - split_proposals, - batch, - histogram, - node_sums, - quantiser, - feature_groups, - seed, - shared_memory); + HistogramAgent agent(X, + g, + h, + n_outputs, + split_proposals, + batch, + histogram, + node_sums, + quantiser, + feature_groups, + seed, + shared_memory); agent.BuildHistogram(); } // NOLINTEND(performance-unnecessary-value-param) // Manage the launch parameters for histogram kernel -template +template struct HistogramKernel { + using T = typename MatrixT::value_type; const std::int32_t kItemsPerTile = kBlockThreads * kItemsPerThread; legate::Buffer feature_groups; int num_groups; @@ -466,7 +462,7 @@ struct HistogramKernel { std::int32_t n_blocks_per_mp = 0; CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor( &n_blocks_per_mp, - fill_histogram_shared, + fill_histogram_shared, kBlockThreads, 0)); this->maximum_blocks_for_occupancy = n_blocks_per_mp * n_mps; @@ -507,8 +503,7 @@ struct HistogramKernel { stream)); } - void BuildHistogram(const legate::AccessorRO& X, - int64_t sample_offset, + void BuildHistogram(const MatrixT& X, const legate::AccessorRO& g, const legate::AccessorRO& h, size_t n_outputs, @@ -525,9 +520,8 @@ struct HistogramKernel { auto min_blocks = (average_elements_per_group + kItemsPerTile - 1) / kItemsPerTile; auto x_grid_size = std::min(static_cast(maximum_blocks_for_occupancy), min_blocks); // Launch the kernel - fill_histogram_shared + fill_histogram_shared <<>>(X, - sample_offset, g, h, n_outputs, @@ -818,10 +812,9 @@ struct Tree { // Use nccl to share the samples with all workers // Remove any duplicates // Return sparse matrix of split samples for each feature -template +template class XMatrix> SparseSplitProposals SelectSplitSamples(legate::TaskContext context, - const legate::AccessorRO& X, - legate::Rect<3> X_shape, + const XMatrix& X, int split_samples, int seed, int64_t dataset_rows, @@ -829,7 +822,6 @@ SparseSplitProposals SelectSplitSamples(legate::TaskContext context, { auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM); auto policy = DEFAULT_POLICY(thrust_alloc).on(stream); - int num_features = X_shape.hi[1] - X_shape.lo[1] + 1; // Randomly choose split_samples rows auto row_samples = legate::create_buffer(split_samples); auto counting = thrust::make_counting_iterator(0); @@ -840,51 +832,54 @@ SparseSplitProposals SelectSplitSamples(legate::TaskContext context, eng.discard(idx); return dist(eng); }); - auto draft_proposals = legate::create_buffer({num_features, split_samples}); + auto draft_proposals = legate::create_buffer({X.NumFeatures(), split_samples}); // fill with local data - LaunchN(num_features * split_samples, stream, [=] __device__(auto idx) { - auto i = idx / num_features; - auto j = idx % num_features; + LaunchN(X.NumFeatures() * split_samples, stream, [=] __device__(auto idx) { + auto i = idx / X.NumFeatures(); + auto j = idx % X.NumFeatures(); auto row = row_samples[i]; - bool has_data = row >= X_shape.lo[0] && row <= X_shape.hi[0]; - draft_proposals[{j, i}] = has_data ? X[{row, j, 0}] : T(0); + bool has_data = X.RowRange().contains(row); + draft_proposals[{j, i}] = has_data ? X.Get(row, j) : T(0); }); // Sum reduce over all workers - SumAllReduce(context, draft_proposals.ptr({0, 0}), num_features * split_samples, stream); + SumAllReduce(context, draft_proposals.ptr({0, 0}), X.NumFeatures() * split_samples, stream); CHECK_CUDA_STREAM(stream); // Condense split samples to unique values // First sort the samples - auto keys = legate::create_buffer(num_features * split_samples); - thrust::transform( - policy, counting, counting + num_features * split_samples, keys.ptr(0), [=] __device__(int i) { - return i / split_samples; - }); + auto keys = legate::create_buffer(X.NumFeatures() * split_samples); + thrust::transform(policy, + counting, + counting + X.NumFeatures() * split_samples, + keys.ptr(0), + [=] __device__(int i) { return i / split_samples; }); // Segmented sort auto begin = thrust::make_zip_iterator(thrust::make_tuple(keys.ptr(0), draft_proposals.ptr({0, 0}))); - thrust::sort(policy, begin, begin + num_features * split_samples, [] __device__(auto a, auto b) { - if (thrust::get<0>(a) != thrust::get<0>(b)) { return thrust::get<0>(a) < thrust::get<0>(b); } - return thrust::get<1>(a) < thrust::get<1>(b); - }); + thrust::sort( + policy, begin, begin + X.NumFeatures() * split_samples, [] __device__(auto a, auto b) { + if (thrust::get<0>(a) != thrust::get<0>(b)) { return thrust::get<0>(a) < thrust::get<0>(b); } + return thrust::get<1>(a) < thrust::get<1>(b); + }); // Extract the unique values - auto out_keys = legate::create_buffer(num_features * split_samples); - auto split_proposals = legate::create_buffer(num_features * split_samples); + auto out_keys = legate::create_buffer(X.NumFeatures() * split_samples); + auto split_proposals = legate::create_buffer(X.NumFeatures() * split_samples); auto key_val = thrust::make_zip_iterator(thrust::make_tuple(keys.ptr(0), draft_proposals.ptr({0, 0}))); auto out_iter = thrust::make_zip_iterator(thrust::make_tuple(out_keys.ptr(0), split_proposals.ptr(0))); auto result = - thrust::unique_copy(policy, key_val, key_val + num_features * split_samples, out_iter); + thrust::unique_copy(policy, key_val, key_val + X.NumFeatures() * split_samples, out_iter); auto n_unique = thrust::distance(out_iter, result); // Count the unique values for each feature - auto row_pointers = legate::create_buffer(num_features + 1); - CHECK_CUDA(cudaMemsetAsync(row_pointers.ptr(0), 0, (num_features + 1) * sizeof(int32_t), stream)); + auto row_pointers = legate::create_buffer(X.NumFeatures() + 1); + CHECK_CUDA( + cudaMemsetAsync(row_pointers.ptr(0), 0, (X.NumFeatures() + 1) * sizeof(int32_t), stream)); thrust::reduce_by_key(policy, out_keys.ptr(0), @@ -894,13 +889,13 @@ SparseSplitProposals SelectSplitSamples(legate::TaskContext context, row_pointers.ptr(1)); // Scan the counts to get the row pointers for a CSR matrix thrust::inclusive_scan( - policy, row_pointers.ptr(1), row_pointers.ptr(1) + num_features, row_pointers.ptr(1)); + policy, row_pointers.ptr(1), row_pointers.ptr(1) + X.NumFeatures(), row_pointers.ptr(1)); CHECK_CUDA(cudaStreamSynchronize(stream)); row_samples.destroy(); draft_proposals.destroy(); out_keys.destroy(); - return SparseSplitProposals(split_proposals, row_pointers, num_features, n_unique); + return SparseSplitProposals(split_proposals, row_pointers, X.NumFeatures(), n_unique); } // Can't put a device lambda in constructor so make this a function @@ -913,8 +908,9 @@ void FillPositions(const legate::Buffer>& sor }); } -template +template struct TreeBuilder { + using T = typename MatrixT::value_type; TreeBuilder(int32_t num_rows, int32_t num_features, int32_t num_outputs, @@ -952,8 +948,7 @@ struct TreeBuilder { max_batch_size = max_histogram_nodes; } - template - void UpdatePositions(Tree& tree, const legate::AccessorRO& X, legate::Rect<3> X_shape) + void UpdatePositions(Tree& tree, const MatrixT X) { auto tree_split_value_ptr = tree.split_value.ptr(0); auto tree_feature_ptr = tree.feature.ptr(0); @@ -967,9 +962,11 @@ struct TreeBuilder { sorted_positions[idx] = cuda::std::make_tuple(-1, row); return; } - double x_value = X[{X_shape.lo[0] + static_cast(row), tree_feature_ptr[pos], 0}]; - bool left = x_value <= tree_split_value_ptr[pos]; - pos = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos); + + double x_value = + X.Get(X.RowRange().lo[0] + static_cast(row), tree_feature_ptr[pos]); + bool left = x_value <= tree_split_value_ptr[pos]; + pos = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos); sorted_positions[idx] = cuda::std::make_tuple(pos, row); }); CHECK_CUDA_STREAM(stream); @@ -1005,12 +1002,10 @@ struct TreeBuilder { stream)); } - template void ComputeHistogram(Histogram histogram, legate::TaskContext context, Tree& tree, - const legate::AccessorRO& X, - legate::Rect<3> X_shape, + const MatrixT X, const legate::AccessorRO& g, const legate::AccessorRO& h, NodeBatch batch, @@ -1018,7 +1013,6 @@ struct TreeBuilder { int depth) { histogram_kernel.BuildHistogram(X, - X_shape.lo[0], g, h, num_outputs, @@ -1184,7 +1178,7 @@ struct TreeBuilder { Histogram histogram; int max_batch_size; GradientQuantiser quantiser; - HistogramKernel histogram_kernel; + HistogramKernel histogram_kernel; cudaStream_t stream; }; @@ -1197,6 +1191,8 @@ struct build_tree_fn { auto [g, g_shape, g_accessor] = GetInputStore(context.input(1).data()); auto [h, h_shape, h_accessor] = GetInputStore(context.input(2).data()); + DenseXMatrix X_matrix(X_accessor, X_shape); + EXPECT_DENSE_ROW_MAJOR(X_accessor.accessor, X_shape); auto num_features = X_shape.hi[1] - X_shape.lo[1] + 1; auto num_rows = std::max(X_shape.hi[0] - X_shape.lo[0] + 1, 0); @@ -1221,19 +1217,19 @@ struct build_tree_fn { Tree tree(max_nodes, num_outputs, stream, thrust_exec_policy); SparseSplitProposals split_proposals = - SelectSplitSamples(context, X_accessor, X_shape, split_samples, seed, dataset_rows, stream); + SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows, stream); GradientQuantiser quantiser(context, g_accessor, h_accessor, g_shape, stream); // Begin building the tree - TreeBuilder builder(num_rows, - num_features, - num_outputs, - stream, - tree.max_nodes, - max_depth, - split_proposals, - quantiser); + TreeBuilder> builder(num_rows, + num_features, + num_outputs, + stream, + tree.max_nodes, + max_depth, + split_proposals, + quantiser); builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha, seed); @@ -1242,22 +1238,86 @@ struct build_tree_fn { for (auto batch : batches) { auto histogram = builder.GetHistogram(batch); - builder.ComputeHistogram(histogram, - context, - tree, - X_accessor, - X_shape, - g_accessor, - h_accessor, - batch, - seed, - depth); + builder.ComputeHistogram( + histogram, context, tree, X_matrix, g_accessor, h_accessor, batch, seed, depth); builder.PerformBestSplit(tree, histogram, alpha, batch); } // Update position of entire level // Don't bother updating positions for the last level - if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_accessor, X_shape); } + if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_matrix); } + } + + tree.WriteTreeOutput(context, thrust_exec_policy, quantiser); + + CHECK_CUDA(cudaStreamSynchronize(stream)); + CHECK_CUDA_STREAM(stream); + } +}; + +struct build_tree_csr_fn { + template + void operator()(legate::TaskContext context) + { + auto [X_vals, X_vals_shape, X_vals_accessor] = GetInputStore(context.input(0).data()); + auto [X_coords, X_coords_shape, X_coords_accessor] = + GetInputStore(context.input(1).data()); + auto [X_offsets, X_offsets_shape, X_offsets_accessor] = + GetInputStore, 1>(context.input(2).data()); + auto [g, g_shape, g_accessor] = GetInputStore(context.input(3).data()); + auto [h, h_shape, h_accessor] = GetInputStore(context.input(4).data()); + + auto num_rows = std::max(X_offsets_shape.hi[0] - X_offsets_shape.lo[0] + 1, 0); + auto num_outputs = g_shape.hi[1] - g_shape.lo[1] + 1; + EXPECT(g_shape.lo[1] == 0, "Outputs should not be split between workers."); + + // Scalars + auto max_depth = context.scalars().at(0).value(); + auto max_nodes = context.scalars().at(1).value(); + auto alpha = context.scalars().at(2).value(); + auto split_samples = context.scalars().at(3).value(); + auto seed = context.scalars().at(4).value(); + auto dataset_rows = context.scalars().at(5).value(); + auto num_features = context.scalars().at(6).value(); + + auto stream = context.get_task_stream(); + auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM); + auto thrust_exec_policy = DEFAULT_POLICY(thrust_alloc).on(stream); + + Tree tree(max_nodes, num_outputs, stream, thrust_exec_policy); + + CSRXMatrix X_matrix( + X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features); + SparseSplitProposals split_proposals = + SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows, stream); + + GradientQuantiser quantiser(context, g_accessor, h_accessor, g_shape, stream); + + // Begin building the tree + TreeBuilder> builder(num_rows, + num_features, + num_outputs, + stream, + tree.max_nodes, + max_depth, + split_proposals, + quantiser); + + builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha, seed); + + for (int depth = 0; depth < max_depth; ++depth) { + auto batches = builder.PrepareBatches(depth, thrust_exec_policy); + for (auto batch : batches) { + auto histogram = builder.GetHistogram(batch); + + builder.ComputeHistogram( + histogram, context, tree, X_matrix, g_accessor, h_accessor, batch, seed, depth); + + builder.PerformBestSplit(tree, histogram, alpha, batch); + } + // Update position of entire level + // Don't bother updating positions for the last level + if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_matrix); } } tree.WriteTreeOutput(context, thrust_exec_policy, quantiser); @@ -1273,4 +1333,10 @@ struct build_tree_fn { type_dispatch_float(X.code(), build_tree_fn(), context); } +/*static*/ void BuildTreeCSRTask::gpu_variant(legate::TaskContext context) +{ + const auto& X = context.input(0).data(); + type_dispatch_float(X.code(), build_tree_csr_fn(), context); +} + } // namespace legateboost diff --git a/src/models/tree/build_tree.h b/src/models/tree/build_tree.h index 0e8cc0b4..0c2072d1 100644 --- a/src/models/tree/build_tree.h +++ b/src/models/tree/build_tree.h @@ -101,6 +101,68 @@ __host__ __device__ inline double CalculateLeafValue(double G, double H, double return -G / (H + alpha); } +// Create a uniform interface to two matrix formats +// Dense and CSR +template +class DenseXMatrix { + public: + using value_type = T; + + private: + legate::AccessorRO x; + legate::Rect<3> shape; + + public: + DenseXMatrix(legate::AccessorRO x, legate::Rect<3> shape) : x(x), shape(shape) {} + __host__ __device__ T Get(uint32_t i, uint32_t j) const { return x[legate::Point<3>{i, j, 0}]; } + __host__ __device__ int NumFeatures() const { return shape.hi[1] - shape.lo[1] + 1; } + __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const + { + return {shape.lo[0], shape.hi[0]}; + } +}; + +template +class CSRXMatrix { + public: + using value_type = T; + + private: + legate::AccessorRO values; + legate::AccessorRO column_indices; + legate::AccessorRO, 1> row_ranges; + legate::Rect<1, legate::coord_t> row_ranges_shape; + int num_features; + + public: + CSRXMatrix(legate::AccessorRO values, + legate::AccessorRO column_indices, + legate::AccessorRO, 1> row_ranges, + legate::Rect<1, legate::coord_t> row_ranges_shape, + int num_features) + : values(values), + column_indices(column_indices), + row_ranges(row_ranges), + num_features(num_features), + row_ranges_shape(row_ranges_shape) + { + } + + // Slower than dense due to search for column index + __host__ __device__ T Get(uint32_t i, uint32_t j) const + { + auto row_range = row_ranges[i]; + // TODO(Rory): Binary search? + for (int64_t k = row_range.lo; k <= row_range.hi; k++) { + if (column_indices[k] == j) return values[k]; + if (column_indices[k] > j) return 0; + } + return 0; + } + __host__ __device__ int NumFeatures() const { return num_features; } + __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const { return row_ranges_shape; } +}; + // Container for the CSR matrix containing the split proposals template class SparseSplitProposals { @@ -243,4 +305,12 @@ class BuildTreeTask : public Task { #endif }; +class BuildTreeCSRTask : public Task { + public: + static void cpu_variant(legate::TaskContext context); +#ifdef LEGATEBOOST_USE_CUDA + static void gpu_variant(legate::TaskContext context); +#endif +}; + } // namespace legateboost From 27d03a0584e86b28262bed9b08d851e6a0fddcc1 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Wed, 4 Dec 2024 08:13:12 -0800 Subject: [PATCH 02/10] Add gpu prediction --- legateboost/models/tree.py | 56 ++++++++++++++- legateboost/test/models/test_tree.py | 4 +- src/legateboost.h | 11 +-- src/models/tree/build_tree.cu | 1 + src/models/tree/build_tree.h | 62 ---------------- src/models/tree/matrix_types.h | 80 +++++++++++++++++++++ src/models/tree/predict.cc | 11 ++- src/models/tree/predict.cu | 102 ++++++++++++++++++++------- src/models/tree/predict.h | 10 ++- 9 files changed, 237 insertions(+), 100 deletions(-) create mode 100644 src/models/tree/matrix_types.h diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py index ba64b10b..025aa03e 100644 --- a/legateboost/models/tree.py +++ b/legateboost/models/tree.py @@ -24,7 +24,8 @@ class LegateBoostOpCode(IntEnum): BUILD_TREE = user_lib.cffi.BUILD_TREE BUILD_TREE_CSR = user_lib.cffi.BUILD_TREE_CSR - PREDICT = user_lib.cffi.PREDICT + PREDICT_TREE = user_lib.cffi.PREDICT_TREE + PREDICT_TREE_CSR = user_lib.cffi.PREDICT_TREE_CSR UPDATE_TREE = user_lib.cffi.UPDATE_TREE @@ -259,12 +260,12 @@ def update( self.hessian = cn.array(hessian, copy=False) return self - def predict(self, X: cn.ndarray) -> cn.ndarray: + def predict_dense(self, X: cn.ndarray) -> cn.ndarray: n_rows = X.shape[0] n_features = X.shape[1] n_outputs = self.leaf_value.shape[1] task = get_legate_runtime().create_auto_task( - user_context, LegateBoostOpCode.PREDICT + user_context, LegateBoostOpCode.PREDICT_TREE ) pred = get_legate_runtime().create_store(types.float64, (n_rows, n_outputs)) @@ -288,9 +289,58 @@ def predict(self, X: cn.ndarray) -> cn.ndarray: task.add_alignment(X_, pred_) task.execute() + return cn.array(pred, copy=False) + + def predict_csr(self, X: csr_matrix) -> cn.ndarray: + n_rows = X.shape[0] + n_outputs = self.leaf_value.shape[1] + task = get_legate_runtime().create_auto_task( + user_context, LegateBoostOpCode.PREDICT_TREE_CSR + ) + + pred = get_legate_runtime().create_store(types.float64, (n_rows, n_outputs)) + # inputs + val_var = task.add_input(X.vals) + crd_var = task.add_input(X.crd) + pos_var = task.add_input(X.pos) + task.add_constraint( + image(pos_var, crd_var, hint=ImageComputationHint.FIRST_LAST) + ) + task.add_constraint( + image(pos_var, val_var, hint=ImageComputationHint.FIRST_LAST) + ) + pos_var_broadcast = X.pos.promote(1, n_outputs) + task.add_alignment(pos_var_broadcast, pred) + + # scalars + task.add_scalar_arg(X.shape[1], types.int32) + + # output + task.add_output( + pred.promote(1, 1) + ) # add 1 dimension so it has the same dimension as dense version + task.add_output(pred) # only here for alignment, no used + # broadcast the tree structure + leaf_value_ = get_store(self.leaf_value) + feature_ = get_store(self.feature) + split_value_ = get_store(self.split_value) + task.add_input(leaf_value_) + task.add_input(feature_) + task.add_input(split_value_) + task.add_broadcast(leaf_value_) + task.add_broadcast(feature_) + task.add_broadcast(split_value_) + + task.add_input(pos_var_broadcast) # used only for alignment + task.execute() return cn.array(pred, copy=False) + def predict(self, X: Union[cn.ndarray, csr_matrix]) -> cn.ndarray: + if isinstance(X, csr_matrix): + return self.predict_csr(X) + return self.predict_dense(X) + def is_leaf(self, id: int) -> Any: return self.feature[id] == -1 diff --git a/legateboost/test/models/test_tree.py b/legateboost/test/models/test_tree.py index e3a2f8c6..c88ca011 100644 --- a/legateboost/test/models/test_tree.py +++ b/legateboost/test/models/test_tree.py @@ -84,4 +84,6 @@ def test_sparse(): ) g = cn.array(rs.normal(size=(2, num_outputs))) h = cn.array(rs.random((2, 1)) + 0.1) - lb.models.Tree().set_random_state(np.random.RandomState(2)).fit(X, g, h) + + model = lb.models.Tree().set_random_state(np.random.RandomState(2)).fit(X, g, h) + model.predict(X) diff --git a/src/legateboost.h b/src/legateboost.h index b13aa4af..892c274a 100644 --- a/src/legateboost.h +++ b/src/legateboost.h @@ -20,7 +20,7 @@ enum LegateBoostOpCode { // NOLINT(performance-enum-size) OP_CODE_BASE = 0, BUILD_TREE = 1, - PREDICT = 2, + PREDICT_TREE = 2, UPDATE_TREE = 3, /* special */ ERF = 4, @@ -29,10 +29,11 @@ enum LegateBoostOpCode { // NOLINT(performance-enum-size) DIGAMMA = 7, ZETA = 8, /**/ - GATHER = 9, - RBF = 10, - BUILD_NN = 11, - BUILD_TREE_CSR = 12, + GATHER = 9, + RBF = 10, + BUILD_NN = 11, + BUILD_TREE_CSR = 12, + PREDICT_TREE_CSR = 13, }; #endif // SRC_LEGATEBOOST_H_ diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu index a3fc489d..8945205b 100644 --- a/src/models/tree/build_tree.cu +++ b/src/models/tree/build_tree.cu @@ -32,6 +32,7 @@ #include "../../cpp_utils/cpp_utils.cuh" #include "legate/comm/coll.h" #include "build_tree.h" +#include "matrix_types.h" namespace legateboost { diff --git a/src/models/tree/build_tree.h b/src/models/tree/build_tree.h index 56125ecf..897ccad4 100644 --- a/src/models/tree/build_tree.h +++ b/src/models/tree/build_tree.h @@ -106,68 +106,6 @@ __host__ __device__ inline auto CalculateLeafValue(double G, double H, double al return -G / (H + alpha); } -// Create a uniform interface to two matrix formats -// Dense and CSR -template -class DenseXMatrix { - public: - using value_type = T; - - private: - legate::AccessorRO x; - legate::Rect<3> shape; - - public: - DenseXMatrix(legate::AccessorRO x, legate::Rect<3> shape) : x(x), shape(shape) {} - __host__ __device__ T Get(uint32_t i, uint32_t j) const { return x[legate::Point<3>{i, j, 0}]; } - __host__ __device__ int NumFeatures() const { return shape.hi[1] - shape.lo[1] + 1; } - __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const - { - return {shape.lo[0], shape.hi[0]}; - } -}; - -template -class CSRXMatrix { - public: - using value_type = T; - - private: - legate::AccessorRO values; - legate::AccessorRO column_indices; - legate::AccessorRO, 1> row_ranges; - legate::Rect<1, legate::coord_t> row_ranges_shape; - int num_features; - - public: - CSRXMatrix(legate::AccessorRO values, - legate::AccessorRO column_indices, - legate::AccessorRO, 1> row_ranges, - legate::Rect<1, legate::coord_t> row_ranges_shape, - int num_features) - : values(values), - column_indices(column_indices), - row_ranges(row_ranges), - num_features(num_features), - row_ranges_shape(row_ranges_shape) - { - } - - // Slower than dense due to search for column index - __host__ __device__ T Get(uint32_t i, uint32_t j) const - { - auto row_range = row_ranges[i]; - // TODO(Rory): Binary search? - for (int64_t k = row_range.lo; k <= row_range.hi; k++) { - if (column_indices[k] == j) return values[k]; - if (column_indices[k] > j) return 0; - } - return 0; - } - __host__ __device__ int NumFeatures() const { return num_features; } - __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const { return row_ranges_shape; } -}; - // Container for the CSR matrix containing the split proposals template class SparseSplitProposals { diff --git a/src/models/tree/matrix_types.h b/src/models/tree/matrix_types.h new file mode 100644 index 00000000..508b351f --- /dev/null +++ b/src/models/tree/matrix_types.h @@ -0,0 +1,80 @@ +/* Copyright 2024 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ +#pragma once +#include +#include + +// Create a uniform interface to two matrix formats +// Dense and CSR +template +class DenseXMatrix { + public: + using value_type = T; + + private: + legate::AccessorRO x; + legate::Rect<3> shape; + + public: + DenseXMatrix(legate::AccessorRO x, legate::Rect<3> shape) : x(x), shape(shape) {} + __host__ __device__ T Get(uint32_t i, uint32_t j) const { return x[legate::Point<3>{i, j, 0}]; } + __host__ __device__ int NumFeatures() const { return shape.hi[1] - shape.lo[1] + 1; } + __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const + { + return {shape.lo[0], shape.hi[0]}; + } +}; + +template +class CSRXMatrix { + public: + using value_type = T; + + private: + legate::AccessorRO values; + legate::AccessorRO column_indices; + legate::AccessorRO, 1> row_ranges; + legate::Rect<1, legate::coord_t> row_ranges_shape; + int num_features; + + public: + CSRXMatrix(legate::AccessorRO values, + legate::AccessorRO column_indices, + legate::AccessorRO, 1> row_ranges, + legate::Rect<1, legate::coord_t> row_ranges_shape, + int num_features) + : values(values), + column_indices(column_indices), + row_ranges(row_ranges), + num_features(num_features), + row_ranges_shape(row_ranges_shape) + { + } + + // Slower than dense due to search for column index + __host__ __device__ T Get(uint32_t i, uint32_t j) const + { + auto row_range = row_ranges[i]; + // TODO(Rory): Binary search? + for (int64_t k = row_range.lo; k <= row_range.hi; k++) { + if (column_indices[k] == j) return values[k]; + if (column_indices[k] > j) return 0; + } + return 0; + } + __host__ __device__ int NumFeatures() const { return num_features; } + __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const { return row_ranges_shape; } +}; diff --git a/src/models/tree/predict.cc b/src/models/tree/predict.cc index a87d5044..5feeded9 100644 --- a/src/models/tree/predict.cc +++ b/src/models/tree/predict.cc @@ -62,7 +62,13 @@ struct predict_fn { }; } // namespace -/*static*/ void PredictTask::cpu_variant(legate::TaskContext context) +/*static*/ void PredictTreeTask::cpu_variant(legate::TaskContext context) +{ + const auto& X = context.input(0).data(); + type_dispatch_float(X.code(), predict_fn(), context); +} + +/*static*/ void PredictTreeCSRTask::cpu_variant(legate::TaskContext context) { const auto& X = context.input(0).data(); type_dispatch_float(X.code(), predict_fn(), context); @@ -74,6 +80,7 @@ namespace // unnamed { void __attribute__((constructor)) register_tasks() { - legateboost::PredictTask::register_variants(); + legateboost::PredictTreeTask::register_variants(); + legateboost::PredictTreeCSRTask::register_variants(); } } // namespace diff --git a/src/models/tree/predict.cu b/src/models/tree/predict.cu index 65837619..181070f3 100644 --- a/src/models/tree/predict.cu +++ b/src/models/tree/predict.cu @@ -19,11 +19,42 @@ #include "../../cpp_utils/cpp_utils.cuh" #include "../../cpp_utils/cpp_utils.h" #include "predict.h" +#include "matrix_types.h" namespace legateboost { namespace { -struct predict_fn { + +template +void PredictRows(const MatrixT& X, + legate::AccessorWO pred_accessor, + legate::Rect<3, legate::coord_t> pred_shape, + legate::AccessorRO split_value, + legate::AccessorRO feature, + legate::AccessorRO leaf_value, + cudaStream_t stream) +{ + // rowwise kernel + auto prediction_lambda = [=] __device__(size_t idx) { + int64_t pos = 0; + auto global_row_idx = X.RowRange().lo + idx; + // Use a max depth of 100 to avoid infinite loops + const int max_depth = 100; + for (int depth = 0; depth < max_depth; depth++) { + if (feature[pos] == -1) { break; } + double const X_val = X.Get(global_row_idx, feature[pos]); + pos = X_val <= split_value[pos] ? (pos * 2) + 1 : (pos * 2) + 2; + } + for (int64_t j = pred_shape.lo[2]; j <= pred_shape.hi[2]; j++) { + pred_accessor[{global_row_idx, 0, j}] = leaf_value[{pos, j}]; + } + }; // NOLINT(readability/braces) + + LaunchN(X.RowRange().volume(), stream, prediction_lambda); + CHECK_CUDA_STREAM(stream); +} + +struct predict_dense_fn { template void operator()(legate::TaskContext context) { @@ -39,7 +70,6 @@ struct predict_fn { auto pred = context.output(0).data(); auto pred_shape = pred.shape<3>(); auto pred_accessor = pred.write_accessor(); - auto n_outputs = pred_shape.hi[2] - pred_shape.lo[2] + 1; EXPECT(pred_shape.lo[2] == 0, "Expect all outputs to be present"); // We should have one output prediction per row of X @@ -50,36 +80,56 @@ struct predict_fn { EXPECT_IS_BROADCAST(context.input(2).data().shape<1>()); EXPECT_IS_BROADCAST(context.input(3).data().shape<1>()); - // rowwise kernel - auto prediction_lambda = [=] __device__(size_t idx) { - int64_t pos = 0; - legate::Point<3> x_point = {X_shape.lo[0] + static_cast(idx), 0, 0}; - - // Use a max depth of 100 to avoid infinite loops - const int max_depth = 100; - for (int depth = 0; depth < max_depth; depth++) { - if (feature[pos] == -1) { break; } - x_point[1] = feature[pos]; - double const X_val = X_accessor[x_point]; - pos = X_val <= split_value[pos] ? (pos * 2) + 1 : (pos * 2) + 2; - } - for (int64_t j = 0; j < n_outputs; j++) { - pred_accessor[{X_shape.lo[0] + static_cast(idx), 0, j}] = leaf_value[{pos, j}]; - } - }; // NOLINT(readability/braces) - - auto* stream = context.get_task_stream(); - LaunchN(X_shape.hi[0] - X_shape.lo[0] + 1, stream, prediction_lambda); - - CHECK_CUDA_STREAM(stream); + PredictRows(DenseXMatrix(X_accessor, X_shape), + pred_accessor, + pred_shape, + split_value, + feature, + leaf_value, + context.get_task_stream()); + } +}; + +struct predict_csr_fn { + template + void operator()(legate::TaskContext context) + { + auto [X_vals, X_vals_shape, X_vals_accessor] = GetInputStore(context.input(0).data()); + auto [X_coords, X_coords_shape, X_coords_accessor] = + GetInputStore(context.input(1).data()); + auto [X_offsets, X_offsets_shape, X_offsets_accessor] = + GetInputStore, 1>(context.input(2).data()); + + auto leaf_value = context.input(3).data().read_accessor(); + auto feature = context.input(4).data().read_accessor(); + auto split_value = context.input(5).data().read_accessor(); + + auto pred = context.output(0).data(); + auto pred_shape = pred.shape<3>(); + auto pred_accessor = pred.write_accessor(); + + auto num_features = context.scalars().at(0).value(); + CSRXMatrix X( + X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features); + + EXPECT_AXIS_ALIGNED(0, X_offsets_shape, pred_shape); + + PredictRows( + X, pred_accessor, pred_shape, split_value, feature, leaf_value, context.get_task_stream()); } }; } // namespace -/*static*/ void PredictTask::gpu_variant(legate::TaskContext context) +/*static*/ void PredictTreeTask::gpu_variant(legate::TaskContext context) +{ + auto X = context.input(0).data(); + type_dispatch_float(X.code(), predict_dense_fn(), context); +} + +/*static*/ void PredictTreeCSRTask::gpu_variant(legate::TaskContext context) { auto X = context.input(0).data(); - type_dispatch_float(X.code(), predict_fn(), context); + type_dispatch_float(X.code(), predict_csr_fn(), context); } } // namespace legateboost diff --git a/src/models/tree/predict.h b/src/models/tree/predict.h index 1d98e6b5..c55efd42 100644 --- a/src/models/tree/predict.h +++ b/src/models/tree/predict.h @@ -20,7 +20,15 @@ namespace legateboost { -class PredictTask : public Task { +class PredictTreeTask : public Task { + public: + static void cpu_variant(legate::TaskContext context); +#ifdef LEGATEBOOST_USE_CUDA + static void gpu_variant(legate::TaskContext context); +#endif +}; + +class PredictTreeCSRTask : public Task { public: static void cpu_variant(legate::TaskContext context); #ifdef LEGATEBOOST_USE_CUDA From ed290d060e949b22965bd7dfccde7c924b92e2b9 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 5 Dec 2024 03:31:54 -0800 Subject: [PATCH 03/10] Accept csr input in estimator --- conda/environments/all_cuda-122.yaml | 1 + dependencies.yaml | 1 + legateboost/input_validation.py | 39 +++++++++++++++------------- legateboost/legateboost.py | 1 + legateboost/models/base_model.py | 10 +++++++ legateboost/models/tree.py | 3 +++ legateboost/test/models/test_tree.py | 4 +-- legateboost/test/test_estimator.py | 23 +++++++++++++++- pyproject.toml | 1 + 9 files changed, 62 insertions(+), 21 deletions(-) diff --git a/conda/environments/all_cuda-122.yaml b/conda/environments/all_cuda-122.yaml index ae2e3713..3f383c17 100644 --- a/conda/environments/all_cuda-122.yaml +++ b/conda/environments/all_cuda-122.yaml @@ -15,6 +15,7 @@ dependencies: - cuda-version>=12.2 - cupynumeric==25.01.*,>=0.0.0.dev0 - hypothesis>=6 +- legate-sparse - legate==25.01.*,>=0.0.0.dev0 - libcublas-dev - make diff --git a/dependencies.yaml b/dependencies.yaml index b04de179..850d6549 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -152,3 +152,4 @@ dependencies: - pytest>=7,<8 - seaborn>=0.13 - xgboost>=2.0 + - legate-sparse diff --git a/legateboost/input_validation.py b/legateboost/input_validation.py index 970328ed..296ef061 100644 --- a/legateboost/input_validation.py +++ b/legateboost/input_validation.py @@ -3,6 +3,11 @@ import numpy as np import scipy.sparse as sp +try: + from legate_sparse import csr_matrix +except ImportError: + csr_matrix = None + import cupynumeric as cn __all__: List[str] = [] @@ -29,22 +34,22 @@ def check_sample_weight(sample_weight: Any, n: int) -> cn.ndarray: def check_array(x: Any) -> cn.ndarray: if sp.issparse(x): - raise ValueError("Sparse matrix not allowed.") - - if not hasattr(x, "__legate_data_interface__"): - x = cn.array(np.require(x, requirements=["C", "A"])) - if hasattr(x, "__array_interface__"): - shape = x.__array_interface__["shape"] - if shape[0] <= 0: - raise ValueError( - "Found array with %d sample(s) (shape=%s) while a" - " minimum of %d is required." % (shape[0], shape, 1) - ) - if len(shape) >= 2 and 0 in shape: - raise ValueError( - "Found array with %d feature(s) (shape=%s) while" - " a minimum of %d is required." % (shape[1], shape, 1) - ) + x = csr_matrix(x) + elif isinstance(x, csr_matrix): + pass + else: + x = cn.array(x, copy=False) + + if x.shape[0] <= 0: + raise ValueError( + "Found array with %d sample(s) (shape=%s) while a" + " minimum of %d is required." % (x.shape[0], x.shape, 1) + ) + if len(x.shape) >= 2 and 0 in x.shape: + raise ValueError( + "Found array with %d feature(s) (shape=%s) while" + " a minimum of %d is required." % (x.shape[1], x.shape, 1) + ) if cn.iscomplexobj(x): raise ValueError("Complex data not supported.") @@ -52,8 +57,6 @@ def check_array(x: Any) -> cn.ndarray: if np.issubdtype(x.dtype, np.floating) and not cn.isfinite(x.sum()): raise ValueError("Input contains NaN or inf") - x = cn.array(x, copy=False) - return x diff --git a/legateboost/legateboost.py b/legateboost/legateboost.py index 3bda0d5d..b171497b 100644 --- a/legateboost/legateboost.py +++ b/legateboost/legateboost.py @@ -91,6 +91,7 @@ def _more_tags(self) -> Any: ), "check_dtype_object": ("object type data not supported."), }, + "X_types": ["2darray", "sparse"], } def _setup_metrics(self) -> list[BaseMetric]: diff --git a/legateboost/models/base_model.py b/legateboost/models/base_model.py index 07408559..1b23cdcb 100644 --- a/legateboost/models/base_model.py +++ b/legateboost/models/base_model.py @@ -91,6 +91,16 @@ def predict(self, X: cn.ndarray) -> cn.ndarray: """ pass + def supports_csr(self) -> bool: + """Whether the model supports CSR matrix input. + + Returns + ------- + bool + True if the model supports CSR matrix input, False otherwise. + """ + return False + @abstractmethod def __str__(self) -> str: pass diff --git a/legateboost/models/tree.py b/legateboost/models/tree.py index 025aa03e..ff4079d3 100644 --- a/legateboost/models/tree.py +++ b/legateboost/models/tree.py @@ -386,3 +386,6 @@ def __mul__(self, scalar: Any) -> "Tree": new = copy.deepcopy(self) new.leaf_value *= scalar return new + + def supports_csr(self) -> bool: + return True diff --git a/legateboost/test/models/test_tree.py b/legateboost/test/models/test_tree.py index c88ca011..9c6f7fe5 100644 --- a/legateboost/test/models/test_tree.py +++ b/legateboost/test/models/test_tree.py @@ -1,6 +1,5 @@ import numpy as np import pytest -from legate_sparse import csr_matrix import cupynumeric as cn import legateboost as lb @@ -75,7 +74,8 @@ def test_alpha(): assert np.isclose(model.predict(X)[0], y.sum() / (y.size + alpha)) -def test_sparse(): +def test_csr(): + csr_matrix = pytest.importorskip("legate_sparse.csr_matrix") num_outputs = 1 rs = cn.random.RandomState(0) X = csr_matrix( diff --git a/legateboost/test/test_estimator.py b/legateboost/test/test_estimator.py index 360a3634..db206ab6 100644 --- a/legateboost/test/test_estimator.py +++ b/legateboost/test/test_estimator.py @@ -1,12 +1,17 @@ import numpy as np import pytest +import scipy from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split from sklearn.utils.estimator_checks import parametrize_with_checks import cupynumeric as cn import legateboost as lb -from legateboost.testing.utils import non_increasing, sanity_check_models +from legateboost.testing.utils import ( + all_base_models, + non_increasing, + sanity_check_models, +) def test_init(): @@ -215,3 +220,19 @@ def test_iterator_methods(): assert list(model) == list(model.models_) for i, est in enumerate(model): assert est == model[i] + + +@pytest.mark.parametrize( + "base_model", filter(lambda m: m.supports_csr(), all_base_models()), ids=type +) +def test_csr_input(base_model): + csr_matrix = pytest.importorskip("legate_sparse").csr_matrix + X_scipy = scipy.sparse.csr_matrix([[1.0, 0.0, 2.0], [0.0, 3.0, 0.0]]) + X_legate_sparse = csr_matrix(X_scipy) + y = cn.array([1.0, 2.0]) + model = lb.LBRegressor( + n_estimators=1, + base_models=(base_model,), + ) + model.fit(X_scipy, y) + model.fit(X_legate_sparse, y) diff --git a/pyproject.toml b/pyproject.toml index 9f3a177d..c09a83d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ requires-python = ">=3.10" [project.optional-dependencies] test = [ "hypothesis>=6", + "legate-sparse", "matplotlib>=3.9", "mypy>=1.13", "nbconvert>=7.16", From 9f07b1af8b00df7c0a1ce4ca337a9466415fbcbf Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 6 Dec 2024 04:51:14 -0800 Subject: [PATCH 04/10] Add example --- examples/sparse/README.md | 3 +++ examples/sparse/sparse.py | 47 +++++++++++++++++++++++++++++++++++ src/models/tree/build_tree.cu | 4 +-- 3 files changed, 52 insertions(+), 2 deletions(-) create mode 100644 examples/sparse/README.md create mode 100644 examples/sparse/sparse.py diff --git a/examples/sparse/README.md b/examples/sparse/README.md new file mode 100644 index 00000000..c0cd3efc --- /dev/null +++ b/examples/sparse/README.md @@ -0,0 +1,3 @@ +# Sparse data + +This example trains a youtube comment spam classifier on a sparse dataset. The comments as raw strings are converted to a sparse matrix of word counts using the `CountVectorizer` from scikit-learn. diff --git a/examples/sparse/sparse.py b/examples/sparse/sparse.py new file mode 100644 index 00000000..a189beff --- /dev/null +++ b/examples/sparse/sparse.py @@ -0,0 +1,47 @@ +import pandas as pd +from sklearn.datasets import fetch_openml +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.model_selection import train_test_split + +import legateboost as lb + +# Alberto, T. & Lochter, J. (2015). YouTube Spam Collection [Dataset]. +# UCI Machine Learning Repository. https://doi.org/10.24432/C58885. +dataset_names = [ + "youtube-spam-psy", + "youtube-spam-shakira", + "youtube-spam-lmfao", + "youtube-spam-eminem", + "youtube-spam-katyperry", +] +X = [] +for dataset_name in dataset_names: + dataset = fetch_openml(name=dataset_name, as_frame=True) + X.append(dataset.data) + +X = pd.concat(X) +y = X["CLASS"] +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=42 +) +vectorizer = CountVectorizer() +X_train_vectorized = vectorizer.fit_transform(X_train["CONTENT"]) +X_test_vectorized = vectorizer.transform(X_test["CONTENT"]) + +model = lb.LBClassifier().fit( + X_train_vectorized, y_train, eval_set=[(X_test_vectorized, y_test)] +) + + +def evaluate_comment(comment): + print("Comment: {}".format(comment)) + print( + "Probability of spam: {}".format( + model.predict_proba(vectorizer.transform([comment]))[0, 1] + ) + ) + + +evaluate_comment(X_test.iloc[15]["CONTENT"]) +evaluate_comment(X_test.iloc[3]["CONTENT"]) +evaluate_comment("Your text here") diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu index 8945205b..c2f224fa 100644 --- a/src/models/tree/build_tree.cu +++ b/src/models/tree/build_tree.cu @@ -1339,8 +1339,8 @@ struct build_tree_csr_fn { auto [h, h_shape, h_accessor] = GetInputStore(context.input(4).data()); auto num_rows = std::max(X_offsets_shape.hi[0] - X_offsets_shape.lo[0] + 1, 0); - auto num_outputs = g_shape.hi[1] - g_shape.lo[1] + 1; - EXPECT(g_shape.lo[1] == 0, "Outputs should not be split between workers."); + auto num_outputs = g_shape.hi[2] - g_shape.lo[2] + 1; + EXPECT(g_shape.lo[2] == 0, "Outputs should not be split between workers."); // Scalars auto max_depth = context.scalars().at(0).value(); From 271878ac7c62861953bf2033dd48b22eb4aceb35 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 6 Dec 2024 05:32:58 -0800 Subject: [PATCH 05/10] Add cpu csr prediction --- ci/run_pytests_cpu.sh | 1 + legateboost/test/models/test_tree.py | 10 ++-- src/models/tree/build_tree.cc | 51 ++++++++++++------- src/models/tree/build_tree.cu | 2 +- src/models/tree/build_tree.h | 2 +- src/models/tree/predict.cc | 76 ++++++++++++++++++++++------ 6 files changed, 99 insertions(+), 43 deletions(-) diff --git a/ci/run_pytests_cpu.sh b/ci/run_pytests_cpu.sh index 50e138ac..16a74640 100755 --- a/ci/run_pytests_cpu.sh +++ b/ci/run_pytests_cpu.sh @@ -18,6 +18,7 @@ set -e -E -u -o pipefail cd legateboost/test legate \ + --gpus 0 \ --sysmem 28000 \ --module pytest \ . \ diff --git a/legateboost/test/models/test_tree.py b/legateboost/test/models/test_tree.py index 9c6f7fe5..99f82171 100644 --- a/legateboost/test/models/test_tree.py +++ b/legateboost/test/models/test_tree.py @@ -75,15 +75,13 @@ def test_alpha(): def test_csr(): - csr_matrix = pytest.importorskip("legate_sparse.csr_matrix") - num_outputs = 1 - rs = cn.random.RandomState(0) + csr_matrix = pytest.importorskip("legate_sparse").csr_matrix X = csr_matrix( (cn.array([1.0, 2.0, 3.0]), cn.array([0, 1, 2]), cn.array([0, 2, 3])), shape=(2, 3), ) - g = cn.array(rs.normal(size=(2, num_outputs))) - h = cn.array(rs.random((2, 1)) + 0.1) + g = cn.array([[1.0], [-1.0]]) + h = cn.array([[1.0], [1.0]]) model = lb.models.Tree().set_random_state(np.random.RandomState(2)).fit(X, g, h) - model.predict(X) + assert np.allclose(model.predict(X), -g / h) diff --git a/src/models/tree/build_tree.cc b/src/models/tree/build_tree.cc index 16ce0177..e7e08083 100644 --- a/src/models/tree/build_tree.cc +++ b/src/models/tree/build_tree.cc @@ -24,6 +24,7 @@ #include "legate_library.h" #include "legateboost.h" #include "cpp_utils/cpp_utils.h" +#include "matrix_types.h" namespace legateboost { namespace { @@ -127,10 +128,9 @@ void WriteTreeOutput(legate::TaskContext context, const Tree& tree) // Share the samples with all workers // Remove any duplicates // Return sparse matrix of split samples for each feature -template +template class XMatrix> auto SelectSplitSamples(legate::TaskContext context, - const legate::AccessorRO& X, - legate::Rect<3> X_shape, + const XMatrix& X, int split_samples, int seed, int64_t dataset_rows) -> SparseSplitProposals @@ -143,23 +143,22 @@ auto SelectSplitSamples(legate::TaskContext context, return dist(eng); }); - auto num_features = X_shape.hi[1] - X_shape.lo[1] + 1; - auto draft_proposals = legate::create_buffer({num_features, split_samples}); + auto draft_proposals = legate::create_buffer({X.NumFeatures(), split_samples}); for (int i = 0; i < split_samples; i++) { auto row = row_samples[i]; - bool const has_data = row >= X_shape.lo[0] && row <= X_shape.hi[0]; - for (int j = 0; j < num_features; j++) { - draft_proposals[{j, i}] = has_data ? X[{row, j, 0}] : T(0); + const bool has_data = X.RowRange().contains(row); + for (int j = 0; j < X.NumFeatures(); j++) { + draft_proposals[{j, i}] = has_data ? X.Get(row, j) : T(0); } } - SumAllReduce(context, tcb::span(draft_proposals.ptr({0, 0}), num_features * split_samples)); + SumAllReduce(context, tcb::span(draft_proposals.ptr({0, 0}), X.NumFeatures() * split_samples)); // Sort samples std::vector split_proposals_tmp; - split_proposals_tmp.reserve(num_features * split_samples); - auto row_pointers = legate::create_buffer(num_features + 1); + split_proposals_tmp.reserve(X.NumFeatures() * split_samples); + auto row_pointers = legate::create_buffer(X.NumFeatures() + 1); row_pointers[0] = 0; - for (int j = 0; j < num_features; j++) { + for (int j = 0; j < X.NumFeatures(); j++) { auto ptr = draft_proposals.ptr({j, 0}); tcb::span const feature_proposals(draft_proposals.ptr({j, 0}), split_samples); std::set const unique(feature_proposals.begin(), feature_proposals.end()); @@ -170,7 +169,7 @@ auto SelectSplitSamples(legate::TaskContext context, auto split_proposals = legate::create_buffer(split_proposals_tmp.size()); std::copy(split_proposals_tmp.begin(), split_proposals_tmp.end(), split_proposals.ptr(0)); return SparseSplitProposals( - split_proposals, row_pointers, num_features, split_proposals_tmp.size()); + split_proposals, row_pointers, X.NumFeatures(), split_proposals_tmp.size()); } template @@ -446,13 +445,14 @@ struct TreeBuilder { Histogram histogram; }; -struct build_tree_fn { +struct build_tree_dense_fn { template void operator()(legate::TaskContext context) { auto [X, X_shape, X_accessor] = GetInputStore(context.input(0).data()); auto [g, g_shape, g_accessor] = GetInputStore(context.input(1).data()); auto [h, h_shape, h_accessor] = GetInputStore(context.input(2).data()); + EXPECT_DENSE_ROW_MAJOR(X_accessor.accessor, X_shape); auto num_features = X_shape.hi[1] - X_shape.lo[1] + 1; auto num_rows = std::max(X_shape.hi[0] - X_shape.lo[0] + 1, 0); @@ -471,8 +471,11 @@ struct build_tree_fn { auto dataset_rows = context.scalars().at(5).value(); Tree tree(max_nodes, narrow(num_outputs)); + + DenseXMatrix X_matrix(X_accessor, X_shape); + SparseSplitProposals const split_proposals = - SelectSplitSamples(context, X_accessor, X_shape, split_samples, seed, dataset_rows); + SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows); // Begin building the tree TreeBuilder builder( @@ -497,15 +500,25 @@ struct build_tree_fn { } }; +struct build_tree_csr_fn { + template + void operator()(legate::TaskContext context) + { + } +}; } // namespace -/*static*/ void BuildTreeTask::cpu_variant(legate::TaskContext context) +/*static*/ void BuildTreeDenseTask::cpu_variant(legate::TaskContext context) { const auto& X = context.input(0).data(); - legateboost::type_dispatch_float(X.code(), build_tree_fn(), context); + legateboost::type_dispatch_float(X.code(), build_tree_dense_fn(), context); } -/*static*/ void BuildTreeCSRTask::cpu_variant(legate::TaskContext context) {} +/*static*/ void BuildTreeCSRTask::cpu_variant(legate::TaskContext context) +{ + const auto& X = context.input(0).data(); + legateboost::type_dispatch_float(X.code(), build_tree_csr_fn(), context); +} } // namespace legateboost @@ -513,7 +526,7 @@ namespace // unnamed { void __attribute__((constructor)) register_tasks() { - legateboost::BuildTreeTask::register_variants(); + legateboost::BuildTreeDenseTask::register_variants(); legateboost::BuildTreeCSRTask::register_variants(); } } // namespace diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu index c2f224fa..d8652037 100644 --- a/src/models/tree/build_tree.cu +++ b/src/models/tree/build_tree.cu @@ -1398,7 +1398,7 @@ struct build_tree_csr_fn { } }; -/*static*/ void BuildTreeTask::gpu_variant(legate::TaskContext context) +/*static*/ void BuildTreeDenseTask::gpu_variant(legate::TaskContext context) { const auto& X = context.input(0).data(); type_dispatch_float(X.code(), build_tree_fn(), context); diff --git a/src/models/tree/build_tree.h b/src/models/tree/build_tree.h index 897ccad4..ace0eef3 100644 --- a/src/models/tree/build_tree.h +++ b/src/models/tree/build_tree.h @@ -243,7 +243,7 @@ class Histogram { } }; -class BuildTreeTask : public Task { +class BuildTreeDenseTask : public Task { public: static void cpu_variant(legate::TaskContext context); #ifdef LEGATEBOOST_USE_CUDA diff --git a/src/models/tree/predict.cc b/src/models/tree/predict.cc index 5feeded9..d24eaf1f 100644 --- a/src/models/tree/predict.cc +++ b/src/models/tree/predict.cc @@ -16,11 +16,34 @@ #include "predict.h" #include #include "../../cpp_utils/cpp_utils.h" +#include "matrix_types.h" namespace legateboost { namespace { -struct predict_fn { +template +void PredictRows(const MatrixT& X, + legate::AccessorWO pred_accessor, + legate::Rect<3, legate::coord_t> pred_shape, + legate::AccessorRO split_value, + legate::AccessorRO feature, + legate::AccessorRO leaf_value) +{ + for (int64_t i = X.RowRange().lo[0]; i <= X.RowRange().hi[0]; i++) { + int pos = 0; + // Use a max depth of 100 to avoid infinite loops + const int max_depth = 100; + for (int depth = 0; depth < max_depth; depth++) { + if (feature[pos] == -1) { break; } + auto x = X.Get(i, feature[pos]); + pos = x <= split_value[pos] ? (pos * 2) + 1 : (pos * 2) + 2; + } + for (int64_t j = pred_shape.lo[2]; j <= pred_shape.hi[2]; j++) { + pred_accessor[{i, 0, j}] = leaf_value[{pos, j}]; + } + } +} +struct predict_dense_fn { template void operator()(legate::TaskContext context) { @@ -45,19 +68,40 @@ struct predict_fn { EXPECT_IS_BROADCAST(context.input(2).data().shape<1>()); EXPECT_IS_BROADCAST(context.input(3).data().shape<1>()); - for (int64_t i = X_shape.lo[0]; i <= X_shape.hi[0]; i++) { - int pos = 0; - // Use a max depth of 100 to avoid infinite loops - const int max_depth = 100; - for (int depth = 0; depth < max_depth; depth++) { - if (feature[pos] == -1) { break; } - auto x = X_accessor[{i, feature[pos], 0}]; - pos = x <= split_value[pos] ? (pos * 2) + 1 : (pos * 2) + 2; - } - for (int64_t j = pred_shape.lo[2]; j <= pred_shape.hi[2]; j++) { - pred_accessor[{i, 0, j}] = leaf_value[{pos, j}]; - } - } + PredictRows(DenseXMatrix(X_accessor, X_shape), + pred_accessor, + pred_shape, + split_value, + feature, + leaf_value); + } +}; + +struct predict_csr_fn { + template + void operator()(legate::TaskContext context) + { + auto [X_vals, X_vals_shape, X_vals_accessor] = GetInputStore(context.input(0).data()); + auto [X_coords, X_coords_shape, X_coords_accessor] = + GetInputStore(context.input(1).data()); + auto [X_offsets, X_offsets_shape, X_offsets_accessor] = + GetInputStore, 1>(context.input(2).data()); + + auto leaf_value = context.input(3).data().read_accessor(); + auto feature = context.input(4).data().read_accessor(); + auto split_value = context.input(5).data().read_accessor(); + + auto pred = context.output(0).data(); + auto pred_shape = pred.shape<3>(); + auto pred_accessor = pred.write_accessor(); + + auto num_features = context.scalars().at(0).value(); + CSRXMatrix X( + X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features); + + EXPECT_AXIS_ALIGNED(0, X_offsets_shape, pred_shape); + + PredictRows(X, pred_accessor, pred_shape, split_value, feature, leaf_value); } }; } // namespace @@ -65,13 +109,13 @@ struct predict_fn { /*static*/ void PredictTreeTask::cpu_variant(legate::TaskContext context) { const auto& X = context.input(0).data(); - type_dispatch_float(X.code(), predict_fn(), context); + type_dispatch_float(X.code(), predict_dense_fn(), context); } /*static*/ void PredictTreeCSRTask::cpu_variant(legate::TaskContext context) { const auto& X = context.input(0).data(); - type_dispatch_float(X.code(), predict_fn(), context); + type_dispatch_float(X.code(), predict_csr_fn(), context); } } // namespace legateboost From 937b7b71750b9e7745f18d344810f25df6d803d7 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 6 Dec 2024 05:45:30 -0800 Subject: [PATCH 06/10] Add cpu tree builder --- legateboost/test/models/test_tree.py | 6 ++- src/models/tree/build_tree.cc | 70 ++++++++++++++++++++++------ 2 files changed, 62 insertions(+), 14 deletions(-) diff --git a/legateboost/test/models/test_tree.py b/legateboost/test/models/test_tree.py index 99f82171..7d5f2e04 100644 --- a/legateboost/test/models/test_tree.py +++ b/legateboost/test/models/test_tree.py @@ -83,5 +83,9 @@ def test_csr(): g = cn.array([[1.0], [-1.0]]) h = cn.array([[1.0], [1.0]]) - model = lb.models.Tree().set_random_state(np.random.RandomState(2)).fit(X, g, h) + model = ( + lb.models.Tree(alpha=0.0) + .set_random_state(np.random.RandomState(2)) + .fit(X, g, h) + ) assert np.allclose(model.predict(X), -g / h) diff --git a/src/models/tree/build_tree.cc b/src/models/tree/build_tree.cc index e7e08083..5e804ccb 100644 --- a/src/models/tree/build_tree.cc +++ b/src/models/tree/build_tree.cc @@ -172,8 +172,9 @@ auto SelectSplitSamples(legate::TaskContext context, split_proposals, row_pointers, X.NumFeatures(), split_proposals_tmp.size()); } -template +template struct TreeBuilder { + using T = typename MatrixT::value_type; TreeBuilder(int32_t num_rows, int32_t num_features, int32_t num_outputs, @@ -201,24 +202,22 @@ struct TreeBuilder { split_proposals.histogram_size); max_batch_size = max_histogram_nodes; } - template void ComputeHistogram(Histogram histogram, legate::TaskContext context, Tree& tree, - const legate::AccessorRO& X, - legate::Rect<3> X_shape, + const MatrixT& X, const legate::AccessorRO& g, const legate::AccessorRO& h, NodeBatch batch) { // Build the histogram for (auto [position, index_local] : batch) { - auto index_global = index_local + X_shape.lo[0]; + auto index_global = index_local + X.RowRange().lo[0]; bool const compute = ComputeHistogramBin( position, tree.node_sums, histogram.ContainsNode(BinaryTree::Parent(position))); if (position < 0 || !compute) { continue; } for (int64_t j = 0; j < num_features; j++) { - auto x_value = X[{index_global, j, 0}]; + auto x_value = X.Get(index_global, j); int const bin_idx = split_proposals.FindBin(x_value, j); if (bin_idx != SparseSplitProposals::NOT_FOUND) { @@ -343,8 +342,7 @@ struct TreeBuilder { } } } - template - void UpdatePositions(Tree& tree, const legate::AccessorRO& X, legate::Rect<3> X_shape) + void UpdatePositions(Tree& tree, const MatrixT& X) { // Update the positions for (int i = 0; i < num_rows; i++) { @@ -353,7 +351,7 @@ struct TreeBuilder { sorted_positions[i] = {-1, index_local}; continue; } - auto x = X[{X_shape.lo[0] + index_local, tree.feature[pos], 0}]; + auto x = X.Get(X.RowRange().lo[0] + index_local, tree.feature[pos]); bool const left = x <= tree.split_value[pos]; pos = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos); sorted_positions[i] = {pos, index_local}; @@ -478,7 +476,7 @@ struct build_tree_dense_fn { SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows); // Begin building the tree - TreeBuilder builder( + TreeBuilder> builder( num_rows, num_features, num_outputs, max_nodes, max_depth, split_proposals); builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha); @@ -487,14 +485,13 @@ struct build_tree_dense_fn { for (auto batch : batches) { auto histogram = builder.GetHistogram(batch); - builder.ComputeHistogram( - histogram, context, tree, X_accessor, X_shape, g_accessor, h_accessor, batch); + builder.ComputeHistogram(histogram, context, tree, X_matrix, g_accessor, h_accessor, batch); builder.PerformBestSplit(tree, histogram, alpha, batch); } // Update position of entire level // Don't bother updating positions for the last level - if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_accessor, X_shape); } + if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_matrix); } } WriteTreeOutput(context, tree); } @@ -504,6 +501,53 @@ struct build_tree_csr_fn { template void operator()(legate::TaskContext context) { + auto [X_vals, X_vals_shape, X_vals_accessor] = GetInputStore(context.input(0).data()); + auto [X_coords, X_coords_shape, X_coords_accessor] = + GetInputStore(context.input(1).data()); + auto [X_offsets, X_offsets_shape, X_offsets_accessor] = + GetInputStore, 1>(context.input(2).data()); + auto [g, g_shape, g_accessor] = GetInputStore(context.input(3).data()); + auto [h, h_shape, h_accessor] = GetInputStore(context.input(4).data()); + + auto num_rows = std::max(X_offsets_shape.hi[0] - X_offsets_shape.lo[0] + 1, 0); + auto num_outputs = g_shape.hi[2] - g_shape.lo[2] + 1; + EXPECT(g_shape.lo[2] == 0, "Outputs should not be split between workers."); + + // Scalars + auto max_depth = context.scalars().at(0).value(); + auto max_nodes = context.scalars().at(1).value(); + auto alpha = context.scalars().at(2).value(); + auto split_samples = context.scalars().at(3).value(); + auto seed = context.scalars().at(4).value(); + auto dataset_rows = context.scalars().at(5).value(); + auto num_features = context.scalars().at(6).value(); + + Tree tree(max_nodes, num_outputs); + + CSRXMatrix X_matrix( + X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features); + const SparseSplitProposals split_proposals = + SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows); + + // Begin building the tree + TreeBuilder> builder( + num_rows, num_features, num_outputs, max_nodes, max_depth, split_proposals); + + builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha); + for (int depth = 0; depth < max_depth; ++depth) { + auto batches = builder.PrepareBatches(depth); + for (auto batch : batches) { + auto histogram = builder.GetHistogram(batch); + + builder.ComputeHistogram(histogram, context, tree, X_matrix, g_accessor, h_accessor, batch); + + builder.PerformBestSplit(tree, histogram, alpha, batch); + } + // Update position of entire level + // Don't bother updating positions for the last level + if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_matrix); } + } + WriteTreeOutput(context, tree); } }; } // namespace From e11d81cd8e112b36ade6564731b98da9e727651a Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 6 Dec 2024 06:45:33 -0800 Subject: [PATCH 07/10] Refactor --- src/models/tree/build_tree.cc | 74 +++++++++-------- src/models/tree/build_tree.cu | 144 +++++++++++++++++----------------- 2 files changed, 107 insertions(+), 111 deletions(-) diff --git a/src/models/tree/build_tree.cc b/src/models/tree/build_tree.cc index 5e804ccb..a99d5e20 100644 --- a/src/models/tree/build_tree.cc +++ b/src/models/tree/build_tree.cc @@ -185,6 +185,7 @@ struct TreeBuilder { num_features(num_features), num_outputs(num_outputs), max_nodes(max_nodes), + max_depth(max_depth), split_proposals(split_proposals) { sorted_positions = legate::create_buffer>(num_rows); @@ -202,6 +203,33 @@ struct TreeBuilder { split_proposals.histogram_size); max_batch_size = max_histogram_nodes; } + + Tree Build(legate::TaskContext context, + const MatrixT& X_matrix, + legate::AccessorRO g_accessor, + legate::AccessorRO h_accessor, + legate::Rect<3> g_shape, + double alpha) + { + // Begin building the tree + Tree tree(max_nodes, narrow(num_outputs)); + this->InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha); + for (int depth = 0; depth < max_depth; ++depth) { + auto batches = this->PrepareBatches(depth); + for (auto batch : batches) { + auto histogram = this->GetHistogram(batch); + + this->ComputeHistogram(histogram, context, tree, X_matrix, g_accessor, h_accessor, batch); + + this->PerformBestSplit(tree, histogram, alpha, batch); + } + // Update position of entire level + // Don't bother updating positions for the last level + if (depth < max_depth - 1) { this->UpdatePositions(tree, X_matrix); } + } + return tree; + } + void ComputeHistogram(Histogram histogram, legate::TaskContext context, Tree& tree, @@ -438,6 +466,7 @@ struct TreeBuilder { int32_t num_features; int32_t num_outputs; int32_t max_nodes; + int32_t max_depth; int max_batch_size; SparseSplitProposals split_proposals; Histogram histogram; @@ -468,31 +497,16 @@ struct build_tree_dense_fn { auto seed = context.scalars().at(4).value(); auto dataset_rows = context.scalars().at(5).value(); - Tree tree(max_nodes, narrow(num_outputs)); - DenseXMatrix X_matrix(X_accessor, X_shape); SparseSplitProposals const split_proposals = SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows); - // Begin building the tree - TreeBuilder> builder( - num_rows, num_features, num_outputs, max_nodes, max_depth, split_proposals); + // Dispatch the tree building algorithm templated on the matrix type + auto tree = TreeBuilder>( + num_rows, num_features, num_outputs, max_nodes, max_depth, split_proposals) + .Build(context, X_matrix, g_accessor, h_accessor, g_shape, alpha); - builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha); - for (int depth = 0; depth < max_depth; ++depth) { - auto batches = builder.PrepareBatches(depth); - for (auto batch : batches) { - auto histogram = builder.GetHistogram(batch); - - builder.ComputeHistogram(histogram, context, tree, X_matrix, g_accessor, h_accessor, batch); - - builder.PerformBestSplit(tree, histogram, alpha, batch); - } - // Update position of entire level - // Don't bother updating positions for the last level - if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_matrix); } - } WriteTreeOutput(context, tree); } }; @@ -522,31 +536,15 @@ struct build_tree_csr_fn { auto dataset_rows = context.scalars().at(5).value(); auto num_features = context.scalars().at(6).value(); - Tree tree(max_nodes, num_outputs); - CSRXMatrix X_matrix( X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features); const SparseSplitProposals split_proposals = SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows); - // Begin building the tree - TreeBuilder> builder( - num_rows, num_features, num_outputs, max_nodes, max_depth, split_proposals); + auto tree = TreeBuilder>( + num_rows, num_features, num_outputs, max_nodes, max_depth, split_proposals) + .Build(context, X_matrix, g_accessor, h_accessor, g_shape, alpha); - builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha); - for (int depth = 0; depth < max_depth; ++depth) { - auto batches = builder.PrepareBatches(depth); - for (auto batch : batches) { - auto histogram = builder.GetHistogram(batch); - - builder.ComputeHistogram(histogram, context, tree, X_matrix, g_accessor, h_accessor, batch); - - builder.PerformBestSplit(tree, histogram, alpha, batch); - } - // Update position of entire level - // Don't bother updating positions for the last level - if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_matrix); } - } WriteTreeOutput(context, tree); } }; diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu index d8652037..f50565e8 100644 --- a/src/models/tree/build_tree.cu +++ b/src/models/tree/build_tree.cu @@ -819,11 +819,12 @@ struct Tree { [=] __device__(const legate::Point& p) { out_acc[p] = x[p]; }); } - template - void WriteTreeOutput(legate::TaskContext context, - const ThrustPolicyT& policy, - GradientQuantiser quantiser) + void WriteTreeOutput(legate::TaskContext context, GradientQuantiser quantiser) { + auto stream = context.get_task_stream(); + auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM); + auto policy = DEFAULT_POLICY(thrust_alloc).on(stream); + WriteOutput(context.output(0).data(), leaf_value, policy); WriteOutput(context.output(1).data(), feature, policy); WriteOutput(context.output(2).data(), split_value, policy); @@ -843,7 +844,7 @@ struct Tree { ~Tree() = default; Tree(const Tree&) = delete; - Tree(Tree&&) = delete; + Tree(Tree&&) = default; auto operator=(const Tree&) -> Tree& = delete; auto operator=(Tree&&) -> Tree& = delete; @@ -975,15 +976,18 @@ struct TreeBuilder { int32_t max_nodes, int32_t max_depth, const SparseSplitProposals& split_proposals, - GradientQuantiser quantiser) + GradientQuantiser quantiser, + int64_t seed) : num_rows(num_rows), num_features(num_features), num_outputs(num_outputs), stream(stream), max_nodes(max_nodes), + max_depth(max_depth), split_proposals(split_proposals), quantiser(quantiser), - histogram_kernel(split_proposals, stream) + histogram_kernel(split_proposals, stream), + seed(seed) { sorted_positions = legate::create_buffer>(num_rows); FillPositions(sorted_positions, num_rows, stream); @@ -1007,6 +1011,38 @@ struct TreeBuilder { max_batch_size = max_histogram_nodes; } + Tree Build(legate::TaskContext context, + const MatrixT& X_matrix, + legate::AccessorRO g_accessor, + legate::AccessorRO h_accessor, + legate::Rect<3> g_shape, + double alpha) + { + auto stream = context.get_task_stream(); + auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM); + auto thrust_exec_policy = DEFAULT_POLICY(thrust_alloc).on(stream); + + Tree tree(max_nodes, num_outputs, stream, thrust_exec_policy); + + this->InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha); + + for (int depth = 0; depth < max_depth; ++depth) { + auto batches = this->PrepareBatches(depth); + for (auto batch : batches) { + auto histogram = this->GetHistogram(batch); + + this->ComputeHistogram(histogram, context, tree, X_matrix, g_accessor, h_accessor, batch); + + this->PerformBestSplit(tree, histogram, alpha, batch); + } + // Update position of entire level + // Don't bother updating positions for the last level + if (depth < max_depth - 1) { this->UpdatePositions(tree, X_matrix); } + } + + return tree; + } + TreeBuilder(const TreeBuilder&) = delete; TreeBuilder(TreeBuilder&&) = delete; auto operator=(const TreeBuilder&) -> TreeBuilder& = delete; @@ -1075,8 +1111,7 @@ struct TreeBuilder { const MatrixT X, const legate::AccessorRO& g, const legate::AccessorRO& h, - NodeBatch batch, - int64_t seed) + NodeBatch batch) { histogram_kernel.BuildHistogram(X, g, @@ -1137,8 +1172,7 @@ struct TreeBuilder { const legate::AccessorRO& g, const legate::AccessorRO& h, legate::Rect<3> g_shape, - double alpha, - int64_t seed) + double alpha) { const int kBlockThreads = 256; const size_t blocks = (num_rows + kBlockThreads - 1) / kBlockThreads; @@ -1245,6 +1279,8 @@ struct TreeBuilder { const int32_t num_features; const int32_t num_outputs; const int32_t max_nodes; + const int32_t max_depth; + const int64_t seed; SparseSplitProposals split_proposals; Histogram histogram; int max_batch_size; @@ -1285,41 +1321,23 @@ struct build_tree_fn { auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM); auto thrust_exec_policy = DEFAULT_POLICY(thrust_alloc).on(stream); - Tree tree(max_nodes, num_outputs, stream, thrust_exec_policy); - const SparseSplitProposals split_proposals = SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows, stream); GradientQuantiser const quantiser(context, g_accessor, h_accessor, g_shape, stream); - // Begin building the tree - TreeBuilder> builder(num_rows, - num_features, - num_outputs, - stream, - tree.max_nodes, - max_depth, - split_proposals, - quantiser); + auto tree = TreeBuilder>(num_rows, + num_features, + num_outputs, + stream, + max_nodes, + max_depth, + split_proposals, + quantiser, + seed) + .Build(context, X_matrix, g_accessor, h_accessor, g_shape, alpha); - builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha, seed); - - for (int depth = 0; depth < max_depth; ++depth) { - auto batches = builder.PrepareBatches(depth); - for (auto batch : batches) { - auto histogram = builder.GetHistogram(batch); - - builder.ComputeHistogram( - histogram, context, tree, X_matrix, g_accessor, h_accessor, batch, seed); - - builder.PerformBestSplit(tree, histogram, alpha, batch); - } - // Update position of entire level - // Don't bother updating positions for the last level - if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_matrix); } - } - - tree.WriteTreeOutput(context, thrust_exec_policy, quantiser); + tree.WriteTreeOutput(context, quantiser); CHECK_CUDA(cudaStreamSynchronize(stream)); CHECK_CUDA_STREAM(stream); @@ -1351,12 +1369,7 @@ struct build_tree_csr_fn { auto dataset_rows = context.scalars().at(5).value(); auto num_features = context.scalars().at(6).value(); - auto stream = context.get_task_stream(); - auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM); - auto thrust_exec_policy = DEFAULT_POLICY(thrust_alloc).on(stream); - - Tree tree(max_nodes, num_outputs, stream, thrust_exec_policy); - + auto* stream = context.get_task_stream(); CSRXMatrix X_matrix( X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features); const SparseSplitProposals split_proposals = @@ -1365,33 +1378,18 @@ struct build_tree_csr_fn { GradientQuantiser quantiser(context, g_accessor, h_accessor, g_shape, stream); // Begin building the tree - TreeBuilder> builder(num_rows, - num_features, - num_outputs, - stream, - tree.max_nodes, - max_depth, - split_proposals, - quantiser); - - builder.InitialiseRoot(context, tree, g_accessor, h_accessor, g_shape, alpha, seed); - - for (int depth = 0; depth < max_depth; ++depth) { - auto batches = builder.PrepareBatches(depth); - for (auto batch : batches) { - auto histogram = builder.GetHistogram(batch); - - builder.ComputeHistogram( - histogram, context, tree, X_matrix, g_accessor, h_accessor, batch, seed); - - builder.PerformBestSplit(tree, histogram, alpha, batch); - } - // Update position of entire level - // Don't bother updating positions for the last level - if (depth < max_depth - 1) { builder.UpdatePositions(tree, X_matrix); } - } - - tree.WriteTreeOutput(context, thrust_exec_policy, quantiser); + auto tree = TreeBuilder>(num_rows, + num_features, + num_outputs, + stream, + max_nodes, + max_depth, + split_proposals, + quantiser, + seed) + .Build(context, X_matrix, g_accessor, h_accessor, g_shape, alpha); + + tree.WriteTreeOutput(context, quantiser); CHECK_CUDA(cudaStreamSynchronize(stream)); CHECK_CUDA_STREAM(stream); From 50ccc8a41af7772f530444ef71e6dc5e2bcfc7fb Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Wed, 11 Dec 2024 04:05:18 -0800 Subject: [PATCH 08/10] Add custom CSR kernel for GPU --- legateboost/test/test_estimator.py | 38 ++++- src/models/tree/build_tree.cc | 9 +- src/models/tree/build_tree.cu | 216 +++++++++++++++++++++-------- src/models/tree/build_tree.h | 6 + src/models/tree/matrix_types.h | 54 ++++++-- src/models/tree/predict.cc | 9 +- src/models/tree/predict.cu | 9 +- 7 files changed, 260 insertions(+), 81 deletions(-) diff --git a/legateboost/test/test_estimator.py b/legateboost/test/test_estimator.py index db206ab6..17affbbf 100644 --- a/legateboost/test/test_estimator.py +++ b/legateboost/test/test_estimator.py @@ -227,12 +227,44 @@ def test_iterator_methods(): ) def test_csr_input(base_model): csr_matrix = pytest.importorskip("legate_sparse").csr_matrix - X_scipy = scipy.sparse.csr_matrix([[1.0, 0.0, 2.0], [0.0, 3.0, 0.0]]) + X_dense = cn.array([[1.0, 0.0, 2.0], [0.0, 3.0, 0.0]]) + X_scipy = scipy.sparse.csr_matrix(X_dense) X_legate_sparse = csr_matrix(X_scipy) y = cn.array([1.0, 2.0]) model = lb.LBRegressor( + init=None, n_estimators=1, base_models=(base_model,), + learning_rate=1.0, ) - model.fit(X_scipy, y) - model.fit(X_legate_sparse, y) + sparse_pred = model.fit(X_scipy, y).predict(X_scipy) + legate_sparse_pred = model.fit(X_legate_sparse, y).predict(X_legate_sparse) + dense_pred = model.fit(X_dense, y).predict(X_dense) + assert cn.allclose(sparse_pred, legate_sparse_pred) + assert cn.allclose(sparse_pred, dense_pred) + + # Generate a sparse dataset and check that dense and sparse + # input give a bitwise equal result + rng = np.random.RandomState(0) + X = rng.binomial(1, 0.1, (100, 100)).astype(np.float32) + y = rng.randint(0, 5, 100) + X_scipy = scipy.sparse.csr_matrix(X) + + # regression + params = {"base_models": (base_model,), "n_estimators": 5, "random_state": 0} + sparse_model = lb.LBRegressor(**params) + dense_model = lb.LBRegressor(**params) + dense_pred = dense_model.fit(X, y).predict(X) + X_csr = csr_matrix(X) + assert X_csr.nnz < X.size + sparse_pred = sparse_model.fit(X_csr, y).predict(X_csr) + assert cn.all(dense_pred == sparse_pred) + sanity_check_models(sparse_model) + + # classification + sparse_model = lb.LBClassifier(**params) + dense_model = lb.LBClassifier(**params) + dense_pred = dense_model.fit(X, y).predict_proba(X) + sparse_pred = sparse_model.fit(X_csr, y).predict_proba(X_csr) + assert cn.all(dense_pred == sparse_pred) + sanity_check_models(sparse_model) diff --git a/src/models/tree/build_tree.cc b/src/models/tree/build_tree.cc index a99d5e20..624b3da1 100644 --- a/src/models/tree/build_tree.cc +++ b/src/models/tree/build_tree.cc @@ -536,8 +536,13 @@ struct build_tree_csr_fn { auto dataset_rows = context.scalars().at(5).value(); auto num_features = context.scalars().at(6).value(); - CSRXMatrix X_matrix( - X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features); + CSRXMatrix X_matrix(X_vals_accessor, + X_coords_accessor, + X_offsets_accessor, + X_vals_shape, + X_offsets_shape, + num_features, + X_vals_shape.volume()); const SparseSplitProposals split_proposals = SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows); diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu index f50565e8..51be29ff 100644 --- a/src/models/tree/build_tree.cu +++ b/src/models/tree/build_tree.cu @@ -210,12 +210,11 @@ using SharedMemoryHistogramType = GPairBase; const int kMaxSharedBins = 2048; // 16KB shared memory. More is not helpful and creates more cache // misses for binary search in split_proposals. -template struct HistogramAgent { - using T = typename MatrixT::value_type; static const int kImpureTile = -1; // Special value for a tile that is not pure (contains // multiple nodes) struct SharedMemoryHistogram { @@ -273,7 +272,7 @@ struct HistogramAgent { } }; - const MatrixT& X; + const DenseXMatrix& X; const legate::AccessorRO& g; const legate::AccessorRO& h; const size_t& n_outputs; @@ -289,7 +288,7 @@ struct HistogramAgent { int feature_stride; SharedMemoryHistogram shared_histogram; - __device__ HistogramAgent(const MatrixT& X, + __device__ HistogramAgent(const DenseXMatrix& X, const legate::AccessorRO& g, const legate::AccessorRO& h, const size_t& n_outputs, @@ -342,8 +341,7 @@ struct HistogramAgent { sample_node, node_sums, histogram.ContainsNode(BinaryTree::Parent(sample_node))); if (!computeHistogram) { continue; } - auto x = X.Get(X.RowRange().lo[0] + local_sample_idx, feature); - // int bin_idx = shared_split_proposals.FindBin(x, feature); + auto x = X.Get(X.RowRange().lo[0] + local_sample_idx, feature); int const bin_idx = split_proposals.FindBin(x, feature); legate::Point<3> p = {X.RowRange().lo[0] + local_sample_idx, 0, output}; @@ -454,19 +452,19 @@ struct HistogramAgent { }; // NOLINTBEGIN(performance-unnecessary-value-param) -template +template __global__ void __launch_bounds__(kBlockThreads) - fill_histogram_shared(MatrixT X, - legate::AccessorRO g, - legate::AccessorRO h, - size_t n_outputs, - SparseSplitProposals split_proposals, - NodeBatch batch, - Histogram histogram, - legate::Buffer node_sums, - GradientQuantiser quantiser, - legate::Buffer feature_groups, - int64_t seed) + fill_histogram_dense(DenseXMatrix X, + legate::AccessorRO g, + legate::AccessorRO h, + size_t n_outputs, + SparseSplitProposals split_proposals, + NodeBatch batch, + Histogram histogram, + legate::Buffer node_sums, + GradientQuantiser quantiser, + legate::Buffer feature_groups, + int64_t seed) { // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,hicpp-avoid-c-arrays) __shared__ char shared_char[kMaxSharedBins * sizeof(SharedMemoryHistogramType)]; @@ -474,22 +472,69 @@ __global__ void __launch_bounds__(kBlockThreads) auto* shared_memory = // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) reinterpret_cast(shared_char); - HistogramAgent agent(X, - g, - h, - n_outputs, - split_proposals, - batch, - histogram, - node_sums, - quantiser, - feature_groups, - seed, - shared_memory); + HistogramAgent agent(X, + g, + h, + n_outputs, + split_proposals, + batch, + histogram, + node_sums, + quantiser, + feature_groups, + seed, + shared_memory); agent.BuildHistogram(); } // NOLINTEND(performance-unnecessary-value-param) +// NOLINTBEGIN(performance-unnecessary-value-param) +template +__global__ void __launch_bounds__(kBlockThreads) + fill_histogram_csr(CSRXMatrix X, + legate::AccessorRO g, + legate::AccessorRO h, + size_t n_outputs, + SparseSplitProposals split_proposals, + NodeBatch batch, + Histogram histogram, + legate::Buffer node_sums, + GradientQuantiser quantiser, + int64_t seed) +{ + // Grid stride loop over rows + for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < batch.InstancesInBatch(); + idx += blockDim.x * gridDim.x) { + auto [sample_node, local_sample_idx] = batch.instances[idx]; + // If we don't need to compute this node, skip + if (!ComputeHistogramBin( + sample_node, node_sums, histogram.ContainsNode(BinaryTree::Parent(sample_node)))) { + continue; + } + // Which matrix elements belong to this row? + std::size_t const global_sample_idx = X.RowRange().lo[0] + local_sample_idx; + auto elements = X.row_ranges[global_sample_idx]; + for (std::size_t element = elements.lo[0]; element <= elements.hi[0]; element++) { + auto feature = X.column_indices[X.vals_shape.lo[0] + element]; + auto x = X.values[X.vals_shape.lo[0] + element]; + int const bin_idx = split_proposals.FindBin(x, feature); + if (bin_idx == SparseSplitProposals::NOT_FOUND) continue; + for (int output = 0; output < n_outputs; output++) { + legate::Point<3> p = {global_sample_idx, 0, output}; + auto gpair_quantised = + quantiser.QuantiseStochasticRounding({g[p], h[p]}, hash_combine(seed, p[0], p[2])); + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) + auto* addPosition = reinterpret_cast::atomic_add_type*>( + &histogram[{sample_node, output, bin_idx}]); + atomicAdd(addPosition, gpair_quantised.grad); + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + atomicAdd(addPosition + 1, gpair_quantised.hess); + } + } + } +} +// NOLINTEND(performance-unnecessary-value-param) + // Manage the launch parameters for histogram kernel template struct HistogramKernel { @@ -506,10 +551,7 @@ struct HistogramKernel { CHECK_CUDA(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device)); std::int32_t n_blocks_per_mp = 0; CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &n_blocks_per_mp, - fill_histogram_shared, - kBlockThreads, - 0)); + &n_blocks_per_mp, fill_histogram_dense, kBlockThreads, 0)); this->maximum_blocks_for_occupancy = n_blocks_per_mp * n_mps; FindFeatureGroups(split_proposals, stream); } @@ -561,24 +603,35 @@ struct HistogramKernel { int64_t seed, cudaStream_t stream) { - int const average_features_per_group = split_proposals.num_features / num_groups; - std::size_t const average_elements_per_group = - batch.InstancesInBatch() * average_features_per_group; - auto min_blocks = (average_elements_per_group + kItemsPerTile - 1) / kItemsPerTile; - auto x_grid_size = std::min(static_cast(maximum_blocks_for_occupancy), min_blocks); - // Launch the kernel - fill_histogram_shared - <<>>(X, - g, - h, - n_outputs, - split_proposals, - batch, - histogram, - node_sums, - quantiser, - feature_groups, - seed); + if constexpr (std::is_same_v>) { + int const average_features_per_group = split_proposals.num_features / num_groups; + std::size_t const average_elements_per_group = + batch.InstancesInBatch() * average_features_per_group; + auto min_blocks = (average_elements_per_group + kItemsPerTile - 1) / kItemsPerTile; + auto x_grid_size = std::min(static_cast(maximum_blocks_for_occupancy), min_blocks); + // Launch the kernel + fill_histogram_dense + <<>>(X, + g, + h, + n_outputs, + split_proposals, + batch, + histogram, + node_sums, + quantiser, + feature_groups, + seed); + } else { + // For sparse data we don't currently make use of feature groups or shared memory + // Use 1 thread per row for lack of a better option currently + // Other methods might involve complicated load balancing + auto min_blocks = (batch.InstancesInBatch() + kItemsPerTile - 1) / kItemsPerTile; + auto x_grid_size = std::min(static_cast(maximum_blocks_for_occupancy), min_blocks); + // Launch the kernel + fill_histogram_csr<<>>( + X, g, h, n_outputs, split_proposals, batch, histogram, node_sums, quantiser, seed); + } } }; @@ -684,6 +737,36 @@ struct GainFeaturePair { } }; +// In the case where we have a sparse matrix, gradients for 0's have not been accumulated in the +// histogram We can infer the gradients at the matrix zeroes by subtracting the sum of the gradients +// at the last bin (which contains gradients from every non-zero element) +// from the sum of the gradients in the node (this sum always includes gradients for every element +// in that node) +template +__device__ auto GetSparseSum(Histogram& histogram, + const SparseSplitProposals& split_proposals, + const IntegerGPair& node_sum, + int node_id, + int output, + int bin_idx) +{ + auto left_sum = vectorised_load(&histogram[{node_id, output, bin_idx}]); + auto right_sum = node_sum - left_sum; + auto feature = split_proposals.FindFeature(bin_idx); + auto [feature_begin, feature_end] = split_proposals.FeatureRange(feature); + auto scan_sum = vectorised_load(&histogram[{node_id, output, feature_end - 1}]); + auto zero_bin = split_proposals.FindBin(0.0, feature); + auto sparse_sum = node_sum - scan_sum; + if (zero_bin == SparseSplitProposals::NOT_FOUND || bin_idx < zero_bin) { + // Do nothing, this amount is already on the right + } else { + // Move it to the left + left_sum += sparse_sum; + right_sum -= sparse_sum; + } + return std::make_tuple(left_sum, right_sum); +} + // NOLINTBEGIN(performance-unnecessary-value-param) template __global__ void __launch_bounds__(BLOCK_THREADS) @@ -718,9 +801,13 @@ __global__ void __launch_bounds__(BLOCK_THREADS) bin_idx += BLOCK_THREADS) { double gain = 0; for (int output = 0; output < n_outputs; ++output) { - auto node_sum = vectorised_load(&node_sums[{node_id, output}]); - auto left_sum = vectorised_load(&histogram[{node_id, output, bin_idx}]); - auto right_sum = node_sum - left_sum; + auto node_sum = vectorised_load(&node_sums[{node_id, output}]); + + auto [left_sum, right_sum] = + GetSparseSum(histogram, split_proposals, node_sum, node_id, output, bin_idx); + // printf("node %d , bin %d, left sum %ld %ld right sum %ld %ld \n", node_id ,bin_idx, + // left_sum.grad, left_sum.hess, right_sum.grad, right_sum.hess); + if (left_sum.hess <= 0 || right_sum.hess <= 0) { gain = 0; break; @@ -755,9 +842,11 @@ __global__ void __launch_bounds__(BLOCK_THREADS) if (node_best_gain > eps) { int const node_best_feature = split_proposals.FindFeature(node_best_bin_idx); for (int output = narrow_cast(threadIdx.x); output < n_outputs; output += BLOCK_THREADS) { - auto node_sum = vectorised_load(&node_sums[{node_id, output}]); - auto left_sum = vectorised_load(&histogram[{node_id, output, node_best_bin_idx}]); - auto right_sum = node_sum - left_sum; + auto node_sum = vectorised_load(&node_sums[{node_id, output}]); + + auto [left_sum, right_sum] = + GetSparseSum(histogram, split_proposals, node_sum, node_id, output, node_best_bin_idx); + node_sums[{BinaryTree::LeftChild(node_id), output}] = left_sum; node_sums[{BinaryTree::RightChild(node_id), output}] = right_sum; @@ -1069,6 +1158,8 @@ struct TreeBuilder { X.Get(X.RowRange().lo[0] + static_cast(row), tree_feature_span[pos]); bool left = x_value <= tree_split_value_span[pos]; pos = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos); + // printf("Row %d, feature %d, value %f pos %d\n", row, tree_feature_span[pos], + // x_value, pos); sorted_positions[idx] = cuda::std::make_tuple(pos, row); }); CHECK_CUDA_STREAM(stream); @@ -1370,8 +1461,13 @@ struct build_tree_csr_fn { auto num_features = context.scalars().at(6).value(); auto* stream = context.get_task_stream(); - CSRXMatrix X_matrix( - X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features); + CSRXMatrix X_matrix(X_vals_accessor, + X_coords_accessor, + X_offsets_accessor, + X_vals_shape, + X_offsets_shape, + num_features, + X_vals_shape.volume()); const SparseSplitProposals split_proposals = SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows, stream); diff --git a/src/models/tree/build_tree.h b/src/models/tree/build_tree.h index ace0eef3..a0026b70 100644 --- a/src/models/tree/build_tree.h +++ b/src/models/tree/build_tree.h @@ -40,6 +40,12 @@ struct GPairBase { this->hess += b.hess; return *this; } + __host__ __device__ auto operator-=(const GPairBase& b) -> GPairBase& + { + this->grad -= b.grad; + this->hess -= b.hess; + return *this; + } }; template diff --git a/src/models/tree/matrix_types.h b/src/models/tree/matrix_types.h index 508b351f..10891e02 100644 --- a/src/models/tree/matrix_types.h +++ b/src/models/tree/matrix_types.h @@ -16,6 +16,11 @@ #pragma once #include #include +#ifdef __CUDACC__ +#include +#else +#include +#endif // Create a uniform interface to two matrix formats // Dense and CSR @@ -30,7 +35,12 @@ class DenseXMatrix { public: DenseXMatrix(legate::AccessorRO x, legate::Rect<3> shape) : x(x), shape(shape) {} - __host__ __device__ T Get(uint32_t i, uint32_t j) const { return x[legate::Point<3>{i, j, 0}]; } + // Global row index refers to the index across partitions + // For features, each worker has every feature so the global is the same as the local index + __host__ __device__ T Get(std::size_t global_row_idx, uint32_t feature_idx) const + { + return x[legate::Point<3>{global_row_idx, feature_idx, 0}]; + } __host__ __device__ int NumFeatures() const { return shape.hi[1] - shape.lo[1] + 1; } __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const { @@ -43,38 +53,58 @@ class CSRXMatrix { public: using value_type = T; - private: legate::AccessorRO values; + legate::Rect<1, legate::coord_t> vals_shape; legate::AccessorRO column_indices; legate::AccessorRO, 1> row_ranges; legate::Rect<1, legate::coord_t> row_ranges_shape; int num_features; + std::size_t nnz; // The number of nnz in ths local partition - public: CSRXMatrix(legate::AccessorRO values, legate::AccessorRO column_indices, legate::AccessorRO, 1> row_ranges, + legate::Rect<1, legate::coord_t> vals_shape, legate::Rect<1, legate::coord_t> row_ranges_shape, - int num_features) + int num_features, + std::size_t nnz) : values(values), column_indices(column_indices), row_ranges(row_ranges), num_features(num_features), - row_ranges_shape(row_ranges_shape) + vals_shape(vals_shape), + row_ranges_shape(row_ranges_shape), + nnz(nnz) { } - // Slower than dense due to search for column index - __host__ __device__ T Get(uint32_t i, uint32_t j) const + // Global row index refers to the index across partitions + // For features, each worker has every feature so the global is the same as the local index + // This method is less efficient than its Dense counterpart due to the need to search for the + // feature + __host__ __device__ T Get(std::size_t global_row_idx, uint32_t feature_idx) const { - auto row_range = row_ranges[i]; - // TODO(Rory): Binary search? - for (int64_t k = row_range.lo; k <= row_range.hi; k++) { - if (column_indices[k] == j) return values[k]; - if (column_indices[k] > j) return 0; + auto row_range = row_ranges[global_row_idx]; + + tcb::span column_indices_span(column_indices.ptr(row_range.lo), + row_range.volume()); + +#ifdef __CUDACC__ + auto result = thrust::lower_bound( + thrust::seq, column_indices_span.begin(), column_indices_span.end(), feature_idx); +#else + auto result = + std::lower_bound(column_indices_span.begin(), column_indices_span.end(), feature_idx); +#endif + + if (result != column_indices_span.end() && *result == feature_idx) { + return values[row_range.lo + (result - column_indices_span.begin())]; } return 0; } + + auto NNZ() const { return nnz; } + __host__ __device__ int NumFeatures() const { return num_features; } __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const { return row_ranges_shape; } }; diff --git a/src/models/tree/predict.cc b/src/models/tree/predict.cc index d24eaf1f..19f80bc2 100644 --- a/src/models/tree/predict.cc +++ b/src/models/tree/predict.cc @@ -96,8 +96,13 @@ struct predict_csr_fn { auto pred_accessor = pred.write_accessor(); auto num_features = context.scalars().at(0).value(); - CSRXMatrix X( - X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features); + CSRXMatrix X(X_vals_accessor, + X_coords_accessor, + X_offsets_accessor, + X_vals_shape, + X_offsets_shape, + num_features, + X_vals_shape.volume()); EXPECT_AXIS_ALIGNED(0, X_offsets_shape, pred_shape); diff --git a/src/models/tree/predict.cu b/src/models/tree/predict.cu index 181070f3..c1c94770 100644 --- a/src/models/tree/predict.cu +++ b/src/models/tree/predict.cu @@ -109,8 +109,13 @@ struct predict_csr_fn { auto pred_accessor = pred.write_accessor(); auto num_features = context.scalars().at(0).value(); - CSRXMatrix X( - X_vals_accessor, X_coords_accessor, X_offsets_accessor, X_offsets_shape, num_features); + CSRXMatrix X(X_vals_accessor, + X_coords_accessor, + X_offsets_accessor, + X_vals_shape, + X_offsets_shape, + num_features, + X_vals_shape.volume()); EXPECT_AXIS_ALIGNED(0, X_offsets_shape, pred_shape); From ea45f872609563b48c1e3f7a16443743b97819ec Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 12 Dec 2024 03:21:10 -0800 Subject: [PATCH 09/10] Add CPU sparse algorithm --- legateboost/test/models/test_tree.py | 1 + legateboost/test/test_estimator.py | 11 ++- src/models/tree/build_tree.cc | 119 ++++++++++++++++++++------- src/models/tree/build_tree.cu | 71 ++++++---------- src/models/tree/build_tree.h | 34 ++++++++ src/models/tree/matrix_types.h | 7 +- src/models/tree/predict.cc | 2 +- src/models/tree/predict.cu | 4 +- 8 files changed, 162 insertions(+), 87 deletions(-) diff --git a/legateboost/test/models/test_tree.py b/legateboost/test/models/test_tree.py index 7d5f2e04..bb9fa9aa 100644 --- a/legateboost/test/models/test_tree.py +++ b/legateboost/test/models/test_tree.py @@ -88,4 +88,5 @@ def test_csr(): .set_random_state(np.random.RandomState(2)) .fit(X, g, h) ) + print(model) assert np.allclose(model.predict(X), -g / h) diff --git a/legateboost/test/test_estimator.py b/legateboost/test/test_estimator.py index 17affbbf..5af10a73 100644 --- a/legateboost/test/test_estimator.py +++ b/legateboost/test/test_estimator.py @@ -244,7 +244,10 @@ def test_csr_input(base_model): assert cn.allclose(sparse_pred, dense_pred) # Generate a sparse dataset and check that dense and sparse - # input give a bitwise equal result + # input give equivalent results + # unfortunately we can't test that they are bitwise identical + # the changing order of floating point sums can lead to different results + # so instead assert that sparse models are roughly as accurate as dense ones rng = np.random.RandomState(0) X = rng.binomial(1, 0.1, (100, 100)).astype(np.float32) y = rng.randint(0, 5, 100) @@ -258,7 +261,8 @@ def test_csr_input(base_model): X_csr = csr_matrix(X) assert X_csr.nnz < X.size sparse_pred = sparse_model.fit(X_csr, y).predict(X_csr) - assert cn.all(dense_pred == sparse_pred) + assert sparse_model.score(X, y) > 0.4 + assert dense_model.score(X, y) > 0.4 sanity_check_models(sparse_model) # classification @@ -266,5 +270,6 @@ def test_csr_input(base_model): dense_model = lb.LBClassifier(**params) dense_pred = dense_model.fit(X, y).predict_proba(X) sparse_pred = sparse_model.fit(X_csr, y).predict_proba(X_csr) - assert cn.all(dense_pred == sparse_pred) + assert sparse_model.score(X, y) > 0.8 + assert dense_model.score(X, y) > 0.8 sanity_check_models(sparse_model) diff --git a/src/models/tree/build_tree.cc b/src/models/tree/build_tree.cc index 624b3da1..55957ecc 100644 --- a/src/models/tree/build_tree.cc +++ b/src/models/tree/build_tree.cc @@ -146,7 +146,7 @@ auto SelectSplitSamples(legate::TaskContext context, auto draft_proposals = legate::create_buffer({X.NumFeatures(), split_samples}); for (int i = 0; i < split_samples; i++) { auto row = row_samples[i]; - const bool has_data = X.RowRange().contains(row); + const bool has_data = X.RowSubset().contains(row); for (int j = 0; j < X.NumFeatures(); j++) { draft_proposals[{j, i}] = has_data ? X.Get(row, j) : T(0); } @@ -230,24 +230,51 @@ struct TreeBuilder { return tree; } - void ComputeHistogram(Histogram histogram, - legate::TaskContext context, - Tree& tree, - const MatrixT& X, - const legate::AccessorRO& g, - const legate::AccessorRO& h, - NodeBatch batch) + void DenseHistogramKernel(const Tree& tree, + Histogram& histogram, + const DenseXMatrix& X, + legate::AccessorRO g, + legate::AccessorRO h, + NodeBatch batch) { // Build the histogram for (auto [position, index_local] : batch) { - auto index_global = index_local + X.RowRange().lo[0]; + auto index_global = index_local + X.RowSubset().lo[0]; bool const compute = ComputeHistogramBin( position, tree.node_sums, histogram.ContainsNode(BinaryTree::Parent(position))); if (position < 0 || !compute) { continue; } for (int64_t j = 0; j < num_features; j++) { auto x_value = X.Get(index_global, j); int const bin_idx = split_proposals.FindBin(x_value, j); + if (bin_idx != SparseSplitProposals::NOT_FOUND) { + for (int64_t k = 0; k < num_outputs; ++k) { + histogram[{position, k, bin_idx}] += + GPair{g[{index_global, 0, k}], h[{index_global, 0, k}]}; + } + } + } + } + } + // Kernel specialised to iterate only over the non-zero elements of the sparse matrix + void CSRHistogramKernel(const Tree& tree, + Histogram& histogram, + const CSRXMatrix& X, + legate::AccessorRO g, + legate::AccessorRO h, + NodeBatch batch) + { + // Build the histogram + for (auto [position, index_local] : batch) { + auto index_global = index_local + X.RowSubset().lo[0]; + bool const compute = ComputeHistogramBin( + position, tree.node_sums, histogram.ContainsNode(BinaryTree::Parent(position))); + if (position < 0 || !compute) { continue; } + auto row_range = X.row_ranges[index_global]; + for (auto element_idx = row_range.lo[0]; element_idx <= row_range.hi[0]; element_idx++) { + auto feature = X.column_indices[element_idx]; + auto x = X.values[element_idx]; + int const bin_idx = split_proposals.FindBin(x, feature); if (bin_idx != SparseSplitProposals::NOT_FOUND) { for (int64_t k = 0; k < num_outputs; ++k) { histogram[{position, k, bin_idx}] += @@ -256,6 +283,21 @@ struct TreeBuilder { } } } + } + + void ComputeHistogram(Histogram histogram, + legate::TaskContext context, + const Tree& tree, + const MatrixT& X, + const legate::AccessorRO& g, + const legate::AccessorRO& h, + NodeBatch batch) + { + if constexpr (std::is_same_v>) { + this->DenseHistogramKernel(tree, histogram, X, g, h, batch); + } else { + this->CSRHistogramKernel(tree, histogram, X, g, h, batch); + } // NCCL cannot allreduce custom types, need to reinterpret as double SumAllReduce( @@ -266,7 +308,7 @@ struct TreeBuilder { this->Scan(histogram, batch, tree); } - void Scan(Histogram histogram, NodeBatch batch, Tree& tree) + void Scan(Histogram histogram, NodeBatch batch, const Tree& tree) { auto scan_node_histogram = [&](int node_idx) { for (int feature = 0; feature < num_features; feature++) { @@ -319,6 +361,7 @@ struct TreeBuilder { } void PerformBestSplit(Tree& tree, Histogram histogram, double alpha, NodeBatch batch) { + const bool is_sparse_matrix = std::is_same_v>; for (int node_id = batch.node_idx_begin; node_id < batch.node_idx_end; node_id++) { double best_gain = 0; int best_feature = -1; @@ -328,10 +371,18 @@ struct TreeBuilder { for (int bin_idx = feature_begin; bin_idx < feature_end; bin_idx++) { double gain = 0; for (int output = 0; output < num_outputs; ++output) { - auto [G_L, H_L] = histogram[{node_id, output, bin_idx}]; - auto [G, H] = tree.node_sums[{node_id, output}]; - auto G_R = G - G_L; - auto H_R = H - H_L; + auto [left_sum, right_sum] = InferSplitSums(histogram, + split_proposals, + tree.node_sums[{node_id, output}], + node_id, + output, + bin_idx, + feature, + is_sparse_matrix); + auto [G_L, H_L] = left_sum; + auto [G_R, H_R] = right_sum; + auto [G, H] = tree.node_sums[{node_id, output}]; + double const reg = std::max(eps, alpha); // Regularisation term gain += 0.5 * ((G_L * G_L) / (H_L + reg) + (G_R * G_R) / (H_R + reg) - (G * G) / (H + reg)); @@ -344,29 +395,33 @@ struct TreeBuilder { } } if (best_gain > eps) { - std::vector left_leaf(num_outputs); - std::vector right_leaf(num_outputs); - std::vector left_sum(num_outputs); - std::vector right_sum(num_outputs); + std::vector left_leaves(num_outputs); + std::vector right_leaves(num_outputs); + std::vector left_sums(num_outputs); + std::vector right_sums(num_outputs); for (int output = 0; output < num_outputs; ++output) { - auto [G_L, H_L] = histogram[{node_id, output, best_bin}]; - auto [G, H] = tree.node_sums[{node_id, output}]; - auto G_R = G - G_L; - auto H_R = H - H_L; - left_leaf[output] = CalculateLeafValue(G_L, H_L, alpha); - right_leaf[output] = CalculateLeafValue(G_R, H_R, alpha); - left_sum[output] = {G_L, H_L}; - right_sum[output] = {G_R, H_R}; + auto [left_sum, right_sum] = InferSplitSums(histogram, + split_proposals, + tree.node_sums[{node_id, output}], + node_id, + output, + best_bin, + best_feature, + is_sparse_matrix); + left_leaves[output] = CalculateLeafValue(left_sum.grad, left_sum.hess, alpha); + right_leaves[output] = CalculateLeafValue(right_sum.grad, right_sum.hess, alpha); + left_sums[output] = left_sum; + right_sums[output] = right_sum; } - if (left_sum[0].hess <= 0.0 || right_sum[0].hess <= 0.0) { continue; } + if (left_sums[0].hess <= 0.0 || right_sums[0].hess <= 0.0) { continue; } tree.AddSplit(node_id, best_feature, split_proposals.split_proposals[legate::coord_t{best_bin}], - left_leaf, - right_leaf, + left_leaves, + right_leaves, best_gain, - left_sum, - right_sum); + left_sums, + right_sums); } } } @@ -379,7 +434,7 @@ struct TreeBuilder { sorted_positions[i] = {-1, index_local}; continue; } - auto x = X.Get(X.RowRange().lo[0] + index_local, tree.feature[pos]); + auto x = X.Get(X.RowSubset().lo[0] + index_local, tree.feature[pos]); bool const left = x <= tree.split_value[pos]; pos = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos); sorted_positions[i] = {pos, index_local}; diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu index 51be29ff..884f945c 100644 --- a/src/models/tree/build_tree.cu +++ b/src/models/tree/build_tree.cu @@ -341,10 +341,10 @@ struct HistogramAgent { sample_node, node_sums, histogram.ContainsNode(BinaryTree::Parent(sample_node))); if (!computeHistogram) { continue; } - auto x = X.Get(X.RowRange().lo[0] + local_sample_idx, feature); + auto x = X.Get(X.RowSubset().lo[0] + local_sample_idx, feature); int const bin_idx = split_proposals.FindBin(x, feature); - legate::Point<3> p = {X.RowRange().lo[0] + local_sample_idx, 0, output}; + legate::Point<3> p = {X.RowSubset().lo[0] + local_sample_idx, 0, output}; auto gpair_quantised = quantiser.QuantiseStochasticRounding({g[p], h[p]}, hash_combine(seed, p[0], p[2])); // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) @@ -391,7 +391,7 @@ struct HistogramAgent { std::array x{}; #pragma unroll for (int i = 0; i < kItemsPerThread; i++) { - x[i] = X.Get(X.RowRange().lo[0] + local_sample_idx[i], feature[i]); + x[i] = X.Get(X.RowSubset().lo[0] + local_sample_idx[i], feature[i]); } std::array bin_idx{}; @@ -402,7 +402,7 @@ struct HistogramAgent { std::array gpair{}; #pragma unroll for (int i = 0; i < kItemsPerThread; i++) { - legate::Point<3> p = {X.RowRange().lo[0] + local_sample_idx[i], 0, output}; + legate::Point<3> p = {X.RowSubset().lo[0] + local_sample_idx[i], 0, output}; gpair[i] = bin_idx[i] != SparseSplitProposals::NOT_FOUND ? quantiser.QuantiseStochasticRounding({g[p], h[p]}, hash_combine(seed, p[0], p[2])) @@ -512,7 +512,7 @@ __global__ void __launch_bounds__(kBlockThreads) continue; } // Which matrix elements belong to this row? - std::size_t const global_sample_idx = X.RowRange().lo[0] + local_sample_idx; + std::size_t const global_sample_idx = X.RowSubset().lo[0] + local_sample_idx; auto elements = X.row_ranges[global_sample_idx]; for (std::size_t element = elements.lo[0]; element <= elements.hi[0]; element++) { auto feature = X.column_indices[X.vals_shape.lo[0] + element]; @@ -737,42 +737,12 @@ struct GainFeaturePair { } }; -// In the case where we have a sparse matrix, gradients for 0's have not been accumulated in the -// histogram We can infer the gradients at the matrix zeroes by subtracting the sum of the gradients -// at the last bin (which contains gradients from every non-zero element) -// from the sum of the gradients in the node (this sum always includes gradients for every element -// in that node) -template -__device__ auto GetSparseSum(Histogram& histogram, - const SparseSplitProposals& split_proposals, - const IntegerGPair& node_sum, - int node_id, - int output, - int bin_idx) -{ - auto left_sum = vectorised_load(&histogram[{node_id, output, bin_idx}]); - auto right_sum = node_sum - left_sum; - auto feature = split_proposals.FindFeature(bin_idx); - auto [feature_begin, feature_end] = split_proposals.FeatureRange(feature); - auto scan_sum = vectorised_load(&histogram[{node_id, output, feature_end - 1}]); - auto zero_bin = split_proposals.FindBin(0.0, feature); - auto sparse_sum = node_sum - scan_sum; - if (zero_bin == SparseSplitProposals::NOT_FOUND || bin_idx < zero_bin) { - // Do nothing, this amount is already on the right - } else { - // Move it to the left - left_sum += sparse_sum; - right_sum -= sparse_sum; - } - return std::make_tuple(left_sum, right_sum); -} - // NOLINTBEGIN(performance-unnecessary-value-param) -template +template __global__ void __launch_bounds__(BLOCK_THREADS) perform_best_split(Histogram histogram, size_t n_outputs, - SparseSplitProposals split_proposals, + SparseSplitProposals split_proposals, double eps, double alpha, legate::Buffer tree_leaf_value, @@ -781,7 +751,8 @@ __global__ void __launch_bounds__(BLOCK_THREADS) legate::Buffer tree_split_value, legate::Buffer tree_gain, NodeBatch batch, - GradientQuantiser quantiser) + GradientQuantiser quantiser, + bool is_sparse) { // using one block per (level) node to have blockwise reductions int const node_id = narrow(batch.node_idx_begin + blockIdx.x); @@ -803,10 +774,9 @@ __global__ void __launch_bounds__(BLOCK_THREADS) for (int output = 0; output < n_outputs; ++output) { auto node_sum = vectorised_load(&node_sums[{node_id, output}]); - auto [left_sum, right_sum] = - GetSparseSum(histogram, split_proposals, node_sum, node_id, output, bin_idx); - // printf("node %d , bin %d, left sum %ld %ld right sum %ld %ld \n", node_id ,bin_idx, - // left_sum.grad, left_sum.hess, right_sum.grad, right_sum.hess); + auto feature = split_proposals.FindFeature(bin_idx); + auto [left_sum, right_sum] = InferSplitSums( + histogram, split_proposals, node_sum, node_id, output, bin_idx, feature, is_sparse); if (left_sum.hess <= 0 || right_sum.hess <= 0) { gain = 0; @@ -844,8 +814,14 @@ __global__ void __launch_bounds__(BLOCK_THREADS) for (int output = narrow_cast(threadIdx.x); output < n_outputs; output += BLOCK_THREADS) { auto node_sum = vectorised_load(&node_sums[{node_id, output}]); - auto [left_sum, right_sum] = - GetSparseSum(histogram, split_proposals, node_sum, node_id, output, node_best_bin_idx); + auto [left_sum, right_sum] = InferSplitSums(histogram, + split_proposals, + node_sum, + node_id, + output, + node_best_bin_idx, + node_best_feature, + is_sparse); node_sums[{BinaryTree::LeftChild(node_id), output}] = left_sum; node_sums[{BinaryTree::RightChild(node_id), output}] = right_sum; @@ -978,7 +954,7 @@ SparseSplitProposals SelectSplitSamples(legate::TaskContext context, auto i = idx / X.NumFeatures(); auto j = idx % X.NumFeatures(); auto row = row_samples[i]; - bool has_data = X.RowRange().contains(row); + bool has_data = X.RowSubset().contains(row); draft_proposals[{j, i}] = has_data ? X.Get(row, j) : T(0); }); @@ -1155,7 +1131,7 @@ struct TreeBuilder { } double x_value = - X.Get(X.RowRange().lo[0] + static_cast(row), tree_feature_span[pos]); + X.Get(X.RowSubset().lo[0] + static_cast(row), tree_feature_span[pos]); bool left = x_value <= tree_split_value_span[pos]; pos = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos); // printf("Row %d, feature %d, value %f pos %d\n", row, tree_feature_span[pos], @@ -1255,7 +1231,8 @@ struct TreeBuilder { tree.split_value, tree.gain, batch, - quantiser); + quantiser, + std::is_same_v>); CHECK_CUDA_STREAM(stream); } void InitialiseRoot(legate::TaskContext context, diff --git a/src/models/tree/build_tree.h b/src/models/tree/build_tree.h index a0026b70..9c3f7e12 100644 --- a/src/models/tree/build_tree.h +++ b/src/models/tree/build_tree.h @@ -249,6 +249,40 @@ class Histogram { } }; +// From the scanned histogram gradients and the node sums, infer the gradients for the left and +// right partitions For a dense matrix the left sum is the scanned histogram and the right sum is +// the node sum minus the left sum In the case where we have a sparse matrix, gradients for 0's have +// not been accumulated in the histogram. We can infer the gradients at the matrix zeroes by +// subtracting the sum of the gradients at the last bin (which contains gradients from every +// non-zero element) from the sum of the gradients in the node (this sum always includes gradients +// for every element in that node) +template +__host__ __device__ auto InferSplitSums(Histogram& scanned_histogram, + const SparseSplitProposals& split_proposals, + const GPairT& node_sum, + int node_id, + int output, + int bin_idx, + int feature, + bool is_sparse) -> std::tuple +{ + auto left_sum = scanned_histogram[{node_id, output, bin_idx}]; + auto right_sum = node_sum - left_sum; + if (!is_sparse) { return std::make_tuple(left_sum, right_sum); } + auto [feature_begin, feature_end] = split_proposals.FeatureRange(feature); + auto scan_sum = scanned_histogram[{node_id, output, feature_end - 1}]; + auto zero_bin = split_proposals.FindBin(0.0, feature); + auto sparse_sum = node_sum - scan_sum; + if (zero_bin == SparseSplitProposals::NOT_FOUND || bin_idx < zero_bin) { + // Do nothing, this amount is already on the right + } else { + // Move it to the left + left_sum += sparse_sum; + right_sum -= sparse_sum; + } + return std::make_tuple(left_sum, right_sum); +} + class BuildTreeDenseTask : public Task { public: static void cpu_variant(legate::TaskContext context); diff --git a/src/models/tree/matrix_types.h b/src/models/tree/matrix_types.h index 10891e02..a7c21839 100644 --- a/src/models/tree/matrix_types.h +++ b/src/models/tree/matrix_types.h @@ -42,7 +42,7 @@ class DenseXMatrix { return x[legate::Point<3>{global_row_idx, feature_idx, 0}]; } __host__ __device__ int NumFeatures() const { return shape.hi[1] - shape.lo[1] + 1; } - __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const + __host__ __device__ legate::Rect<1, legate::coord_t> RowSubset() const { return {shape.lo[0], shape.hi[0]}; } @@ -106,5 +106,8 @@ class CSRXMatrix { auto NNZ() const { return nnz; } __host__ __device__ int NumFeatures() const { return num_features; } - __host__ __device__ legate::Rect<1, legate::coord_t> RowRange() const { return row_ranges_shape; } + __host__ __device__ legate::Rect<1, legate::coord_t> RowSubset() const + { + return row_ranges_shape; + } }; diff --git a/src/models/tree/predict.cc b/src/models/tree/predict.cc index 19f80bc2..0e5cb88e 100644 --- a/src/models/tree/predict.cc +++ b/src/models/tree/predict.cc @@ -29,7 +29,7 @@ void PredictRows(const MatrixT& X, legate::AccessorRO feature, legate::AccessorRO leaf_value) { - for (int64_t i = X.RowRange().lo[0]; i <= X.RowRange().hi[0]; i++) { + for (int64_t i = X.RowSubset().lo[0]; i <= X.RowSubset().hi[0]; i++) { int pos = 0; // Use a max depth of 100 to avoid infinite loops const int max_depth = 100; diff --git a/src/models/tree/predict.cu b/src/models/tree/predict.cu index c1c94770..3b573c5d 100644 --- a/src/models/tree/predict.cu +++ b/src/models/tree/predict.cu @@ -37,7 +37,7 @@ void PredictRows(const MatrixT& X, // rowwise kernel auto prediction_lambda = [=] __device__(size_t idx) { int64_t pos = 0; - auto global_row_idx = X.RowRange().lo + idx; + auto global_row_idx = X.RowSubset().lo + idx; // Use a max depth of 100 to avoid infinite loops const int max_depth = 100; for (int depth = 0; depth < max_depth; depth++) { @@ -50,7 +50,7 @@ void PredictRows(const MatrixT& X, } }; // NOLINT(readability/braces) - LaunchN(X.RowRange().volume(), stream, prediction_lambda); + LaunchN(X.RowSubset().volume(), stream, prediction_lambda); CHECK_CUDA_STREAM(stream); } From 936570e8edd957c95a7045d6a3a4ea3832b91334 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 9 Jan 2025 01:04:47 -0800 Subject: [PATCH 10/10] clang-tidy and other fixes --- src/models/tree/build_tree.cc | 47 +++++++++++------------ src/models/tree/build_tree.cu | 69 +++++++++++++++++----------------- src/models/tree/build_tree.h | 10 ++++- src/models/tree/matrix_types.h | 45 +++++++++++++--------- src/models/tree/predict.cc | 22 +++++------ src/models/tree/predict.cu | 22 +++++------ 6 files changed, 116 insertions(+), 99 deletions(-) diff --git a/src/models/tree/build_tree.cc b/src/models/tree/build_tree.cc index de3a0f39..0d14872a 100644 --- a/src/models/tree/build_tree.cc +++ b/src/models/tree/build_tree.cc @@ -172,7 +172,7 @@ auto SelectSplitSamples(legate::TaskContext context, // Set the largest split sample to +inf such that an element must belong to one of the bins // i.e. we cannot go off the end when searching for a bin - for (int feature = 0; feature < num_features; feature++) { + for (int feature = 0; feature < X.NumFeatures(); feature++) { auto end = row_pointers[feature + 1]; split_proposals[end - 1] = std::numeric_limits::infinity(); } @@ -213,12 +213,12 @@ struct TreeBuilder { max_batch_size = max_histogram_nodes; } - Tree Build(legate::TaskContext context, + auto Build(legate::TaskContext context, const MatrixT& X_matrix, - legate::AccessorRO g_accessor, - legate::AccessorRO h_accessor, + const legate::AccessorRO& g_accessor, + const legate::AccessorRO& h_accessor, legate::Rect<3> g_shape, - double alpha) + double alpha) -> Tree { // Begin building the tree Tree tree(max_nodes, narrow(num_outputs)); @@ -242,8 +242,8 @@ struct TreeBuilder { void DenseHistogramKernel(const Tree& tree, Histogram& histogram, const DenseXMatrix& X, - legate::AccessorRO g, - legate::AccessorRO h, + const legate::AccessorRO& g, + const legate::AccessorRO& h, NodeBatch batch) { // Build the histogram @@ -268,8 +268,8 @@ struct TreeBuilder { void CSRHistogramKernel(const Tree& tree, Histogram& histogram, const CSRXMatrix& X, - legate::AccessorRO g, - legate::AccessorRO h, + const legate::AccessorRO& g, + const legate::AccessorRO& h, NodeBatch batch) { // Build the histogram @@ -283,11 +283,9 @@ struct TreeBuilder { auto feature = X.column_indices[element_idx]; auto x = X.values[element_idx]; int const bin_idx = split_proposals.FindBin(x, feature); - if (bin_idx != SparseSplitProposals::NOT_FOUND) { - for (int64_t k = 0; k < num_outputs; ++k) { - histogram[{position, k, bin_idx}] += - GPair{g[{index_global, 0, k}], h[{index_global, 0, k}]}; - } + for (int64_t k = 0; k < num_outputs; ++k) { + histogram[{position, k, bin_idx}] += + GPair{g[{index_global, 0, k}], h[{index_global, 0, k}]}; } } } @@ -367,7 +365,10 @@ struct TreeBuilder { } } } - void PerformBestSplit(Tree& tree, Histogram histogram, double alpha, NodeBatch batch) + void PerformBestSplit(Tree& tree, + const Histogram& histogram, + double alpha, + NodeBatch batch) { const bool is_sparse_matrix = std::is_same_v>; for (int node_id = batch.node_idx_begin; node_id < batch.node_idx_end; node_id++) { @@ -560,7 +561,7 @@ struct build_tree_dense_fn { auto seed = context.scalars().at(4).value(); auto dataset_rows = context.scalars().at(5).value(); - DenseXMatrix X_matrix(X_accessor, X_shape); + DenseXMatrix const X_matrix(X_accessor, X_shape); SparseSplitProposals const split_proposals = SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows); @@ -599,13 +600,13 @@ struct build_tree_csr_fn { auto dataset_rows = context.scalars().at(5).value(); auto num_features = context.scalars().at(6).value(); - CSRXMatrix X_matrix(X_vals_accessor, - X_coords_accessor, - X_offsets_accessor, - X_vals_shape, - X_offsets_shape, - num_features, - X_vals_shape.volume()); + CSRXMatrix const X_matrix(X_vals_accessor, + X_coords_accessor, + X_offsets_accessor, + X_vals_shape, + X_offsets_shape, + num_features, + X_vals_shape.volume()); const SparseSplitProposals split_proposals = SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows); diff --git a/src/models/tree/build_tree.cu b/src/models/tree/build_tree.cu index c4b528fe..e248b655 100644 --- a/src/models/tree/build_tree.cu +++ b/src/models/tree/build_tree.cu @@ -21,11 +21,11 @@ #include #include #include -#include #include #include #include #include +#include #include "legate_library.h" #include "legateboost.h" #include "../../cpp_utils/cpp_utils.h" @@ -395,7 +395,7 @@ struct HistogramAgent { std::array gpair{}; #pragma unroll for (int i = 0; i < kItemsPerThread; i++) { - legate::Point<3> p = {sample_offset + local_sample_idx[i], 0, output}; + legate::Point<3> p = {sample_node[i] + local_sample_idx[i], 0, output}; gpair[i] = quantiser.QuantiseStochasticRounding({g[p], h[p]}, hash_combine(seed, p[0], p[2])); } #pragma unroll @@ -489,8 +489,8 @@ __global__ void __launch_bounds__(kBlockThreads) int64_t seed) { // Grid stride loop over rows - for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < batch.InstancesInBatch(); - idx += blockDim.x * gridDim.x) { + for (std::size_t idx = (blockIdx.x * blockDim.x) + threadIdx.x; idx < batch.InstancesInBatch(); + idx += static_cast(blockDim.x * gridDim.x)) { auto [sample_node, local_sample_idx] = batch.instances[idx]; // If we don't need to compute this node, skip if (!ComputeHistogramBin( @@ -504,7 +504,6 @@ __global__ void __launch_bounds__(kBlockThreads) auto feature = X.column_indices[X.vals_shape.lo[0] + element]; auto x = X.values[X.vals_shape.lo[0] + element]; int const bin_idx = split_proposals.FindBin(x, feature); - if (bin_idx == SparseSplitProposals::NOT_FOUND) continue; for (int output = 0; output < n_outputs; output++) { legate::Point<3> p = {global_sample_idx, 0, output}; auto gpair_quantised = @@ -872,7 +871,7 @@ struct Tree { void WriteTreeOutput(legate::TaskContext context, GradientQuantiser quantiser) { - auto stream = context.get_task_stream(); + auto* stream = context.get_task_stream(); auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM); auto policy = DEFAULT_POLICY(thrust_alloc).on(stream); @@ -904,8 +903,8 @@ struct Tree { legate::Buffer split_value; legate::Buffer gain; legate::Buffer node_sums; - const int num_outputs; - const int max_nodes; + int num_outputs; + int max_nodes; cudaStream_t stream; }; @@ -914,12 +913,12 @@ struct Tree { // Remove any duplicates // Return sparse matrix of split samples for each feature template class XMatrix> -SparseSplitProposals SelectSplitSamples(legate::TaskContext context, - const XMatrix& X, - int split_samples, - int seed, - int64_t dataset_rows, - cudaStream_t stream) +auto SelectSplitSamples(legate::TaskContext context, + const XMatrix& X, + int split_samples, + int seed, + int64_t dataset_rows, + cudaStream_t stream) -> SparseSplitProposals { auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM); auto policy = DEFAULT_POLICY(thrust_alloc).on(stream); @@ -940,7 +939,7 @@ SparseSplitProposals SelectSplitSamples(legate::TaskContext context, auto i = idx / X.NumFeatures(); auto j = idx % X.NumFeatures(); auto row = row_samples[i]; - bool has_data = X.RowSubset().contains(row); + bool const has_data = X.RowSubset().contains(row); draft_proposals[{j, i}] = has_data ? X.Get(row, j) : T(0); }); @@ -1000,7 +999,7 @@ SparseSplitProposals SelectSplitSamples(legate::TaskContext context, // Set the largest split sample to +inf such that an element must belong to one of the bins // i.e. we cannot go off the end when searching for a bin - LaunchN(num_features, stream, [=] __device__(int i) { + LaunchN(X.NumFeatures(), stream, [=] __device__(int i) { auto end = row_pointers_span[i + 1]; split_proposals[end - 1] = std::numeric_limits::infinity(); }); @@ -1069,14 +1068,14 @@ struct TreeBuilder { max_batch_size = max_histogram_nodes; } - Tree Build(legate::TaskContext context, + auto Build(legate::TaskContext context, const MatrixT& X_matrix, - legate::AccessorRO g_accessor, - legate::AccessorRO h_accessor, + const legate::AccessorRO& g_accessor, + const legate::AccessorRO& h_accessor, legate::Rect<3> g_shape, - double alpha) + double alpha) -> Tree { - auto stream = context.get_task_stream(); + auto* stream = context.get_task_stream(); auto thrust_alloc = ThrustAllocator(legate::Memory::GPU_FB_MEM); auto thrust_exec_policy = DEFAULT_POLICY(thrust_alloc).on(stream); @@ -1107,7 +1106,7 @@ struct TreeBuilder { auto operator=(TreeBuilder&&) -> TreeBuilder& = delete; ~TreeBuilder() = default; - void UpdatePositions(Tree& tree, const MatrixT X) + void UpdatePositions(Tree& tree, const MatrixT& X) { tcb::span const tree_feature_span(tree.feature.ptr(0), max_nodes); tcb::span const tree_split_value_span(tree.split_value.ptr(0), max_nodes); @@ -1123,10 +1122,10 @@ struct TreeBuilder { return; } - double x_value = + double const x_value = X.Get(X.RowSubset().lo[0] + static_cast(row), tree_feature_span[pos]); - bool left = x_value <= tree_split_value_span[pos]; - pos = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos); + bool const left = x_value <= tree_split_value_span[pos]; + pos = left ? BinaryTree::LeftChild(pos) : BinaryTree::RightChild(pos); // printf("Row %d, feature %d, value %f pos %d\n", row, tree_feature_span[pos], // x_value, pos); sorted_positions[idx] = cuda::std::make_tuple(pos, row); @@ -1168,7 +1167,7 @@ struct TreeBuilder { void ComputeHistogram(Histogram histogram, legate::TaskContext context, Tree& tree, - const MatrixT X, + const MatrixT& X, const legate::AccessorRO& g, const legate::AccessorRO& h, NodeBatch batch) @@ -1359,7 +1358,7 @@ struct build_tree_fn { auto [g, g_shape, g_accessor] = GetInputStore(context.input(1).data()); auto [h, h_shape, h_accessor] = GetInputStore(context.input(2).data()); - DenseXMatrix X_matrix(X_accessor, X_shape); + DenseXMatrix const X_matrix(X_accessor, X_shape); EXPECT_DENSE_ROW_MAJOR(X_accessor.accessor, X_shape); auto num_features = X_shape.hi[1] - X_shape.lo[1] + 1; @@ -1431,17 +1430,17 @@ struct build_tree_csr_fn { auto num_features = context.scalars().at(6).value(); auto* stream = context.get_task_stream(); - CSRXMatrix X_matrix(X_vals_accessor, - X_coords_accessor, - X_offsets_accessor, - X_vals_shape, - X_offsets_shape, - num_features, - X_vals_shape.volume()); + CSRXMatrix const X_matrix(X_vals_accessor, + X_coords_accessor, + X_offsets_accessor, + X_vals_shape, + X_offsets_shape, + num_features, + X_vals_shape.volume()); const SparseSplitProposals split_proposals = SelectSplitSamples(context, X_matrix, split_samples, seed, dataset_rows, stream); - GradientQuantiser quantiser(context, g_accessor, h_accessor, g_shape, stream); + GradientQuantiser const quantiser(context, g_accessor, h_accessor, g_shape, stream); // Begin building the tree auto tree = TreeBuilder>(num_rows, diff --git a/src/models/tree/build_tree.h b/src/models/tree/build_tree.h index 5348bf2f..e45138c7 100644 --- a/src/models/tree/build_tree.h +++ b/src/models/tree/build_tree.h @@ -249,6 +249,12 @@ class Histogram { { return buffer_[{p[0] - node_begin_, p[1], p[2]}]; } + + // Node, output, bin + __host__ __device__ auto operator[](legate::Point<3> p) const -> GPairT + { + return buffer_[{p[0] - node_begin_, p[1], p[2]}]; + } }; // From the scanned histogram gradients and the node sums, infer the gradients for the left and @@ -259,7 +265,7 @@ class Histogram { // non-zero element) from the sum of the gradients in the node (this sum always includes gradients // for every element in that node) template -__host__ __device__ auto InferSplitSums(Histogram& scanned_histogram, +__host__ __device__ auto InferSplitSums(const Histogram& scanned_histogram, const SparseSplitProposals& split_proposals, const GPairT& node_sum, int node_id, @@ -275,7 +281,7 @@ __host__ __device__ auto InferSplitSums(Histogram& scanned_histogram, auto scan_sum = scanned_histogram[{node_id, output, feature_end - 1}]; auto zero_bin = split_proposals.FindBin(0.0, feature); auto sparse_sum = node_sum - scan_sum; - if (zero_bin == SparseSplitProposals::NOT_FOUND || bin_idx < zero_bin) { + if (bin_idx < zero_bin) { // Do nothing, this amount is already on the right } else { // Move it to the left diff --git a/src/models/tree/matrix_types.h b/src/models/tree/matrix_types.h index a7c21839..32eb62ab 100644 --- a/src/models/tree/matrix_types.h +++ b/src/models/tree/matrix_types.h @@ -16,12 +16,14 @@ #pragma once #include #include +#include #ifdef __CUDACC__ #include #else #include #endif +namespace legateboost { // Create a uniform interface to two matrix formats // Dense and CSR template @@ -34,15 +36,22 @@ class DenseXMatrix { legate::Rect<3> shape; public: - DenseXMatrix(legate::AccessorRO x, legate::Rect<3> shape) : x(x), shape(shape) {} + DenseXMatrix(const legate::AccessorRO& x, legate::Rect<3> shape) + : x(std::move(x)), shape(shape) + { + } // Global row index refers to the index across partitions // For features, each worker has every feature so the global is the same as the local index - __host__ __device__ T Get(std::size_t global_row_idx, uint32_t feature_idx) const + [[nodiscard]] __host__ __device__ auto Get(std::size_t global_row_idx, uint32_t feature_idx) const + -> T { return x[legate::Point<3>{global_row_idx, feature_idx, 0}]; } - __host__ __device__ int NumFeatures() const { return shape.hi[1] - shape.lo[1] + 1; } - __host__ __device__ legate::Rect<1, legate::coord_t> RowSubset() const + [[nodiscard]] __host__ __device__ auto NumFeatures() const -> int + { + return shape.hi[1] - shape.lo[1] + 1; + } + [[nodiscard]] __host__ __device__ auto RowSubset() const -> legate::Rect<1, legate::coord_t> { return {shape.lo[0], shape.hi[0]}; } @@ -61,14 +70,14 @@ class CSRXMatrix { int num_features; std::size_t nnz; // The number of nnz in ths local partition - CSRXMatrix(legate::AccessorRO values, - legate::AccessorRO column_indices, - legate::AccessorRO, 1> row_ranges, + CSRXMatrix(const legate::AccessorRO& values, + const legate::AccessorRO& column_indices, + const legate::AccessorRO, 1>& row_ranges, legate::Rect<1, legate::coord_t> vals_shape, legate::Rect<1, legate::coord_t> row_ranges_shape, int num_features, std::size_t nnz) - : values(values), + : values(std::move(values)), column_indices(column_indices), row_ranges(row_ranges), num_features(num_features), @@ -82,18 +91,19 @@ class CSRXMatrix { // For features, each worker has every feature so the global is the same as the local index // This method is less efficient than its Dense counterpart due to the need to search for the // feature - __host__ __device__ T Get(std::size_t global_row_idx, uint32_t feature_idx) const + [[nodiscard]] __host__ __device__ auto Get(std::size_t global_row_idx, uint32_t feature_idx) const + -> T { - auto row_range = row_ranges[global_row_idx]; + auto row_range = row_ranges[narrow(global_row_idx)]; - tcb::span column_indices_span(column_indices.ptr(row_range.lo), - row_range.volume()); + tcb::span const column_indices_span(column_indices.ptr(row_range.lo), + row_range.volume()); #ifdef __CUDACC__ - auto result = thrust::lower_bound( + const auto* result = thrust::lower_bound( thrust::seq, column_indices_span.begin(), column_indices_span.end(), feature_idx); #else - auto result = + const auto* result = std::lower_bound(column_indices_span.begin(), column_indices_span.end(), feature_idx); #endif @@ -103,11 +113,12 @@ class CSRXMatrix { return 0; } - auto NNZ() const { return nnz; } + [[nodiscard]] auto NNZ() const { return nnz; } - __host__ __device__ int NumFeatures() const { return num_features; } - __host__ __device__ legate::Rect<1, legate::coord_t> RowSubset() const + [[nodiscard]] __host__ __device__ auto NumFeatures() const -> int { return num_features; } + [[nodiscard]] __host__ __device__ auto RowSubset() const -> legate::Rect<1, legate::coord_t> { return row_ranges_shape; } }; +} // namespace legateboost diff --git a/src/models/tree/predict.cc b/src/models/tree/predict.cc index 0e5cb88e..13e36f0c 100644 --- a/src/models/tree/predict.cc +++ b/src/models/tree/predict.cc @@ -23,11 +23,11 @@ namespace legateboost { namespace { template void PredictRows(const MatrixT& X, - legate::AccessorWO pred_accessor, + const legate::AccessorWO& pred_accessor, legate::Rect<3, legate::coord_t> pred_shape, - legate::AccessorRO split_value, - legate::AccessorRO feature, - legate::AccessorRO leaf_value) + const legate::AccessorRO& split_value, + const legate::AccessorRO& feature, + const legate::AccessorRO& leaf_value) { for (int64_t i = X.RowSubset().lo[0]; i <= X.RowSubset().hi[0]; i++) { int pos = 0; @@ -96,13 +96,13 @@ struct predict_csr_fn { auto pred_accessor = pred.write_accessor(); auto num_features = context.scalars().at(0).value(); - CSRXMatrix X(X_vals_accessor, - X_coords_accessor, - X_offsets_accessor, - X_vals_shape, - X_offsets_shape, - num_features, - X_vals_shape.volume()); + CSRXMatrix const X(X_vals_accessor, + X_coords_accessor, + X_offsets_accessor, + X_vals_shape, + X_offsets_shape, + num_features, + X_vals_shape.volume()); EXPECT_AXIS_ALIGNED(0, X_offsets_shape, pred_shape); diff --git a/src/models/tree/predict.cu b/src/models/tree/predict.cu index 3b573c5d..7c93b0f6 100644 --- a/src/models/tree/predict.cu +++ b/src/models/tree/predict.cu @@ -27,11 +27,11 @@ namespace { template void PredictRows(const MatrixT& X, - legate::AccessorWO pred_accessor, + const legate::AccessorWO& pred_accessor, legate::Rect<3, legate::coord_t> pred_shape, - legate::AccessorRO split_value, - legate::AccessorRO feature, - legate::AccessorRO leaf_value, + const legate::AccessorRO& split_value, + const legate::AccessorRO& feature, + const legate::AccessorRO& leaf_value, cudaStream_t stream) { // rowwise kernel @@ -109,13 +109,13 @@ struct predict_csr_fn { auto pred_accessor = pred.write_accessor(); auto num_features = context.scalars().at(0).value(); - CSRXMatrix X(X_vals_accessor, - X_coords_accessor, - X_offsets_accessor, - X_vals_shape, - X_offsets_shape, - num_features, - X_vals_shape.volume()); + CSRXMatrix const X(X_vals_accessor, + X_coords_accessor, + X_offsets_accessor, + X_vals_shape, + X_offsets_shape, + num_features, + X_vals_shape.volume()); EXPECT_AXIS_ALIGNED(0, X_offsets_shape, pred_shape);