From a530153ae8d8a89fe31caa190ce5384144071079 Mon Sep 17 00:00:00 2001 From: Alexander Popkov <30959770+alexander-pv@users.noreply.github.com> Date: Mon, 23 Oct 2023 09:19:19 +0300 Subject: [PATCH] Reduce sklearn dependency in causalml (#686) * Add private code with Cython tree structures * Add tree code compilation into setup.py * Update imports and remove redundant code in causal trees * Remove outdated commands in Makefile * Add details into contributing doc * Update sklearn and numpy dependencies * Keep line-by-line cython modules description in setup.py * Fix joblib support * Update joblib additional args parsing for older python versions * Add causalforest support for sklearn>=1.2.0 --- CONTRIBUTING.md | 4 +- Makefile | 6 +- causalml/inference/tree/_tree/__init__.py | 4 + causalml/inference/tree/_tree/_classes.py | 322 +++ causalml/inference/tree/_tree/_criterion.pxd | 78 + causalml/inference/tree/_tree/_criterion.pyx | 398 ++++ causalml/inference/tree/_tree/_splitter.pxd | 98 + causalml/inference/tree/_tree/_splitter.pyx | 1547 ++++++++++++++ causalml/inference/tree/_tree/_tree.pxd | 114 ++ causalml/inference/tree/_tree/_tree.pyx | 1771 +++++++++++++++++ causalml/inference/tree/_tree/_utils.pxd | 121 ++ causalml/inference/tree/_tree/_utils.pyx | 311 +++ causalml/inference/tree/causal/__init__.py | 0 causalml/inference/tree/causal/_builder.pxd | 12 +- causalml/inference/tree/causal/_builder.pyx | 2 +- causalml/inference/tree/causal/_criterion.pxd | 6 +- causalml/inference/tree/causal/_criterion.pyx | 1 + causalml/inference/tree/causal/_tree.py | 20 +- .../inference/tree/causal/causalforest.py | 83 +- causalml/inference/tree/causal/causaltree.py | 6 +- pyproject.toml | 10 +- setup.py | 36 +- 22 files changed, 4859 insertions(+), 91 deletions(-) create mode 100755 causalml/inference/tree/_tree/__init__.py create mode 100644 causalml/inference/tree/_tree/_classes.py create mode 100644 causalml/inference/tree/_tree/_criterion.pxd create mode 100755 causalml/inference/tree/_tree/_criterion.pyx create mode 100644 causalml/inference/tree/_tree/_splitter.pxd create mode 100644 causalml/inference/tree/_tree/_splitter.pyx create mode 100644 causalml/inference/tree/_tree/_tree.pxd create mode 100755 causalml/inference/tree/_tree/_tree.pyx create mode 100644 causalml/inference/tree/_tree/_utils.pxd create mode 100755 causalml/inference/tree/_tree/_utils.pyx mode change 100644 => 100755 causalml/inference/tree/causal/__init__.py mode change 100644 => 100755 causalml/inference/tree/causal/_builder.pxd mode change 100644 => 100755 causalml/inference/tree/causal/_builder.pyx mode change 100644 => 100755 causalml/inference/tree/causal/_criterion.pxd mode change 100644 => 100755 causalml/inference/tree/causal/_criterion.pyx mode change 100644 => 100755 causalml/inference/tree/causal/_tree.py mode change 100644 => 100755 causalml/inference/tree/causal/causalforest.py mode change 100644 => 100755 causalml/inference/tree/causal/causaltree.py mode change 100644 => 100755 pyproject.toml mode change 100644 => 100755 setup.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 27b1251f..bcae1a52 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -66,8 +66,9 @@ If you added a new inference method, add test code to the `tests/` folder. **CausalML** uses `pytest` for tests. Install `pytest` and `pytest-cov`, and the package dependencies: ```bash -$ pip install pytest pytest-cov -r requirements.txt +$ pip install .[test] ``` +See details for test dependencies in `pyproject.toml` ### Building Cython @@ -75,6 +76,7 @@ In order to run tests, you need to build the Cython modules ```bash $ python setup.py build_ext --inplace ``` +This is important because during testing causalml modules are imported from the source code. ### Testing diff --git a/Makefile b/Makefile index 70258628..4c1c383d 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,7 @@ .PHONY: build_ext -build_ext: install_req clean +build_ext: clean python setup.py build_ext --force --inplace -.PHONY: install_req -install_req: - pip install -r requirements.txt - .PHONY: build build: build_ext python setup.py bdist_wheel diff --git a/causalml/inference/tree/_tree/__init__.py b/causalml/inference/tree/_tree/__init__.py new file mode 100755 index 00000000..20af086a --- /dev/null +++ b/causalml/inference/tree/_tree/__init__.py @@ -0,0 +1,4 @@ +""" +This part of tree structures definition was initially borrowed from +https://github.com/scikit-learn/scikit-learn/tree/1.0.2/sklearn/tree +""" diff --git a/causalml/inference/tree/_tree/_classes.py b/causalml/inference/tree/_tree/_classes.py new file mode 100644 index 00000000..70031b6d --- /dev/null +++ b/causalml/inference/tree/_tree/_classes.py @@ -0,0 +1,322 @@ +""" +This module gathers tree-based methods, including decision, regression and +randomized trees. Single and multi-output problems are both handled. +""" + +# Authors: Gilles Louppe +# Peter Prettenhofer +# Brian Holt +# Noel Dawe +# Satrajit Gosh +# Joly Arnaud +# Fares Hedayati +# Nelson Liu +# +# License: BSD 3 clause + + +from abc import ABCMeta +from abc import abstractmethod +from sklearn.base import MultiOutputMixin +from sklearn.base import BaseEstimator +from sklearn.base import is_classifier, clone +from sklearn.utils import Bunch +from sklearn.utils.validation import check_is_fitted + +import numpy as np +from scipy.sparse import issparse + +from ._tree import Tree +from ._tree import _build_pruned_tree_ccp +from ._tree import ccp_pruning_path +from . import _tree, _splitter + +DTYPE = _tree.DTYPE +DOUBLE = _tree.DOUBLE + +DENSE_SPLITTERS = { + "best": _splitter.BestSplitter, + "random": _splitter.RandomSplitter, +} + +SPARSE_SPLITTERS = { + "best": _splitter.BestSparseSplitter, + "random": _splitter.RandomSparseSplitter, +} + + +# ============================================================================= +# Base decision tree +# ============================================================================= + + +class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): + """Base class for decision trees. + + Warning: This class should not be used directly. + Use derived classes instead. + """ + + @abstractmethod + def __init__( + self, + *, + criterion, + splitter, + max_depth, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + max_features, + max_leaf_nodes, + random_state, + min_impurity_decrease, + class_weight=None, + ccp_alpha=0.0, + ): + self.criterion = criterion + self.splitter = splitter + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.max_leaf_nodes = max_leaf_nodes + self.random_state = random_state + self.min_impurity_decrease = min_impurity_decrease + self.class_weight = class_weight + self.ccp_alpha = ccp_alpha + + @abstractmethod + def fit( + self, X, y, sample_weight=None, check_input=True, X_idx_sorted="deprecated" + ): + pass + + def get_depth(self): + """Return the depth of the decision tree. + + The depth of a tree is the maximum distance between the root + and any leaf. + + Returns + ------- + self.tree_.max_depth : int + The maximum depth of the tree. + """ + check_is_fitted(self) + return self.tree_.max_depth + + def get_n_leaves(self): + """Return the number of leaves of the decision tree. + + Returns + ------- + self.tree_.n_leaves : int + Number of leaves. + """ + check_is_fitted(self) + return self.tree_.n_leaves + + def _validate_X_predict(self, X, check_input): + """Validate the training data on predict (probabilities).""" + if check_input: + X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False) + if issparse(X) and ( + X.indices.dtype != np.intc or X.indptr.dtype != np.intc + ): + raise ValueError("No support for np.int64 index based sparse matrices") + else: + # The number of features is checked regardless of `check_input` + self._check_n_features(X, reset=False) + return X + + def predict(self, X, check_input=True): + """Predict class or regression value for X. + + For a classification model, the predicted class for each sample in X is + returned. For a regression model, the predicted value based on X is + returned. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + check_input : bool, default=True + Allow to bypass several input checking. + Don't use this parameter unless you know what you do. + + Returns + ------- + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + The predicted classes, or the predict values. + """ + check_is_fitted(self) + X = self._validate_X_predict(X, check_input) + proba = self.tree_.predict(X) + n_samples = X.shape[0] + + # Classification + if is_classifier(self): + if self.n_outputs_ == 1: + return self.classes_.take(np.argmax(proba, axis=1), axis=0) + + else: + class_type = self.classes_[0].dtype + predictions = np.zeros((n_samples, self.n_outputs_), dtype=class_type) + for k in range(self.n_outputs_): + predictions[:, k] = self.classes_[k].take( + np.argmax(proba[:, k], axis=1), axis=0 + ) + + return predictions + + # Regression + else: + if self.n_outputs_ == 1: + return proba[:, 0] + + else: + return proba[:, :, 0] + + def apply(self, X, check_input=True): + """Return the index of the leaf that each sample is predicted as. + + .. versionadded:: 0.17 + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + check_input : bool, default=True + Allow to bypass several input checking. + Don't use this parameter unless you know what you do. + + Returns + ------- + X_leaves : array-like of shape (n_samples,) + For each datapoint x in X, return the index of the leaf x + ends up in. Leaves are numbered within + ``[0; self.tree_.node_count)``, possibly with gaps in the + numbering. + """ + check_is_fitted(self) + X = self._validate_X_predict(X, check_input) + return self.tree_.apply(X) + + def decision_path(self, X, check_input=True): + """Return the decision path in the tree. + + .. versionadded:: 0.18 + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csr_matrix``. + + check_input : bool, default=True + Allow to bypass several input checking. + Don't use this parameter unless you know what you do. + + Returns + ------- + indicator : sparse matrix of shape (n_samples, n_nodes) + Return a node indicator CSR matrix where non zero elements + indicates that the samples goes through the nodes. + """ + X = self._validate_X_predict(X, check_input) + return self.tree_.decision_path(X) + + def _prune_tree(self): + """Prune tree using Minimal Cost-Complexity Pruning.""" + check_is_fitted(self) + + if self.ccp_alpha < 0.0: + raise ValueError("ccp_alpha must be greater than or equal to 0") + + if self.ccp_alpha == 0.0: + return + + # build pruned tree + if is_classifier(self): + n_classes = np.atleast_1d(self.n_classes_) + pruned_tree = Tree(self.n_features_in_, n_classes, self.n_outputs_) + else: + pruned_tree = Tree( + self.n_features_in_, + # TODO: the tree shouldn't need this param + np.array([1] * self.n_outputs_, dtype=np.intp), + self.n_outputs_, + ) + _build_pruned_tree_ccp(pruned_tree, self.tree_, self.ccp_alpha) + + self.tree_ = pruned_tree + + def cost_complexity_pruning_path(self, X, y, sample_weight=None): + """Compute the pruning path during Minimal Cost-Complexity Pruning. + + See :ref:`minimal_cost_complexity_pruning` for details on the pruning + process. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csc_matrix``. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + The target values (class labels) as integers or strings. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. Splits are also + ignored if they would result in any single class carrying a + negative weight in either child node. + + Returns + ------- + ccp_path : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + ccp_alphas : ndarray + Effective alphas of subtree during pruning. + + impurities : ndarray + Sum of the impurities of the subtree leaves for the + corresponding alpha value in ``ccp_alphas``. + """ + est = clone(self).set_params(ccp_alpha=0.0) + est.fit(X, y, sample_weight=sample_weight) + return Bunch(**ccp_pruning_path(est.tree_)) + + @property + def feature_importances_(self): + """Return the feature importances. + + The importance of a feature is computed as the (normalized) total + reduction of the criterion brought by that feature. + It is also known as the Gini importance. + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + Returns + ------- + feature_importances_ : ndarray of shape (n_features,) + Normalized total reduction of criteria by feature + (Gini importance). + """ + check_is_fitted(self) + + return self.tree_.compute_feature_importances() diff --git a/causalml/inference/tree/_tree/_criterion.pxd b/causalml/inference/tree/_tree/_criterion.pxd new file mode 100644 index 00000000..762e3509 --- /dev/null +++ b/causalml/inference/tree/_tree/_criterion.pxd @@ -0,0 +1,78 @@ +# Authors: Gilles Louppe +# Peter Prettenhofer +# Brian Holt +# Joel Nothman +# Arnaud Joly +# Jacob Schreiber +# +# License: BSD 3 clause + +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False +# cython: language_level=3 +# cython: linetrace=True + +import numpy as np +cimport numpy as cnp + +from ._tree cimport DTYPE_t # Type of X +from ._tree cimport DOUBLE_t # Type of y, sample_weight +from ._tree cimport SIZE_t # Type for indices and counters +from ._tree cimport INT32_t # Signed 32 bit integer +from ._tree cimport UINT32_t # Unsigned 32 bit integer + +cdef class Criterion: + # The criterion computes the impurity of a node and the reduction of + # impurity of a split on that node. It also computes the output statistics + # such as the mean in regression and class probabilities in classification. + + # Internal structures + cdef const DOUBLE_t[:, ::1] y # Values of y + cdef DOUBLE_t* sample_weight # Sample weights + + cdef SIZE_t* samples # Sample indices in X, y + cdef SIZE_t start # samples[start:pos] are the samples in the left node + cdef SIZE_t pos # samples[pos:end] are the samples in the right node + cdef SIZE_t end + + cdef SIZE_t n_outputs # Number of outputs + cdef SIZE_t n_samples # Number of samples + cdef SIZE_t n_node_samples # Number of samples in the node (end-start) + cdef double weighted_n_samples # Weighted number of samples (in total) + cdef double weighted_n_node_samples # Weighted number of samples in the node + cdef double weighted_n_left # Weighted number of samples in the left node + cdef double weighted_n_right # Weighted number of samples in the right node + + cdef double* sum_total # For classification criteria, the sum of the + # weighted count of each label. For regression, + # the sum of w*y. sum_total[k] is equal to + # sum_{i=start}^{end-1} w[samples[i]]*y[samples[i], k], + # where k is output index. + cdef double* sum_left # Same as above, but for the left side of the split + cdef double* sum_right # same as above, but for the right side of the split + + # The criterion object is maintained such that left and right collected + # statistics correspond to samples[start:pos] and samples[pos:end]. + + # Methods + cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, + double weighted_n_samples, SIZE_t* samples, SIZE_t start, + SIZE_t end) nogil except -1 + cdef int reset(self) nogil except -1 + cdef int reverse_reset(self) nogil except -1 + cdef int update(self, SIZE_t new_pos) nogil except -1 + cdef double node_impurity(self) nogil + cdef void children_impurity(self, double* impurity_left, + double* impurity_right) nogil + cdef void node_value(self, double* dest) nogil + cdef double impurity_improvement(self, double impurity_parent, + double impurity_left, + double impurity_right) nogil + cdef double proxy_impurity_improvement(self) nogil + + +cdef class RegressionCriterion(Criterion): + """Abstract regression criterion.""" + + cdef double sq_sum_total diff --git a/causalml/inference/tree/_tree/_criterion.pyx b/causalml/inference/tree/_tree/_criterion.pyx new file mode 100755 index 00000000..ae281edd --- /dev/null +++ b/causalml/inference/tree/_tree/_criterion.pyx @@ -0,0 +1,398 @@ +# Authors: Gilles Louppe +# Peter Prettenhofer +# Brian Holt +# Noel Dawe +# Satrajit Gosh +# Lars Buitinck +# Arnaud Joly +# Joel Nothman +# Fares Hedayati +# Jacob Schreiber +# Nelson Liu +# +# License: BSD 3 clause + +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False +# cython: language_level=3 +# cython: linetrace=True + +from libc.stdlib cimport calloc +from libc.stdlib cimport free +from libc.string cimport memcpy +from libc.string cimport memset +from libc.math cimport fabs + +import numpy as np +cimport numpy as cnp +cnp.import_array() + + +cdef class Criterion: + """Interface for impurity criteria. + + This object stores methods on how to calculate how good a split is using + different metrics. + """ + + def __dealloc__(self): + """Destructor.""" + free(self.sum_total) + free(self.sum_left) + free(self.sum_right) + + def __getstate__(self): + return {} + + def __setstate__(self, d): + pass + + cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, + double weighted_n_samples, SIZE_t* samples, SIZE_t start, + SIZE_t end) nogil except -1: + """Placeholder for a method which will initialize the criterion. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + + Parameters + ---------- + y : array-like, dtype=DOUBLE_t + y is a buffer that can store values for n_outputs target variables + sample_weight : array-like, dtype=DOUBLE_t + The weight of each sample + weighted_n_samples : double + The total weight of the samples being considered + samples : array-like, dtype=SIZE_t + Indices of the samples in X and y, where samples[start:end] + correspond to the samples in this node + start : SIZE_t + The first sample to be used on this node + end : SIZE_t + The last sample used on this node + + """ + pass + + cdef int reset(self) nogil except -1: + """Reset the criterion at pos=start. + + This method must be implemented by the subclass. + """ + pass + + cdef int reverse_reset(self) nogil except -1: + """Reset the criterion at pos=end. + + This method must be implemented by the subclass. + """ + pass + + cdef int update(self, SIZE_t new_pos) nogil except -1: + """Updated statistics by moving samples[pos:new_pos] to the left child. + + This updates the collected statistics by moving samples[pos:new_pos] + from the right child to the left child. It must be implemented by + the subclass. + + Parameters + ---------- + new_pos : SIZE_t + New starting index position of the samples in the right child + """ + pass + + cdef double node_impurity(self) nogil: + """Placeholder for calculating the impurity of the node. + + Placeholder for a method which will evaluate the impurity of + the current node, i.e. the impurity of samples[start:end]. This is the + primary function of the criterion class. The smaller the impurity the + better. + """ + pass + + cdef void children_impurity(self, double* impurity_left, + double* impurity_right) nogil: + """Placeholder for calculating the impurity of children. + + Placeholder for a method which evaluates the impurity in + children nodes, i.e. the impurity of samples[start:pos] + the impurity + of samples[pos:end]. + + Parameters + ---------- + impurity_left : double pointer + The memory address where the impurity of the left child should be + stored. + impurity_right : double pointer + The memory address where the impurity of the right child should be + stored + """ + pass + + cdef void node_value(self, double* dest) nogil: + """Placeholder for storing the node value. + + Placeholder for a method which will compute the node value + of samples[start:end] and save the value into dest. + + Parameters + ---------- + dest : double pointer + The memory address where the node value should be stored. + """ + pass + + cdef double proxy_impurity_improvement(self) nogil: + """Compute a proxy of the impurity reduction. + + This method is used to speed up the search for the best split. + It is a proxy quantity such that the split that maximizes this value + also maximizes the impurity improvement. It neglects all constant terms + of the impurity decrease for a given split. + + The absolute impurity improvement is only computed by the + impurity_improvement method once the best split has been found. + """ + cdef double impurity_left + cdef double impurity_right + self.children_impurity(&impurity_left, &impurity_right) + + return (- self.weighted_n_right * impurity_right + - self.weighted_n_left * impurity_left) + + cdef double impurity_improvement(self, double impurity_parent, + double impurity_left, + double impurity_right) nogil: + """Compute the improvement in impurity. + + This method computes the improvement in impurity when a split occurs. + The weighted impurity improvement equation is the following: + + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + + where N is the total number of samples, N_t is the number of samples + at the current node, N_t_L is the number of samples in the left child, + and N_t_R is the number of samples in the right child, + + Parameters + ---------- + impurity_parent : double + The initial impurity of the parent node before the split + + impurity_left : double + The impurity of the left child + + impurity_right : double + The impurity of the right child + + Return + ------ + double : improvement in impurity after the split occurs + """ + return ((self.weighted_n_node_samples / self.weighted_n_samples) * + (impurity_parent - (self.weighted_n_right / + self.weighted_n_node_samples * impurity_right) + - (self.weighted_n_left / + self.weighted_n_node_samples * impurity_left))) + + +cdef class RegressionCriterion(Criterion): + r"""Abstract regression criterion. + + This handles cases where the target is a continuous value, and is + evaluated by computing the variance of the target values left and right + of the split point. The computation takes linear time with `n_samples` + by using :: + + var = \sum_i^n (y_i - y_bar) ** 2 + = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2 + """ + + def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): + """Initialize parameters for this criterion. + + Parameters + ---------- + n_outputs : SIZE_t + The number of targets to be predicted + + n_samples : SIZE_t + The total number of samples to fit on + """ + # Default values + self.sample_weight = NULL + + self.samples = NULL + self.start = 0 + self.pos = 0 + self.end = 0 + + self.n_outputs = n_outputs + self.n_samples = n_samples + self.n_node_samples = 0 + self.weighted_n_node_samples = 0.0 + self.weighted_n_left = 0.0 + self.weighted_n_right = 0.0 + + self.sq_sum_total = 0.0 + + # Allocate accumulators. Make sure they are NULL, not uninitialized, + # before an exception can be raised (which triggers __dealloc__). + self.sum_total = NULL + self.sum_left = NULL + self.sum_right = NULL + + # Allocate memory for the accumulators + self.sum_total = calloc(n_outputs, sizeof(double)) + self.sum_left = calloc(n_outputs, sizeof(double)) + self.sum_right = calloc(n_outputs, sizeof(double)) + + if (self.sum_total == NULL or + self.sum_left == NULL or + self.sum_right == NULL): + raise MemoryError() + + def __reduce__(self): + return (type(self), (self.n_outputs, self.n_samples), self.__getstate__()) + + cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, + double weighted_n_samples, SIZE_t* samples, SIZE_t start, + SIZE_t end) nogil except -1: + """Initialize the criterion. + + This initializes the criterion at node samples[start:end] and children + samples[start:start] and samples[start:end]. + """ + # Initialize fields + self.y = y + self.sample_weight = sample_weight + self.samples = samples + self.start = start + self.end = end + self.n_node_samples = end - start + self.weighted_n_samples = weighted_n_samples + self.weighted_n_node_samples = 0. + + cdef SIZE_t i + cdef SIZE_t p + cdef SIZE_t k + cdef DOUBLE_t y_ik + cdef DOUBLE_t w_y_ik + cdef DOUBLE_t w = 1.0 + + self.sq_sum_total = 0.0 + memset(self.sum_total, 0, self.n_outputs * sizeof(double)) + + for p in range(start, end): + i = samples[p] + + if sample_weight != NULL: + w = sample_weight[i] + + for k in range(self.n_outputs): + y_ik = self.y[i, k] + w_y_ik = w * y_ik + self.sum_total[k] += w_y_ik + self.sq_sum_total += w_y_ik * y_ik + + self.weighted_n_node_samples += w + + # Reset to pos=start + self.reset() + return 0 + + cdef int reset(self) nogil except -1: + """Reset the criterion at pos=start.""" + cdef SIZE_t n_bytes = self.n_outputs * sizeof(double) + memset(self.sum_left, 0, n_bytes) + memcpy(self.sum_right, self.sum_total, n_bytes) + + self.weighted_n_left = 0.0 + self.weighted_n_right = self.weighted_n_node_samples + self.pos = self.start + return 0 + + cdef int reverse_reset(self) nogil except -1: + """Reset the criterion at pos=end.""" + cdef SIZE_t n_bytes = self.n_outputs * sizeof(double) + memset(self.sum_right, 0, n_bytes) + memcpy(self.sum_left, self.sum_total, n_bytes) + + self.weighted_n_right = 0.0 + self.weighted_n_left = self.weighted_n_node_samples + self.pos = self.end + return 0 + + cdef int update(self, SIZE_t new_pos) nogil except -1: + """Updated statistics by moving samples[pos:new_pos] to the left.""" + cdef double* sum_left = self.sum_left + cdef double* sum_right = self.sum_right + cdef double* sum_total = self.sum_total + + cdef double* sample_weight = self.sample_weight + cdef SIZE_t* samples = self.samples + + cdef SIZE_t pos = self.pos + cdef SIZE_t end = self.end + cdef SIZE_t i + cdef SIZE_t p + cdef SIZE_t k + cdef DOUBLE_t w = 1.0 + + # Update statistics up to new_pos + # + # Given that + # sum_left[x] + sum_right[x] = sum_total[x] + # and that sum_total is known, we are going to update + # sum_left from the direction that require the least amount + # of computations, i.e. from pos to new_pos or from end to new_pos. + if (new_pos - pos) <= (end - new_pos): + for p in range(pos, new_pos): + i = samples[p] + + if sample_weight != NULL: + w = sample_weight[i] + + for k in range(self.n_outputs): + sum_left[k] += w * self.y[i, k] + + self.weighted_n_left += w + else: + self.reverse_reset() + + for p in range(end - 1, new_pos - 1, -1): + i = samples[p] + + if sample_weight != NULL: + w = sample_weight[i] + + for k in range(self.n_outputs): + sum_left[k] -= w * self.y[i, k] + + self.weighted_n_left -= w + + self.weighted_n_right = (self.weighted_n_node_samples - + self.weighted_n_left) + for k in range(self.n_outputs): + sum_right[k] = sum_total[k] - sum_left[k] + + self.pos = new_pos + return 0 + + cdef double node_impurity(self) nogil: + pass + + cdef void children_impurity(self, double* impurity_left, + double* impurity_right) nogil: + pass + + cdef void node_value(self, double* dest) nogil: + """Compute the node value of samples[start:end] into dest.""" + cdef SIZE_t k + + for k in range(self.n_outputs): + dest[k] = self.sum_total[k] / self.weighted_n_node_samples diff --git a/causalml/inference/tree/_tree/_splitter.pxd b/causalml/inference/tree/_tree/_splitter.pxd new file mode 100644 index 00000000..5b63fc1d --- /dev/null +++ b/causalml/inference/tree/_tree/_splitter.pxd @@ -0,0 +1,98 @@ +# Authors: Gilles Louppe +# Peter Prettenhofer +# Brian Holt +# Joel Nothman +# Arnaud Joly +# Jacob Schreiber +# +# License: BSD 3 clause + +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False +# cython: language_level=3 +# cython: linetrace=True + +import numpy as np +cimport numpy as cnp + +from ._criterion cimport Criterion + +from ._tree cimport DTYPE_t # Type of X +from ._tree cimport DOUBLE_t # Type of y, sample_weight +from ._tree cimport SIZE_t # Type for indices and counters +from ._tree cimport INT32_t # Signed 32 bit integer +from ._tree cimport UINT32_t # Unsigned 32 bit integer + + +cdef struct SplitRecord: + # Data to track sample split + SIZE_t feature # Which feature to split on. + SIZE_t pos # Split samples array at the given position, + # i.e. count of samples below threshold for feature. + # pos is >= end if the node is a leaf. + double threshold # Threshold to split at. + double improvement # Impurity improvement given parent node. + double impurity_left # Impurity of the left split. + double impurity_right # Impurity of the right split. + +cdef class Splitter: + # The splitter searches in the input space for a feature and a threshold + # to split the samples samples[start:end]. + # + # The impurity computations are delegated to a criterion object. + + # Internal structures + cdef public Criterion criterion # Impurity criterion + cdef public SIZE_t max_features # Number of features to test + cdef public SIZE_t min_samples_leaf # Min samples in a leaf + cdef public double min_weight_leaf # Minimum weight in a leaf + + cdef object random_state # Random state + cdef UINT32_t rand_r_state # sklearn_rand_r random number state + + cdef SIZE_t* samples # Sample indices in X, y + cdef SIZE_t n_samples # X.shape[0] + cdef double weighted_n_samples # Weighted number of samples + cdef SIZE_t* features # Feature indices in X + cdef SIZE_t* constant_features # Constant features indices + cdef SIZE_t n_features # X.shape[1] + cdef DTYPE_t* feature_values # temp. array holding feature values + + cdef SIZE_t start # Start position for the current node + cdef SIZE_t end # End position for the current node + + cdef const DOUBLE_t[:, ::1] y + cdef DOUBLE_t* sample_weight + + # The samples vector `samples` is maintained by the Splitter object such + # that the samples contained in a node are contiguous. With this setting, + # `node_split` reorganizes the node samples `samples[start:end]` in two + # subsets `samples[start:pos]` and `samples[pos:end]`. + + # The 1-d `features` array of size n_features contains the features + # indices and allows fast sampling without replacement of features. + + # The 1-d `constant_features` array of size n_features holds in + # `constant_features[:n_constant_features]` the feature ids with + # constant values for all the samples that reached a specific node. + # The value `n_constant_features` is given by the parent node to its + # child nodes. The content of the range `[n_constant_features:]` is left + # undefined, but preallocated for performance reasons + # This allows optimization with depth-based tree building. + + # Methods + cdef int init(self, object X, const DOUBLE_t[:, ::1] y, + DOUBLE_t* sample_weight) except -1 + + cdef int node_reset(self, SIZE_t start, SIZE_t end, + double* weighted_n_node_samples) nogil except -1 + + cdef int node_split(self, + double impurity, # Impurity of the node + SplitRecord* split, + SIZE_t* n_constant_features) nogil except -1 + + cdef void node_value(self, double* dest) nogil + + cdef double node_impurity(self) nogil diff --git a/causalml/inference/tree/_tree/_splitter.pyx b/causalml/inference/tree/_tree/_splitter.pyx new file mode 100644 index 00000000..f72bcce9 --- /dev/null +++ b/causalml/inference/tree/_tree/_splitter.pyx @@ -0,0 +1,1547 @@ +# Authors: Gilles Louppe +# Peter Prettenhofer +# Brian Holt +# Noel Dawe +# Satrajit Gosh +# Lars Buitinck +# Arnaud Joly +# Joel Nothman +# Fares Hedayati +# Jacob Schreiber +# +# License: BSD 3 clause + +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False +# cython: language_level=3 +# cython: linetrace=True + +from ._criterion cimport Criterion + +from libc.stdlib cimport free +from libc.stdlib cimport qsort +from libc.string cimport memcpy +from libc.string cimport memset + +import numpy as np +cimport numpy as cnp +cnp.import_array() + +from scipy.sparse import csc_matrix + +from ._utils cimport log +from ._utils cimport rand_int +from ._utils cimport rand_uniform +from ._utils cimport RAND_R_MAX +from ._utils cimport safe_realloc + +cdef double INFINITY = np.inf + +# Mitigate precision differences between 32 bit and 64 bit +cdef DTYPE_t FEATURE_THRESHOLD = 1e-7 + +# Constant to switch between algorithm non zero value extract algorithm +# in SparseSplitter +cdef DTYPE_t EXTRACT_NNZ_SWITCH = 0.1 + +cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) nogil: + self.impurity_left = INFINITY + self.impurity_right = INFINITY + self.pos = start_pos + self.feature = 0 + self.threshold = 0. + self.improvement = -INFINITY + +cdef class Splitter: + """Abstract splitter class. + + Splitters are called by tree builders to find the best splits on both + sparse and dense data, one split at a time. + """ + + def __cinit__(self, Criterion criterion, SIZE_t max_features, + SIZE_t min_samples_leaf, double min_weight_leaf, + object random_state): + """ + Parameters + ---------- + criterion : Criterion + The criterion to measure the quality of a split. + + max_features : SIZE_t + The maximal number of randomly selected features which can be + considered for a split. + + min_samples_leaf : SIZE_t + The minimal number of samples each leaf can have, where splits + which would result in having less samples in a leaf are not + considered. + + min_weight_leaf : double + The minimal weight each leaf can have, where the weight is the sum + of the weights of each sample in it. + + random_state : object + The user inputted random state to be used for pseudo-randomness + """ + + self.criterion = criterion + + self.samples = NULL + self.n_samples = 0 + self.features = NULL + self.n_features = 0 + self.feature_values = NULL + + self.sample_weight = NULL + + self.max_features = max_features + self.min_samples_leaf = min_samples_leaf + self.min_weight_leaf = min_weight_leaf + self.random_state = random_state + + def __dealloc__(self): + """Destructor.""" + + free(self.samples) + free(self.features) + free(self.constant_features) + free(self.feature_values) + + def __getstate__(self): + return {} + + def __setstate__(self, d): + pass + + cdef int init(self, + object X, + const DOUBLE_t[:, ::1] y, + DOUBLE_t* sample_weight) except -1: + """Initialize the splitter. + + Take in the input data X, the target Y, and optional sample weights. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + + Parameters + ---------- + X : object + This contains the inputs. Usually it is a 2d numpy array. + + y : ndarray, dtype=DOUBLE_t + This is the vector of targets, or true labels, for the samples + + sample_weight : DOUBLE_t* + The weights of the samples, where higher weighted samples are fit + closer than lower weight samples. If not provided, all samples + are assumed to have uniform weight. + """ + + self.rand_r_state = self.random_state.randint(0, RAND_R_MAX) + cdef SIZE_t n_samples = X.shape[0] + + # Create a new array which will be used to store nonzero + # samples from the feature of interest + cdef SIZE_t* samples = safe_realloc(&self.samples, n_samples) + + cdef SIZE_t i, j + cdef double weighted_n_samples = 0.0 + j = 0 + + for i in range(n_samples): + # Only work with positively weighted samples + if sample_weight == NULL or sample_weight[i] != 0.0: + samples[j] = i + j += 1 + + if sample_weight != NULL: + weighted_n_samples += sample_weight[i] + else: + weighted_n_samples += 1.0 + + # Number of samples is number of positively weighted samples + self.n_samples = j + self.weighted_n_samples = weighted_n_samples + + cdef SIZE_t n_features = X.shape[1] + cdef SIZE_t* features = safe_realloc(&self.features, n_features) + + for i in range(n_features): + features[i] = i + + self.n_features = n_features + + safe_realloc(&self.feature_values, n_samples) + safe_realloc(&self.constant_features, n_features) + + self.y = y + + self.sample_weight = sample_weight + return 0 + + cdef int node_reset(self, SIZE_t start, SIZE_t end, + double* weighted_n_node_samples) nogil except -1: + """Reset splitter on node samples[start:end]. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + + Parameters + ---------- + start : SIZE_t + The index of the first sample to consider + end : SIZE_t + The index of the last sample to consider + weighted_n_node_samples : ndarray, dtype=double pointer + The total weight of those samples + """ + + self.start = start + self.end = end + + self.criterion.init(self.y, + self.sample_weight, + self.weighted_n_samples, + self.samples, + start, + end) + + weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples + return 0 + + cdef int node_split(self, double impurity, SplitRecord* split, + SIZE_t* n_constant_features) nogil except -1: + """Find the best split on node samples[start:end]. + + This is a placeholder method. The majority of computation will be done + here. + + It should return -1 upon errors. + """ + + pass + + cdef void node_value(self, double* dest) nogil: + """Copy the value of node samples[start:end] into dest.""" + + self.criterion.node_value(dest) + + cdef double node_impurity(self) nogil: + """Return the impurity of the current node.""" + + return self.criterion.node_impurity() + + +cdef class BaseDenseSplitter(Splitter): + cdef const DTYPE_t[:, :] X + + cdef SIZE_t n_total_samples + + cdef int init(self, + object X, + const DOUBLE_t[:, ::1] y, + DOUBLE_t* sample_weight) except -1: + """Initialize the splitter + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + + # Call parent init + Splitter.init(self, X, y, sample_weight) + + self.X = X + return 0 + + +cdef class BestSplitter(BaseDenseSplitter): + """Splitter for finding the best split.""" + def __reduce__(self): + return (BestSplitter, (self.criterion, + self.max_features, + self.min_samples_leaf, + self.min_weight_leaf, + self.random_state), self.__getstate__()) + + cdef int node_split(self, double impurity, SplitRecord* split, + SIZE_t* n_constant_features) nogil except -1: + """Find the best split on node samples[start:end] + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + # Find the best split + cdef SIZE_t* samples = self.samples + cdef SIZE_t start = self.start + cdef SIZE_t end = self.end + + cdef SIZE_t* features = self.features + cdef SIZE_t* constant_features = self.constant_features + cdef SIZE_t n_features = self.n_features + + cdef DTYPE_t* Xf = self.feature_values + cdef SIZE_t max_features = self.max_features + cdef SIZE_t min_samples_leaf = self.min_samples_leaf + cdef double min_weight_leaf = self.min_weight_leaf + cdef UINT32_t* random_state = &self.rand_r_state + + cdef SplitRecord best, current + cdef double current_proxy_improvement = -INFINITY + cdef double best_proxy_improvement = -INFINITY + + cdef SIZE_t f_i = n_features + cdef SIZE_t f_j + cdef SIZE_t p + cdef SIZE_t feature_idx_offset + cdef SIZE_t feature_offset + cdef SIZE_t i + cdef SIZE_t j + + cdef SIZE_t n_visited_features = 0 + # Number of features discovered to be constant during the split search + cdef SIZE_t n_found_constants = 0 + # Number of features known to be constant and drawn without replacement + cdef SIZE_t n_drawn_constants = 0 + cdef SIZE_t n_known_constants = n_constant_features[0] + # n_total_constants = n_known_constants + n_found_constants + cdef SIZE_t n_total_constants = n_known_constants + cdef DTYPE_t current_feature_value + cdef SIZE_t partition_end + + _init_split(&best, end) + + # Sample up to max_features without replacement using a + # Fisher-Yates-based algorithm (using the local variables `f_i` and + # `f_j` to compute a permutation of the `features` array). + # + # Skip the CPU intensive evaluation of the impurity criterion for + # features that were already detected as constant (hence not suitable + # for good splitting) by ancestor nodes and save the information on + # newly discovered constant features to spare computation on descendant + # nodes. + while (f_i > n_total_constants and # Stop early if remaining features + # are constant + (n_visited_features < max_features or + # At least one drawn features must be non constant + n_visited_features <= n_found_constants + n_drawn_constants)): + + n_visited_features += 1 + + # Loop invariant: elements of features in + # - [:n_drawn_constant[ holds drawn and known constant features; + # - [n_drawn_constant:n_known_constant[ holds known constant + # features that haven't been drawn yet; + # - [n_known_constant:n_total_constant[ holds newly found constant + # features; + # - [n_total_constant:f_i[ holds features that haven't been drawn + # yet and aren't constant apriori. + # - [f_i:n_features[ holds features that have been drawn + # and aren't constant. + + # Draw a feature at random + f_j = rand_int(n_drawn_constants, f_i - n_found_constants, + random_state) + + if f_j < n_known_constants: + # f_j in the interval [n_drawn_constants, n_known_constants[ + features[n_drawn_constants], features[f_j] = features[f_j], features[n_drawn_constants] + + n_drawn_constants += 1 + + else: + # f_j in the interval [n_known_constants, f_i - n_found_constants[ + f_j += n_found_constants + # f_j in the interval [n_total_constants, f_i[ + current.feature = features[f_j] + + # Sort samples along that feature; by + # copying the values into an array and + # sorting the array in a manner which utilizes the cache more + # effectively. + for i in range(start, end): + Xf[i] = self.X[samples[i], current.feature] + + sort(Xf + start, samples + start, end - start) + + if Xf[end - 1] <= Xf[start] + FEATURE_THRESHOLD: + features[f_j], features[n_total_constants] = features[n_total_constants], features[f_j] + + n_found_constants += 1 + n_total_constants += 1 + + else: + f_i -= 1 + features[f_i], features[f_j] = features[f_j], features[f_i] + + # Evaluate all splits + self.criterion.reset() + p = start + + while p < end: + while (p + 1 < end and + Xf[p + 1] <= Xf[p] + FEATURE_THRESHOLD): + p += 1 + + # (p + 1 >= end) or (X[samples[p + 1], current.feature] > + # X[samples[p], current.feature]) + p += 1 + # (p >= end) or (X[samples[p], current.feature] > + # X[samples[p - 1], current.feature]) + + if p < end: + current.pos = p + + # Reject if min_samples_leaf is not guaranteed + if (((current.pos - start) < min_samples_leaf) or + ((end - current.pos) < min_samples_leaf)): + continue + + self.criterion.update(current.pos) + + # Reject if min_weight_leaf is not satisfied + if ((self.criterion.weighted_n_left < min_weight_leaf) or + (self.criterion.weighted_n_right < min_weight_leaf)): + continue + + current_proxy_improvement = self.criterion.proxy_impurity_improvement() + + if current_proxy_improvement > best_proxy_improvement: + best_proxy_improvement = current_proxy_improvement + # sum of halves is used to avoid infinite value + current.threshold = Xf[p - 1] / 2.0 + Xf[p] / 2.0 + + if ((current.threshold == Xf[p]) or + (current.threshold == INFINITY) or + (current.threshold == -INFINITY)): + current.threshold = Xf[p - 1] + + best = current # copy + + # Reorganize into samples[start:best.pos] + samples[best.pos:end] + if best.pos < end: + partition_end = end + p = start + + while p < partition_end: + if self.X[samples[p], best.feature] <= best.threshold: + p += 1 + + else: + partition_end -= 1 + + samples[p], samples[partition_end] = samples[partition_end], samples[p] + + self.criterion.reset() + self.criterion.update(best.pos) + self.criterion.children_impurity(&best.impurity_left, + &best.impurity_right) + best.improvement = self.criterion.impurity_improvement( + impurity, best.impurity_left, best.impurity_right) + + # Respect invariant for constant features: the original order of + # element in features[:n_known_constants] must be preserved for sibling + # and child nodes + memcpy(features, constant_features, sizeof(SIZE_t) * n_known_constants) + + # Copy newly found constant features + memcpy(constant_features + n_known_constants, + features + n_known_constants, + sizeof(SIZE_t) * n_found_constants) + + # Return values + split[0] = best + n_constant_features[0] = n_total_constants + return 0 + + +# Sort n-element arrays pointed to by Xf and samples, simultaneously, +# by the values in Xf. Algorithm: Introsort (Musser, SP&E, 1997). +cdef inline void sort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil: + if n == 0: + return + cdef int maxd = 2 * log(n) + introsort(Xf, samples, n, maxd) + + +cdef inline void swap(DTYPE_t* Xf, SIZE_t* samples, + SIZE_t i, SIZE_t j) nogil: + # Helper for sort + Xf[i], Xf[j] = Xf[j], Xf[i] + samples[i], samples[j] = samples[j], samples[i] + + +cdef inline DTYPE_t median3(DTYPE_t* Xf, SIZE_t n) nogil: + # Median of three pivot selection, after Bentley and McIlroy (1993). + # Engineering a sort function. SP&E. Requires 8/3 comparisons on average. + cdef DTYPE_t a = Xf[0], b = Xf[n / 2], c = Xf[n - 1] + if a < b: + if b < c: + return b + elif a < c: + return c + else: + return a + elif b < c: + if a < c: + return a + else: + return c + else: + return b + + +# Introsort with median of 3 pivot selection and 3-way partition function +# (robust to repeated elements, e.g. lots of zero features). +cdef void introsort(DTYPE_t* Xf, SIZE_t *samples, + SIZE_t n, int maxd) nogil: + cdef DTYPE_t pivot + cdef SIZE_t i, l, r + + while n > 1: + if maxd <= 0: # max depth limit exceeded ("gone quadratic") + heapsort(Xf, samples, n) + return + maxd -= 1 + + pivot = median3(Xf, n) + + # Three-way partition. + i = l = 0 + r = n + while i < r: + if Xf[i] < pivot: + swap(Xf, samples, i, l) + i += 1 + l += 1 + elif Xf[i] > pivot: + r -= 1 + swap(Xf, samples, i, r) + else: + i += 1 + + introsort(Xf, samples, l, maxd) + Xf += r + samples += r + n -= r + + +cdef inline void sift_down(DTYPE_t* Xf, SIZE_t* samples, + SIZE_t start, SIZE_t end) nogil: + # Restore heap order in Xf[start:end] by moving the max element to start. + cdef SIZE_t child, maxind, root + + root = start + while True: + child = root * 2 + 1 + + # find max of root, left child, right child + maxind = root + if child < end and Xf[maxind] < Xf[child]: + maxind = child + if child + 1 < end and Xf[maxind] < Xf[child + 1]: + maxind = child + 1 + + if maxind == root: + break + else: + swap(Xf, samples, root, maxind) + root = maxind + + +cdef void heapsort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil: + cdef SIZE_t start, end + + # heapify + start = (n - 2) / 2 + end = n + while True: + sift_down(Xf, samples, start, end) + if start == 0: + break + start -= 1 + + # sort by shrinking the heap, putting the max element immediately after it + end = n - 1 + while end > 0: + swap(Xf, samples, 0, end) + sift_down(Xf, samples, 0, end) + end = end - 1 + + +cdef class RandomSplitter(BaseDenseSplitter): + """Splitter for finding the best random split.""" + def __reduce__(self): + return (RandomSplitter, (self.criterion, + self.max_features, + self.min_samples_leaf, + self.min_weight_leaf, + self.random_state), self.__getstate__()) + + cdef int node_split(self, double impurity, SplitRecord* split, + SIZE_t* n_constant_features) nogil except -1: + """Find the best random split on node samples[start:end] + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + # Draw random splits and pick the best + cdef SIZE_t* samples = self.samples + cdef SIZE_t start = self.start + cdef SIZE_t end = self.end + + cdef SIZE_t* features = self.features + cdef SIZE_t* constant_features = self.constant_features + cdef SIZE_t n_features = self.n_features + + cdef DTYPE_t* Xf = self.feature_values + cdef SIZE_t max_features = self.max_features + cdef SIZE_t min_samples_leaf = self.min_samples_leaf + cdef double min_weight_leaf = self.min_weight_leaf + cdef UINT32_t* random_state = &self.rand_r_state + + cdef SplitRecord best, current + cdef double current_proxy_improvement = - INFINITY + cdef double best_proxy_improvement = - INFINITY + + cdef SIZE_t f_i = n_features + cdef SIZE_t f_j + cdef SIZE_t p + cdef SIZE_t partition_end + cdef SIZE_t feature_stride + # Number of features discovered to be constant during the split search + cdef SIZE_t n_found_constants = 0 + # Number of features known to be constant and drawn without replacement + cdef SIZE_t n_drawn_constants = 0 + cdef SIZE_t n_known_constants = n_constant_features[0] + # n_total_constants = n_known_constants + n_found_constants + cdef SIZE_t n_total_constants = n_known_constants + cdef SIZE_t n_visited_features = 0 + cdef DTYPE_t min_feature_value + cdef DTYPE_t max_feature_value + cdef DTYPE_t current_feature_value + + _init_split(&best, end) + + # Sample up to max_features without replacement using a + # Fisher-Yates-based algorithm (using the local variables `f_i` and + # `f_j` to compute a permutation of the `features` array). + # + # Skip the CPU intensive evaluation of the impurity criterion for + # features that were already detected as constant (hence not suitable + # for good splitting) by ancestor nodes and save the information on + # newly discovered constant features to spare computation on descendant + # nodes. + while (f_i > n_total_constants and # Stop early if remaining features + # are constant + (n_visited_features < max_features or + # At least one drawn features must be non constant + n_visited_features <= n_found_constants + n_drawn_constants)): + n_visited_features += 1 + + # Loop invariant: elements of features in + # - [:n_drawn_constant[ holds drawn and known constant features; + # - [n_drawn_constant:n_known_constant[ holds known constant + # features that haven't been drawn yet; + # - [n_known_constant:n_total_constant[ holds newly found constant + # features; + # - [n_total_constant:f_i[ holds features that haven't been drawn + # yet and aren't constant apriori. + # - [f_i:n_features[ holds features that have been drawn + # and aren't constant. + + # Draw a feature at random + f_j = rand_int(n_drawn_constants, f_i - n_found_constants, + random_state) + + if f_j < n_known_constants: + # f_j in the interval [n_drawn_constants, n_known_constants[ + features[n_drawn_constants], features[f_j] = features[f_j], features[n_drawn_constants] + n_drawn_constants += 1 + + else: + # f_j in the interval [n_known_constants, f_i - n_found_constants[ + f_j += n_found_constants + # f_j in the interval [n_total_constants, f_i[ + + current.feature = features[f_j] + + # Find min, max + min_feature_value = self.X[samples[start], current.feature] + max_feature_value = min_feature_value + Xf[start] = min_feature_value + + for p in range(start + 1, end): + current_feature_value = self.X[samples[p], current.feature] + Xf[p] = current_feature_value + + if current_feature_value < min_feature_value: + min_feature_value = current_feature_value + elif current_feature_value > max_feature_value: + max_feature_value = current_feature_value + + if max_feature_value <= min_feature_value + FEATURE_THRESHOLD: + features[f_j], features[n_total_constants] = features[n_total_constants], current.feature + + n_found_constants += 1 + n_total_constants += 1 + + else: + f_i -= 1 + features[f_i], features[f_j] = features[f_j], features[f_i] + + # Draw a random threshold + current.threshold = rand_uniform(min_feature_value, + max_feature_value, + random_state) + + if current.threshold == max_feature_value: + current.threshold = min_feature_value + + # Partition + p, partition_end = start, end + while p < partition_end: + if Xf[p] <= current.threshold: + p += 1 + else: + partition_end -= 1 + + Xf[p], Xf[partition_end] = Xf[partition_end], Xf[p] + samples[p], samples[partition_end] = samples[partition_end], samples[p] + + current.pos = partition_end + + # Reject if min_samples_leaf is not guaranteed + if (((current.pos - start) < min_samples_leaf) or + ((end - current.pos) < min_samples_leaf)): + continue + + # Evaluate split + self.criterion.reset() + self.criterion.update(current.pos) + + # Reject if min_weight_leaf is not satisfied + if ((self.criterion.weighted_n_left < min_weight_leaf) or + (self.criterion.weighted_n_right < min_weight_leaf)): + continue + + current_proxy_improvement = self.criterion.proxy_impurity_improvement() + + if current_proxy_improvement > best_proxy_improvement: + best_proxy_improvement = current_proxy_improvement + best = current # copy + + # Reorganize into samples[start:best.pos] + samples[best.pos:end] + if best.pos < end: + if current.feature != best.feature: + p, partition_end = start, end + + while p < partition_end: + if self.X[samples[p], best.feature] <= best.threshold: + p += 1 + else: + partition_end -= 1 + + samples[p], samples[partition_end] = samples[partition_end], samples[p] + + self.criterion.reset() + self.criterion.update(best.pos) + self.criterion.children_impurity(&best.impurity_left, + &best.impurity_right) + best.improvement = self.criterion.impurity_improvement( + impurity, best.impurity_left, best.impurity_right) + + # Respect invariant for constant features: the original order of + # element in features[:n_known_constants] must be preserved for sibling + # and child nodes + memcpy(features, constant_features, sizeof(SIZE_t) * n_known_constants) + + # Copy newly found constant features + memcpy(constant_features + n_known_constants, + features + n_known_constants, + sizeof(SIZE_t) * n_found_constants) + + # Return values + split[0] = best + n_constant_features[0] = n_total_constants + return 0 + + +cdef class BaseSparseSplitter(Splitter): + # The sparse splitter works only with csc sparse matrix format + cdef DTYPE_t* X_data + cdef INT32_t* X_indices + cdef INT32_t* X_indptr + + cdef SIZE_t n_total_samples + + cdef SIZE_t* index_to_samples + cdef SIZE_t* sorted_samples + + def __cinit__(self, Criterion criterion, SIZE_t max_features, + SIZE_t min_samples_leaf, double min_weight_leaf, + object random_state): + # Parent __cinit__ is automatically called + + self.X_data = NULL + self.X_indices = NULL + self.X_indptr = NULL + + self.n_total_samples = 0 + + self.index_to_samples = NULL + self.sorted_samples = NULL + + def __dealloc__(self): + """Deallocate memory.""" + free(self.index_to_samples) + free(self.sorted_samples) + + cdef int init(self, + object X, + const DOUBLE_t[:, ::1] y, + DOUBLE_t* sample_weight) except -1: + """Initialize the splitter + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + # Call parent init + Splitter.init(self, X, y, sample_weight) + + if not isinstance(X, csc_matrix): + raise ValueError("X should be in csc format") + + cdef SIZE_t* samples = self.samples + cdef SIZE_t n_samples = self.n_samples + + # Initialize X + cdef cnp.ndarray[dtype=DTYPE_t, ndim=1] data = X.data + cdef cnp.ndarray[dtype=INT32_t, ndim=1] indices = X.indices + cdef cnp.ndarray[dtype=INT32_t, ndim=1] indptr = X.indptr + cdef SIZE_t n_total_samples = X.shape[0] + + self.X_data = data.data + self.X_indices = indices.data + self.X_indptr = indptr.data + self.n_total_samples = n_total_samples + + # Initialize auxiliary array used to perform split + safe_realloc(&self.index_to_samples, n_total_samples) + safe_realloc(&self.sorted_samples, n_samples) + + cdef SIZE_t* index_to_samples = self.index_to_samples + cdef SIZE_t p + for p in range(n_total_samples): + index_to_samples[p] = -1 + + for p in range(n_samples): + index_to_samples[samples[p]] = p + return 0 + + cdef inline SIZE_t _partition(self, double threshold, + SIZE_t end_negative, SIZE_t start_positive, + SIZE_t zero_pos) nogil: + """Partition samples[start:end] based on threshold.""" + + cdef SIZE_t p + cdef SIZE_t partition_end + + cdef DTYPE_t* Xf = self.feature_values + cdef SIZE_t* samples = self.samples + cdef SIZE_t* index_to_samples = self.index_to_samples + + if threshold < 0.: + p = self.start + partition_end = end_negative + elif threshold > 0.: + p = start_positive + partition_end = self.end + else: + # Data are already split + return zero_pos + + while p < partition_end: + if Xf[p] <= threshold: + p += 1 + + else: + partition_end -= 1 + + Xf[p], Xf[partition_end] = Xf[partition_end], Xf[p] + sparse_swap(index_to_samples, samples, p, partition_end) + + return partition_end + + cdef inline void extract_nnz(self, SIZE_t feature, + SIZE_t* end_negative, SIZE_t* start_positive, + bint* is_samples_sorted) nogil: + """Extract and partition values for a given feature. + + The extracted values are partitioned between negative values + Xf[start:end_negative[0]] and positive values Xf[start_positive[0]:end]. + The samples and index_to_samples are modified according to this + partition. + + The extraction corresponds to the intersection between the arrays + X_indices[indptr_start:indptr_end] and samples[start:end]. + This is done efficiently using either an index_to_samples based approach + or binary search based approach. + + Parameters + ---------- + feature : SIZE_t, + Index of the feature we want to extract non zero value. + + + end_negative, start_positive : SIZE_t*, SIZE_t*, + Return extracted non zero values in self.samples[start:end] where + negative values are in self.feature_values[start:end_negative[0]] + and positive values are in + self.feature_values[start_positive[0]:end]. + + is_samples_sorted : bint*, + If is_samples_sorted, then self.sorted_samples[start:end] will be + the sorted version of self.samples[start:end]. + + """ + cdef SIZE_t indptr_start = self.X_indptr[feature], + cdef SIZE_t indptr_end = self.X_indptr[feature + 1] + cdef SIZE_t n_indices = (indptr_end - indptr_start) + cdef SIZE_t n_samples = self.end - self.start + + # Use binary search if n_samples * log(n_indices) < + # n_indices and index_to_samples approach otherwise. + # O(n_samples * log(n_indices)) is the running time of binary + # search and O(n_indices) is the running time of index_to_samples + # approach. + if ((1 - is_samples_sorted[0]) * n_samples * log(n_samples) + + n_samples * log(n_indices) < EXTRACT_NNZ_SWITCH * n_indices): + extract_nnz_binary_search(self.X_indices, self.X_data, + indptr_start, indptr_end, + self.samples, self.start, self.end, + self.index_to_samples, + self.feature_values, + end_negative, start_positive, + self.sorted_samples, is_samples_sorted) + + # Using an index to samples technique to extract non zero values + # index_to_samples is a mapping from X_indices to samples + else: + extract_nnz_index_to_samples(self.X_indices, self.X_data, + indptr_start, indptr_end, + self.samples, self.start, self.end, + self.index_to_samples, + self.feature_values, + end_negative, start_positive) + + +cdef int compare_SIZE_t(const void* a, const void* b) nogil: + """Comparison function for sort.""" + return ((a)[0] - (b)[0]) + + +cdef inline void binary_search(INT32_t* sorted_array, + INT32_t start, INT32_t end, + SIZE_t value, SIZE_t* index, + INT32_t* new_start) nogil: + """Return the index of value in the sorted array. + + If not found, return -1. new_start is the last pivot + 1 + """ + cdef INT32_t pivot + index[0] = -1 + while start < end: + pivot = start + (end - start) / 2 + + if sorted_array[pivot] == value: + index[0] = pivot + start = pivot + 1 + break + + if sorted_array[pivot] < value: + start = pivot + 1 + else: + end = pivot + new_start[0] = start + + +cdef inline void extract_nnz_index_to_samples(INT32_t* X_indices, + DTYPE_t* X_data, + INT32_t indptr_start, + INT32_t indptr_end, + SIZE_t* samples, + SIZE_t start, + SIZE_t end, + SIZE_t* index_to_samples, + DTYPE_t* Xf, + SIZE_t* end_negative, + SIZE_t* start_positive) nogil: + """Extract and partition values for a feature using index_to_samples. + + Complexity is O(indptr_end - indptr_start). + """ + cdef INT32_t k + cdef SIZE_t index + cdef SIZE_t end_negative_ = start + cdef SIZE_t start_positive_ = end + + for k in range(indptr_start, indptr_end): + if start <= index_to_samples[X_indices[k]] < end: + if X_data[k] > 0: + start_positive_ -= 1 + Xf[start_positive_] = X_data[k] + index = index_to_samples[X_indices[k]] + sparse_swap(index_to_samples, samples, index, start_positive_) + + + elif X_data[k] < 0: + Xf[end_negative_] = X_data[k] + index = index_to_samples[X_indices[k]] + sparse_swap(index_to_samples, samples, index, end_negative_) + end_negative_ += 1 + + # Returned values + end_negative[0] = end_negative_ + start_positive[0] = start_positive_ + + +cdef inline void extract_nnz_binary_search(INT32_t* X_indices, + DTYPE_t* X_data, + INT32_t indptr_start, + INT32_t indptr_end, + SIZE_t* samples, + SIZE_t start, + SIZE_t end, + SIZE_t* index_to_samples, + DTYPE_t* Xf, + SIZE_t* end_negative, + SIZE_t* start_positive, + SIZE_t* sorted_samples, + bint* is_samples_sorted) nogil: + """Extract and partition values for a given feature using binary search. + + If n_samples = end - start and n_indices = indptr_end - indptr_start, + the complexity is + + O((1 - is_samples_sorted[0]) * n_samples * log(n_samples) + + n_samples * log(n_indices)). + """ + cdef SIZE_t n_samples + + if not is_samples_sorted[0]: + n_samples = end - start + memcpy(sorted_samples + start, samples + start, + n_samples * sizeof(SIZE_t)) + qsort(sorted_samples + start, n_samples, sizeof(SIZE_t), + compare_SIZE_t) + is_samples_sorted[0] = 1 + + while (indptr_start < indptr_end and + sorted_samples[start] > X_indices[indptr_start]): + indptr_start += 1 + + while (indptr_start < indptr_end and + sorted_samples[end - 1] < X_indices[indptr_end - 1]): + indptr_end -= 1 + + cdef SIZE_t p = start + cdef SIZE_t index + cdef SIZE_t k + cdef SIZE_t end_negative_ = start + cdef SIZE_t start_positive_ = end + + while (p < end and indptr_start < indptr_end): + # Find index of sorted_samples[p] in X_indices + binary_search(X_indices, indptr_start, indptr_end, + sorted_samples[p], &k, &indptr_start) + + if k != -1: + # If k != -1, we have found a non zero value + + if X_data[k] > 0: + start_positive_ -= 1 + Xf[start_positive_] = X_data[k] + index = index_to_samples[X_indices[k]] + sparse_swap(index_to_samples, samples, index, start_positive_) + + + elif X_data[k] < 0: + Xf[end_negative_] = X_data[k] + index = index_to_samples[X_indices[k]] + sparse_swap(index_to_samples, samples, index, end_negative_) + end_negative_ += 1 + p += 1 + + # Returned values + end_negative[0] = end_negative_ + start_positive[0] = start_positive_ + + +cdef inline void sparse_swap(SIZE_t* index_to_samples, SIZE_t* samples, + SIZE_t pos_1, SIZE_t pos_2) nogil: + """Swap sample pos_1 and pos_2 preserving sparse invariant.""" + samples[pos_1], samples[pos_2] = samples[pos_2], samples[pos_1] + index_to_samples[samples[pos_1]] = pos_1 + index_to_samples[samples[pos_2]] = pos_2 + + +cdef class BestSparseSplitter(BaseSparseSplitter): + """Splitter for finding the best split, using the sparse data.""" + + def __reduce__(self): + return (BestSparseSplitter, (self.criterion, + self.max_features, + self.min_samples_leaf, + self.min_weight_leaf, + self.random_state), self.__getstate__()) + + cdef int node_split(self, double impurity, SplitRecord* split, + SIZE_t* n_constant_features) nogil except -1: + """Find the best split on node samples[start:end], using sparse features + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + # Find the best split + cdef SIZE_t* samples = self.samples + cdef SIZE_t start = self.start + cdef SIZE_t end = self.end + + cdef INT32_t* X_indices = self.X_indices + cdef INT32_t* X_indptr = self.X_indptr + cdef DTYPE_t* X_data = self.X_data + + cdef SIZE_t* features = self.features + cdef SIZE_t* constant_features = self.constant_features + cdef SIZE_t n_features = self.n_features + + cdef DTYPE_t* Xf = self.feature_values + cdef SIZE_t* sorted_samples = self.sorted_samples + cdef SIZE_t* index_to_samples = self.index_to_samples + cdef SIZE_t max_features = self.max_features + cdef SIZE_t min_samples_leaf = self.min_samples_leaf + cdef double min_weight_leaf = self.min_weight_leaf + cdef UINT32_t* random_state = &self.rand_r_state + + cdef SplitRecord best, current + _init_split(&best, end) + cdef double current_proxy_improvement = - INFINITY + cdef double best_proxy_improvement = - INFINITY + + cdef SIZE_t f_i = n_features + cdef SIZE_t f_j, p + cdef SIZE_t n_visited_features = 0 + # Number of features discovered to be constant during the split search + cdef SIZE_t n_found_constants = 0 + # Number of features known to be constant and drawn without replacement + cdef SIZE_t n_drawn_constants = 0 + cdef SIZE_t n_known_constants = n_constant_features[0] + # n_total_constants = n_known_constants + n_found_constants + cdef SIZE_t n_total_constants = n_known_constants + cdef DTYPE_t current_feature_value + + cdef SIZE_t p_next + cdef SIZE_t p_prev + cdef bint is_samples_sorted = 0 # indicate is sorted_samples is + # inititialized + + # We assume implicitly that end_positive = end and + # start_negative = start + cdef SIZE_t start_positive + cdef SIZE_t end_negative + + # Sample up to max_features without replacement using a + # Fisher-Yates-based algorithm (using the local variables `f_i` and + # `f_j` to compute a permutation of the `features` array). + # + # Skip the CPU intensive evaluation of the impurity criterion for + # features that were already detected as constant (hence not suitable + # for good splitting) by ancestor nodes and save the information on + # newly discovered constant features to spare computation on descendant + # nodes. + while (f_i > n_total_constants and # Stop early if remaining features + # are constant + (n_visited_features < max_features or + # At least one drawn features must be non constant + n_visited_features <= n_found_constants + n_drawn_constants)): + + n_visited_features += 1 + + # Loop invariant: elements of features in + # - [:n_drawn_constant[ holds drawn and known constant features; + # - [n_drawn_constant:n_known_constant[ holds known constant + # features that haven't been drawn yet; + # - [n_known_constant:n_total_constant[ holds newly found constant + # features; + # - [n_total_constant:f_i[ holds features that haven't been drawn + # yet and aren't constant apriori. + # - [f_i:n_features[ holds features that have been drawn + # and aren't constant. + + # Draw a feature at random + f_j = rand_int(n_drawn_constants, f_i - n_found_constants, + random_state) + + if f_j < n_known_constants: + # f_j in the interval [n_drawn_constants, n_known_constants[ + features[f_j], features[n_drawn_constants] = features[n_drawn_constants], features[f_j] + + n_drawn_constants += 1 + + else: + # f_j in the interval [n_known_constants, f_i - n_found_constants[ + f_j += n_found_constants + # f_j in the interval [n_total_constants, f_i[ + + current.feature = features[f_j] + self.extract_nnz(current.feature, + &end_negative, &start_positive, + &is_samples_sorted) + + # Sort the positive and negative parts of `Xf` + sort(Xf + start, samples + start, end_negative - start) + sort(Xf + start_positive, samples + start_positive, + end - start_positive) + + # Update index_to_samples to take into account the sort + for p in range(start, end_negative): + index_to_samples[samples[p]] = p + for p in range(start_positive, end): + index_to_samples[samples[p]] = p + + # Add one or two zeros in Xf, if there is any + if end_negative < start_positive: + start_positive -= 1 + Xf[start_positive] = 0. + + if end_negative != start_positive: + Xf[end_negative] = 0. + end_negative += 1 + + if Xf[end - 1] <= Xf[start] + FEATURE_THRESHOLD: + features[f_j], features[n_total_constants] = features[n_total_constants], features[f_j] + + n_found_constants += 1 + n_total_constants += 1 + + else: + f_i -= 1 + features[f_i], features[f_j] = features[f_j], features[f_i] + + # Evaluate all splits + self.criterion.reset() + p = start + + while p < end: + if p + 1 != end_negative: + p_next = p + 1 + else: + p_next = start_positive + + while (p_next < end and + Xf[p_next] <= Xf[p] + FEATURE_THRESHOLD): + p = p_next + if p + 1 != end_negative: + p_next = p + 1 + else: + p_next = start_positive + + + # (p_next >= end) or (X[samples[p_next], current.feature] > + # X[samples[p], current.feature]) + p_prev = p + p = p_next + # (p >= end) or (X[samples[p], current.feature] > + # X[samples[p_prev], current.feature]) + + + if p < end: + current.pos = p + + # Reject if min_samples_leaf is not guaranteed + if (((current.pos - start) < min_samples_leaf) or + ((end - current.pos) < min_samples_leaf)): + continue + + self.criterion.update(current.pos) + + # Reject if min_weight_leaf is not satisfied + if ((self.criterion.weighted_n_left < min_weight_leaf) or + (self.criterion.weighted_n_right < min_weight_leaf)): + continue + + current_proxy_improvement = self.criterion.proxy_impurity_improvement() + + if current_proxy_improvement > best_proxy_improvement: + best_proxy_improvement = current_proxy_improvement + # sum of halves used to avoid infinite values + current.threshold = Xf[p_prev] / 2.0 + Xf[p] / 2.0 + + if ((current.threshold == Xf[p]) or + (current.threshold == INFINITY) or + (current.threshold == -INFINITY)): + current.threshold = Xf[p_prev] + + best = current + + # Reorganize into samples[start:best.pos] + samples[best.pos:end] + if best.pos < end: + self.extract_nnz(best.feature, &end_negative, &start_positive, + &is_samples_sorted) + + self._partition(best.threshold, end_negative, start_positive, + best.pos) + + self.criterion.reset() + self.criterion.update(best.pos) + self.criterion.children_impurity(&best.impurity_left, + &best.impurity_right) + best.improvement = self.criterion.impurity_improvement( + impurity, best.impurity_left, best.impurity_right) + + # Respect invariant for constant features: the original order of + # element in features[:n_known_constants] must be preserved for sibling + # and child nodes + memcpy(features, constant_features, sizeof(SIZE_t) * n_known_constants) + + # Copy newly found constant features + memcpy(constant_features + n_known_constants, + features + n_known_constants, + sizeof(SIZE_t) * n_found_constants) + + # Return values + split[0] = best + n_constant_features[0] = n_total_constants + return 0 + + +cdef class RandomSparseSplitter(BaseSparseSplitter): + """Splitter for finding a random split, using the sparse data.""" + + def __reduce__(self): + return (RandomSparseSplitter, (self.criterion, + self.max_features, + self.min_samples_leaf, + self.min_weight_leaf, + self.random_state), self.__getstate__()) + + cdef int node_split(self, double impurity, SplitRecord* split, + SIZE_t* n_constant_features) nogil except -1: + """Find a random split on node samples[start:end], using sparse features + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + # Find the best split + cdef SIZE_t* samples = self.samples + cdef SIZE_t start = self.start + cdef SIZE_t end = self.end + + cdef INT32_t* X_indices = self.X_indices + cdef INT32_t* X_indptr = self.X_indptr + cdef DTYPE_t* X_data = self.X_data + + cdef SIZE_t* features = self.features + cdef SIZE_t* constant_features = self.constant_features + cdef SIZE_t n_features = self.n_features + + cdef DTYPE_t* Xf = self.feature_values + cdef SIZE_t* sorted_samples = self.sorted_samples + cdef SIZE_t* index_to_samples = self.index_to_samples + cdef SIZE_t max_features = self.max_features + cdef SIZE_t min_samples_leaf = self.min_samples_leaf + cdef double min_weight_leaf = self.min_weight_leaf + cdef UINT32_t* random_state = &self.rand_r_state + + cdef SplitRecord best, current + _init_split(&best, end) + cdef double current_proxy_improvement = - INFINITY + cdef double best_proxy_improvement = - INFINITY + + cdef DTYPE_t current_feature_value + + cdef SIZE_t f_i = n_features + cdef SIZE_t f_j, p + cdef SIZE_t n_visited_features = 0 + # Number of features discovered to be constant during the split search + cdef SIZE_t n_found_constants = 0 + # Number of features known to be constant and drawn without replacement + cdef SIZE_t n_drawn_constants = 0 + cdef SIZE_t n_known_constants = n_constant_features[0] + # n_total_constants = n_known_constants + n_found_constants + cdef SIZE_t n_total_constants = n_known_constants + cdef SIZE_t partition_end + + cdef DTYPE_t min_feature_value + cdef DTYPE_t max_feature_value + + cdef bint is_samples_sorted = 0 # indicate that sorted_samples is + # inititialized + + # We assume implicitly that end_positive = end and + # start_negative = start + cdef SIZE_t start_positive + cdef SIZE_t end_negative + + # Sample up to max_features without replacement using a + # Fisher-Yates-based algorithm (using the local variables `f_i` and + # `f_j` to compute a permutation of the `features` array). + # + # Skip the CPU intensive evaluation of the impurity criterion for + # features that were already detected as constant (hence not suitable + # for good splitting) by ancestor nodes and save the information on + # newly discovered constant features to spare computation on descendant + # nodes. + while (f_i > n_total_constants and # Stop early if remaining features + # are constant + (n_visited_features < max_features or + # At least one drawn features must be non constant + n_visited_features <= n_found_constants + n_drawn_constants)): + + n_visited_features += 1 + + # Loop invariant: elements of features in + # - [:n_drawn_constant[ holds drawn and known constant features; + # - [n_drawn_constant:n_known_constant[ holds known constant + # features that haven't been drawn yet; + # - [n_known_constant:n_total_constant[ holds newly found constant + # features; + # - [n_total_constant:f_i[ holds features that haven't been drawn + # yet and aren't constant apriori. + # - [f_i:n_features[ holds features that have been drawn + # and aren't constant. + + # Draw a feature at random + f_j = rand_int(n_drawn_constants, f_i - n_found_constants, + random_state) + + if f_j < n_known_constants: + # f_j in the interval [n_drawn_constants, n_known_constants[ + features[f_j], features[n_drawn_constants] = features[n_drawn_constants], features[f_j] + + n_drawn_constants += 1 + + else: + # f_j in the interval [n_known_constants, f_i - n_found_constants[ + f_j += n_found_constants + # f_j in the interval [n_total_constants, f_i[ + + current.feature = features[f_j] + + self.extract_nnz(current.feature, + &end_negative, &start_positive, + &is_samples_sorted) + + # Add one or two zeros in Xf, if there is any + if end_negative < start_positive: + start_positive -= 1 + Xf[start_positive] = 0. + + if end_negative != start_positive: + Xf[end_negative] = 0. + end_negative += 1 + + # Find min, max in Xf[start:end_negative] + min_feature_value = Xf[start] + max_feature_value = min_feature_value + + for p in range(start, end_negative): + current_feature_value = Xf[p] + + if current_feature_value < min_feature_value: + min_feature_value = current_feature_value + elif current_feature_value > max_feature_value: + max_feature_value = current_feature_value + + # Update min, max given Xf[start_positive:end] + for p in range(start_positive, end): + current_feature_value = Xf[p] + + if current_feature_value < min_feature_value: + min_feature_value = current_feature_value + elif current_feature_value > max_feature_value: + max_feature_value = current_feature_value + + if max_feature_value <= min_feature_value + FEATURE_THRESHOLD: + features[f_j] = features[n_total_constants] + features[n_total_constants] = current.feature + + n_found_constants += 1 + n_total_constants += 1 + + else: + f_i -= 1 + features[f_i], features[f_j] = features[f_j], features[f_i] + + # Draw a random threshold + current.threshold = rand_uniform(min_feature_value, + max_feature_value, + random_state) + + if current.threshold == max_feature_value: + current.threshold = min_feature_value + + # Partition + current.pos = self._partition(current.threshold, + end_negative, + start_positive, + start_positive + + (Xf[start_positive] == 0.)) + + # Reject if min_samples_leaf is not guaranteed + if (((current.pos - start) < min_samples_leaf) or + ((end - current.pos) < min_samples_leaf)): + continue + + # Evaluate split + self.criterion.reset() + self.criterion.update(current.pos) + + # Reject if min_weight_leaf is not satisfied + if ((self.criterion.weighted_n_left < min_weight_leaf) or + (self.criterion.weighted_n_right < min_weight_leaf)): + continue + + current_proxy_improvement = self.criterion.proxy_impurity_improvement() + + if current_proxy_improvement > best_proxy_improvement: + best_proxy_improvement = current_proxy_improvement + self.criterion.children_impurity(¤t.impurity_left, + ¤t.impurity_right) + current.improvement = self.criterion.impurity_improvement( + impurity, current.impurity_left, current.impurity_right) + best = current + + # Reorganize into samples[start:best.pos] + samples[best.pos:end] + if best.pos < end: + if current.feature != best.feature: + self.extract_nnz(best.feature, &end_negative, &start_positive, + &is_samples_sorted) + + self._partition(best.threshold, end_negative, start_positive, + best.pos) + + self.criterion.reset() + self.criterion.update(best.pos) + self.criterion.children_impurity(&best.impurity_left, + &best.impurity_right) + best.improvement = self.criterion.impurity_improvement( + impurity, best.impurity_left, best.impurity_right) + + # Respect invariant for constant features: the original order of + # element in features[:n_known_constants] must be preserved for sibling + # and child nodes + memcpy(features, constant_features, sizeof(SIZE_t) * n_known_constants) + + # Copy newly found constant features + memcpy(constant_features + n_known_constants, + features + n_known_constants, + sizeof(SIZE_t) * n_found_constants) + + # Return values + split[0] = best + n_constant_features[0] = n_total_constants + return 0 diff --git a/causalml/inference/tree/_tree/_tree.pxd b/causalml/inference/tree/_tree/_tree.pxd new file mode 100644 index 00000000..d32b5eae --- /dev/null +++ b/causalml/inference/tree/_tree/_tree.pxd @@ -0,0 +1,114 @@ +# Authors: Gilles Louppe +# Peter Prettenhofer +# Brian Holt +# Joel Nothman +# Arnaud Joly +# Jacob Schreiber +# Nelson Liu +# +# License: BSD 3 clause + +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False +# cython: language_level=3 +# cython: linetrace=True + +import numpy as np +cimport numpy as cnp + +ctypedef cnp.npy_float32 DTYPE_t # Type of X +ctypedef cnp.npy_float64 DOUBLE_t # Type of y, sample_weight +ctypedef cnp.npy_intp SIZE_t # Type for indices and counters +ctypedef cnp.npy_int32 INT32_t # Signed 32 bit integer +ctypedef cnp.npy_uint32 UINT32_t # Unsigned 32 bit integer + +from ._splitter cimport Splitter +from ._splitter cimport SplitRecord + + +cdef struct Node: + # Base storage structure for the nodes in a Tree object + + SIZE_t left_child # id of the left child of the node + SIZE_t right_child # id of the right child of the node + SIZE_t feature # Feature used for splitting the node + DOUBLE_t threshold # Threshold value at the node + DOUBLE_t impurity # Impurity of the node (i.e., the value of the criterion) + SIZE_t n_node_samples # Number of samples at the node + DOUBLE_t weighted_n_node_samples # Weighted number of samples at the node + + +cdef class Tree: + # The Tree object is a binary tree structure constructed by the + # TreeBuilder. The tree structure is used for predictions and feature importances. + + # Input/Output layout + cdef public SIZE_t n_features # Number of features in X + cdef SIZE_t* n_classes # Number of classes in y[:, k] + cdef public SIZE_t n_outputs # Number of outputs in y + cdef public SIZE_t max_n_classes # max(n_classes) + + # Inner structures: values are stored separately from node structure, + # since size is determined at runtime. + cdef public SIZE_t max_depth # Max depth of the tree + cdef public SIZE_t node_count # Counter for node IDs + cdef public SIZE_t capacity # Capacity of tree, in terms of nodes + cdef Node* nodes # Array of nodes + cdef double* value # (capacity, n_outputs, max_n_classes) array of values + cdef SIZE_t value_stride # = n_outputs * max_n_classes + + # Methods + cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, + SIZE_t feature, double threshold, double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples) nogil except -1 + cdef int _resize(self, SIZE_t capacity) nogil except -1 + cdef int _resize_c(self, SIZE_t capacity=*) nogil except -1 + + cdef cnp.ndarray _get_value_ndarray(self) + cdef cnp.ndarray _get_node_ndarray(self) + + cpdef cnp.ndarray predict(self, object X) + + cpdef cnp.ndarray apply(self, object X) + cdef cnp.ndarray _apply_dense(self, object X) + cdef cnp.ndarray _apply_sparse_csr(self, object X) + + cpdef object decision_path(self, object X) + cdef object _decision_path_dense(self, object X) + cdef object _decision_path_sparse_csr(self, object X) + + cpdef compute_feature_importances(self, normalize=*) + + +cdef class TreeBuilder: + # The TreeBuilder recursively builds a Tree object from training samples, + # using a Splitter object for splitting internal nodes and assigning + # values to leaves. + # + # This class controls the various stopping criteria and the node splitting + # evaluation order, e.g. depth-first or best-first. + + cdef Splitter splitter # Splitting algorithm + + cdef SIZE_t min_samples_split # Minimum number of samples in an internal node + cdef SIZE_t min_samples_leaf # Minimum number of samples in a leaf + cdef double min_weight_leaf # Minimum weight in a leaf + cdef SIZE_t max_depth # Maximal tree depth + cdef double min_impurity_decrease # Impurity threshold for early stopping + + cpdef build( + self, + Tree tree, + object X, + cnp.ndarray y, + cnp.ndarray sample_weight=*, + ) + + cdef _check_input( + self, + object X, + cnp.ndarray y, + cnp.ndarray sample_weight, + ) diff --git a/causalml/inference/tree/_tree/_tree.pyx b/causalml/inference/tree/_tree/_tree.pyx new file mode 100755 index 00000000..b7af41a5 --- /dev/null +++ b/causalml/inference/tree/_tree/_tree.pyx @@ -0,0 +1,1771 @@ +# Authors: Gilles Louppe +# Peter Prettenhofer +# Brian Holt +# Noel Dawe +# Satrajit Gosh +# Lars Buitinck +# Arnaud Joly +# Joel Nothman +# Fares Hedayati +# Jacob Schreiber +# Nelson Liu +# +# License: BSD 3 clause + +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False +# cython: language_level=3 +# cython: linetrace=True + +from cpython cimport Py_INCREF, PyObject, PyTypeObject + +from libc.stdlib cimport free +from libc.math cimport fabs +from libc.string cimport memcpy +from libc.string cimport memset +from libc.stdint cimport SIZE_MAX + +import struct + +import numpy as np +cimport numpy as cnp +cnp.import_array() + +from scipy.sparse import issparse +from scipy.sparse import csr_matrix + +from ._utils cimport Stack +from ._utils cimport StackRecord +from ._utils cimport PriorityHeap +from ._utils cimport PriorityHeapRecord +from ._utils cimport safe_realloc +from ._utils cimport sizet_ptr_to_ndarray + + +cdef extern from "numpy/arrayobject.h": + object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr, + int nd, cnp.npy_intp* dims, + cnp.npy_intp* strides, + void* data, int flags, object obj) + int PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj) + +# ============================================================================= +# Types and constants +# ============================================================================= + +from numpy import float32 as DTYPE +from numpy import float64 as DOUBLE + +cdef double INFINITY = np.inf +cdef double EPSILON = np.finfo('double').eps + +# Some handy constants (BestFirstTreeBuilder) +cdef int IS_FIRST = 1 +cdef int IS_NOT_FIRST = 0 +cdef int IS_LEFT = 1 +cdef int IS_NOT_LEFT = 0 + +TREE_LEAF = -1 +TREE_UNDEFINED = -2 +cdef SIZE_t _TREE_LEAF = TREE_LEAF +cdef SIZE_t _TREE_UNDEFINED = TREE_UNDEFINED +cdef SIZE_t INITIAL_STACK_SIZE = 10 + +# Build the corresponding numpy dtype for Node. +# This works by casting `dummy` to an array of Node of length 1, which numpy +# can construct a `dtype`-object for. See https://stackoverflow.com/q/62448946 +# for a more detailed explanation. +cdef Node dummy; +NODE_DTYPE = np.asarray((&dummy)).dtype + +# ============================================================================= +# TreeBuilder +# ============================================================================= + +cdef class TreeBuilder: + """Interface for different tree building strategies.""" + + cpdef build(self, Tree tree, object X, cnp.ndarray y, + cnp.ndarray sample_weight=None): + """Build a decision tree from the training set (X, y).""" + pass + + cdef inline _check_input(self, object X, cnp.ndarray y, + cnp.ndarray sample_weight): + """Check input dtype, layout and format""" + if issparse(X): + X = X.tocsc() + X.sort_indices() + + if X.data.dtype != DTYPE: + X.data = np.ascontiguousarray(X.data, dtype=DTYPE) + + if X.indices.dtype != np.int32 or X.indptr.dtype != np.int32: + raise ValueError("No support for np.int64 index based " + "sparse matrices") + + elif X.dtype != DTYPE: + # since we have to copy we will make it fortran for efficiency + X = np.asfortranarray(X, dtype=DTYPE) + + if y.dtype != DOUBLE or not y.flags.contiguous: + y = np.ascontiguousarray(y, dtype=DOUBLE) + + if (sample_weight is not None and + (sample_weight.dtype != DOUBLE or + not sample_weight.flags.contiguous)): + sample_weight = np.asarray(sample_weight, dtype=DOUBLE, + order="C") + + return X, y, sample_weight + +# Depth first builder --------------------------------------------------------- + +cdef class DepthFirstTreeBuilder(TreeBuilder): + """Build a decision tree in depth-first fashion.""" + + def __cinit__(self, Splitter splitter, SIZE_t min_samples_split, + SIZE_t min_samples_leaf, double min_weight_leaf, + SIZE_t max_depth, double min_impurity_decrease): + self.splitter = splitter + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_leaf = min_weight_leaf + self.max_depth = max_depth + self.min_impurity_decrease = min_impurity_decrease + + cpdef build(self, Tree tree, object X, cnp.ndarray y, + cnp.ndarray sample_weight=None): + """Build a decision tree from the training set (X, y).""" + + # check input + X, y, sample_weight = self._check_input(X, y, sample_weight) + + cdef DOUBLE_t* sample_weight_ptr = NULL + if sample_weight is not None: + sample_weight_ptr = sample_weight.data + + # Initial capacity + cdef int init_capacity + + if tree.max_depth <= 10: + init_capacity = (2 ** (tree.max_depth + 1)) - 1 + else: + init_capacity = 2047 + + tree._resize(init_capacity) + + # Parameters + cdef Splitter splitter = self.splitter + cdef SIZE_t max_depth = self.max_depth + cdef SIZE_t min_samples_leaf = self.min_samples_leaf + cdef double min_weight_leaf = self.min_weight_leaf + cdef SIZE_t min_samples_split = self.min_samples_split + cdef double min_impurity_decrease = self.min_impurity_decrease + + # Recursive partition (without actual recursion) + splitter.init(X, y, sample_weight_ptr) + + cdef SIZE_t start + cdef SIZE_t end + cdef SIZE_t depth + cdef SIZE_t parent + cdef bint is_left + cdef SIZE_t n_node_samples = splitter.n_samples + cdef double weighted_n_samples = splitter.weighted_n_samples + cdef double weighted_n_node_samples + cdef SplitRecord split + cdef SIZE_t node_id + + cdef double impurity = INFINITY + cdef SIZE_t n_constant_features + cdef bint is_leaf + cdef bint first = 1 + cdef SIZE_t max_depth_seen = -1 + cdef int rc = 0 + + cdef Stack stack = Stack(INITIAL_STACK_SIZE) + cdef StackRecord stack_record + + with nogil: + # push root node onto stack + rc = stack.push(0, n_node_samples, 0, _TREE_UNDEFINED, 0, INFINITY, 0) + if rc == -1: + # got return code -1 - out-of-memory + with gil: + raise MemoryError() + + while not stack.is_empty(): + stack.pop(&stack_record) + + start = stack_record.start + end = stack_record.end + depth = stack_record.depth + parent = stack_record.parent + is_left = stack_record.is_left + impurity = stack_record.impurity + n_constant_features = stack_record.n_constant_features + + n_node_samples = end - start + splitter.node_reset(start, end, &weighted_n_node_samples) + + is_leaf = (depth >= max_depth or + n_node_samples < min_samples_split or + n_node_samples < 2 * min_samples_leaf or + weighted_n_node_samples < 2 * min_weight_leaf) + + if first: + impurity = splitter.node_impurity() + first = 0 + + # impurity == 0 with tolerance due to rounding errors + is_leaf = is_leaf or impurity <= EPSILON + + if not is_leaf: + splitter.node_split(impurity, &split, &n_constant_features) + # If EPSILON=0 in the below comparison, float precision + # issues stop splitting, producing trees that are + # dissimilar to v0.18 + is_leaf = (is_leaf or split.pos >= end or + (split.improvement + EPSILON < + min_impurity_decrease)) + + node_id = tree._add_node(parent, is_left, is_leaf, split.feature, + split.threshold, impurity, n_node_samples, + weighted_n_node_samples) + + if node_id == SIZE_MAX: + rc = -1 + break + + # Store value for all nodes, to facilitate tree/model + # inspection and interpretation + splitter.node_value(tree.value + node_id * tree.value_stride) + + if not is_leaf: + # Push right child on stack + rc = stack.push(split.pos, end, depth + 1, node_id, 0, + split.impurity_right, n_constant_features) + if rc == -1: + break + + # Push left child on stack + rc = stack.push(start, split.pos, depth + 1, node_id, 1, + split.impurity_left, n_constant_features) + if rc == -1: + break + + if depth > max_depth_seen: + max_depth_seen = depth + + if rc >= 0: + rc = tree._resize_c(tree.node_count) + + if rc >= 0: + tree.max_depth = max_depth_seen + if rc == -1: + raise MemoryError() + + +# Best first builder ---------------------------------------------------------- + +cdef inline int _add_to_frontier(PriorityHeapRecord* rec, + PriorityHeap frontier) nogil except -1: + """Adds record ``rec`` to the priority queue ``frontier`` + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + return frontier.push(rec.node_id, rec.start, rec.end, rec.pos, rec.depth, + rec.is_leaf, rec.improvement, rec.impurity, + rec.impurity_left, rec.impurity_right) + + +cdef class BestFirstTreeBuilder(TreeBuilder): + """Build a decision tree in best-first fashion. + + The best node to expand is given by the node at the frontier that has the + highest impurity improvement. + """ + cdef SIZE_t max_leaf_nodes + + def __cinit__(self, Splitter splitter, SIZE_t min_samples_split, + SIZE_t min_samples_leaf, min_weight_leaf, + SIZE_t max_depth, SIZE_t max_leaf_nodes, + double min_impurity_decrease): + self.splitter = splitter + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_leaf = min_weight_leaf + self.max_depth = max_depth + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_decrease = min_impurity_decrease + + cpdef build(self, Tree tree, object X, cnp.ndarray y, + cnp.ndarray sample_weight=None): + """Build a decision tree from the training set (X, y).""" + + # check input + X, y, sample_weight = self._check_input(X, y, sample_weight) + + cdef DOUBLE_t* sample_weight_ptr = NULL + if sample_weight is not None: + sample_weight_ptr = sample_weight.data + + # Parameters + cdef Splitter splitter = self.splitter + cdef SIZE_t max_leaf_nodes = self.max_leaf_nodes + cdef SIZE_t min_samples_leaf = self.min_samples_leaf + cdef double min_weight_leaf = self.min_weight_leaf + cdef SIZE_t min_samples_split = self.min_samples_split + + # Recursive partition (without actual recursion) + splitter.init(X, y, sample_weight_ptr) + + cdef PriorityHeap frontier = PriorityHeap(INITIAL_STACK_SIZE) + cdef PriorityHeapRecord record + cdef PriorityHeapRecord split_node_left + cdef PriorityHeapRecord split_node_right + + cdef SIZE_t n_node_samples = splitter.n_samples + cdef SIZE_t max_split_nodes = max_leaf_nodes - 1 + cdef bint is_leaf + cdef SIZE_t max_depth_seen = -1 + cdef int rc = 0 + cdef Node* node + + # Initial capacity + cdef SIZE_t init_capacity = max_split_nodes + max_leaf_nodes + tree._resize(init_capacity) + + with nogil: + # add root to frontier + rc = self._add_split_node(splitter, tree, 0, n_node_samples, + INFINITY, IS_FIRST, IS_LEFT, NULL, 0, + &split_node_left) + if rc >= 0: + rc = _add_to_frontier(&split_node_left, frontier) + + if rc == -1: + with gil: + raise MemoryError() + + while not frontier.is_empty(): + frontier.pop(&record) + + node = &tree.nodes[record.node_id] + is_leaf = (record.is_leaf or max_split_nodes <= 0) + + if is_leaf: + # Node is not expandable; set node as leaf + node.left_child = _TREE_LEAF + node.right_child = _TREE_LEAF + node.feature = _TREE_UNDEFINED + node.threshold = _TREE_UNDEFINED + + else: + # Node is expandable + + # Decrement number of split nodes available + max_split_nodes -= 1 + + # Compute left split node + rc = self._add_split_node(splitter, tree, + record.start, record.pos, + record.impurity_left, + IS_NOT_FIRST, IS_LEFT, node, + record.depth + 1, + &split_node_left) + if rc == -1: + break + + # tree.nodes may have changed + node = &tree.nodes[record.node_id] + + # Compute right split node + rc = self._add_split_node(splitter, tree, record.pos, + record.end, + record.impurity_right, + IS_NOT_FIRST, IS_NOT_LEFT, node, + record.depth + 1, + &split_node_right) + if rc == -1: + break + + # Add nodes to queue + rc = _add_to_frontier(&split_node_left, frontier) + if rc == -1: + break + + rc = _add_to_frontier(&split_node_right, frontier) + if rc == -1: + break + + if record.depth > max_depth_seen: + max_depth_seen = record.depth + + if rc >= 0: + rc = tree._resize_c(tree.node_count) + + if rc >= 0: + tree.max_depth = max_depth_seen + + if rc == -1: + raise MemoryError() + + cdef inline int _add_split_node(self, Splitter splitter, Tree tree, + SIZE_t start, SIZE_t end, double impurity, + bint is_first, bint is_left, Node* parent, + SIZE_t depth, + PriorityHeapRecord* res) nogil except -1: + """Adds node w/ partition ``[start, end)`` to the frontier. """ + cdef SplitRecord split + cdef SIZE_t node_id + cdef SIZE_t n_node_samples + cdef SIZE_t n_constant_features = 0 + cdef double weighted_n_samples = splitter.weighted_n_samples + cdef double min_impurity_decrease = self.min_impurity_decrease + cdef double weighted_n_node_samples + cdef bint is_leaf + cdef SIZE_t n_left, n_right + cdef double imp_diff + + splitter.node_reset(start, end, &weighted_n_node_samples) + + if is_first: + impurity = splitter.node_impurity() + + n_node_samples = end - start + is_leaf = (depth >= self.max_depth or + n_node_samples < self.min_samples_split or + n_node_samples < 2 * self.min_samples_leaf or + weighted_n_node_samples < 2 * self.min_weight_leaf or + impurity <= EPSILON # impurity == 0 with tolerance + ) + + if not is_leaf: + splitter.node_split(impurity, &split, &n_constant_features) + # If EPSILON=0 in the below comparison, float precision issues stop + # splitting early, producing trees that are dissimilar to v0.18 + is_leaf = (is_leaf or split.pos >= end or + split.improvement + EPSILON < min_impurity_decrease) + + node_id = tree._add_node(parent - tree.nodes + if parent != NULL + else _TREE_UNDEFINED, + is_left, is_leaf, + split.feature, split.threshold, impurity, n_node_samples, + weighted_n_node_samples) + if node_id == SIZE_MAX: + return -1 + + # compute values also for split nodes (might become leafs later). + splitter.node_value(tree.value + node_id * tree.value_stride) + + res.node_id = node_id + res.start = start + res.end = end + res.depth = depth + res.impurity = impurity + + if not is_leaf: + # is split node + res.pos = split.pos + res.is_leaf = 0 + res.improvement = split.improvement + res.impurity_left = split.impurity_left + res.impurity_right = split.impurity_right + + else: + # is leaf => 0 improvement + res.pos = end + res.is_leaf = 1 + res.improvement = 0.0 + res.impurity_left = impurity + res.impurity_right = impurity + + return 0 + + +# ============================================================================= +# Tree +# ============================================================================= + +cdef class Tree: + """Array-based representation of a binary decision tree. + + The binary tree is represented as a number of parallel arrays. The i-th + element of each array holds information about the node `i`. Node 0 is the + tree's root. You can find a detailed description of all arrays in + `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split + nodes, resp. In this case the values of nodes of the other type are + arbitrary! + + Attributes + ---------- + node_count : int + The number of nodes (internal nodes + leaves) in the tree. + + capacity : int + The current capacity (i.e., size) of the arrays, which is at least as + great as `node_count`. + + max_depth : int + The depth of the tree, i.e. the maximum depth of its leaves. + + children_left : array of int, shape [node_count] + children_left[i] holds the node id of the left child of node i. + For leaves, children_left[i] == TREE_LEAF. Otherwise, + children_left[i] > i. This child handles the case where + X[:, feature[i]] <= threshold[i]. + + children_right : array of int, shape [node_count] + children_right[i] holds the node id of the right child of node i. + For leaves, children_right[i] == TREE_LEAF. Otherwise, + children_right[i] > i. This child handles the case where + X[:, feature[i]] > threshold[i]. + + feature : array of int, shape [node_count] + feature[i] holds the feature to split on, for the internal node i. + + threshold : array of double, shape [node_count] + threshold[i] holds the threshold for the internal node i. + + value : array of double, shape [node_count, n_outputs, max_n_classes] + Contains the constant prediction value of each node. + + impurity : array of double, shape [node_count] + impurity[i] holds the impurity (i.e., the value of the splitting + criterion) at node i. + + n_node_samples : array of int, shape [node_count] + n_node_samples[i] holds the number of training samples reaching node i. + + weighted_n_node_samples : array of int, shape [node_count] + weighted_n_node_samples[i] holds the weighted number of training samples + reaching node i. + """ + # Wrap for outside world. + # WARNING: these reference the current `nodes` and `value` buffers, which + # must not be freed by a subsequent memory allocation. + # (i.e. through `_resize` or `__setstate__`) + property n_classes: + def __get__(self): + return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs) + + property children_left: + def __get__(self): + return self._get_node_ndarray()['left_child'][:self.node_count] + + property children_right: + def __get__(self): + return self._get_node_ndarray()['right_child'][:self.node_count] + + property n_leaves: + def __get__(self): + return np.sum(np.logical_and( + self.children_left == -1, + self.children_right == -1)) + + property feature: + def __get__(self): + return self._get_node_ndarray()['feature'][:self.node_count] + + property threshold: + def __get__(self): + return self._get_node_ndarray()['threshold'][:self.node_count] + + property impurity: + def __get__(self): + return self._get_node_ndarray()['impurity'][:self.node_count] + + property n_node_samples: + def __get__(self): + return self._get_node_ndarray()['n_node_samples'][:self.node_count] + + property weighted_n_node_samples: + def __get__(self): + return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] + + property value: + def __get__(self): + return self._get_value_ndarray()[:self.node_count] + + def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): + """Constructor.""" + cdef SIZE_t dummy = 0 + size_t_dtype = np.array(dummy).dtype + + n_classes = _check_n_classes(n_classes, size_t_dtype) + + # Input/Output layout + self.n_features = n_features + self.n_outputs = n_outputs + self.n_classes = NULL + safe_realloc(&self.n_classes, n_outputs) + + self.max_n_classes = np.max(n_classes) + self.value_stride = n_outputs * self.max_n_classes + + cdef SIZE_t k + for k in range(n_outputs): + self.n_classes[k] = n_classes[k] + + # Inner structures + self.max_depth = 0 + self.node_count = 0 + self.capacity = 0 + self.value = NULL + self.nodes = NULL + + def __dealloc__(self): + """Destructor.""" + # Free all inner structures + free(self.n_classes) + free(self.value) + free(self.nodes) + + def __reduce__(self): + """Reduce re-implementation, for pickling.""" + return (Tree, (self.n_features, + sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), + self.n_outputs), self.__getstate__()) + + def __getstate__(self): + """Getstate re-implementation, for pickling.""" + d = {} + # capacity is inferred during the __setstate__ using nodes + d["max_depth"] = self.max_depth + d["node_count"] = self.node_count + d["nodes"] = self._get_node_ndarray() + d["values"] = self._get_value_ndarray() + return d + + def __setstate__(self, d): + """Setstate re-implementation, for unpickling.""" + self.max_depth = d["max_depth"] + self.node_count = d["node_count"] + + if 'nodes' not in d: + raise ValueError('You have loaded Tree version which ' + 'cannot be imported') + + node_ndarray = d['nodes'] + value_ndarray = d['values'] + + value_shape = (node_ndarray.shape[0], self.n_outputs, + self.max_n_classes) + + node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE) + value_ndarray = _check_value_ndarray( + value_ndarray, + expected_dtype=np.dtype(np.float64), + expected_shape=value_shape + ) + + self.capacity = node_ndarray.shape[0] + if self._resize_c(self.capacity) != 0: + raise MemoryError("resizing tree to %d" % self.capacity) + memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), + self.capacity * sizeof(Node)) + memcpy(self.value, cnp.PyArray_DATA(value_ndarray), + self.capacity * self.value_stride * sizeof(double)) + + cdef int _resize(self, SIZE_t capacity) nogil except -1: + """Resize all inner arrays to `capacity`, if `capacity` == -1, then + double the size of the inner arrays. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + if self._resize_c(capacity) != 0: + # Acquire gil only if we need to raise + with gil: + raise MemoryError() + + cdef int _resize_c(self, SIZE_t capacity=SIZE_MAX) nogil except -1: + """Guts of _resize + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + if capacity == self.capacity and self.nodes != NULL: + return 0 + + if capacity == SIZE_MAX: + if self.capacity == 0: + capacity = 3 # default initial value + else: + capacity = 2 * self.capacity + + safe_realloc(&self.nodes, capacity) + safe_realloc(&self.value, capacity * self.value_stride) + + # value memory is initialised to 0 to enable classifier argmax + if capacity > self.capacity: + memset((self.value + self.capacity * self.value_stride), 0, + (capacity - self.capacity) * self.value_stride * + sizeof(double)) + + # if capacity smaller than node_count, adjust the counter + if capacity < self.node_count: + self.node_count = capacity + + self.capacity = capacity + return 0 + + cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, + SIZE_t feature, double threshold, double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples) nogil except -1: + """Add a node to the tree. + + The new node registers itself as the child of its parent. + + Returns (size_t)(-1) on error. + """ + cdef SIZE_t node_id = self.node_count + + if node_id >= self.capacity: + if self._resize_c() != 0: + return SIZE_MAX + + cdef Node* node = &self.nodes[node_id] + node.impurity = impurity + node.n_node_samples = n_node_samples + node.weighted_n_node_samples = weighted_n_node_samples + + if parent != _TREE_UNDEFINED: + if is_left: + self.nodes[parent].left_child = node_id + else: + self.nodes[parent].right_child = node_id + + if is_leaf: + node.left_child = _TREE_LEAF + node.right_child = _TREE_LEAF + node.feature = _TREE_UNDEFINED + node.threshold = _TREE_UNDEFINED + + else: + # left_child and right_child will be set later + node.feature = feature + node.threshold = threshold + + self.node_count += 1 + + return node_id + + cpdef cnp.ndarray predict(self, object X): + """Predict target for X.""" + out = self._get_value_ndarray().take(self.apply(X), axis=0, + mode='clip') + if self.n_outputs == 1: + out = out.reshape(X.shape[0], self.max_n_classes) + return out + + cpdef cnp.ndarray apply(self, object X): + """Finds the terminal region (=leaf node) for each sample in X.""" + if issparse(X): + return self._apply_sparse_csr(X) + else: + return self._apply_dense(X) + + cdef inline cnp.ndarray _apply_dense(self, object X): + """Finds the terminal region (=leaf node) for each sample in X.""" + + # Check input + if not isinstance(X, np.ndarray): + raise ValueError("X should be in np.ndarray format, got %s" + % type(X)) + + if X.dtype != DTYPE: + raise ValueError("X.dtype should be np.float32, got %s" % X.dtype) + + # Extract input + cdef const DTYPE_t[:, :] X_ndarray = X + cdef SIZE_t n_samples = X.shape[0] + + # Initialize output + cdef cnp.ndarray[SIZE_t] out = np.zeros((n_samples,), dtype=np.intp) + cdef SIZE_t* out_ptr = out.data + + # Initialize auxiliary data-structure + cdef Node* node = NULL + cdef SIZE_t i = 0 + + with nogil: + for i in range(n_samples): + node = self.nodes + # While node not a leaf + while node.left_child != _TREE_LEAF: + # ... and node.right_child != _TREE_LEAF: + if X_ndarray[i, node.feature] <= node.threshold: + node = &self.nodes[node.left_child] + else: + node = &self.nodes[node.right_child] + + out_ptr[i] = (node - self.nodes) # node offset + + return out + + cdef inline cnp.ndarray _apply_sparse_csr(self, object X): + """Finds the terminal region (=leaf node) for each sample in sparse X. + """ + # Check input + if not isinstance(X, csr_matrix): + raise ValueError("X should be in csr_matrix format, got %s" + % type(X)) + + if X.dtype != DTYPE: + raise ValueError("X.dtype should be np.float32, got %s" % X.dtype) + + # Extract input + cdef cnp.ndarray[ndim=1, dtype=DTYPE_t] X_data_ndarray = X.data + cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indices_ndarray = X.indices + cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indptr_ndarray = X.indptr + + cdef DTYPE_t* X_data = X_data_ndarray.data + cdef INT32_t* X_indices = X_indices_ndarray.data + cdef INT32_t* X_indptr = X_indptr_ndarray.data + + cdef SIZE_t n_samples = X.shape[0] + cdef SIZE_t n_features = X.shape[1] + + # Initialize output + cdef cnp.ndarray[SIZE_t, ndim=1] out = np.zeros((n_samples,), + dtype=np.intp) + cdef SIZE_t* out_ptr = out.data + + # Initialize auxiliary data-structure + cdef DTYPE_t feature_value = 0. + cdef Node* node = NULL + cdef DTYPE_t* X_sample = NULL + cdef SIZE_t i = 0 + cdef INT32_t k = 0 + + # feature_to_sample as a data structure records the last seen sample + # for each feature; functionally, it is an efficient way to identify + # which features are nonzero in the present sample. + cdef SIZE_t* feature_to_sample = NULL + + safe_realloc(&X_sample, n_features) + safe_realloc(&feature_to_sample, n_features) + + with nogil: + memset(feature_to_sample, -1, n_features * sizeof(SIZE_t)) + + for i in range(n_samples): + node = self.nodes + + for k in range(X_indptr[i], X_indptr[i + 1]): + feature_to_sample[X_indices[k]] = i + X_sample[X_indices[k]] = X_data[k] + + # While node not a leaf + while node.left_child != _TREE_LEAF: + # ... and node.right_child != _TREE_LEAF: + if feature_to_sample[node.feature] == i: + feature_value = X_sample[node.feature] + + else: + feature_value = 0. + + if feature_value <= node.threshold: + node = &self.nodes[node.left_child] + else: + node = &self.nodes[node.right_child] + + out_ptr[i] = (node - self.nodes) # node offset + + # Free auxiliary arrays + free(X_sample) + free(feature_to_sample) + + return out + + cpdef object decision_path(self, object X): + """Finds the decision path (=node) for each sample in X.""" + if issparse(X): + return self._decision_path_sparse_csr(X) + else: + return self._decision_path_dense(X) + + cdef inline object _decision_path_dense(self, object X): + """Finds the decision path (=node) for each sample in X.""" + + # Check input + if not isinstance(X, np.ndarray): + raise ValueError("X should be in np.ndarray format, got %s" + % type(X)) + + if X.dtype != DTYPE: + raise ValueError("X.dtype should be np.float32, got %s" % X.dtype) + + # Extract input + cdef const DTYPE_t[:, :] X_ndarray = X + cdef SIZE_t n_samples = X.shape[0] + + # Initialize output + cdef cnp.ndarray[SIZE_t] indptr = np.zeros(n_samples + 1, dtype=np.intp) + cdef SIZE_t* indptr_ptr = indptr.data + + cdef cnp.ndarray[SIZE_t] indices = np.zeros(n_samples * + (1 + self.max_depth), + dtype=np.intp) + cdef SIZE_t* indices_ptr = indices.data + + # Initialize auxiliary data-structure + cdef Node* node = NULL + cdef SIZE_t i = 0 + + with nogil: + for i in range(n_samples): + node = self.nodes + indptr_ptr[i + 1] = indptr_ptr[i] + + # Add all external nodes + while node.left_child != _TREE_LEAF: + # ... and node.right_child != _TREE_LEAF: + indices_ptr[indptr_ptr[i + 1]] = (node - self.nodes) + indptr_ptr[i + 1] += 1 + + if X_ndarray[i, node.feature] <= node.threshold: + node = &self.nodes[node.left_child] + else: + node = &self.nodes[node.right_child] + + # Add the leave node + indices_ptr[indptr_ptr[i + 1]] = (node - self.nodes) + indptr_ptr[i + 1] += 1 + + indices = indices[:indptr[n_samples]] + cdef cnp.ndarray[SIZE_t] data = np.ones(shape=len(indices), + dtype=np.intp) + out = csr_matrix((data, indices, indptr), + shape=(n_samples, self.node_count)) + + return out + + cdef inline object _decision_path_sparse_csr(self, object X): + """Finds the decision path (=node) for each sample in X.""" + + # Check input + if not isinstance(X, csr_matrix): + raise ValueError("X should be in csr_matrix format, got %s" + % type(X)) + + if X.dtype != DTYPE: + raise ValueError("X.dtype should be np.float32, got %s" % X.dtype) + + # Extract input + cdef cnp.ndarray[ndim=1, dtype=DTYPE_t] X_data_ndarray = X.data + cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indices_ndarray = X.indices + cdef cnp.ndarray[ndim=1, dtype=INT32_t] X_indptr_ndarray = X.indptr + + cdef DTYPE_t* X_data = X_data_ndarray.data + cdef INT32_t* X_indices = X_indices_ndarray.data + cdef INT32_t* X_indptr = X_indptr_ndarray.data + + cdef SIZE_t n_samples = X.shape[0] + cdef SIZE_t n_features = X.shape[1] + + # Initialize output + cdef cnp.ndarray[SIZE_t] indptr = np.zeros(n_samples + 1, dtype=np.intp) + cdef SIZE_t* indptr_ptr = indptr.data + + cdef cnp.ndarray[SIZE_t] indices = np.zeros(n_samples * + (1 + self.max_depth), + dtype=np.intp) + cdef SIZE_t* indices_ptr = indices.data + + # Initialize auxiliary data-structure + cdef DTYPE_t feature_value = 0. + cdef Node* node = NULL + cdef DTYPE_t* X_sample = NULL + cdef SIZE_t i = 0 + cdef INT32_t k = 0 + + # feature_to_sample as a data structure records the last seen sample + # for each feature; functionally, it is an efficient way to identify + # which features are nonzero in the present sample. + cdef SIZE_t* feature_to_sample = NULL + + safe_realloc(&X_sample, n_features) + safe_realloc(&feature_to_sample, n_features) + + with nogil: + memset(feature_to_sample, -1, n_features * sizeof(SIZE_t)) + + for i in range(n_samples): + node = self.nodes + indptr_ptr[i + 1] = indptr_ptr[i] + + for k in range(X_indptr[i], X_indptr[i + 1]): + feature_to_sample[X_indices[k]] = i + X_sample[X_indices[k]] = X_data[k] + + # While node not a leaf + while node.left_child != _TREE_LEAF: + # ... and node.right_child != _TREE_LEAF: + + indices_ptr[indptr_ptr[i + 1]] = (node - self.nodes) + indptr_ptr[i + 1] += 1 + + if feature_to_sample[node.feature] == i: + feature_value = X_sample[node.feature] + + else: + feature_value = 0. + + if feature_value <= node.threshold: + node = &self.nodes[node.left_child] + else: + node = &self.nodes[node.right_child] + + # Add the leave node + indices_ptr[indptr_ptr[i + 1]] = (node - self.nodes) + indptr_ptr[i + 1] += 1 + + # Free auxiliary arrays + free(X_sample) + free(feature_to_sample) + + indices = indices[:indptr[n_samples]] + cdef cnp.ndarray[SIZE_t] data = np.ones(shape=len(indices), + dtype=np.intp) + out = csr_matrix((data, indices, indptr), + shape=(n_samples, self.node_count)) + + return out + + + cpdef compute_feature_importances(self, normalize=True): + """Computes the importance of each feature (aka variable).""" + cdef Node* left + cdef Node* right + cdef Node* nodes = self.nodes + cdef Node* node = nodes + cdef Node* end_node = node + self.node_count + + cdef double normalizer = 0. + + cdef cnp.ndarray[cnp.float64_t, ndim=1] importances + importances = np.zeros((self.n_features,)) + cdef DOUBLE_t* importance_data = importances.data + + with nogil: + while node != end_node: + if node.left_child != _TREE_LEAF: + # ... and node.right_child != _TREE_LEAF: + left = &nodes[node.left_child] + right = &nodes[node.right_child] + + importance_data[node.feature] += ( + node.weighted_n_node_samples * node.impurity - + left.weighted_n_node_samples * left.impurity - + right.weighted_n_node_samples * right.impurity) + node += 1 + + importances /= nodes[0].weighted_n_node_samples + + if normalize: + normalizer = np.sum(importances) + + if normalizer > 0.0: + # Avoid dividing by zero (e.g., when root is pure) + importances /= normalizer + + return importances + + cdef cnp.ndarray _get_value_ndarray(self): + """Wraps value as a 3-d NumPy array. + + The array keeps a reference to this Tree, which manages the underlying + memory. + """ + cdef cnp.npy_intp shape[3] + shape[0] = self.node_count + shape[1] = self.n_outputs + shape[2] = self.max_n_classes + cdef cnp.ndarray arr + arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value) + Py_INCREF(self) + if PyArray_SetBaseObject(arr, self) < 0: + raise ValueError("Can't initialize array.") + return arr + + cdef cnp.ndarray _get_node_ndarray(self): + """Wraps nodes as a NumPy struct array. + + The array keeps a reference to this Tree, which manages the underlying + memory. Individual fields are publicly accessible as properties of the + Tree. + """ + cdef cnp.npy_intp shape[1] + shape[0] = self.node_count + cdef cnp.npy_intp strides[1] + strides[0] = sizeof(Node) + cdef cnp.ndarray arr + Py_INCREF(NODE_DTYPE) + arr = PyArray_NewFromDescr( cnp.ndarray, + NODE_DTYPE, 1, shape, + strides, self.nodes, + cnp.NPY_DEFAULT, None) + Py_INCREF(self) + if PyArray_SetBaseObject(arr, self) < 0: + raise ValueError("Can't initialize array.") + return arr + + def compute_partial_dependence(self, DTYPE_t[:, ::1] X, + int[::1] target_features, + double[::1] out): + """Partial dependence of the response on the ``target_feature`` set. + + For each sample in ``X`` a tree traversal is performed. + Each traversal starts from the root with weight 1.0. + + At each non-leaf node that splits on a target feature, either + the left child or the right child is visited based on the feature + value of the current sample, and the weight is not modified. + At each non-leaf node that splits on a complementary feature, + both children are visited and the weight is multiplied by the fraction + of training samples which went to each child. + + At each leaf, the value of the node is multiplied by the current + weight (weights sum to 1 for all visited terminal nodes). + + Parameters + ---------- + X : view on 2d ndarray, shape (n_samples, n_target_features) + The grid points on which the partial dependence should be + evaluated. + target_features : view on 1d ndarray, shape (n_target_features) + The set of target features for which the partial dependence + should be evaluated. + out : view on 1d ndarray, shape (n_samples) + The value of the partial dependence function on each grid + point. + """ + cdef: + double[::1] weight_stack = np.zeros(self.node_count, + dtype=np.float64) + SIZE_t[::1] node_idx_stack = np.zeros(self.node_count, + dtype=np.intp) + SIZE_t sample_idx + SIZE_t feature_idx + int stack_size + double left_sample_frac + double current_weight + double total_weight # used for sanity check only + Node *current_node # use a pointer to avoid copying attributes + SIZE_t current_node_idx + bint is_target_feature + SIZE_t _TREE_LEAF = TREE_LEAF # to avoid python interactions + + for sample_idx in range(X.shape[0]): + # init stacks for current sample + stack_size = 1 + node_idx_stack[0] = 0 # root node + weight_stack[0] = 1 # all the samples are in the root node + total_weight = 0 + + while stack_size > 0: + # pop the stack + stack_size -= 1 + current_node_idx = node_idx_stack[stack_size] + current_node = &self.nodes[current_node_idx] + + if current_node.left_child == _TREE_LEAF: + # leaf node + out[sample_idx] += (weight_stack[stack_size] * + self.value[current_node_idx]) + total_weight += weight_stack[stack_size] + else: + # non-leaf node + + # determine if the split feature is a target feature + is_target_feature = False + for feature_idx in range(target_features.shape[0]): + if target_features[feature_idx] == current_node.feature: + is_target_feature = True + break + + if is_target_feature: + # In this case, we push left or right child on stack + if X[sample_idx, feature_idx] <= current_node.threshold: + node_idx_stack[stack_size] = current_node.left_child + else: + node_idx_stack[stack_size] = current_node.right_child + stack_size += 1 + else: + # In this case, we push both children onto the stack, + # and give a weight proportional to the number of + # samples going through each branch. + + # push left child + node_idx_stack[stack_size] = current_node.left_child + left_sample_frac = ( + self.nodes[current_node.left_child].weighted_n_node_samples / + current_node.weighted_n_node_samples) + current_weight = weight_stack[stack_size] + weight_stack[stack_size] = current_weight * left_sample_frac + stack_size += 1 + + # push right child + node_idx_stack[stack_size] = current_node.right_child + weight_stack[stack_size] = ( + current_weight * (1 - left_sample_frac)) + stack_size += 1 + + # Sanity check. Should never happen. + if not (0.999 < total_weight < 1.001): + raise ValueError("Total weight should be 1.0 but was %.9f" % + total_weight) + + +def _check_n_classes(n_classes, expected_dtype): + if n_classes.ndim != 1: + raise ValueError( + f"Wrong dimensions for n_classes from the pickle: " + f"expected 1, got {n_classes.ndim}" + ) + + if n_classes.dtype == expected_dtype: + return n_classes + + # Handles both different endianness and different bitness + if n_classes.dtype.kind == "i" and n_classes.dtype.itemsize in [4, 8]: + return n_classes.astype(expected_dtype, casting="same_kind") + + raise ValueError( + "n_classes from the pickle has an incompatible dtype:\n" + f"- expected: {expected_dtype}\n" + f"- got: {n_classes.dtype}" + ) + + +def _check_value_ndarray(value_ndarray, expected_dtype, expected_shape): + if value_ndarray.shape != expected_shape: + raise ValueError( + "Wrong shape for value array from the pickle: " + f"expected {expected_shape}, got {value_ndarray.shape}" + ) + + if not value_ndarray.flags.c_contiguous: + raise ValueError( + "value array from the pickle should be a C-contiguous array" + ) + + if value_ndarray.dtype == expected_dtype: + return value_ndarray + + # Handles different endianness + if value_ndarray.dtype.str.endswith('f8'): + return value_ndarray.astype(expected_dtype, casting='equiv') + + raise ValueError( + "value array from the pickle has an incompatible dtype:\n" + f"- expected: {expected_dtype}\n" + f"- got: {value_ndarray.dtype}" + ) + + +def _dtype_to_dict(dtype): + return {name: dt.str for name, (dt, *rest) in dtype.fields.items()} + + +def _dtype_dict_with_modified_bitness(dtype_dict): + # field names in Node struct with SIZE_t types (see sklearn/tree/_tree.pxd) + indexing_field_names = ["left_child", "right_child", "feature", "n_node_samples"] + + expected_dtype_size = str(struct.calcsize("P")) + allowed_dtype_size = "8" if expected_dtype_size == "4" else "4" + + allowed_dtype_dict = dtype_dict.copy() + for name in indexing_field_names: + allowed_dtype_dict[name] = allowed_dtype_dict[name].replace( + expected_dtype_size, allowed_dtype_size + ) + + return allowed_dtype_dict + + +def _all_compatible_dtype_dicts(dtype): + # The Cython code for decision trees uses platform-specific SIZE_t + # typed indexing fields that correspond to either i4 or i8 dtypes for + # the matching fields in the numpy array depending on the bitness of + # the platform (32 bit or 64 bit respectively). + # + # We need to cast the indexing fields of the NODE_DTYPE-dtyped array at + # pickle load time to enable cross-bitness deployment scenarios. We + # typically want to make it possible to run the expensive fit method of + # a tree estimator on a 64 bit server platform, pickle the estimator + # for deployment and run the predict method of a low power 32 bit edge + # platform. + # + # A similar thing happens for endianness, the machine where the pickle was + # saved can have a different endianness than the machine where the pickle + # is loaded + + dtype_dict = _dtype_to_dict(dtype) + dtype_dict_with_modified_bitness = _dtype_dict_with_modified_bitness(dtype_dict) + dtype_dict_with_modified_endianness = _dtype_to_dict(dtype.newbyteorder()) + dtype_dict_with_modified_bitness_and_endianness = _dtype_dict_with_modified_bitness( + dtype_dict_with_modified_endianness + ) + + return [ + dtype_dict, + dtype_dict_with_modified_bitness, + dtype_dict_with_modified_endianness, + dtype_dict_with_modified_bitness_and_endianness, + ] + + +def _check_node_ndarray(node_ndarray, expected_dtype): + if node_ndarray.ndim != 1: + raise ValueError( + "Wrong dimensions for node array from the pickle: " + f"expected 1, got {node_ndarray.ndim}" + ) + + if not node_ndarray.flags.c_contiguous: + raise ValueError( + "node array from the pickle should be a C-contiguous array" + ) + + node_ndarray_dtype = node_ndarray.dtype + if node_ndarray_dtype == expected_dtype: + return node_ndarray + + node_ndarray_dtype_dict = _dtype_to_dict(node_ndarray_dtype) + all_compatible_dtype_dicts = _all_compatible_dtype_dicts(expected_dtype) + + if node_ndarray_dtype_dict not in all_compatible_dtype_dicts: + raise ValueError( + "node array from the pickle has an incompatible dtype:\n" + f"- expected: {expected_dtype}\n" + f"- got : {node_ndarray_dtype}" + ) + + return node_ndarray.astype(expected_dtype, casting="same_kind") + + +# ============================================================================= +# Build Pruned Tree +# ============================================================================= + + +cdef class _CCPPruneController: + """Base class used by build_pruned_tree_ccp and ccp_pruning_path + to control pruning. + """ + cdef bint stop_pruning(self, DOUBLE_t effective_alpha) nogil: + """Return 1 to stop pruning and 0 to continue pruning""" + return 0 + + cdef void save_metrics(self, DOUBLE_t effective_alpha, + DOUBLE_t subtree_impurities) nogil: + """Save metrics when pruning""" + pass + + cdef void after_pruning(self, unsigned char[:] in_subtree) nogil: + """Called after pruning""" + pass + + +cdef class _AlphaPruner(_CCPPruneController): + """Use alpha to control when to stop pruning.""" + cdef DOUBLE_t ccp_alpha + cdef SIZE_t capacity + + def __cinit__(self, DOUBLE_t ccp_alpha): + self.ccp_alpha = ccp_alpha + self.capacity = 0 + + cdef bint stop_pruning(self, DOUBLE_t effective_alpha) nogil: + # The subtree on the previous iteration has the greatest ccp_alpha + # less than or equal to self.ccp_alpha + return self.ccp_alpha < effective_alpha + + cdef void after_pruning(self, unsigned char[:] in_subtree) nogil: + """Updates the number of leaves in subtree""" + for i in range(in_subtree.shape[0]): + if in_subtree[i]: + self.capacity += 1 + + +cdef class _PathFinder(_CCPPruneController): + """Record metrics used to return the cost complexity path.""" + cdef DOUBLE_t[:] ccp_alphas + cdef DOUBLE_t[:] impurities + cdef UINT32_t count + + def __cinit__(self, int node_count): + self.ccp_alphas = np.zeros(shape=(node_count), dtype=np.float64) + self.impurities = np.zeros(shape=(node_count), dtype=np.float64) + self.count = 0 + + cdef void save_metrics(self, + DOUBLE_t effective_alpha, + DOUBLE_t subtree_impurities) nogil: + self.ccp_alphas[self.count] = effective_alpha + self.impurities[self.count] = subtree_impurities + self.count += 1 + + +cdef _cost_complexity_prune(unsigned char[:] leaves_in_subtree, # OUT + Tree orig_tree, + _CCPPruneController controller): + """Perform cost complexity pruning. + + This function takes an already grown tree, `orig_tree` and outputs a + boolean mask `leaves_in_subtree` to are the leaves in the pruned tree. The + controller signals when the pruning should stop and is passed the + metrics of the subtrees during the pruning process. + + Parameters + ---------- + leaves_in_subtree : unsigned char[:] + Output for leaves of subtree + orig_tree : Tree + Original tree + ccp_controller : _CCPPruneController + Cost complexity controller + """ + + cdef: + SIZE_t i + SIZE_t n_nodes = orig_tree.node_count + # prior probability using weighted samples + DOUBLE_t[:] weighted_n_node_samples = orig_tree.weighted_n_node_samples + DOUBLE_t total_sum_weights = weighted_n_node_samples[0] + DOUBLE_t[:] impurity = orig_tree.impurity + # weighted impurity of each node + DOUBLE_t[:] r_node = np.empty(shape=n_nodes, dtype=np.float64) + + SIZE_t[:] child_l = orig_tree.children_left + SIZE_t[:] child_r = orig_tree.children_right + SIZE_t[:] parent = np.zeros(shape=n_nodes, dtype=np.intp) + + # Only uses the start and parent variables + Stack stack = Stack(INITIAL_STACK_SIZE) + StackRecord stack_record + int rc = 0 + SIZE_t node_idx + + SIZE_t[:] n_leaves = np.zeros(shape=n_nodes, dtype=np.intp) + DOUBLE_t[:] r_branch = np.zeros(shape=n_nodes, dtype=np.float64) + DOUBLE_t current_r + SIZE_t leaf_idx + SIZE_t parent_idx + + # candidate nodes that can be pruned + unsigned char[:] candidate_nodes = np.zeros(shape=n_nodes, + dtype=np.uint8) + # nodes in subtree + unsigned char[:] in_subtree = np.ones(shape=n_nodes, dtype=np.uint8) + DOUBLE_t[:] g_node = np.zeros(shape=n_nodes, dtype=np.float64) + SIZE_t pruned_branch_node_idx + DOUBLE_t subtree_alpha + DOUBLE_t effective_alpha + SIZE_t child_l_idx + SIZE_t child_r_idx + SIZE_t n_pruned_leaves + DOUBLE_t r_diff + DOUBLE_t max_float64 = np.finfo(np.float64).max + + # find parent node ids and leaves + with nogil: + + for i in range(r_node.shape[0]): + r_node[i] = ( + weighted_n_node_samples[i] * impurity[i] / total_sum_weights) + + # Push root node, using StackRecord.start as node id + rc = stack.push(0, 0, 0, -1, 0, 0, 0) + if rc == -1: + with gil: + raise MemoryError("pruning tree") + + while not stack.is_empty(): + stack.pop(&stack_record) + node_idx = stack_record.start + parent[node_idx] = stack_record.parent + if child_l[node_idx] == _TREE_LEAF: + # ... and child_r[node_idx] == _TREE_LEAF: + leaves_in_subtree[node_idx] = 1 + else: + rc = stack.push(child_l[node_idx], 0, 0, node_idx, 0, 0, 0) + if rc == -1: + with gil: + raise MemoryError("pruning tree") + + rc = stack.push(child_r[node_idx], 0, 0, node_idx, 0, 0, 0) + if rc == -1: + with gil: + raise MemoryError("pruning tree") + + # computes number of leaves in all branches and the overall impurity of + # the branch. The overall impurity is the sum of r_node in its leaves. + for leaf_idx in range(leaves_in_subtree.shape[0]): + if not leaves_in_subtree[leaf_idx]: + continue + r_branch[leaf_idx] = r_node[leaf_idx] + + # bubble up values to ancestor nodes + current_r = r_node[leaf_idx] + while leaf_idx != 0: + parent_idx = parent[leaf_idx] + r_branch[parent_idx] += current_r + n_leaves[parent_idx] += 1 + leaf_idx = parent_idx + + for i in range(leaves_in_subtree.shape[0]): + candidate_nodes[i] = not leaves_in_subtree[i] + + # save metrics before pruning + controller.save_metrics(0.0, r_branch[0]) + + # while root node is not a leaf + while candidate_nodes[0]: + + # computes ccp_alpha for subtrees and finds the minimal alpha + effective_alpha = max_float64 + for i in range(n_nodes): + if not candidate_nodes[i]: + continue + subtree_alpha = (r_node[i] - r_branch[i]) / (n_leaves[i] - 1) + if subtree_alpha < effective_alpha: + effective_alpha = subtree_alpha + pruned_branch_node_idx = i + + if controller.stop_pruning(effective_alpha): + break + + # stack uses only the start variable + rc = stack.push(pruned_branch_node_idx, 0, 0, 0, 0, 0, 0) + if rc == -1: + with gil: + raise MemoryError("pruning tree") + + # descendants of branch are not in subtree + while not stack.is_empty(): + stack.pop(&stack_record) + node_idx = stack_record.start + + if not in_subtree[node_idx]: + continue # branch has already been marked for pruning + candidate_nodes[node_idx] = 0 + leaves_in_subtree[node_idx] = 0 + in_subtree[node_idx] = 0 + + if child_l[node_idx] != _TREE_LEAF: + # ... and child_r[node_idx] != _TREE_LEAF: + rc = stack.push(child_l[node_idx], 0, 0, 0, 0, 0, 0) + if rc == -1: + with gil: + raise MemoryError("pruning tree") + rc = stack.push(child_r[node_idx], 0, 0, 0, 0, 0, 0) + if rc == -1: + with gil: + raise MemoryError("pruning tree") + leaves_in_subtree[pruned_branch_node_idx] = 1 + in_subtree[pruned_branch_node_idx] = 1 + + # updates number of leaves + n_pruned_leaves = n_leaves[pruned_branch_node_idx] - 1 + n_leaves[pruned_branch_node_idx] = 0 + + # computes the increase in r_branch to bubble up + r_diff = r_node[pruned_branch_node_idx] - r_branch[pruned_branch_node_idx] + r_branch[pruned_branch_node_idx] = r_node[pruned_branch_node_idx] + + # bubble up values to ancestors + node_idx = parent[pruned_branch_node_idx] + while node_idx != -1: + n_leaves[node_idx] -= n_pruned_leaves + r_branch[node_idx] += r_diff + node_idx = parent[node_idx] + + controller.save_metrics(effective_alpha, r_branch[0]) + + controller.after_pruning(in_subtree) + + +def _build_pruned_tree_ccp( + Tree tree, # OUT + Tree orig_tree, + DOUBLE_t ccp_alpha): + """Build a pruned tree from the original tree using cost complexity + pruning. + + The values and nodes from the original tree are copied into the pruned + tree. + + Parameters + ---------- + tree : Tree + Location to place the pruned tree + orig_tree : Tree + Original tree + ccp_alpha : positive double + Complexity parameter. The subtree with the largest cost complexity + that is smaller than ``ccp_alpha`` will be chosen. By default, + no pruning is performed. + """ + + cdef: + SIZE_t n_nodes = orig_tree.node_count + unsigned char[:] leaves_in_subtree = np.zeros( + shape=n_nodes, dtype=np.uint8) + + pruning_controller = _AlphaPruner(ccp_alpha=ccp_alpha) + + _cost_complexity_prune(leaves_in_subtree, orig_tree, pruning_controller) + + _build_pruned_tree(tree, orig_tree, leaves_in_subtree, + pruning_controller.capacity) + + +def ccp_pruning_path(Tree orig_tree): + """Computes the cost complexity pruning path. + + Parameters + ---------- + tree : Tree + Original tree. + + Returns + ------- + path_info : dict + Information about pruning path with attributes: + + ccp_alphas : ndarray + Effective alphas of subtree during pruning. + + impurities : ndarray + Sum of the impurities of the subtree leaves for the + corresponding alpha value in ``ccp_alphas``. + """ + cdef: + unsigned char[:] leaves_in_subtree = np.zeros( + shape=orig_tree.node_count, dtype=np.uint8) + + path_finder = _PathFinder(orig_tree.node_count) + + _cost_complexity_prune(leaves_in_subtree, orig_tree, path_finder) + + cdef: + UINT32_t total_items = path_finder.count + cnp.ndarray ccp_alphas = np.empty(shape=total_items, + dtype=np.float64) + cnp.ndarray impurities = np.empty(shape=total_items, + dtype=np.float64) + UINT32_t count = 0 + + while count < total_items: + ccp_alphas[count] = path_finder.ccp_alphas[count] + impurities[count] = path_finder.impurities[count] + count += 1 + + return {'ccp_alphas': ccp_alphas, 'impurities': impurities} + + +cdef _build_pruned_tree( + Tree tree, # OUT + Tree orig_tree, + const unsigned char[:] leaves_in_subtree, + SIZE_t capacity): + """Build a pruned tree. + + Build a pruned tree from the original tree by transforming the nodes in + ``leaves_in_subtree`` into leaves. + + Parameters + ---------- + tree : Tree + Location to place the pruned tree + orig_tree : Tree + Original tree + leaves_in_subtree : unsigned char memoryview, shape=(node_count, ) + Boolean mask for leaves to include in subtree + capacity : SIZE_t + Number of nodes to initially allocate in pruned tree + """ + tree._resize(capacity) + + cdef: + SIZE_t orig_node_id + SIZE_t new_node_id + SIZE_t depth + SIZE_t parent + bint is_left + bint is_leaf + + # value_stride for original tree and new tree are the same + SIZE_t value_stride = orig_tree.value_stride + SIZE_t max_depth_seen = -1 + int rc = 0 + Node* node + double* orig_value_ptr + double* new_value_ptr + + # Only uses the start, depth, parent, and is_left variables + Stack stack = Stack(INITIAL_STACK_SIZE) + StackRecord stack_record + + with nogil: + # push root node onto stack + rc = stack.push(0, 0, 0, _TREE_UNDEFINED, 0, 0.0, 0) + if rc == -1: + with gil: + raise MemoryError("pruning tree") + + while not stack.is_empty(): + stack.pop(&stack_record) + + orig_node_id = stack_record.start + depth = stack_record.depth + parent = stack_record.parent + is_left = stack_record.is_left + + is_leaf = leaves_in_subtree[orig_node_id] + node = &orig_tree.nodes[orig_node_id] + + new_node_id = tree._add_node( + parent, is_left, is_leaf, node.feature, node.threshold, + node.impurity, node.n_node_samples, + node.weighted_n_node_samples) + + if new_node_id == SIZE_MAX: + rc = -1 + break + + # copy value from original tree to new tree + orig_value_ptr = orig_tree.value + value_stride * orig_node_id + new_value_ptr = tree.value + value_stride * new_node_id + memcpy(new_value_ptr, orig_value_ptr, sizeof(double) * value_stride) + + if not is_leaf: + # Push right child on stack + rc = stack.push( + node.right_child, 0, depth + 1, new_node_id, 0, 0.0, 0) + if rc == -1: + break + + # push left child on stack + rc = stack.push( + node.left_child, 0, depth + 1, new_node_id, 1, 0.0, 0) + if rc == -1: + break + + if depth > max_depth_seen: + max_depth_seen = depth + + if rc >= 0: + tree.max_depth = max_depth_seen + if rc == -1: + raise MemoryError("pruning tree") diff --git a/causalml/inference/tree/_tree/_utils.pxd b/causalml/inference/tree/_tree/_utils.pxd new file mode 100644 index 00000000..098a0028 --- /dev/null +++ b/causalml/inference/tree/_tree/_utils.pxd @@ -0,0 +1,121 @@ +# Authors: Gilles Louppe +# Peter Prettenhofer +# Arnaud Joly +# Jacob Schreiber +# Nelson Liu +# +# License: BSD 3 clause + +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False +# cython: language_level=3 +# cython: linetrace=True + +import numpy as np +cimport numpy as cnp +from ._tree cimport Node + +ctypedef cnp.npy_float32 DTYPE_t # Type of X +ctypedef cnp.npy_float64 DOUBLE_t # Type of y, sample_weight +ctypedef cnp.npy_intp SIZE_t # Type for indices and counters +ctypedef cnp.npy_int32 INT32_t # Signed 32 bit integer +ctypedef cnp.npy_uint32 UINT32_t # Unsigned 32 bit integer + + +cdef enum: + # Max value for our rand_r replacement (near the bottom). + # We don't use RAND_MAX because it's different across platforms and + # particularly tiny on Windows/MSVC. + RAND_R_MAX = 0x7FFFFFFF + + +# safe_realloc(&p, n) resizes the allocation of p to n * sizeof(*p) bytes or +# raises a MemoryError. It never calls free, since that's __dealloc__'s job. +# cdef DTYPE_t *p = NULL +# safe_realloc(&p, n) +# is equivalent to p = malloc(n * sizeof(*p)) with error checking. +ctypedef fused realloc_ptr: + # Add pointer types here as needed. + (DTYPE_t*) + (SIZE_t*) + (unsigned char*) + (DOUBLE_t*) + (DOUBLE_t**) + (Node*) + (Node**) + (StackRecord*) + (PriorityHeapRecord*) + +cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) nogil except * + + +cdef SIZE_t rand_int(SIZE_t low, SIZE_t high, + UINT32_t* random_state) nogil + + +cdef double rand_uniform(double low, double high, + UINT32_t* random_state) nogil + + +cdef cnp.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size) + + +cdef double log(double x) nogil + +# ============================================================================= +# Stack data structure +# ============================================================================= + +# A record on the stack for depth-first tree growing +cdef struct StackRecord: + SIZE_t start + SIZE_t end + SIZE_t depth + SIZE_t parent + bint is_left + double impurity + SIZE_t n_constant_features + +cdef class Stack: + cdef SIZE_t capacity + cdef SIZE_t top + cdef StackRecord* stack_ + + cdef bint is_empty(self) nogil + cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent, + bint is_left, double impurity, + SIZE_t n_constant_features) nogil except -1 + cdef int pop(self, StackRecord* res) nogil + + +# ============================================================================= +# PriorityHeap data structure +# ============================================================================= + +# A record on the frontier for best-first tree growing +cdef struct PriorityHeapRecord: + SIZE_t node_id + SIZE_t start + SIZE_t end + SIZE_t pos + SIZE_t depth + bint is_leaf + double impurity + double impurity_left + double impurity_right + double improvement + +cdef class PriorityHeap: + cdef SIZE_t capacity + cdef SIZE_t heap_ptr + cdef PriorityHeapRecord* heap_ + + cdef bint is_empty(self) nogil + cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil + cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos, SIZE_t heap_length) nogil + cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos, + SIZE_t depth, bint is_leaf, double improvement, + double impurity, double impurity_left, + double impurity_right) nogil except -1 + cdef int pop(self, PriorityHeapRecord* res) nogil diff --git a/causalml/inference/tree/_tree/_utils.pyx b/causalml/inference/tree/_tree/_utils.pyx new file mode 100755 index 00000000..4b69bc23 --- /dev/null +++ b/causalml/inference/tree/_tree/_utils.pyx @@ -0,0 +1,311 @@ +# Authors: Gilles Louppe +# Peter Prettenhofer +# Arnaud Joly +# Jacob Schreiber +# Nelson Liu +# +# +# License: BSD 3 clause + +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False +# cython: language_level=3 +# cython: linetrace=True + +from libc.stdlib cimport free +from libc.stdlib cimport malloc +from libc.stdlib cimport realloc +from libc.math cimport log as ln + +import numpy as np +cimport numpy as cnp +cnp.import_array() + +cdef inline UINT32_t DEFAULT_SEED = 1 + +# ============================================================================= +# Helper functions +# ============================================================================= + +# rand_r replacement using a 32bit XorShift generator +# See http://www.jstatsoft.org/v08/i14/paper for details +cdef inline UINT32_t our_rand_r(UINT32_t* seed) nogil: + """Generate a pseudo-random np.uint32 from a np.uint32 seed""" + # seed shouldn't ever be 0. + if (seed[0] == 0): seed[0] = DEFAULT_SEED + + seed[0] ^= (seed[0] << 13) + seed[0] ^= (seed[0] >> 17) + seed[0] ^= (seed[0] << 5) + + # Note: we must be careful with the final line cast to np.uint32 so that + # the function behaves consistently across platforms. + # + # The following cast might yield different results on different platforms: + # wrong_cast = RAND_R_MAX + 1 + # + # We can use: + # good_cast = (RAND_R_MAX + 1) + # or: + # cdef np.uint32_t another_good_cast = RAND_R_MAX + 1 + return seed[0] % (RAND_R_MAX + 1) + + +cdef realloc_ptr safe_realloc(realloc_ptr* p, size_t nelems) nogil except *: + # sizeof(realloc_ptr[0]) would be more like idiomatic C, but causes Cython + # 0.20.1 to crash. + cdef size_t nbytes = nelems * sizeof(p[0][0]) + if nbytes / sizeof(p[0][0]) != nelems: + # Overflow in the multiplication + with gil: + raise MemoryError("could not allocate (%d * %d) bytes" + % (nelems, sizeof(p[0][0]))) + cdef realloc_ptr tmp = realloc(p[0], nbytes) + if tmp == NULL: + with gil: + raise MemoryError("could not allocate %d bytes" % nbytes) + + p[0] = tmp + return tmp # for convenience + + +def _realloc_test(): + # Helper for tests. Tries to allocate (-1) / 2 * sizeof(size_t) + # bytes, which will always overflow. + cdef SIZE_t* p = NULL + safe_realloc(&p, (-1) / 2) + if p != NULL: + free(p) + assert False + + +cdef inline cnp.ndarray sizet_ptr_to_ndarray(SIZE_t* data, SIZE_t size): + """Return copied data as 1D numpy array of intp's.""" + cdef cnp.npy_intp shape[1] + shape[0] = size + return cnp.PyArray_SimpleNewFromData(1, shape, cnp.NPY_INTP, data).copy() + + +cdef inline SIZE_t rand_int(SIZE_t low, SIZE_t high, + UINT32_t* random_state) nogil: + """Generate a random integer in [low; end).""" + return low + our_rand_r(random_state) % (high - low) + + +cdef inline double rand_uniform(double low, double high, + UINT32_t* random_state) nogil: + """Generate a random double in [low; high).""" + return ((high - low) * our_rand_r(random_state) / + RAND_R_MAX) + low + + +cdef inline double log(double x) nogil: + return ln(x) / ln(2.0) + + +# ============================================================================= +# Stack data structure +# ============================================================================= + +cdef class Stack: + """A LIFO data structure. + + Attributes + ---------- + capacity : SIZE_t + The elements the stack can hold; if more added then ``self.stack_`` + needs to be resized. + + top : SIZE_t + The number of elements currently on the stack. + + stack : StackRecord pointer + The stack of records (upward in the stack corresponds to the right). + """ + + def __cinit__(self, SIZE_t capacity): + self.capacity = capacity + self.top = 0 + self.stack_ = malloc(capacity * sizeof(StackRecord)) + + def __dealloc__(self): + free(self.stack_) + + cdef bint is_empty(self) nogil: + return self.top <= 0 + + cdef int push(self, SIZE_t start, SIZE_t end, SIZE_t depth, SIZE_t parent, + bint is_left, double impurity, + SIZE_t n_constant_features) nogil except -1: + """Push a new element onto the stack. + + Return -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + cdef SIZE_t top = self.top + cdef StackRecord* stack = NULL + + # Resize if capacity not sufficient + if top >= self.capacity: + self.capacity *= 2 + # Since safe_realloc can raise MemoryError, use `except -1` + safe_realloc(&self.stack_, self.capacity) + + stack = self.stack_ + stack[top].start = start + stack[top].end = end + stack[top].depth = depth + stack[top].parent = parent + stack[top].is_left = is_left + stack[top].impurity = impurity + stack[top].n_constant_features = n_constant_features + + # Increment stack pointer + self.top = top + 1 + return 0 + + cdef int pop(self, StackRecord* res) nogil: + """Remove the top element from the stack and copy to ``res``. + + Returns 0 if pop was successful (and ``res`` is set); -1 + otherwise. + """ + cdef SIZE_t top = self.top + cdef StackRecord* stack = self.stack_ + + if top <= 0: + return -1 + + res[0] = stack[top - 1] + self.top = top - 1 + + return 0 + + +# ============================================================================= +# PriorityHeap data structure +# ============================================================================= + +cdef class PriorityHeap: + """A priority queue implemented as a binary heap. + + The heap invariant is that the impurity improvement of the parent record + is larger then the impurity improvement of the children. + + Attributes + ---------- + capacity : SIZE_t + The capacity of the heap + + heap_ptr : SIZE_t + The water mark of the heap; the heap grows from left to right in the + array ``heap_``. The following invariant holds ``heap_ptr < capacity``. + + heap_ : PriorityHeapRecord* + The array of heap records. The maximum element is on the left; + the heap grows from left to right + """ + + def __cinit__(self, SIZE_t capacity): + self.capacity = capacity + self.heap_ptr = 0 + safe_realloc(&self.heap_, capacity) + + def __dealloc__(self): + free(self.heap_) + + cdef bint is_empty(self) nogil: + return self.heap_ptr <= 0 + + cdef void heapify_up(self, PriorityHeapRecord* heap, SIZE_t pos) nogil: + """Restore heap invariant parent.improvement > child.improvement from + ``pos`` upwards. """ + if pos == 0: + return + + cdef SIZE_t parent_pos = (pos - 1) / 2 + + if heap[parent_pos].improvement < heap[pos].improvement: + heap[parent_pos], heap[pos] = heap[pos], heap[parent_pos] + self.heapify_up(heap, parent_pos) + + cdef void heapify_down(self, PriorityHeapRecord* heap, SIZE_t pos, + SIZE_t heap_length) nogil: + """Restore heap invariant parent.improvement > children.improvement from + ``pos`` downwards. """ + cdef SIZE_t left_pos = 2 * (pos + 1) - 1 + cdef SIZE_t right_pos = 2 * (pos + 1) + cdef SIZE_t largest = pos + + if (left_pos < heap_length and + heap[left_pos].improvement > heap[largest].improvement): + largest = left_pos + + if (right_pos < heap_length and + heap[right_pos].improvement > heap[largest].improvement): + largest = right_pos + + if largest != pos: + heap[pos], heap[largest] = heap[largest], heap[pos] + self.heapify_down(heap, largest, heap_length) + + cdef int push(self, SIZE_t node_id, SIZE_t start, SIZE_t end, SIZE_t pos, + SIZE_t depth, bint is_leaf, double improvement, + double impurity, double impurity_left, + double impurity_right) nogil except -1: + """Push record on the priority heap. + + Return -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + cdef SIZE_t heap_ptr = self.heap_ptr + cdef PriorityHeapRecord* heap = NULL + + # Resize if capacity not sufficient + if heap_ptr >= self.capacity: + self.capacity *= 2 + # Since safe_realloc can raise MemoryError, use `except -1` + safe_realloc(&self.heap_, self.capacity) + + # Put element as last element of heap + heap = self.heap_ + heap[heap_ptr].node_id = node_id + heap[heap_ptr].start = start + heap[heap_ptr].end = end + heap[heap_ptr].pos = pos + heap[heap_ptr].depth = depth + heap[heap_ptr].is_leaf = is_leaf + heap[heap_ptr].impurity = impurity + heap[heap_ptr].impurity_left = impurity_left + heap[heap_ptr].impurity_right = impurity_right + heap[heap_ptr].improvement = improvement + + # Heapify up + self.heapify_up(heap, heap_ptr) + + # Increase element count + self.heap_ptr = heap_ptr + 1 + return 0 + + cdef int pop(self, PriorityHeapRecord* res) nogil: + """Remove max element from the heap. """ + cdef SIZE_t heap_ptr = self.heap_ptr + cdef PriorityHeapRecord* heap = self.heap_ + + if heap_ptr <= 0: + return -1 + + # Take first element + res[0] = heap[0] + + # Put last element to the front + heap[0], heap[heap_ptr - 1] = heap[heap_ptr - 1], heap[0] + + # Restore heap invariant + if heap_ptr > 1: + self.heapify_down(heap, 0, heap_ptr - 1) + + self.heap_ptr = heap_ptr - 1 + + return 0 diff --git a/causalml/inference/tree/causal/__init__.py b/causalml/inference/tree/causal/__init__.py old mode 100644 new mode 100755 diff --git a/causalml/inference/tree/causal/_builder.pxd b/causalml/inference/tree/causal/_builder.pxd old mode 100644 new mode 100755 index f3b8bef0..12bf7e58 --- a/causalml/inference/tree/causal/_builder.pxd +++ b/causalml/inference/tree/causal/_builder.pxd @@ -4,11 +4,11 @@ # cython: language_level=3 # cython: linetrace=True -from sklearn.tree._tree cimport Node, Tree, TreeBuilder -from sklearn.tree._tree cimport Splitter, SplitRecord -from sklearn.tree._utils cimport StackRecord, Stack -from sklearn.tree._utils cimport PriorityHeapRecord, PriorityHeap -from sklearn.tree._tree cimport SIZE_t, DOUBLE_t +from .._tree._tree cimport Node, Tree, TreeBuilder +from .._tree._splitter cimport Splitter, SplitRecord +from .._tree._utils cimport StackRecord, Stack +from .._tree._utils cimport PriorityHeapRecord, PriorityHeap +from .._tree._tree cimport SIZE_t, DOUBLE_t cdef struct FrontierRecord: @@ -24,4 +24,4 @@ cdef struct FrontierRecord: double impurity double impurity_left double impurity_right - double improvement \ No newline at end of file + double improvement diff --git a/causalml/inference/tree/causal/_builder.pyx b/causalml/inference/tree/causal/_builder.pyx old mode 100644 new mode 100755 index 5ef59873..3c9d32ef --- a/causalml/inference/tree/causal/_builder.pyx +++ b/causalml/inference/tree/causal/_builder.pyx @@ -58,7 +58,7 @@ cdef class DepthFirstCausalTreeBuilder(TreeBuilder): cdef int init_capacity if tree.max_depth <= 10: - init_capacity = int(2 ** (tree.max_depth + 1)) - 1 + init_capacity = (2 ** (tree.max_depth + 1)) - 1 else: init_capacity = 2047 diff --git a/causalml/inference/tree/causal/_criterion.pxd b/causalml/inference/tree/causal/_criterion.pxd old mode 100644 new mode 100755 index 4a1c5764..ebc05055 --- a/causalml/inference/tree/causal/_criterion.pxd +++ b/causalml/inference/tree/causal/_criterion.pxd @@ -5,8 +5,8 @@ # cython: linetrace=True -from sklearn.tree._criterion cimport RegressionCriterion -from sklearn.tree._criterion cimport SIZE_t, DOUBLE_t +from .._tree._criterion cimport RegressionCriterion +from .._tree._criterion cimport SIZE_t, DOUBLE_t cdef struct NodeInfo: @@ -23,4 +23,4 @@ cdef struct NodeInfo: cdef struct SplitState: NodeInfo node # current node state NodeInfo right # right split state - NodeInfo left # left split state \ No newline at end of file + NodeInfo left # left split state diff --git a/causalml/inference/tree/causal/_criterion.pyx b/causalml/inference/tree/causal/_criterion.pyx old mode 100644 new mode 100755 index 3a5a413b..4ab9366f --- a/causalml/inference/tree/causal/_criterion.pyx +++ b/causalml/inference/tree/causal/_criterion.pyx @@ -213,6 +213,7 @@ cdef class CausalRegressionCriterion(RegressionCriterion): """Compute penalty for the sample size difference between groups""" return self.groups_penalty * fabs(tr_count- ct_count) + cdef class StandardMSE(CausalRegressionCriterion): """ Standard MSE with treatment effect estimates diff --git a/causalml/inference/tree/causal/_tree.py b/causalml/inference/tree/causal/_tree.py old mode 100644 new mode 100755 index bc319590..542f8bae --- a/causalml/inference/tree/causal/_tree.py +++ b/causalml/inference/tree/causal/_tree.py @@ -4,15 +4,16 @@ from math import ceil import numpy as np -from sklearn.tree._classes import CRITERIA_REG -from sklearn.tree._classes import DTYPE, DOUBLE -from sklearn.tree._classes import SPARSE_SPLITTERS, DENSE_SPLITTERS -from sklearn.tree._classes import Tree, BaseDecisionTree -from sklearn.tree._classes import issparse, check_random_state -from sklearn.tree._criterion import Criterion -from sklearn.tree._splitter import Splitter +from scipy.sparse import issparse +from sklearn.utils import check_random_state from sklearn.utils.validation import _check_sample_weight +from .._tree._classes import DTYPE, DOUBLE +from .._tree._classes import SPARSE_SPLITTERS, DENSE_SPLITTERS +from .._tree._classes import Tree, BaseDecisionTree +from .._tree._criterion import Criterion +from .._tree._splitter import Splitter + from ._builder import DepthFirstCausalTreeBuilder, BestFirstCausalTreeBuilder from ._criterion import StandardMSE, CausalMSE, TTest @@ -21,7 +22,6 @@ "standard_mse": StandardMSE, "t_test": TTest, } -CRITERIA_REG.update(CAUSAL_TREES_CRITERIA) class BaseCausalDecisionTree(BaseDecisionTree): @@ -193,7 +193,9 @@ def fit( # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): - criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples) + criterion = CAUSAL_TREES_CRITERIA[self.criterion]( + self.n_outputs_, n_samples + ) criterion.eps = self.eps criterion.groups_penalty = self.groups_penalty else: diff --git a/causalml/inference/tree/causal/causalforest.py b/causalml/inference/tree/causal/causalforest.py old mode 100644 new mode 100755 index 527762db..77992ced --- a/causalml/inference/tree/causal/causalforest.py +++ b/causalml/inference/tree/causal/causalforest.py @@ -1,23 +1,33 @@ from typing import Union + import numpy as np import forestci as fci from joblib import Parallel, delayed from warnings import catch_warnings, simplefilter, warn -from sklearn.ensemble._forest import DOUBLE, DTYPE, MAX_INT + from sklearn.exceptions import DataConversionWarning -from sklearn.ensemble._forest import compute_sample_weight, issparse -from sklearn.utils.fixes import _joblib_parallel_args from sklearn.utils.validation import check_random_state, _check_sample_weight from sklearn.utils.multiclass import type_of_target -from sklearn.ensemble._forest import ( - ForestRegressor, - RandomForestRegressor, - ExtraTreesRegressor, -) +from sklearn import __version__ as sklearn_version +from sklearn.ensemble._forest import DOUBLE, DTYPE, MAX_INT +from sklearn.ensemble._forest import ForestRegressor +from sklearn.ensemble._forest import compute_sample_weight, issparse from sklearn.ensemble._forest import _generate_sample_indices, _get_n_samples_bootstrap from .causaltree import CausalTreeRegressor +try: + from packaging.version import parse as Version +except ModuleNotFoundError: + from distutils.version import LooseVersion as Version + +if Version(sklearn_version) >= Version("1.1.0"): + _joblib_parallel_args = dict(prefer="threads") +else: + from sklearn.utils.fixes import _joblib_parallel_args + + _joblib_parallel_args = _joblib_parallel_args(prefer="threads") + def _parallel_build_trees( tree, @@ -139,12 +149,18 @@ def __init__( to train each base estimator. groups_cnt: (bool), count treatment and control groups for each node/leaf """ - super().__init__( - base_estimator=CausalTreeRegressor( - control_name=control_name, criterion=criterion, groups_cnt=groups_cnt - ), - n_estimators=n_estimators, - estimator_params=( + self._estimator = CausalTreeRegressor( + control_name=control_name, criterion=criterion, groups_cnt=groups_cnt + ) + _estimator_key = ( + "estimator" + if Version(sklearn_version) >= Version("1.2.0") + else "base_estimator" + ) + _parent_args = { + _estimator_key: self._estimator, + "n_estimators": n_estimators, + "estimator_params": ( "criterion", "control_name", "max_depth", @@ -158,14 +174,16 @@ def __init__( "min_samples_leaf", "random_state", ), - bootstrap=bootstrap, - oob_score=oob_score, - n_jobs=n_jobs, - random_state=random_state, - verbose=verbose, - warm_start=warm_start, - max_samples=max_samples, - ) + "bootstrap": bootstrap, + "oob_score": oob_score, + "n_jobs": n_jobs, + "random_state": random_state, + "verbose": verbose, + "warm_start": warm_start, + "max_samples": max_samples, + } + + super().__init__(**_parent_args) self.criterion = criterion self.control_name = control_name @@ -277,22 +295,6 @@ def _fit(self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray = None): # Check parameters self._validate_estimator() - # TODO: Remove in v1.2 - if isinstance(self, (RandomForestRegressor, ExtraTreesRegressor)): - if self.criterion == "mse": - warn( - "Criterion 'mse' was deprecated in v1.0 and will be " - "removed in version 1.2. Use `criterion='squared_error'` " - "which is equivalent.", - FutureWarning, - ) - elif self.criterion == "mae": - warn( - "Criterion 'mae' was deprecated in v1.0 and will be " - "removed in version 1.2. Use `criterion='absolute_error'` " - "which is equivalent.", - FutureWarning, - ) if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available if bootstrap=True") @@ -327,11 +329,10 @@ def _fit(self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray = None): self._make_estimator(append=False, random_state=random_state) for i in range(n_more_estimators) ] - trees = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, - **_joblib_parallel_args(prefer="threads"), + **_joblib_parallel_args, )( delayed(_parallel_build_trees)( t, @@ -377,7 +378,7 @@ def fit(self, X: np.ndarray, treatment: np.ndarray, y: np.ndarray): Returns: self """ - X, y, w = self.base_estimator._prepare_data(X=X, y=y, treatment=treatment) + X, y, w = self._estimator._prepare_data(X=X, y=y, treatment=treatment) return self._fit(X=X, y=y, sample_weight=w) def predict(self, X: np.ndarray, with_outcomes: bool = False) -> np.ndarray: diff --git a/causalml/inference/tree/causal/causaltree.py b/causalml/inference/tree/causal/causaltree.py old mode 100644 new mode 100755 index bcf0b697..5e6c8180 --- a/causalml/inference/tree/causal/causaltree.py +++ b/causalml/inference/tree/causal/causaltree.py @@ -2,16 +2,18 @@ import sys from typing import Union -import numpy as np import tqdm +import numpy as np +from numpy import float32 as DTYPE + from pathos.pools import ProcessPool as PPool from scipy.stats import norm from sklearn.base import RegressorMixin -from sklearn.tree._tree import DTYPE from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted from causalml.inference.meta.utils import check_treatment_vector + from ._tree import BaseCausalDecisionTree from ..utils import get_tree_leaves_mask, timeit diff --git a/pyproject.toml b/pyproject.toml old mode 100644 new mode 100755 index 98f2542d..c4c0586c --- a/pyproject.toml +++ b/pyproject.toml @@ -26,11 +26,11 @@ dependencies = [ "forestci==0.6", "pathos==0.2.9", "pip>=10.0", - "numpy", + "numpy>=1.18.5", "scipy>=1.4.1", "matplotlib", - "pandas>=0.24.1", - "scikit-learn<=1.0.2", + "pandas>=0.24.1,<1.4.0", + "scikit-learn>=0.22.0", "statsmodels>=0.9.0", "Cython<=0.29.34", "seaborn", @@ -62,8 +62,8 @@ requires = [ "setuptools>=18.0", "wheel", "Cython<=0.29.34", - "numpy", - "scikit-learn<=1.0.2", + "numpy>=1.18.5", + "scikit-learn>=0.22.0", ] [project.urls] diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index bf67e9ac..6fcfb629 --- a/setup.py +++ b/setup.py @@ -1,3 +1,4 @@ +import multiprocessing as mp from setuptools import dist, setup, find_packages from setuptools.extension import Extension @@ -15,34 +16,33 @@ dist.Distribution().fetch_build_eggs(["numpy"]) from numpy import get_include as np_get_include +# fmt: off +cython_modules = [ + ("causalml.inference.tree._tree._tree", "causalml/inference/tree/_tree/_tree.pyx"), + ("causalml.inference.tree._tree._criterion", "causalml/inference/tree/_tree/_criterion.pyx"), + ("causalml.inference.tree._tree._splitter", "causalml/inference/tree/_tree/_splitter.pyx"), + ("causalml.inference.tree._tree._utils", "causalml/inference/tree/_tree/_utils.pyx"), + ("causalml.inference.tree.causal._criterion", "causalml/inference/tree/causal/_criterion.pyx"), + ("causalml.inference.tree.causal._builder", "causalml/inference/tree/causal/_builder.pyx"), + ("causalml.inference.tree.uplift", "causalml/inference/tree/uplift.pyx"), +] +# fmt: on + extensions = [ Extension( - "causalml.inference.tree.causal._criterion", - ["causalml/inference/tree/causal/_criterion.pyx"], - libraries=[], - include_dirs=[np_get_include()], - extra_compile_args=["-O3"], - ), - Extension( - "causalml.inference.tree.causal._builder", - ["causalml/inference/tree/causal/_builder.pyx"], - libraries=[], - include_dirs=[np_get_include()], - extra_compile_args=["-O3"], - ), - Extension( - "causalml.inference.tree.uplift", - ["causalml/inference/tree/uplift.pyx"], + name, + [source], libraries=[], include_dirs=[np_get_include()], extra_compile_args=["-O3"], - ), + ) + for name, source in cython_modules ] packages = find_packages(exclude=["tests", "tests.*"]) setup( packages=packages, - ext_modules=cythonize(extensions, annotate=True), + ext_modules=cythonize(extensions, annotate=True, nthreads=mp.cpu_count()), include_dirs=[np_get_include()], )