Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: PCA via Array API based on #2096 #2106

Draft
wants to merge 44 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
8bedde1
ENH: array api dispatching
samir-nasibli Oct 2, 2024
b11fcf3
Deselect some scikit-learn Array API tests
samir-nasibli Oct 4, 2024
467634a
Merge branch 'intel:main' into enh/array_api_dispatching
samir-nasibli Oct 4, 2024
31030f7
Merge branch 'intel:main' into enh/array_api_dispatching
samir-nasibli Oct 8, 2024
943796e
deselect more tests
samir-nasibli Oct 8, 2024
ef42daa
deselect more tests
samir-nasibli Oct 8, 2024
3bc755d
disabled tests for
samir-nasibli Oct 8, 2024
76f1876
fix the deselection comment
samir-nasibli Oct 8, 2024
ce0b8e1
disabled test for Ridge regression
samir-nasibli Oct 8, 2024
404e8c0
Disabled tests and added comment
samir-nasibli Oct 8, 2024
ced43bf
ENH: Array API dispatching
samir-nasibli Oct 8, 2024
968365f
Merge branch 'intel:main' into enh/array_api_dispatching_testing
samir-nasibli Oct 9, 2024
c395d03
Revert adding dpctl into Array PI conformance testing
samir-nasibli Oct 9, 2024
9271479
Merge branch 'enh/array_api_dispatching_testing' of https://github.co…
samir-nasibli Oct 9, 2024
5784c25
minor refactoring onedal _array_api
samir-nasibli Oct 9, 2024
8d7f664
add tests
samir-nasibli Oct 9, 2024
63d8f30
addressed memory usage tests
samir-nasibli Oct 9, 2024
6bd0280
Address some array api test fails
samir-nasibli Oct 9, 2024
90411e7
linting
samir-nasibli Oct 9, 2024
2b7bbc5
addressed test_get_namespace
samir-nasibli Oct 9, 2024
b7b8f03
adding test case for validate_data check with Array API inputs
samir-nasibli Oct 9, 2024
169009d
minor refactoring
samir-nasibli Oct 9, 2024
9ca118c
addressed test_patch_map_match fail
samir-nasibli Oct 9, 2024
7ddcf40
Added docstrings for get_namespace
samir-nasibli Oct 9, 2024
ec90d43
docstrings for Array API tests
samir-nasibli Oct 9, 2024
6e7e547
updated minimal scikit-learn version for Array API dispatching
samir-nasibli Oct 9, 2024
e5db839
updated minimal scikit-learn version for Array API dispatching in _de…
samir-nasibli Oct 9, 2024
f99a92b
fix test test_get_namespace_with_config_context
samir-nasibli Oct 9, 2024
4e3286a
Merge branch 'main' into enh/array_api_dispatching_testing
samir-nasibli Oct 10, 2024
bc10579
ENH: DBSCAN via Array API
samir-nasibli Oct 10, 2024
1cb07a2
refactor onedal/datatypes/_data_conversion.py
samir-nasibli Oct 11, 2024
acf5689
minor fix
samir-nasibli Oct 11, 2024
9b2a8e9
minor update
samir-nasibli Oct 11, 2024
cfd91ea
added _check_sample_weight via Array API
samir-nasibli Oct 13, 2024
6f242ec
ENH: PCA via Array API based on #2096
samir-nasibli Oct 13, 2024
eb14aec
Merge branch 'main' into enh/pca_array_api
samir-nasibli Oct 13, 2024
3052e91
Linting
samir-nasibli Oct 13, 2024
53fae8d
correction for array api
samir-nasibli Oct 13, 2024
5b7638c
returned relative import for _is_csr
samir-nasibli Oct 13, 2024
aa5c38e
Merge branch 'intel:main' into enh/pca_array_api
samir-nasibli Oct 14, 2024
3850b45
Merge branch 'intel:main' into enh/pca_array_api
samir-nasibli Oct 15, 2024
154f61f
Merge branch 'intel:main' into enh/pca_array_api
samir-nasibli Oct 18, 2024
fb1059f
Merge branch 'intel:main' into enh/pca_array_api
samir-nasibli Oct 19, 2024
641318e
Merge branch 'main' into enh/pca_array_api
samir-nasibli Oct 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions onedal/datatypes/_data_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,14 @@
import dpctl.tensor as dpt


def _apply_and_pass(func, *args):
def _apply_and_pass(func, *args, **kwargs):
if len(args) == 1:
return func(args[0])
return tuple(map(func, args))
return func(args[0], **kwargs) if len(kwargs) > 0 else func(args[0])
return (
tuple(func(arg, **kwargs) for arg in args)
if len(kwargs) > 0
else tuple(func(arg) for arg in args)
)


def from_table(*args):
Expand All @@ -58,7 +62,7 @@ def to_table(*args):
if _is_dpc_backend:
from ..common._policy import _HostInteropPolicy

def _convert_to_supported(policy, *data):
def _convert_to_supported(policy, *data, xp=np):
def func(x):
return x

Expand All @@ -70,13 +74,13 @@ def func(x):
device = policy._queue.sycl_device

def convert_or_pass(x):
if (x is not None) and (x.dtype == np.float64):
if (x is not None) and (x.dtype == xp.float64):
warnings.warn(
"Data will be converted into float32 from "
"float64 because device does not support it",
RuntimeWarning,
)
return x.astype(np.float32)
return xp.astype(x, dtype=xp.float32)
else:
return x

Expand All @@ -87,7 +91,7 @@ def convert_or_pass(x):

else:

def _convert_to_supported(policy, *data):
def _convert_to_supported(policy, *data, xp=np):
def func(x):
return x

Expand Down
2 changes: 2 additions & 0 deletions onedal/decomposition/incremental_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
from .pca import BasePCA


# TODO:
# update for BasePCA.
class IncrementalPCA(BasePCA):
"""
Incremental estimator for PCA based on oneDAL implementation.
Expand Down
116 changes: 89 additions & 27 deletions onedal/decomposition/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@

from ..common._base import BaseEstimator
from ..datatypes import _convert_to_supported, from_table, to_table
from ..utils._array_api import (
_asarray,
_convert_to_numpy,
get_namespace,
sklearn_array_api_dispatch,
)


class BasePCA(BaseEstimator, metaclass=ABCMeta):
Expand All @@ -42,13 +48,13 @@ def __init__(
self.is_deterministic = is_deterministic
self.whiten = whiten

def _get_onedal_params(self, data, stage=None):
def _get_onedal_params(self, data, xp, stage=None):
if stage is None:
n_components = self._resolve_n_components_for_training(data.shape)
elif stage == "predict":
n_components = self.n_components_
return {
"fptype": "float" if data.dtype == np.float32 else "double",
"fptype": "float" if data.dtype == xp.float32 else "double",
"method": self.method,
"n_components": n_components,
"is_deterministic": self.is_deterministic,
Expand Down Expand Up @@ -95,77 +101,125 @@ def _resolve_n_components_for_result(self, shape_tuple):
elif self.n_components == "mle":
return _infer_dimension(self.explained_variance_, shape_tuple[0])
elif 0.0 < self.n_components < 1.0:
# TODO:
# check for Array API.
ratio_cumsum = stable_cumsum(self.explained_variance_ratio_)
return np.searchsorted(ratio_cumsum, self.n_components, side="right") + 1
elif isinstance(self.n_components, float) and self.n_components == 1.0:
return min(shape_tuple)
else:
return self.n_components

def _compute_noise_variance(self, n_components, n_sf_min):
def _compute_noise_variance(self, xp, n_components, n_sf_min):
if n_components < n_sf_min:
if len(self.explained_variance_) == n_sf_min:
return self.explained_variance_[n_components:].mean()
elif len(self.explained_variance_) < n_sf_min:
# TODO Rename variances_ to var_ to align with sklearn/sklearnex IncrementalPCA
# TODO:
# check xp.sum for Array API.
if hasattr(self, "variances_"):
resid_var = self.variances_.sum()
resid_var = xp.sum(self.variances_)
elif hasattr(self, "var_"):
resid_var = self.var_.sum()
resid_var = xp.sum(self.var_)

resid_var -= self.explained_variance_.sum()
resid_var -= xp.sum(self.explained_variance_)
return resid_var / (n_sf_min - n_components)
else:
return 0.0

def _create_model(self):
def _create_model(self, xp):
m = self._get_backend("decomposition", "dim_reduction", "model")
m.eigenvectors = to_table(self.components_)
m.means = to_table(self.mean_)
m.eigenvectors = to_table(_convert_to_numpy(self.components_, xp=xp))
m.means = to_table(_convert_to_numpy(self.mean_, xp=xp))
if self.whiten:
m.eigenvalues = to_table(self.explained_variance_)
m.eigenvalues = to_table(_convert_to_numpy(self.explained_variance_, xp=xp))
self._onedal_model = m
return m

def predict(self, X, queue=None):
def _predict(self, X, xp, queue=None):
policy = self._get_policy(queue, X)
model = self._create_model()
model = self._create_model(xp)
X = _convert_to_supported(policy, X)
params = self._get_onedal_params(X, stage="predict")
params = self._get_onedal_params(X, xp, stage="predict")

result = self._get_backend(
"decomposition", "dim_reduction", "infer", policy, params, model, to_table(X)
"decomposition",
"dim_reduction",
"infer",
policy,
params,
model,
to_table(_convert_to_numpy(X, xp=xp)),
)
return from_table(result.transformed_data)
# Since `from_table` data management enabled only for numpy host,
# copy data from numpy host output to xp namespace array.
return _asarray(
from_table(result.transformed_data).reshape(-1), xp=xp, sycl_queue=queue
)

def _predict(self, X, xp, queue=None):
xp, is_array_api_compliant = get_namespace(X)
# update for queue getting.
queue = X.sycl_queue
return self._fit(X, xp, is_array_api_compliant, queue)


class PCA(BasePCA):

def fit(self, X, y=None, queue=None):
@sklearn_array_api_dispatch()
def _fit(self, X, xp, is_array_api_compliant, y=None, queue=None):
n_samples, n_features = X.shape
n_sf_min = min(n_samples, n_features)
self._validate_n_components(self.n_components, n_samples, n_features)

policy = self._get_policy(queue, X)
# TODO: investigate why np.ndarray with OWNDATA=FALSE flag
# fails to be converted to oneDAL table
# TODO:
# check if only numpy issues.
if isinstance(X, np.ndarray) and not X.flags["OWNDATA"]:
X = X.copy()
X = _convert_to_supported(policy, X)
X = _convert_to_supported(policy, X, xp=xp)

params = self._get_onedal_params(X)
result = self._get_backend(
"decomposition", "dim_reduction", "train", policy, params, to_table(X)
"decomposition",
"dim_reduction",
"train",
policy,
params,
to_table(_convert_to_numpy(X, xp=xp)),
)

self.mean_ = from_table(result.means).ravel()
self.variances_ = from_table(result.variances)
self.components_ = from_table(result.eigenvectors)
self.singular_values_ = from_table(result.singular_values).ravel()
self.explained_variance_ = np.maximum(from_table(result.eigenvalues).ravel(), 0)
self.explained_variance_ratio_ = from_table(
result.explained_variances_ratio
).ravel()
# Since `from_table` data management enabled only for numpy host,
# copy data from numpy host output to xp namespace array.
self.mean_ = _asarray(
from_table(result.means).reshape(-1), xp=xp, sycl_queue=queue
)
self.variances_ = _asarray(
from_table(result.variances).reshape(-1), xp=xp, sycl_queue=queue
)
self.components_ = _asarray(
from_table(result.eigenvectors).reshape(-1), xp=xp, sycl_queue=queue
)
self.singular_values_ = _asarray(
from_table(result.singular_values).reshape(-1), xp=xp, sycl_queue=queue
)
# self.explained_variance_ = np.maximum(from_table(result.eigenvalues).ravel(), 0)
# TODO:
# check for Array API.
self.explained_variance_ = xp.max(
_asarray(
from_table(result.singular_values).reshape(-1), xp=xp, sycl_queue=queue
),
0,
)
self.explained_variance_ratio_ = _asarray(
from_table(result.explained_variances_ratio).reshape(-1),
xp=xp,
sycl_queue=queue,
)
self.n_samples_ = n_samples
self.n_features_ = n_features

Expand All @@ -175,12 +229,20 @@ def fit(self, X, y=None, queue=None):

n_components = self._resolve_n_components_for_result(X.shape)
self.n_components_ = n_components
self.noise_variance_ = self._compute_noise_variance(n_components, n_sf_min)
self.noise_variance_ = self._compute_noise_variance(xp, n_components, n_sf_min)

# TODO:
# check ufunc work here.
if n_components < params["n_components"]:
self.explained_variance_ = self.explained_variance_[:n_components]
self.components_ = self.components_[:n_components]
self.singular_values_ = self.singular_values_[:n_components]
self.explained_variance_ratio_ = self.explained_variance_ratio_[:n_components]

return self

def fit(self, X, y=None, queue=None):
xp, is_array_api_compliant = get_namespace(X)
# update for queue getting.
queue = X.sycl_queue
return self._fit(X, xp, is_array_api_compliant, y, queue)
17 changes: 17 additions & 0 deletions onedal/decomposition/tests/test_pca.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# TODO:
# TBD.
Loading
Loading