Skip to content

Commit

Permalink
Merge branch 'main' into enh/pca_array_api
Browse files Browse the repository at this point in the history
  • Loading branch information
samir-nasibli committed Oct 22, 2024
2 parents fb1059f + c0eb5ad commit 641318e
Show file tree
Hide file tree
Showing 34 changed files with 420 additions and 120 deletions.
2 changes: 1 addition & 1 deletion .ci/pipeline/build-and-test-lnx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ steps:
. /usr/share/miniconda/etc/profile.d/conda.sh
conda activate CB
cd ..
./s/conda-recipe/run_test.sh
./s/conda-recipe/run_test.sh --json-report
displayName: "Sklearnex testing"
- script: |
. /usr/share/miniconda/etc/profile.d/conda.sh
Expand Down
2 changes: 1 addition & 1 deletion .ci/pipeline/build-and-test-win.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ steps:
- script: |
call activate CB
cd ..
call s\conda-recipe\run_test.bat s\
call s\conda-recipe\run_test.bat s\ --json-report
displayName: 'Sklearnex testing'
- script: |
call activate CB
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ jobs:
call .\.github\scripts\activate_components.bat ${{ steps.set-env.outputs.DPCFLAG }}
set PYTHON=python
cd ..
call scikit-learn-intelex\conda-recipe\run_test.bat scikit-learn-intelex\
call scikit-learn-intelex\conda-recipe\run_test.bat scikit-learn-intelex\ --json-report
- name: Sklearn testing
shell: cmd
run: |
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,6 @@ record*

# example .res files
tests/_results*

# json reports from pytest
.pytest_reports/*
31 changes: 26 additions & 5 deletions conda-recipe/run_test.bat
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,31 @@ IF NOT DEFINED PYTHON (set "PYTHON=python")

%PYTHON% -c "from sklearnex import patch_sklearn; patch_sklearn()" || set exitcode=1

%PYTHON% -m pytest --verbose -s %1tests || set exitcode=1
rem Note: execute with argument --json-report as second argument
rem in order to produce a JSON report under folder '.pytest_reports'.
set with_json_report=0
if "%~2"=="--json-report" (
set with_json_report=1
mkdir .pytest_reports
del /q .pytest_reports\*.json
)

if "%with_json_report%"=="1" (
%PYTHON% -m pytest --verbose -s %1tests --json-report --json-report-file=.pytest_reports\legacy_report.json || set exitcode=1
pytest --verbose --pyargs daal4py --json-report --json-report-file=.pytest_reports\daal4py_report.json || set exitcode=1
pytest --verbose --pyargs sklearnex --json-report --json-report-file=.pytest_reports\sklearnex_report.json || set exitcode=1
pytest --verbose --pyargs onedal --json-report --json-report-file=.pytest_reports\onedal_report.json || set exitcode=1
pytest --verbose %1.ci\scripts\test_global_patch.py --json-report --json-report-file=.pytest_reports\global_patching_report.json || set exitcode=1
if NOT EXIST .pytest_reports\legacy_report.json (
echo "Error: JSON report files failed to be produced."
set exitcode=1
)
) else (
%PYTHON% -m pytest --verbose -s %1tests || set exitcode=1
pytest --verbose --pyargs daal4py || set exitcode=1
pytest --verbose --pyargs sklearnex || set exitcode=1
pytest --verbose --pyargs onedal || set exitcode=1
pytest --verbose %1.ci\scripts\test_global_patch.py || set exitcode=1
)

pytest --verbose --pyargs daal4py || set exitcode=1
pytest --verbose --pyargs sklearnex || set exitcode=1
pytest --verbose --pyargs onedal || set exitcode=1
pytest --verbose %1.ci\scripts\test_global_patch.py || set exitcode=1
EXIT /B %exitcode%
30 changes: 24 additions & 6 deletions conda-recipe/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,28 +36,46 @@ if [ -z "${PYTHON}" ]; then
export PYTHON=python
fi

# Note: execute with argument --json-report in order to produce
# a JSON report under folder '.pytest_reports'. Other arguments
# will also be forwarded to pytest.
with_json_report=0
if [[ "$*" == *"--json-report"* ]]; then
echo "Will produce JSON report of tests"
with_json_report=1
mkdir -p .pytest_reports
if [[ ! -z "$(ls .pytest_reports)" ]]; then
rm .pytest_reports/*.json
fi
fi
function json_report_name {
if [[ "${with_json_report}" == "1" ]]; then
printf -- "--json-report-file=.pytest_reports/$1_report.json"
fi
}

${PYTHON} -c "from sklearnex import patch_sklearn; patch_sklearn()"
return_code=$(($return_code + $?))

pytest --verbose -s ${sklex_root}/tests
pytest --verbose -s ${sklex_root}/tests $@ $(json_report_name legacy)
return_code=$(($return_code + $?))

pytest --verbose --pyargs daal4py
pytest --verbose --pyargs daal4py $@ $(json_report_name daal4py)
return_code=$(($return_code + $?))

pytest --verbose --pyargs sklearnex
pytest --verbose --pyargs sklearnex $@ $(json_report_name sklearnex)
return_code=$(($return_code + $?))

pytest --verbose --pyargs onedal
pytest --verbose --pyargs onedal $@ $(json_report_name onedal)
return_code=$(($return_code + $?))

pytest --verbose -s ${sklex_root}/.ci/scripts/test_global_patch.py
pytest --verbose -s ${sklex_root}/.ci/scripts/test_global_patch.py $@ $(json_report_name global_patching)
return_code=$(($return_code + $?))

echo "NO_DIST=$NO_DIST"
if [[ ! $NO_DIST ]]; then
mpirun --version
mpirun -n 4 pytest --verbose -s ${sklex_root}/tests/test*spmd*.py
mpirun -n 4 pytest --verbose -s ${sklex_root}/tests/test*spmd*.py $@ $(json_report_name mpi_legacy)
return_code=$(($return_code + $?))
fi

Expand Down
16 changes: 10 additions & 6 deletions daal4py/sklearn/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,17 +95,21 @@ def daal_check_version(
return False


@functools.lru_cache(maxsize=256, typed=False)
def sklearn_check_version(ver):
if hasattr(Version(ver), "base_version"):
base_sklearn_version = Version(sklearn_version).base_version
res = bool(Version(base_sklearn_version) >= Version(ver))
def _package_check_version(version_to_check, available_version):
if hasattr(Version(version_to_check), "base_version"):
base_package_version = Version(available_version).base_version
res = bool(Version(base_package_version) >= Version(version_to_check))
else:
# packaging module not available
res = bool(Version(sklearn_version) >= Version(ver))
res = bool(Version(available_version) >= Version(version_to_check))
return res


@functools.lru_cache(maxsize=256, typed=False)
def sklearn_check_version(ver):
return _package_check_version(ver, sklearn_version)


def parse_dtype(dt):
if dt == np.double:
return "double"
Expand Down
2 changes: 1 addition & 1 deletion dependencies-dev
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ Jinja2==3.1.4
numpy==2.0.1 ; python_version <= '3.9'
numpy==2.1.2 ; python_version > '3.9'
pybind11==2.13.6
cmake==3.30.3
cmake==3.30.5
setuptools==75.2.0
123 changes: 123 additions & 0 deletions doc/sources/array_api.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
.. ******************************************************************************
.. * Copyright 2024 Intel Corporation
.. *
.. * Licensed under the Apache License, Version 2.0 (the "License");
.. * you may not use this file except in compliance with the License.
.. * You may obtain a copy of the License at
.. *
.. * http://www.apache.org/licenses/LICENSE-2.0
.. *
.. * Unless required by applicable law or agreed to in writing, software
.. * distributed under the License is distributed on an "AS IS" BASIS,
.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
.. * See the License for the specific language governing permissions and
.. * limitations under the License.
.. *******************************************************************************/
.. _array_api:

=================
Array API support
=================
The `Array API <https://data-apis.org/array-api/latest/>`_ specification defines
a standard API for all array manipulation libraries with a NumPy-like API.
Intel(R) Extension for Scikit-Learn doesn't require
`array-api-compat <https://github.com/data-apis/array-api-compat>`__ to be installed for
functional support of the array API standard.
In the current implementation, the functional support of array api follows the functional
support of different array or DataFrame inputs and does not modify the precision of the
input and output data formats unless necessary. Any array API input will be converted to host
numpy.ndarrays and all internal manipulations with data will be done with these representations of
the input data. DPNP's 'ndarray' and Data Parallel Control's 'usm_ndarray' have special handling
requirements that are described in the relevant section of this document. Output values will in
all relevant cases match the input data format.

.. note::
Currently, only `array-api-strict <https://github.com/data-apis/array-api-strict>`__,
`dpctl <https://intelpython.github.io/dpctl/latest/index.html>`__, `dpnp <https://github.com/IntelPython/dpnp>`__
and `numpy <https://numpy.org/>`__ are known to work with sklearnex estimators.
.. note::
Stock Scikit-learn’s array API support requires `array-api-compat <https://github.com/data-apis/array-api-compat>`__ to be installed.


Support for DPNP and DPCTL
==========================
The functional support of input data for sklearnex estimators also extended for SYCL USM array types.
These include SYCL USM arrays `dpnp's <https://github.com/IntelPython/dpnp>`__ ndarray and
`Data Parallel Control usm_ndarray <https://intelpython.github.io/dpctl/latest/index.html>`__.
DPNP ndarray and Data Parallel Control usm_ndarray contain SYCL contexts which can be used for
`sklearnex` device offloading.

.. note::
Current support for DPNP and DPCTL usm_ndarray data can be copied and moved to and from device in sklearnex and have
impacts on memory utilization.

DPCTL or DPNP inputs are not required to use `config_context(target_offload=device)`.
`sklearnex` will use input usm_ndarray sycl context for device offloading.

.. note::
As DPCTL or DPNP inputs contain SYCL contexts, they do not require `config_context(target_offload=device)`.
However, the use of `config_context`` will override the contained SYCL context and will force movement
of data to the targeted device.


Support for Array API-compatible inputs
=======================================
All patched estimators, metrics, tools and non-scikit-learn estimators functionally support Array API standard.
Intel(R) Extension for scikit-Learn preserves input data format for all outputs. For all array inputs except
SYCL USM arrays `dpnp's <https://github.com/IntelPython/dpnp>`__ ndarray and
`Data Parallel Control usm_ndarray <https://intelpython.github.io/dpctl/latest/index.html>`__ all computation
will be only accomplished on CPU unless specified by a `config_context`` with an available GPU device.

Stock scikit-learn uses `config_context(array_api_dispatch=True)` for enabling Array API
`support <https://scikit-learn.org/1.5/modules/array_api.html>`__.
If `array_api_dispatch` is enabled and the installed Scikit-Learn version supports array API, then the original
inputs are used when falling back to Scikit-Learn functionality.

.. note::
Data Parallel Control usm_ndarray or DPNP ndarray inputs will use host numpy data copies when
falling back to Scikit-Learn since they are not array API compliant.
.. note::
Functional support doesn't guarantee that after the model is trained, fitted attributes that are arrays
will also be from the same namespace as the training data.


Example usage
=============

DPNP ndarrays
-------------

Here is an example code to demonstrate how to use `dpnp <https://github.com/IntelPython/dpnp>`__ arrays to
run `RandomForestRegressor` on a GPU without `config_context(array_api_dispatch=True)`:

.. literalinclude:: ../../examples/sklearnex/random_forest_regressor_dpnp.py
:language: python


.. note::
Functional support doesn't guarantee that after the model is trained, fitted attributes that are arrays
will also be from the same namespace as the training data.

For example, if `dpnp's <https://github.com/IntelPython/dpnp>`__ namespace was used for training,
then fitted attributes will be on the CPU and `numpy.ndarray` data format.

DPCTL usm_ndarrays
------------------
Here is an example code to demonstrate how to use `dpctl <https://intelpython.github.io/dpctl/latest/index.html>`__
arrays to run `RandomForestClassifier` on a GPU witout `config_context(array_api_dispatch=True)`:

.. literalinclude:: ../../examples/sklearnex/random_forest_classifier_dpctl.py
:language: python

As on previous example, if `dpctl <https://intelpython.github.io/dpctl/latest/index.html>`__ Array API namespace was
used for training, then fitted attributes will be on the CPU and `numpy.ndarray` data format.

Use of `array-api-strict`
-------------------------

Here is an example code to demonstrate how to use `array-api-strict <https://github.com/data-apis/array-api-strict>`__
arrays to run `DBSCAN`.

.. literalinclude:: ../../examples/sklearnex/dbscan_array_api.py
:language: python
4 changes: 2 additions & 2 deletions doc/sources/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@
author = "Intel"

# The short X.Y version
version = "2024.3.0"
version = "2025.0.0"
# The full version, including alpha/beta/rc tags
release = "2024.3.0"
release = "2025.0.0"


# -- General configuration ---------------------------------------------------
Expand Down
1 change: 1 addition & 0 deletions doc/sources/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ Enable Intel(R) GPU optimizations
oneAPI and GPU support <oneapi-gpu.rst>
distributed-mode.rst
non-scikit-algorithms.rst
array_api.rst
verbose.rst
deprecation.rst

Expand Down
36 changes: 36 additions & 0 deletions examples/sklearnex/dbscan_array_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# ==============================================================================
# Copyright 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

import array_api_strict

from sklearnex import config_context, patch_sklearn

patch_sklearn()

from sklearn.cluster import DBSCAN

X = array_api_strict.asarray(
[[1.0, 2.0], [2.0, 2.0], [2.0, 3.0], [8.0, 7.0], [8.0, 8.0], [25.0, 80.0]],
dtype=array_api_strict.float32,
)

# Could be launched without `config_context(array_api_dispatch=True)`. This context
# manager for sklearnex, only guarantee that in case of the fallback to stock
# scikit-learn, fitted attributes to be from the same Array API namespace as
# the training data.
clustering = DBSCAN(eps=3, min_samples=2).fit(X)

print(f"Fitted labels :\n", clustering.labels_)
19 changes: 6 additions & 13 deletions onedal/_device_offload.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,18 @@

from ._config import _get_config
from .utils._array_api import _asarray, _is_numpy_namespace
from .utils._dpep_helpers import dpctl_available, dpnp_available

try:
if dpctl_available:
from dpctl import SyclQueue
from dpctl.memory import MemoryUSMDevice, as_usm_memory
from dpctl.tensor import usm_ndarray

dpctl_available = True
except ImportError:
dpctl_available = False

try:
if dpnp_available:
import dpnp

from .utils._array_api import _convert_to_dpnp

dpnp_available = True
except ImportError:
dpnp_available = False


class DummySyclQueue:
"""This class is designed to act like dpctl.SyclQueue
Expand Down Expand Up @@ -140,7 +133,7 @@ def _transfer_to_host(queue, *data):
raise RuntimeError("Input data shall be located on single target device")

host_data.append(item)
return queue, host_data
return has_usm_data, queue, host_data


def _get_global_queue():
Expand All @@ -157,8 +150,8 @@ def _get_global_queue():

def _get_host_inputs(*args, **kwargs):
q = _get_global_queue()
q, hostargs = _transfer_to_host(q, *args)
q, hostvalues = _transfer_to_host(q, *kwargs.values())
_, q, hostargs = _transfer_to_host(q, *args)
_, q, hostvalues = _transfer_to_host(q, *kwargs.values())
hostkwargs = dict(zip(kwargs.keys(), hostvalues))
return q, hostargs, hostkwargs

Expand Down
Loading

0 comments on commit 641318e

Please sign in to comment.