Merge branch 'main' into enh/pca_array_api

uxlfoundation · Oct 22, 2024 · 641318e · 641318e
2 parents fb1059f + c0eb5ad
commit 641318e
Show file tree

Hide file tree

Showing 34 changed files with 420 additions and 120 deletions.
diff --git a/.ci/pipeline/build-and-test-lnx.yml b/.ci/pipeline/build-and-test-lnx.yml
@@ -55,7 +55,7 @@ steps:
       . /usr/share/miniconda/etc/profile.d/conda.sh
       conda activate CB
       cd ..
-      ./s/conda-recipe/run_test.sh
+      ./s/conda-recipe/run_test.sh --json-report
     displayName: "Sklearnex testing"
   - script: |
       . /usr/share/miniconda/etc/profile.d/conda.sh

diff --git a/.ci/pipeline/build-and-test-win.yml b/.ci/pipeline/build-and-test-win.yml
@@ -51,7 +51,7 @@ steps:
   - script: |
       call activate CB
       cd ..
-      call s\conda-recipe\run_test.bat s\
+      call s\conda-recipe\run_test.bat s\ --json-report
     displayName: 'Sklearnex testing'
   - script: |
       call activate CB

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -245,7 +245,7 @@ jobs:
           call .\.github\scripts\activate_components.bat ${{ steps.set-env.outputs.DPCFLAG }}
           set PYTHON=python
           cd ..
-          call scikit-learn-intelex\conda-recipe\run_test.bat scikit-learn-intelex\
+          call scikit-learn-intelex\conda-recipe\run_test.bat scikit-learn-intelex\ --json-report
       - name: Sklearn testing
         shell: cmd
         run: |

diff --git a/.gitignore b/.gitignore
@@ -30,3 +30,6 @@ record*
 
 # example .res files
 tests/_results*
+
+# json reports from pytest
+.pytest_reports/*
diff --git a/conda-recipe/run_test.bat b/conda-recipe/run_test.bat
@@ -23,10 +23,31 @@ IF NOT DEFINED PYTHON (set "PYTHON=python")
 
 %PYTHON% -c "from sklearnex import patch_sklearn; patch_sklearn()" || set exitcode=1
 
-%PYTHON% -m pytest --verbose -s %1tests || set exitcode=1
+rem Note: execute with argument --json-report as second argument
+rem in order to produce a JSON report under folder '.pytest_reports'.
+set with_json_report=0
+if "%~2"=="--json-report" (
+    set with_json_report=1
+    mkdir .pytest_reports
+    del /q .pytest_reports\*.json
+)
+
+if "%with_json_report%"=="1" (
+    %PYTHON% -m pytest --verbose -s %1tests --json-report --json-report-file=.pytest_reports\legacy_report.json || set exitcode=1
+    pytest --verbose --pyargs daal4py --json-report --json-report-file=.pytest_reports\daal4py_report.json || set exitcode=1
+    pytest --verbose --pyargs sklearnex --json-report --json-report-file=.pytest_reports\sklearnex_report.json || set exitcode=1
+    pytest --verbose --pyargs onedal --json-report --json-report-file=.pytest_reports\onedal_report.json || set exitcode=1
+    pytest --verbose %1.ci\scripts\test_global_patch.py --json-report --json-report-file=.pytest_reports\global_patching_report.json || set exitcode=1
+    if NOT EXIST .pytest_reports\legacy_report.json (
+        echo "Error: JSON report files failed to be produced."
+        set exitcode=1
+    )
+) else (
+    %PYTHON% -m pytest --verbose -s %1tests || set exitcode=1
+    pytest --verbose --pyargs daal4py || set exitcode=1
+    pytest --verbose --pyargs sklearnex || set exitcode=1
+    pytest --verbose --pyargs onedal || set exitcode=1
+    pytest --verbose %1.ci\scripts\test_global_patch.py || set exitcode=1
+)
 
-pytest --verbose --pyargs daal4py || set exitcode=1
-pytest --verbose --pyargs sklearnex || set exitcode=1
-pytest --verbose --pyargs onedal || set exitcode=1
-pytest --verbose %1.ci\scripts\test_global_patch.py || set exitcode=1
 EXIT /B %exitcode%
diff --git a/conda-recipe/run_test.sh b/conda-recipe/run_test.sh
@@ -36,28 +36,46 @@ if [ -z "${PYTHON}" ]; then
     export PYTHON=python
 fi
 
+# Note: execute with argument --json-report in order to produce
+# a JSON report under folder '.pytest_reports'. Other arguments
+# will also be forwarded to pytest.
+with_json_report=0
+if [[ "$*" == *"--json-report"* ]]; then
+    echo "Will produce JSON report of tests"
+    with_json_report=1
+    mkdir -p .pytest_reports
+    if [[ ! -z "$(ls .pytest_reports)" ]]; then
+        rm .pytest_reports/*.json
+    fi
+fi
+function json_report_name {
+    if [[ "${with_json_report}" == "1" ]]; then
+        printf -- "--json-report-file=.pytest_reports/$1_report.json"
+    fi
+}
+
 ${PYTHON} -c "from sklearnex import patch_sklearn; patch_sklearn()"
 return_code=$(($return_code + $?))
 
-pytest --verbose -s ${sklex_root}/tests
+pytest --verbose -s ${sklex_root}/tests $@ $(json_report_name legacy)
 return_code=$(($return_code + $?))
 
-pytest --verbose --pyargs daal4py
+pytest --verbose --pyargs daal4py $@ $(json_report_name daal4py)
 return_code=$(($return_code + $?))
 
-pytest --verbose --pyargs sklearnex
+pytest --verbose --pyargs sklearnex $@ $(json_report_name sklearnex)
 return_code=$(($return_code + $?))
 
-pytest --verbose --pyargs onedal
+pytest --verbose --pyargs onedal $@ $(json_report_name onedal)
 return_code=$(($return_code + $?))
 
-pytest --verbose -s ${sklex_root}/.ci/scripts/test_global_patch.py
+pytest --verbose -s ${sklex_root}/.ci/scripts/test_global_patch.py $@ $(json_report_name global_patching)
 return_code=$(($return_code + $?))
 
 echo "NO_DIST=$NO_DIST"
 if [[ ! $NO_DIST ]]; then
     mpirun --version
-    mpirun -n 4 pytest --verbose -s ${sklex_root}/tests/test*spmd*.py
+    mpirun -n 4 pytest --verbose -s ${sklex_root}/tests/test*spmd*.py $@ $(json_report_name mpi_legacy)
     return_code=$(($return_code + $?))
 fi
 

diff --git a/daal4py/sklearn/_utils.py b/daal4py/sklearn/_utils.py
@@ -95,17 +95,21 @@ def daal_check_version(
     return False
 
 
-@functools.lru_cache(maxsize=256, typed=False)
-def sklearn_check_version(ver):
-    if hasattr(Version(ver), "base_version"):
-        base_sklearn_version = Version(sklearn_version).base_version
-        res = bool(Version(base_sklearn_version) >= Version(ver))
+def _package_check_version(version_to_check, available_version):
+    if hasattr(Version(version_to_check), "base_version"):
+        base_package_version = Version(available_version).base_version
+        res = bool(Version(base_package_version) >= Version(version_to_check))
     else:
         # packaging module not available
-        res = bool(Version(sklearn_version) >= Version(ver))
+        res = bool(Version(available_version) >= Version(version_to_check))
     return res
 
 
+@functools.lru_cache(maxsize=256, typed=False)
+def sklearn_check_version(ver):
+    return _package_check_version(ver, sklearn_version)
+
+
 def parse_dtype(dt):
     if dt == np.double:
         return "double"

diff --git a/dependencies-dev b/dependencies-dev
@@ -3,5 +3,5 @@ Jinja2==3.1.4
 numpy==2.0.1 ; python_version <= '3.9'
 numpy==2.1.2 ; python_version > '3.9'
 pybind11==2.13.6
-cmake==3.30.3
+cmake==3.30.5
 setuptools==75.2.0
diff --git a/doc/sources/array_api.rst b/doc/sources/array_api.rst
@@ -0,0 +1,123 @@
+.. ******************************************************************************
+.. * Copyright 2024 Intel Corporation
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+.. _array_api:
+
+=================
+Array API support
+=================
+The `Array API <https://data-apis.org/array-api/latest/>`_ specification defines
+a standard API for all array manipulation libraries with a NumPy-like API.
+Intel(R) Extension for Scikit-Learn doesn't require
+`array-api-compat <https://github.com/data-apis/array-api-compat>`__ to be installed for
+functional support of the array API standard.
+In the current implementation, the functional support of array api follows the functional
+support of different array or DataFrame inputs and does not modify the precision of the
+input and output data formats unless necessary. Any array API input will be converted to host
+numpy.ndarrays and all internal manipulations with data will be done with these representations of
+the input data. DPNP's 'ndarray' and Data Parallel Control's 'usm_ndarray' have special handling
+requirements that are described in the relevant section of this document. Output values will in
+all relevant cases match the input data format.
+
+.. note::
+    Currently, only `array-api-strict <https://github.com/data-apis/array-api-strict>`__,
+    `dpctl <https://intelpython.github.io/dpctl/latest/index.html>`__, `dpnp <https://github.com/IntelPython/dpnp>`__
+    and `numpy <https://numpy.org/>`__ are known to work with sklearnex estimators.
+.. note::
+    Stock Scikit-learn’s array API support requires `array-api-compat <https://github.com/data-apis/array-api-compat>`__ to be installed.
+
+
+Support for DPNP and DPCTL
+==========================
+The functional support of input data for sklearnex estimators also extended for SYCL USM array types.
+These include SYCL USM arrays `dpnp's <https://github.com/IntelPython/dpnp>`__ ndarray and
+`Data Parallel Control usm_ndarray <https://intelpython.github.io/dpctl/latest/index.html>`__.
+DPNP ndarray and Data Parallel Control usm_ndarray contain SYCL contexts which can be used for
+`sklearnex` device offloading.
+
+.. note::
+    Current support for DPNP and DPCTL usm_ndarray data can be copied and moved to and from device in sklearnex and have
+    impacts on memory utilization.
+
+DPCTL or DPNP inputs are not required to use `config_context(target_offload=device)`.
+`sklearnex` will use input usm_ndarray sycl context for device offloading.
+
+.. note::
+    As DPCTL or DPNP inputs contain SYCL contexts, they do not require `config_context(target_offload=device)`.
+    However, the use of `config_context`` will override the contained SYCL context and will force movement
+    of data to the targeted device.
+
+
+Support for Array API-compatible inputs
+=======================================
+All patched estimators, metrics, tools and non-scikit-learn estimators functionally support Array API standard.
+Intel(R) Extension for scikit-Learn preserves input data format for all outputs. For all array inputs except
+SYCL USM arrays `dpnp's <https://github.com/IntelPython/dpnp>`__ ndarray and
+`Data Parallel Control usm_ndarray <https://intelpython.github.io/dpctl/latest/index.html>`__ all computation
+will be only accomplished on CPU unless specified by a `config_context`` with an available GPU device.
+
+Stock scikit-learn uses `config_context(array_api_dispatch=True)` for enabling Array API
+`support <https://scikit-learn.org/1.5/modules/array_api.html>`__.
+If `array_api_dispatch` is enabled and the installed Scikit-Learn version supports array API, then the original
+inputs are used when falling back to Scikit-Learn functionality.
+
+.. note::
+    Data Parallel Control usm_ndarray or DPNP ndarray inputs will use host numpy data copies when
+    falling back to Scikit-Learn since they are not array API compliant.
+.. note::
+    Functional support doesn't guarantee that after the model is trained, fitted attributes that are arrays
+    will also be from the same namespace as the training data.
+
+
+Example usage
+=============
+
+DPNP ndarrays
+-------------
+
+Here is an example code to demonstrate how to use `dpnp <https://github.com/IntelPython/dpnp>`__ arrays to
+run `RandomForestRegressor` on a GPU without `config_context(array_api_dispatch=True)`:
+
+.. literalinclude:: ../../examples/sklearnex/random_forest_regressor_dpnp.py
+	   :language: python
+
+
+.. note::
+    Functional support doesn't guarantee that after the model is trained, fitted attributes that are arrays
+    will also be from the same namespace as the training data.
+
+For example, if `dpnp's <https://github.com/IntelPython/dpnp>`__ namespace was used for training,
+then fitted attributes will be on the CPU and `numpy.ndarray` data format.
+
+DPCTL usm_ndarrays
+------------------
+Here is an example code to demonstrate how to use `dpctl <https://intelpython.github.io/dpctl/latest/index.html>`__
+arrays to run `RandomForestClassifier` on a GPU witout `config_context(array_api_dispatch=True)`:
+
+.. literalinclude:: ../../examples/sklearnex/random_forest_classifier_dpctl.py
+	   :language: python
+
+As on previous example, if `dpctl <https://intelpython.github.io/dpctl/latest/index.html>`__ Array API namespace was
+used for training, then fitted attributes will be on the CPU and `numpy.ndarray` data format.
+
+Use of `array-api-strict`
+-------------------------
+
+Here is an example code to demonstrate how to use `array-api-strict <https://github.com/data-apis/array-api-strict>`__
+arrays to run `DBSCAN`.
+
+.. literalinclude:: ../../examples/sklearnex/dbscan_array_api.py
+	   :language: python
diff --git a/doc/sources/conf.py b/doc/sources/conf.py
@@ -42,9 +42,9 @@
 author = "Intel"
 
 # The short X.Y version
-version = "2024.3.0"
+version = "2025.0.0"
 # The full version, including alpha/beta/rc tags
-release = "2024.3.0"
+release = "2025.0.0"
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/doc/sources/index.rst b/doc/sources/index.rst
@@ -106,6 +106,7 @@ Enable Intel(R) GPU optimizations
    oneAPI and GPU support <oneapi-gpu.rst>
    distributed-mode.rst
    non-scikit-algorithms.rst
+   array_api.rst
    verbose.rst
    deprecation.rst
 

diff --git a/examples/sklearnex/dbscan_array_api.py b/examples/sklearnex/dbscan_array_api.py
@@ -0,0 +1,36 @@
+# ==============================================================================
+# Copyright 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import array_api_strict
+
+from sklearnex import config_context, patch_sklearn
+
+patch_sklearn()
+
+from sklearn.cluster import DBSCAN
+
+X = array_api_strict.asarray(
+    [[1.0, 2.0], [2.0, 2.0], [2.0, 3.0], [8.0, 7.0], [8.0, 8.0], [25.0, 80.0]],
+    dtype=array_api_strict.float32,
+)
+
+# Could be launched without `config_context(array_api_dispatch=True)`. This context
+# manager for sklearnex, only guarantee that in case of the fallback to stock
+# scikit-learn, fitted attributes to be from the same Array API namespace as
+# the training data.
+clustering = DBSCAN(eps=3, min_samples=2).fit(X)
+
+print(f"Fitted labels :\n", clustering.labels_)
diff --git a/onedal/_device_offload.py b/onedal/_device_offload.py
@@ -23,25 +23,18 @@
 
 from ._config import _get_config
 from .utils._array_api import _asarray, _is_numpy_namespace
+from .utils._dpep_helpers import dpctl_available, dpnp_available
 
-try:
+if dpctl_available:
     from dpctl import SyclQueue
     from dpctl.memory import MemoryUSMDevice, as_usm_memory
     from dpctl.tensor import usm_ndarray
 
-    dpctl_available = True
-except ImportError:
-    dpctl_available = False
-
-try:
+if dpnp_available:
     import dpnp
 
     from .utils._array_api import _convert_to_dpnp
 
-    dpnp_available = True
-except ImportError:
-    dpnp_available = False
-
 
 class DummySyclQueue:
     """This class is designed to act like dpctl.SyclQueue
@@ -140,7 +133,7 @@ def _transfer_to_host(queue, *data):
             raise RuntimeError("Input data shall be located on single target device")
 
         host_data.append(item)
-    return queue, host_data
+    return has_usm_data, queue, host_data
 
 
 def _get_global_queue():
@@ -157,8 +150,8 @@ def _get_global_queue():
 
 def _get_host_inputs(*args, **kwargs):
     q = _get_global_queue()
-    q, hostargs = _transfer_to_host(q, *args)
-    q, hostvalues = _transfer_to_host(q, *kwargs.values())
+    _, q, hostargs = _transfer_to_host(q, *args)
+    _, q, hostvalues = _transfer_to_host(q, *kwargs.values())
     hostkwargs = dict(zip(kwargs.keys(), hostvalues))
     return q, hostargs, hostkwargs