From aa420375e24777fd52f04af5141c13637989a202 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Fri, 23 May 2025 22:47:32 +0700 Subject: [PATCH 01/13] Implemented NumbaExecutionEngine --- pandas/core/apply.py | 73 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 58 insertions(+), 15 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 2c96f1ef020ac..fe87b1d2beaa6 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -45,9 +45,9 @@ ABCSeries, ) -from pandas.core._numba.executor import generate_apply_looper import pandas.core.common as com from pandas.core.construction import ensure_wrapped_if_datetimelike +from pandas.core._numba.executor import generate_apply_looper from pandas.core.util.numba_ import ( get_jit_arguments, prepare_function_arguments, @@ -178,6 +178,57 @@ def apply( """ +class NumbaExecutionEngine(BaseExecutionEngine): + """ + Numba-based execution engine for pandas apply and map operations. + """ + + @staticmethod + def map( + data: np.ndarray | Series | DataFrame, + func, + args: tuple, + kwargs: dict, + engine_kwargs: dict | None, + skip_na: bool, + ): + """ + Elementwise map for the Numba engine. Currently not supported. + """ + raise NotImplementedError("Numba map is not implemented yet.") + + @staticmethod + def apply( + data: np.ndarray | Series | DataFrame, + func, + args: tuple, + kwargs: dict, + engine_kwargs: dict | None, + axis: int | str, + ): + """ + Apply `func` along the given axis using Numba. + """ + + looper_args, looper_kwargs = prepare_function_arguments( + func, # type: ignore[arg-type] + args, + kwargs, + num_required_args=1, + ) + # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has + # incompatible type "Callable[..., Any] | str | list[Callable + # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | + # list[Callable[..., Any] | str]]"; expected "Hashable" + nb_looper = generate_apply_looper( + func, # type: ignore[arg-type] + **get_jit_arguments(engine_kwargs) + ) + result = nb_looper(data, axis, *looper_args) + # If we made the result 2-D, squeeze it back to 1-D + return np.squeeze(result) + + def frame_apply( obj: DataFrame, func: AggFuncType, @@ -1094,23 +1145,15 @@ def wrapper(*args, **kwargs): return wrapper if engine == "numba": - args, kwargs = prepare_function_arguments( - self.func, # type: ignore[arg-type] + engine_obj = NumbaExecutionEngine() + result = engine_obj.apply( + self.values, + self.func, self.args, self.kwargs, - num_required_args=1, - ) - # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has - # incompatible type "Callable[..., Any] | str | list[Callable - # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | - # list[Callable[..., Any] | str]]"; expected "Hashable" - nb_looper = generate_apply_looper( - self.func, # type: ignore[arg-type] - **get_jit_arguments(engine_kwargs), + engine_kwargs, + self.axis, ) - result = nb_looper(self.values, self.axis, *args) - # If we made the result 2-D, squeeze it back to 1-D - result = np.squeeze(result) else: result = np.apply_along_axis( wrap_function(self.func), From db9f3b000f237a1fc580f3361e0984b410ee9d3e Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sat, 24 May 2025 06:51:23 +0700 Subject: [PATCH 02/13] whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ab3316e7fca4c..6948ffcde40b2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -31,6 +31,7 @@ Other enhancements - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) - :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) +- Added :class:`pandas.core.apply.NumbaExecutionEngine` as the built-in ``numba`` execution engine for ``apply`` and ``map`` operations (:issue:`61458`) - Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`) - Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`) - :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` From 4cb240d95c139ef8956a0430287559a5d75a73bc Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sat, 24 May 2025 06:56:06 +0700 Subject: [PATCH 03/13] precommit --- pandas/core/apply.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index fe87b1d2beaa6..ba240813d3229 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -45,9 +45,9 @@ ABCSeries, ) +from pandas.core._numba.executor import generate_apply_looper import pandas.core.common as com from pandas.core.construction import ensure_wrapped_if_datetimelike -from pandas.core._numba.executor import generate_apply_looper from pandas.core.util.numba_ import ( get_jit_arguments, prepare_function_arguments, @@ -211,7 +211,7 @@ def apply( """ looper_args, looper_kwargs = prepare_function_arguments( - func, # type: ignore[arg-type] + func, # type: ignore[arg-type] args, kwargs, num_required_args=1, @@ -221,8 +221,8 @@ def apply( # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | # list[Callable[..., Any] | str]]"; expected "Hashable" nb_looper = generate_apply_looper( - func, # type: ignore[arg-type] - **get_jit_arguments(engine_kwargs) + func, # type: ignore[arg-type] + **get_jit_arguments(engine_kwargs), ) result = nb_looper(data, axis, *looper_args) # If we made the result 2-D, squeeze it back to 1-D From 97d9063dcc65968956b282b13fbb49337f0388b2 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sat, 24 May 2025 07:21:10 +0700 Subject: [PATCH 04/13] Match function arguments --- pandas/core/apply.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index ba240813d3229..3d760eaa8705a 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -189,7 +189,7 @@ def map( func, args: tuple, kwargs: dict, - engine_kwargs: dict | None, + decorator: Callable | None, skip_na: bool, ): """ @@ -203,7 +203,7 @@ def apply( func, args: tuple, kwargs: dict, - engine_kwargs: dict | None, + decorator: Callable, axis: int | str, ): """ @@ -222,7 +222,7 @@ def apply( # list[Callable[..., Any] | str]]"; expected "Hashable" nb_looper = generate_apply_looper( func, # type: ignore[arg-type] - **get_jit_arguments(engine_kwargs), + **get_jit_arguments(decorator), ) result = nb_looper(data, axis, *looper_args) # If we made the result 2-D, squeeze it back to 1-D From 69e0e355e14312e19b1341157d1b6e100f8dcb3d Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sat, 24 May 2025 07:54:50 +0700 Subject: [PATCH 05/13] Fix CI --- pandas/core/apply.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 3d760eaa8705a..b765088308b2d 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -209,9 +209,12 @@ def apply( """ Apply `func` along the given axis using Numba. """ + engine_kwargs: dict[str, bool] | None = ( + decorator if isinstance(decorator, dict) else None + ) looper_args, looper_kwargs = prepare_function_arguments( - func, # type: ignore[arg-type] + func, args, kwargs, num_required_args=1, @@ -221,8 +224,8 @@ def apply( # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str | # list[Callable[..., Any] | str]]"; expected "Hashable" nb_looper = generate_apply_looper( - func, # type: ignore[arg-type] - **get_jit_arguments(decorator), + func, + **get_jit_arguments(engine_kwargs), ) result = nb_looper(data, axis, *looper_args) # If we made the result 2-D, squeeze it back to 1-D From 736507949fbc217fae93d061f02ab3f9e2899f05 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Wed, 28 May 2025 16:41:30 +0700 Subject: [PATCH 06/13] updated whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 6948ffcde40b2..ea9b06a58be92 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -30,8 +30,8 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) +- :meth:`DataFrame.apply` accepts Numba as an engine by passing the JIT decorator directly, e.g. ``df.apply(func, engine=numba.jit)`` (:issue:`61458`) - :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) -- Added :class:`pandas.core.apply.NumbaExecutionEngine` as the built-in ``numba`` execution engine for ``apply`` and ``map`` operations (:issue:`61458`) - Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`) - Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`) - :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` From c605857d16bde78f6a4b0cc04556bcf24f7844bc Mon Sep 17 00:00:00 2001 From: arthurlw Date: Thu, 29 May 2025 22:18:39 +0700 Subject: [PATCH 07/13] Updated conditions and delegate method to numba.jit --- pandas/core/apply.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index b765088308b2d..a4cce45758feb 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -13,6 +13,7 @@ cast, ) +import numba import numpy as np from pandas._libs.internals import BlockValuesRefs @@ -1148,8 +1149,9 @@ def wrapper(*args, **kwargs): return wrapper if engine == "numba": - engine_obj = NumbaExecutionEngine() - result = engine_obj.apply( + if not hasattr(numba.jit, "__pandas_udf__"): + numba.jit.__pandas_udf__ = NumbaExecutionEngine + result = numba.jit.__pandas_udf__.apply( self.values, self.func, self.args, From 24a06150e01028a38f3466ded5c85e143ea41aef Mon Sep 17 00:00:00 2001 From: arthurlw Date: Tue, 3 Jun 2025 18:37:58 +0700 Subject: [PATCH 08/13] Added try and except to catch ImportError --- pandas/core/apply.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index a4cce45758feb..760fd111f21ce 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -13,7 +13,6 @@ cast, ) -import numba import numpy as np from pandas._libs.internals import BlockValuesRefs @@ -1149,16 +1148,31 @@ def wrapper(*args, **kwargs): return wrapper if engine == "numba": - if not hasattr(numba.jit, "__pandas_udf__"): - numba.jit.__pandas_udf__ = NumbaExecutionEngine - result = numba.jit.__pandas_udf__.apply( - self.values, - self.func, - self.args, - self.kwargs, - engine_kwargs, - self.axis, - ) + try: + import numba + + if not hasattr(numba.jit, "__pandas_udf__"): + numba.jit.__pandas_udf__ = NumbaExecutionEngine + result = numba.jit.__pandas_udf__.apply( + self.values, + self.func, + self.args, + self.kwargs, + engine_kwargs, + self.axis, + ) + else: + raise ImportError + except ImportError: + engine_obj = NumbaExecutionEngine() + result = engine_obj.apply( + self.values, + self.func, + self.args, + self.kwargs, + engine_kwargs, + self.axis, + ) else: result = np.apply_along_axis( wrap_function(self.func), From b7a2ecbae9e64ae4f8e155a1cb0edb96f44b8e6b Mon Sep 17 00:00:00 2001 From: arthurlw Date: Tue, 10 Jun 2025 13:28:12 +0700 Subject: [PATCH 09/13] Use import_optional_dependency to load Numba --- pandas/core/apply.py | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 760fd111f21ce..f54c27c93a2a9 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1148,31 +1148,19 @@ def wrapper(*args, **kwargs): return wrapper if engine == "numba": - try: - import numba - - if not hasattr(numba.jit, "__pandas_udf__"): - numba.jit.__pandas_udf__ = NumbaExecutionEngine - result = numba.jit.__pandas_udf__.apply( - self.values, - self.func, - self.args, - self.kwargs, - engine_kwargs, - self.axis, - ) - else: - raise ImportError - except ImportError: - engine_obj = NumbaExecutionEngine() - result = engine_obj.apply( - self.values, - self.func, - self.args, - self.kwargs, - engine_kwargs, - self.axis, - ) + numba = import_optional_dependency("numba") + + if not hasattr(numba.jit, "__pandas_udf__"): + numba.jit.__pandas_udf__ = NumbaExecutionEngine + + result = numba.jit.__pandas_udf__.apply( + self.values, + self.func, + self.args, + self.kwargs, + engine_kwargs, + self.axis, + ) else: result = np.apply_along_axis( wrap_function(self.func), From 6f4fb501550fbd69ed10df1ca6c5c6107c26084e Mon Sep 17 00:00:00 2001 From: arthurlw Date: Tue, 17 Jun 2025 14:38:23 +0700 Subject: [PATCH 10/13] Updated engine handling: normalizing numba to a fake decorator and updating empty or python string condition --- pandas/core/frame.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8053c17437c5e..593e59457518d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -129,7 +129,7 @@ roperator, ) from pandas.core.accessor import Accessor -from pandas.core.apply import reconstruct_and_relabel_result +from pandas.core.apply import NumbaExecutionEngine, reconstruct_and_relabel_result from pandas.core.array_algos.take import take_2d_multi from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ( @@ -10616,14 +10616,14 @@ def apply( significant amount of time to run. Fast functions are unlikely to run faster with JIT compilation. """ - if engine is None or isinstance(engine, str): - from pandas.core.apply import frame_apply - - if engine is None: - engine = "python" + if engine == "numba": + numba = import_optional_dependency("numba") + numba_jit = numba.jit(**engine_kwargs) + numba_jit.__pandas_udf__ = NumbaExecutionEngine + engine = numba_jit - if engine not in ["python", "numba"]: - raise ValueError(f"Unknown engine '{engine}'") + if engine is None or engine == "python": + from pandas.core.apply import frame_apply op = frame_apply( self, @@ -10632,7 +10632,7 @@ def apply( raw=raw, result_type=result_type, by_row=by_row, - engine=engine, + engine="python", engine_kwargs=engine_kwargs, args=args, kwargs=kwargs, From 221cf7cd7b8f987bc149ef84d0f8365138e44f30 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Tue, 17 Jun 2025 14:47:27 +0700 Subject: [PATCH 11/13] Added check for empty engine_kwargs --- pandas/core/frame.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 593e59457518d..8c971603076cc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10618,7 +10618,10 @@ def apply( """ if engine == "numba": numba = import_optional_dependency("numba") - numba_jit = numba.jit(**engine_kwargs) + if engine_kwargs is not None: + numba_jit = numba.jit(**engine_kwargs) + else: + numba_jit = numba.jit() numba_jit.__pandas_udf__ = NumbaExecutionEngine engine = numba_jit From ed8dc7f15472c28c3d8fe425aa7a086fab05fea4 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Tue, 17 Jun 2025 17:13:25 +0700 Subject: [PATCH 12/13] Moved checks from Apply.apply to NumbaExecutionEngine.apply --- pandas/core/apply.py | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index f54c27c93a2a9..290e72974e9aa 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -209,6 +209,30 @@ def apply( """ Apply `func` along the given axis using Numba. """ + + if is_list_like(func): + raise NotImplementedError( + "the 'numba' engine doesn't support lists of callables yet" + ) + + if isinstance(func, str): + raise NotImplementedError( + "the 'numba' engine doesn't support using " + "a string as the callable function" + ) + + elif isinstance(func, np.ufunc): + raise NotImplementedError( + "the 'numba' engine doesn't support " + "using a numpy ufunc as the callable function" + ) + + # check for data typing + if not isinstance(data, np.ndarray): + if len(data.columns) == 0 and len(data.index) == 0: + return data.copy() # mimic apply_empty_result() + return FrameApply.apply_standard() + engine_kwargs: dict[str, bool] | None = ( decorator if isinstance(decorator, dict) else None ) @@ -1011,10 +1035,6 @@ def apply(self) -> DataFrame | Series: # dispatch to handle list-like or dict-like if is_list_like(self.func): - if self.engine == "numba": - raise NotImplementedError( - "the 'numba' engine doesn't support lists of callables yet" - ) return self.apply_list_or_dict_like() # all empty @@ -1023,20 +1043,10 @@ def apply(self) -> DataFrame | Series: # string dispatch if isinstance(self.func, str): - if self.engine == "numba": - raise NotImplementedError( - "the 'numba' engine doesn't support using " - "a string as the callable function" - ) return self.apply_str() # ufunc elif isinstance(self.func, np.ufunc): - if self.engine == "numba": - raise NotImplementedError( - "the 'numba' engine doesn't support " - "using a numpy ufunc as the callable function" - ) with np.errstate(all="ignore"): results = self.obj._mgr.apply("apply", func=self.func) # _constructor will retain self.index and self.columns @@ -1044,10 +1054,6 @@ def apply(self) -> DataFrame | Series: # broadcasting if self.result_type == "broadcast": - if self.engine == "numba": - raise NotImplementedError( - "the 'numba' engine doesn't support result_type='broadcast'" - ) return self.apply_broadcast(self.obj) # one axis empty From 65b9d320ffe83cac78f2640673fb7ca0bb49a778 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Wed, 18 Jun 2025 10:17:31 +0700 Subject: [PATCH 13/13] Fixed CI, removed unused numba checks, updated raw=false condition, updated engine checks --- pandas/core/apply.py | 43 +++++++++++++------------------------------ pandas/core/frame.py | 13 ++++++++++--- 2 files changed, 23 insertions(+), 33 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 290e72974e9aa..949959de7cbcd 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -230,8 +230,12 @@ def apply( # check for data typing if not isinstance(data, np.ndarray): if len(data.columns) == 0 and len(data.index) == 0: - return data.copy() # mimic apply_empty_result() - return FrameApply.apply_standard() + return data.copy() # mimic apply_empty_result() + # TODO: + # Rewrite FrameApply.apply_series_numba() logic without FrameApply object + raise NotImplementedError( + "raw=False is not yet supported in NumbaExecutionEngine." + ) engine_kwargs: dict[str, bool] | None = ( decorator if isinstance(decorator, dict) else None @@ -780,12 +784,6 @@ def apply_list_or_dict_like(self) -> DataFrame | Series: Result when self.func is a list-like or dict-like, None otherwise. """ - if self.engine == "numba": - raise NotImplementedError( - "The 'numba' engine doesn't support list-like/" - "dict likes of callables yet." - ) - if self.axis == 1 and isinstance(self.obj, ABCDataFrame): return self.obj.T.apply(self.func, 0, args=self.args, **self.kwargs).T @@ -1153,28 +1151,13 @@ def wrapper(*args, **kwargs): return wrapper - if engine == "numba": - numba = import_optional_dependency("numba") - - if not hasattr(numba.jit, "__pandas_udf__"): - numba.jit.__pandas_udf__ = NumbaExecutionEngine - - result = numba.jit.__pandas_udf__.apply( - self.values, - self.func, - self.args, - self.kwargs, - engine_kwargs, - self.axis, - ) - else: - result = np.apply_along_axis( - wrap_function(self.func), - self.axis, - self.values, - *self.args, - **self.kwargs, - ) + result = np.apply_along_axis( + wrap_function(self.func), + self.axis, + self.values, + *self.args, + **self.kwargs, + ) # TODO: mixed type case if result.ndim == 2: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8c971603076cc..78c9fc3fc10f1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -129,7 +129,10 @@ roperator, ) from pandas.core.accessor import Accessor -from pandas.core.apply import NumbaExecutionEngine, reconstruct_and_relabel_result +from pandas.core.apply import ( + NumbaExecutionEngine, + reconstruct_and_relabel_result, +) from pandas.core.array_algos.take import take_2d_multi from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ( @@ -10625,9 +10628,12 @@ def apply( numba_jit.__pandas_udf__ = NumbaExecutionEngine engine = numba_jit - if engine is None or engine == "python": + if engine is None or isinstance(engine, str): from pandas.core.apply import frame_apply + if engine not in ["python"] and engine is not None: + raise ValueError(f"Unknown engine '{engine}'") + op = frame_apply( self, func=func, @@ -10641,7 +10647,8 @@ def apply( kwargs=kwargs, ) return op.apply().__finalize__(self, method="apply") - elif hasattr(engine, "__pandas_udf__"): + + if hasattr(engine, "__pandas_udf__"): if result_type is not None: raise NotImplementedError( f"{result_type=} only implemented for the default engine"