pymc-devs · jessegrabowski · May 23, 2025 · May 23, 2025 · May 23, 2025 · May 23, 2025
diff --git a/pytensor/link/numba/dispatch/__init__.py b/pytensor/link/numba/dispatch/__init__.py
@@ -14,6 +14,6 @@
 import pytensor.link.numba.dispatch.sparse
 import pytensor.link.numba.dispatch.subtensor
 import pytensor.link.numba.dispatch.tensor_basic
-
+import pytensor.link.numba.dispatch.blas
 
 # isort: on
diff --git a/pytensor/link/numba/dispatch/basic.py b/pytensor/link/numba/dispatch/basic.py
@@ -75,7 +75,7 @@ def numba_njit(*args, fastmath=None, **kwargs):
         message=(
             "(\x1b\\[1m)*"  # ansi escape code for bold text
             "Cannot cache compiled function "
-            '"(numba_funcified_fgraph|store_core_outputs|cholesky|solve|solve_triangular|cho_solve|lu_factor)" '
+            '"(numba_funcified_fgraph|store_core_outputs|cholesky|solve|solve_triangular|cho_solve|lu_factor|banded_dot)" '
             "as it uses dynamic globals"
         ),
         category=NumbaWarning,

diff --git a/pytensor/link/numba/dispatch/blas.py b/pytensor/link/numba/dispatch/blas.py
@@ -0,0 +1,59 @@
+from pytensor.link.numba.dispatch import numba_funcify
+from pytensor.link.numba.dispatch.basic import numba_njit
+from pytensor.link.numba.dispatch.linalg.dot.banded import _gbmv
+from pytensor.link.numba.dispatch.linalg.dot.general import _matrix_vector_product
+from pytensor.link.numba.dispatch.slinalg import _COMPLEX_DTYPE_NOT_SUPPORTED_MSG
+from pytensor.tensor.blas import BandedGEMV, Gemv
+from pytensor.tensor.type import complex_dtypes
+
+
+@numba_funcify.register(Gemv)
+def numba_funcify_Gemv(op, node, **kwargs):
+    """
+    Function to handle the Gemv operation in Numba.
+    """
+    overwrite_y = op.inplace
+
+    @numba_njit()
+    def numba_gemv(y, alpha, A, x, beta):
+        """
+        Numba implementation of the Gemv operation.
+        """
+        return _matrix_vector_product(
+            alpha=alpha,
+            A=A,
+            x=x,
+            beta=beta,
+            y=y,
+            overwrite_y=overwrite_y,
+        )
+
+    return numba_gemv
+
+
+@numba_funcify.register(BandedGEMV)
+def numba_funcify_BandedGEMV(op, node, **kwargs):
+    kl = op.lower_diags
+    ku = op.upper_diags
+    overwrite_y = op.overwrite_y
+    trans = int(op.transpose)
+    dtype = node.inputs[0].dtype
+
+    if dtype in complex_dtypes:
+        raise NotImplementedError(_COMPLEX_DTYPE_NOT_SUPPORTED_MSG.format(op=op))
+
+    @numba_njit(cache=False)
+    def banded_gemv(A, x, y, alpha, beta):
+        return _gbmv(
+            A=A,
+            x=x,
+            kl=kl,
+            ku=ku,
+            y=y,
+            alpha=alpha,
+            beta=beta,
+            overwrite_y=overwrite_y,
+            trans=trans,
+        )
+
+    return banded_gemv
diff --git a/pytensor/link/numba/dispatch/linalg/_BLAS.py b/pytensor/link/numba/dispatch/linalg/_BLAS.py
@@ -0,0 +1,93 @@
+import ctypes
+
+from numba.core.extending import get_cython_function_address
+from numba.np.linalg import ensure_blas, ensure_lapack, get_blas_kind
+
+from pytensor.link.numba.dispatch.linalg._LAPACK import (
+    _get_float_pointer_for_dtype,
+    _ptr_int,
+)
+
+
+def _get_blas_ptr_and_ptr_type(dtype, name):
+    d = get_blas_kind(dtype)
+    func_name = f"{d}{name}"
+    float_pointer = _get_float_pointer_for_dtype(d)
+    lapack_ptr = get_cython_function_address("scipy.linalg.cython_blas", func_name)
+
+    return lapack_ptr, float_pointer
+
+
+class _BLAS:
+    """
+    Functions to return type signatures for wrapped BLAS functions.
+
+    Here we are specifically concered with BLAS functions exposed by scipy, and not used by numpy.
+
+    Patterned after https://github.com/numba/numba/blob/bd7ebcfd4b850208b627a3f75d4706000be36275/numba/np/linalg.py#L74
+    """
+
+    def __init__(self):
+        ensure_lapack()
+        ensure_blas()
+
+    @classmethod
+    def numba_xgemv(cls, dtype):
+        """
+        xGEMV performs one of the following matrix operations:
+
+            y = alpha * A @ x + beta * y,   or   y = alpha * A.T @ x + beta * y
+
+        Where alpha and beta are scalars, x and y are vectors, and A is a general matrix.
+        """
+
+        blas_ptr, float_pointer = _get_blas_ptr_and_ptr_type(dtype, "gemv")
+
+        functype = ctypes.CFUNCTYPE(
+            None,
+            _ptr_int,  # TRANS
+            _ptr_int,  # M
+            _ptr_int,  # N
+            float_pointer,  # ALPHA
+            float_pointer,  # A
+            _ptr_int,  # LDA
+            float_pointer,  # X
+            _ptr_int,  # INCX
+            float_pointer,  # BETA
+            float_pointer,  # Y
+            _ptr_int,  # INCY
+        )
+
+        return functype(blas_ptr)
+
+    @classmethod
+    def numba_xgbmv(cls, dtype):
+        """
+        xGBMV performs one of the following matrix operations:
+
+            y = alpha * A @ x + beta * y,   or   y = alpha * A.T @ x + beta * y
+
+        Where alpha and beta are scalars, x and y are vectors, and A is a band matrix with kl sub-diagonals and ku
+        super-diagonals.
+        """
+
+        blas_ptr, float_pointer = _get_blas_ptr_and_ptr_type(dtype, "gbmv")
+
+        functype = ctypes.CFUNCTYPE(
+            None,
+            _ptr_int,  # TRANS
+            _ptr_int,  # M
+            _ptr_int,  # N
+            _ptr_int,  # KL
+            _ptr_int,  # KU
+            float_pointer,  # ALPHA
+            float_pointer,  # A
+            _ptr_int,  # LDA
+            float_pointer,  # X
+            _ptr_int,  # INCX
+            float_pointer,  # BETA
+            float_pointer,  # Y
+            _ptr_int,  # INCY
+        )
+
+        return functype(blas_ptr)
diff --git a/pytensor/link/numba/dispatch/linalg/dot/__init__.py b/pytensor/link/numba/dispatch/linalg/dot/__init__.py
diff --git a/pytensor/link/numba/dispatch/linalg/dot/banded.py b/pytensor/link/numba/dispatch/linalg/dot/banded.py
@@ -0,0 +1,179 @@
+from collections.abc import Callable
+from typing import Any
+
+import numpy as np
+from numba import njit as numba_njit
+from numba.core.extending import overload
+from numba.np.linalg import ensure_blas, ensure_lapack
+from scipy import linalg
+
+from pytensor.link.numba.dispatch.linalg._BLAS import _BLAS
+from pytensor.link.numba.dispatch.linalg._LAPACK import (
+    _get_underlying_float,
+    val_to_int_ptr,
+)
+from pytensor.link.numba.dispatch.linalg.utils import (
+    _check_scipy_linalg_matrix,
+    _copy_to_fortran_order_even_if_1d,
+    _trans_char_to_int,
+)
+
+
+@numba_njit(inline="always")
+def A_to_banded(A: np.ndarray, kl: int, ku: int) -> np.ndarray:
+    m, n = A.shape
+
+    # This matrix is build backwards then transposed to get it into Fortran order
+    # (order="F" is not allowed in Numba land)
+    A_banded = np.zeros((n, kl + ku + 1), dtype=A.dtype).T
+
+    for i, k in enumerate(range(ku, -kl - 1, -1)):
+        if k >= 0:
+            A_banded[i, k:] = np.diag(A, k=k)
+        else:
+            A_banded[i, : n + k] = np.diag(A, k=k)
+
+    return A_banded
+
+
+def _gbmv(
+    alpha: np.ndarray,
+    A: np.ndarray,
+    x: np.ndarray,
+    kl: int,
+    ku: int,
+    beta: np.ndarray | None = None,
+    y: np.ndarray | None = None,
+    overwrite_y: bool = False,
+    trans: int = 1,
+) -> Any:
+    """
+    Thin wrapper around gmbv. This code will only be called if njit is disabled globally
+    (e.g. during testing)
+    """
+    (fn,) = linalg.get_blas_funcs(("gbmv",), (A, x))
+    m, n = A.shape
+    A_banded = A_to_banded(A, kl=kl, ku=ku)
+
+    incx = x.strides[0] // x.itemsize
+    offx = 0 if incx >= 0 else -x.size + 1
+
+    if y is not None:
+        incy = y.strides[0] // y.itemsize
+        offy = 0 if incy >= 0 else -y.size + 1
+    else:
+        incy = 1
+        offy = 0
+
+    return fn(
+        m=m,
+        n=n,
+        kl=kl,
+        ku=ku,
+        a=A_banded,
+        alpha=alpha,
+        x=x,
+        incx=incx,
+        offx=offx,
+        beta=beta,
+        y=y,
+        overwrite_y=overwrite_y,
+        incy=incy,
+        offy=offy,
+        trans=trans,
+    )
+
+
+@overload(_gbmv)
+def gbmv_impl(
+    alpha: np.ndarray,
+    A: np.ndarray,
+    x: np.ndarray,
+    kl: int,
+    ku: int,
+    beta: np.ndarray | None = None,
+    y: np.ndarray | None = None,
+    overwrite_y: bool = False,
+    trans: int = 1,
+) -> Callable[
+    [
+        np.ndarray,
+        np.ndarray,
+        np.ndarray,
+        int,
+        int,
+        np.ndarray | None,
+        np.ndarray | None,
+        bool,
+        int,
+    ],
+    np.ndarray,
+]:
+    ensure_lapack()
+    ensure_blas()
+    _check_scipy_linalg_matrix(A, "dot_banded")
+    dtype = A.dtype
+    w_type = _get_underlying_float(dtype)
+    numba_gbmv = _BLAS().numba_xgbmv(dtype)
+
+    def impl(
+        alpha: np.ndarray,
+        A: np.ndarray,
+        x: np.ndarray,
+        kl: int,
+        ku: int,
+        beta: np.ndarray | None = None,
+        y: np.ndarray | None = None,
+        overwrite_y: bool = False,
+        trans: int = 1,
+    ) -> np.ndarray:
+        m, n = A.shape
+
+        A_banded = A_to_banded(A, kl=kl, ku=ku)
+        x_stride = x.strides[0] // x.itemsize
+
+        if beta is None:
+            beta = np.zeros((), dtype=dtype)
+
+        if y is None:
+            y_copy = np.empty(shape=(m,), dtype=dtype)
+        elif overwrite_y and y.flags.f_contiguous:
+            y_copy = y
+        else:
+            y_copy = _copy_to_fortran_order_even_if_1d(y)
+
+        y_stride = y_copy.strides[0] // y_copy.itemsize
+
+        TRANS = val_to_int_ptr(_trans_char_to_int(trans))
+        M = val_to_int_ptr(m)
+        N = val_to_int_ptr(n)
+        LDA = val_to_int_ptr(A_banded.shape[0])
+
+        KL = val_to_int_ptr(kl)
+        KU = val_to_int_ptr(ku)
+
+        INCX = val_to_int_ptr(x_stride)
+        INCY = val_to_int_ptr(y_stride)
+
+        numba_gbmv(
+            TRANS,
+            M,
+            N,
+            KL,
+            KU,
+            alpha.view(w_type).ctypes,
+            A_banded.view(w_type).ctypes,
+            LDA,
+            # x.view().ctypes is creating a pointer to the beginning of the memory where the array is. When we have
+            # a negative stride, we need to trick BLAS by pointing to the last element of the array.
+            # The [-1:] slice is a workaround to make sure x remains an array (otherwise it has no .ctypes)
+            (x if x_stride >= 0 else x[-1:]).view(w_type).ctypes,
+            INCX,
+            beta.view(w_type).ctypes,
+            y_copy.view(w_type).ctypes,
+            INCY,
+        )
+
+        return y_copy
+
+    return impl