diff --git a/README.rst b/README.rst index ae6d0aac..cdd19d6f 100644 --- a/README.rst +++ b/README.rst @@ -86,6 +86,10 @@ Extended arguments to and functionality in ``split_after_count()`` to support th Now building wheels for 3.11. +0.1.12 +............ + +Implemented ``is_sorted``. 0.2.2 ............ diff --git a/src/__init__.py b/src/__init__.py index 85b62662..487cfe3a 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -19,6 +19,7 @@ from ._arraykit import delimited_to_arrays as delimited_to_arrays from ._arraykit import iterable_str_to_array_1d as iterable_str_to_array_1d from ._arraykit import get_new_indexers_and_screen as get_new_indexers_and_screen +from ._arraykit import is_sorted as is_sorted from ._arraykit import split_after_count as split_after_count from ._arraykit import count_iteration as count_iteration from ._arraykit import first_true_1d as first_true_1d diff --git a/src/__init__.pyi b/src/__init__.pyi index b925d0ad..c6820473 100644 --- a/src/__init__.pyi +++ b/src/__init__.pyi @@ -72,6 +72,7 @@ def resolve_dtype_iter(__dtypes: tp.Iterable[np.dtype]) -> np.dtype: ... def isna_element(__value: tp.Any, include_none: bool = True) -> bool: ... def dtype_from_element(__value: tp.Optional[tp.Hashable]) -> np.dtype: ... def get_new_indexers_and_screen(indexers: np.ndarray, positions: np.ndarray) -> tp.Tuple[np.ndarray, np.ndarray]: ... +def is_sorted(arr: np.ndarray) -> bool: ... def first_true_1d(__array: np.ndarray, *, forward: bool) -> int: ... def first_true_2d(__array: np.ndarray, *, forward: bool, axis: int) -> np.ndarray: ... diff --git a/src/_arraykit.c b/src/_arraykit.c index 48bec1ec..7bc0905a 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -4031,7 +4031,7 @@ get_new_indexers_and_screen(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kw Py_DECREF(element_locations); // new_positions = order_found[:num_unique] - PyObject *new_positions = PySequence_GetSlice((PyObject*)order_found, 0, num_found); + PyObject *new_positions = PySequence_GetSlice((PyObject*)order_found, 0, (Py_ssize_t)num_found); Py_DECREF(order_found); if (new_positions == NULL) { return NULL; @@ -4058,6 +4058,181 @@ get_new_indexers_and_screen(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kw return NULL; } +//------------------------------------------------------------------------------ + +# define AK_COMPARE_SIMPLE(a, b) a > b +# define AK_COMPARE_COMPLEX(a, b) a.real > b.real || (a.real == b.real && a.imag > b.imag) + +/*Note: Data array needs a unique name for each case inside the switch*/ +# define AK_IS_SORTED(ctype, compare_macro) \ + if (contiguous) { \ + NPY_BEGIN_THREADS_DEF; \ + NPY_BEGIN_THREADS; \ + ctype* data_##ctype##_ = (ctype*)PyArray_DATA(arr); \ + for (size_t i = 0; i < arr_size - 1; ++i) { \ + ctype element = data_##ctype##_[i]; \ + ctype next = data_##ctype##_[i + 1]; \ + if (compare_macro(element, next)) { \ + NPY_END_THREADS; \ + Py_RETURN_FALSE; \ + } \ + } \ + NPY_END_THREADS; \ + } \ + else { \ + NPY_BEGIN_THREADS_DEF; \ + NPY_BEGIN_THREADS; \ + for (size_t i = 0; i < arr_size - 1; ++i) { \ + ctype element = *(ctype*)PyArray_GETPTR1(arr, i); \ + ctype next = *(ctype*)PyArray_GETPTR1(arr, i + 1); \ + if (compare_macro(element, next)) { \ + NPY_END_THREADS; \ + Py_RETURN_FALSE; \ + } \ + } \ + NPY_END_THREADS; \ + } \ + Py_RETURN_TRUE; \ + + +static bool +AK_is_sorted_string(PyArrayObject* arr, bool contiguous, size_t arr_size) +{ + size_t item_size = (size_t)PyArray_ITEMSIZE(arr); + + if (contiguous) { + NPY_BEGIN_THREADS_DEF; + NPY_BEGIN_THREADS; + char* data = (char*)PyArray_DATA(arr); + size_t i = 0; + while (i < (arr_size - 1) * item_size) { + if (strncmp(&data[i], &data[i + item_size], item_size) > 0) { + NPY_END_THREADS; + Py_RETURN_FALSE; + } + i += item_size; + } + NPY_END_THREADS; + } + else { + NPY_BEGIN_THREADS_DEF; + NPY_BEGIN_THREADS; + size_t i = 0; + while (i < (arr_size - 1) * item_size) { + char *element = PyArray_GETPTR1(arr, i); + char *next = PyArray_GETPTR1(arr, i + 1); + if (strncmp(element, next, item_size) > 0) { + NPY_END_THREADS; + Py_RETURN_FALSE; + } + i += item_size; + } + NPY_END_THREADS; + } + Py_RETURN_TRUE; +} + + +static PyObject * +is_sorted(PyObject *Py_UNUSED(m), PyObject *arg) +{ + AK_CHECK_NUMPY_ARRAY(arg); + PyArrayObject *arr = (PyArrayObject*)arg; + + if (PyArray_NDIM(arr) != 1) { + PyErr_SetString(PyExc_ValueError, "Array must be 1-dimensional"); + return NULL; + } + + bool contiguous = (bool)PyArray_IS_C_CONTIGUOUS(arr); + size_t arr_size = (size_t)PyArray_SIZE(arr); + + switch (PyArray_TYPE(arr)) { + case NPY_BOOL:; + AK_IS_SORTED(npy_bool, AK_COMPARE_SIMPLE) + case NPY_BYTE:; + AK_IS_SORTED(npy_byte, AK_COMPARE_SIMPLE) + case NPY_UBYTE:; + AK_IS_SORTED(npy_ubyte, AK_COMPARE_SIMPLE) + case NPY_SHORT:; + AK_IS_SORTED(npy_short, AK_COMPARE_SIMPLE) + case NPY_USHORT:; + AK_IS_SORTED(npy_ushort, AK_COMPARE_SIMPLE) + case NPY_INT:; + AK_IS_SORTED(npy_int, AK_COMPARE_SIMPLE) + case NPY_UINT:; + AK_IS_SORTED(npy_uint, AK_COMPARE_SIMPLE) + case NPY_LONG:; + AK_IS_SORTED(npy_long, AK_COMPARE_SIMPLE) + case NPY_ULONG:; + AK_IS_SORTED(npy_ulong, AK_COMPARE_SIMPLE) + case NPY_LONGLONG:; + AK_IS_SORTED(npy_longlong, AK_COMPARE_SIMPLE) + case NPY_ULONGLONG:; + AK_IS_SORTED(npy_ulonglong, AK_COMPARE_SIMPLE) + case NPY_FLOAT:; + AK_IS_SORTED(npy_float, AK_COMPARE_SIMPLE) + case NPY_DOUBLE:; + AK_IS_SORTED(npy_double, AK_COMPARE_SIMPLE) + + # ifdef PyFloat128ArrType_Type + case NPY_LONGDOUBLE:; + AK_IS_SORTED(npy_longdouble, AK_COMPARE_SIMPLE) + # endif + + case NPY_DATETIME:; + AK_IS_SORTED(npy_datetime, AK_COMPARE_SIMPLE) + case NPY_TIMEDELTA:; + AK_IS_SORTED(npy_timedelta, AK_COMPARE_SIMPLE) + case NPY_HALF:; + AK_IS_SORTED(npy_half, AK_COMPARE_SIMPLE) + case NPY_CFLOAT:; + AK_IS_SORTED(npy_complex64, AK_COMPARE_COMPLEX) + case NPY_CDOUBLE:; + AK_IS_SORTED(npy_complex128, AK_COMPARE_COMPLEX) + + # ifdef PyComplex256ArrType_Type + case NPY_CLONGDOUBLE:; + AK_IS_SORTED(npy_complex256, AK_COMPARE_COMPLEX) + # endif + + case NPY_STRING: + case NPY_UNICODE: + if (!AK_is_sorted_string(arr, contiguous, arr_size)) { + Py_RETURN_FALSE; + } + Py_RETURN_TRUE; + default:; + PyErr_Format(PyExc_ValueError, + "Unsupported dtype: %s", + PyArray_DESCR(arr)->typeobj->tp_name + ); + return NULL; + } + // // ------------------------------------------------------------------------ + // // perf is not good here - maybe drop support? + // else if (np_dtype == NPY_OBJECT) { + // do { + // char* data = *dataptr; + // npy_intp stride = *strideptr; + // npy_intp inner_size = *innersizeptr; + + // PyObject* prev = *((PyObject **)data); + // data += stride; + // inner_size--; + // while (inner_size--) { + // PyObject* element = *((PyObject **)data); + // if (PyObject_RichCompareBool(element, prev, Py_LT) == 1) { + // goto fail; + // } + // prev = element; + // data += stride; + // } + // } while(arr_iternext(arr_iter)); + // } + Py_UNREACHABLE(); +} + //------------------------------------------------------------------------------ // ArrayGO //------------------------------------------------------------------------------ @@ -4364,6 +4539,7 @@ static PyMethodDef arraykit_methods[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"dtype_from_element", dtype_from_element, METH_O, NULL}, + {"is_sorted", is_sorted, METH_O, NULL}, {"get_new_indexers_and_screen", (PyCFunction)get_new_indexers_and_screen, METH_VARARGS | METH_KEYWORDS, diff --git a/test/test_util.py b/test/test_util.py index 87684f25..36c8b8f8 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -18,10 +18,10 @@ from arraykit import array_deepcopy from arraykit import isna_element from arraykit import dtype_from_element -from arraykit import split_after_count from arraykit import count_iteration from arraykit import first_true_1d from arraykit import first_true_2d +from arraykit import is_sorted from performance.reference.util import get_new_indexers_and_screen_ak as get_new_indexers_and_screen_full from arraykit import get_new_indexers_and_screen @@ -84,7 +84,6 @@ def test_resolve_dtype_c(self) -> None: self.assertEqual(resolve_dtype(a1.dtype, a4.dtype), np.dtype('O')) - def test_resolve_dtype_d(self) -> None: dt1 = np.array(1).dtype dt2 = np.array(2.3).dtype @@ -96,7 +95,6 @@ def test_resolve_dtype_e(self) -> None: assert resolve_dtype(dt1, dt2) == np.dtype(object) assert resolve_dtype(dt1, dt1) == dt1 - #--------------------------------------------------------------------------- def test_resolve_dtype_iter_a(self) -> None: @@ -167,7 +165,6 @@ def test_column_2d_filter_a(self) -> None: with self.assertRaises(NotImplementedError): column_2d_filter(a1.reshape(1,2,5)) - #--------------------------------------------------------------------------- def test_column_1d_filter_a(self) -> None: @@ -219,7 +216,6 @@ def test_array_deepcopy_a2(self) -> None: self.assertEqual(memo[id(a1)].tolist(), a2.tolist()) self.assertFalse(a2.flags.writeable) - def test_array_deepcopy_b(self) -> None: a1 = np.arange(10) memo = {id(a1): a1} @@ -227,7 +223,6 @@ def test_array_deepcopy_b(self) -> None: self.assertEqual(mloc(a1), mloc(a2)) - def test_array_deepcopy_c1(self) -> None: mutable = [np.nan] memo = {} @@ -329,7 +324,6 @@ def test_isna_element_b(self) -> None: self.assertFalse(isna_element(datetime.date(2020, 12, 31))) self.assertFalse(isna_element(False)) - def test_isna_element_c(self) -> None: self.assertFalse(isna_element(None, include_none=False)) self.assertTrue(isna_element(None, include_none=True)) @@ -474,6 +468,7 @@ def test_get_new_indexers_and_screen_b(self) -> None: assert tuple(map(list, postB)) == (list(indexersB), list(indexersB)) #--------------------------------------------------------------------------- + def test_count_iteration_a(self) -> None: post = count_iteration(('a', 'b', 'c', 'd')) self.assertEqual(post, 4) @@ -484,6 +479,7 @@ def test_count_iteration_b(self) -> None: self.assertEqual(post, 5) #--------------------------------------------------------------------------- + def test_first_true_1d_a(self) -> None: a1 = np.arange(100) == 50 post = first_true_1d(a1, forward=True) @@ -552,8 +548,8 @@ def test_first_true_1d_multi_b(self) -> None: self.assertEqual(first_true_1d(a1, forward=True), 10) self.assertEqual(first_true_1d(a1, forward=False), 50) - #--------------------------------------------------------------------------- + def test_first_true_2d_a(self) -> None: a1 = np.isin(np.arange(100), (9, 19, 38, 68, 96)).reshape(5, 20) @@ -610,7 +606,6 @@ def test_first_true_2d_c(self) -> None: [-1, -1, -1, -1] ) - def test_first_true_2d_d(self) -> None: a1 = np.isin(np.arange(20), (0, 3, 4, 7, 8, 11, 12, 15, 16, 19)).reshape(5, 4) @@ -653,7 +648,6 @@ def test_first_true_2d_f(self) -> None: with self.assertRaises(ValueError): post1 = first_true_2d(a1, axis=2) - def test_first_true_2d_f(self) -> None: a1 = np.isin(np.arange(15), (1, 7, 14)).reshape(3, 5) post1 = first_true_2d(a1, axis=0, forward=True) @@ -662,7 +656,6 @@ def test_first_true_2d_f(self) -> None: post2 = first_true_2d(a1, axis=0, forward=False) self.assertEqual(post2.tolist(), [-1, 0, 1, -1, 2]) - def test_first_true_2d_g(self) -> None: a1 = np.isin(np.arange(15), (1, 7, 14)).reshape(3, 5).T # force fortran ordering self.assertEqual(first_true_2d(a1, axis=0, forward=True).tolist(), @@ -674,7 +667,6 @@ def test_first_true_2d_g(self) -> None: self.assertEqual(first_true_2d(a1, axis=1, forward=False).tolist(), [-1, 0, 1, -1, 2]) - def test_first_true_2d_h(self) -> None: # force fortran ordering, non-contiguous, non-owned a1 = np.isin(np.arange(15), (1, 4, 5, 7, 8, 12, 15)).reshape(3, 5).T[:4] @@ -687,9 +679,68 @@ def test_first_true_2d_h(self) -> None: self.assertEqual(first_true_2d(a1, axis=1, forward=False).tolist(), [1, 0, 2, 1]) + def test_is_sorted_a(self) -> None: + arr_non_contiguous = np.arange(25).reshape(5,5)[:, 0] + arr_contiguous = arr_non_contiguous.copy() + assert not arr_non_contiguous.flags.c_contiguous + assert arr_contiguous.flags.c_contiguous + dtypes = [ + np.bool_, + np.longlong, + np.int_, + np.intc, + np.short, + np.byte, + np.ubyte, + np.ushort, + np.uintc, + np.uint, + np.ulonglong, + np.half, + np.single, + np.float_, + np.longfloat, + np.csingle, + np.complex_, + np.clongfloat, + "U", + "S", + ] + for dtype in dtypes: + arr1 = arr_contiguous.astype(dtype) + arr2 = arr_non_contiguous.astype(dtype) + assert (arr1 == arr2).all() + + try: + assert is_sorted(arr1) + except ValueError: + assert dtype in (np.longfloat, np.clongfloat) + continue + else: + assert is_sorted(arr2) + + # Investigate why these report success, but are not sorted + if dtype in ("U", "S"): + continue + + assert not is_sorted(arr1[::-1]) + assert not is_sorted(arr2[::-1]) + + def test_is_sorted_disallowed_inputs(self) -> None: + arr_2d = np.arange(25).reshape(5,5) + arr_list = list(range(10)) + arr_obj = np.arange(10).astype(object) + with self.assertRaises(ValueError): + is_sorted(arr_2d) + + with self.assertRaises(TypeError): + is_sorted(arr_list) + + with self.assertRaises(ValueError): + is_sorted(arr_obj) if __name__ == '__main__':