From 8f3e34765479d08ec2ae11ca6e38d978b08123bb Mon Sep 17 00:00:00 2001 From: Krishna Pai Date: Fri, 14 Oct 2022 10:13:57 -0700 Subject: [PATCH] Create PyVelox package and move type dependencies over. (#503) Summary: PyVelox package is gated with CMake flags in this pr. This refactoring will allow Torcharrow and Koski to use a common set of bindings. Pull Request resolved: https://github.com/pytorch/torcharrow/pull/503 Reviewed By: wenleix Differential Revision: D40082189 Pulled By: kgpai fbshipit-source-id: 069aed8ecf2275b0cc0ce3e223fe7d83e0678589 --- csrc/velox/CMakeLists.txt | 3 + csrc/velox/_torcharrow.pyi | 124 +++++++++---------- csrc/velox/lib.cpp | 84 +++---------- csrc/velox/pyvelox/CMakeLists.txt | 33 +++++ csrc/velox/pyvelox/__init__.py | 0 csrc/velox/pyvelox/pyvelox.cpp | 32 +++++ csrc/velox/pyvelox/pyvelox.h | 145 ++++++++++++++++++++++ torcharrow/test/lib_test/test_column.py | 152 ++++++++++++------------ torcharrow/test/lib_test/test_udf.py | 42 +++---- torcharrow/test/test_dtypes.py | 8 +- torcharrow/test/test_list_column.py | 6 +- torcharrow/velox_rt/list_column_cpu.py | 4 +- torcharrow/velox_rt/map_column_cpu.py | 2 +- torcharrow/velox_rt/typing.py | 36 +++--- 14 files changed, 415 insertions(+), 256 deletions(-) create mode 100644 csrc/velox/pyvelox/CMakeLists.txt create mode 100644 csrc/velox/pyvelox/__init__.py create mode 100644 csrc/velox/pyvelox/pyvelox.cpp create mode 100644 csrc/velox/pyvelox/pyvelox.h diff --git a/csrc/velox/CMakeLists.txt b/csrc/velox/CMakeLists.txt index 9aa97ce5e..7788b148f 100644 --- a/csrc/velox/CMakeLists.txt +++ b/csrc/velox/CMakeLists.txt @@ -9,6 +9,7 @@ cmake_minimum_required(VERSION 3.15) # _torcharrow is a shared library as it's a Python extension set(CMAKE_POSITION_INDEPENDENT_CODE ON) +set(CREATE_PYVELOX_MODULE OFF) # To make the right CPython is built with on GitHub Actions, # see https://github.com/actions/setup-python/issues/121#issuecomment-1014500503 @@ -91,6 +92,7 @@ endif() # set_target_properties(_torcharrow PROPERTIES CXX_VISIBILITY_PRESET default) add_subdirectory(velox) add_subdirectory(functions) +add_subdirectory(pyvelox) # Link with Velox: @@ -103,6 +105,7 @@ target_link_libraries(_torcharrow PRIVATE velox_function_registry velox_arrow_bridge torcharrow_udfs + pyvelox ) target_compile_definitions( diff --git a/csrc/velox/_torcharrow.pyi b/csrc/velox/_torcharrow.pyi index a66b0d490..2952a6be6 100644 --- a/csrc/velox/_torcharrow.pyi +++ b/csrc/velox/_torcharrow.pyi @@ -49,18 +49,18 @@ __all__ = [ "SimpleColumnVARCHAR", "TypeKind", "VeloxType", -"VeloxArrayType", -"VeloxFixedArrayType", -"VeloxMapType", -"VeloxRowType", -"VeloxType_BIGINT", -"VeloxType_BOOLEAN", -"VeloxType_DOUBLE", -"VeloxType_INTEGER", -"VeloxType_REAL", -"VeloxType_SMALLINT", -"VeloxType_TINYINT", -"VeloxType_VARCHAR", +"ArrayType", +"FixedSizeArrayType", +"MapType", +"RowType", +"BigintType", +"BooleanType", +"DoubleType", +"IntegerType", +"RealType", +"SmallintType", +"TinyintType", +"VarcharType", "Vocab", "fb_SentencePiece", "Column", @@ -2153,14 +2153,14 @@ class VeloxType: def kind_name(self) -> str: ... __pybind11_module_local_v4_clang_libstdcpp_cxxabi1002__ = ... # type: PyCapsule # value = pass -class VeloxArrayType(VeloxType): +class ArrayType(VeloxType): def __init__(self, arg0: VeloxType) -> None: ... def element_type(self) -> VeloxType: ... def kind(self) -> TypeKind: ... def kind_name(self) -> str: ... __pybind11_module_local_v4_clang_libstdcpp_cxxabi1002__ = ... # type: PyCapsule # value = pass -class VeloxFixedArrayType(VeloxType): +class FixedSizeArrayType(VeloxType): def __init__(self, arg0: int, arg1: VeloxType) -> None: ... def element_type(self) -> VeloxType: ... def fixed_width(self) -> int: ... @@ -2168,7 +2168,7 @@ class VeloxFixedArrayType(VeloxType): def kind_name(self) -> str: ... __pybind11_module_local_v4_clang_libstdcpp_cxxabi1002__ = ... # type: PyCapsule # value = pass -class VeloxMapType(VeloxType): +class MapType(VeloxType): def __init__(self, arg0: VeloxType, arg1: VeloxType) -> None: ... def key_type(self) -> VeloxType: ... def kind(self) -> TypeKind: ... @@ -2176,7 +2176,7 @@ class VeloxMapType(VeloxType): def value_type(self) -> VeloxType: ... __pybind11_module_local_v4_clang_libstdcpp_cxxabi1002__ = ... # type: PyCapsule # value = pass -class VeloxRowType(VeloxType): +class RowType(VeloxType): def __init__(self, arg0: List[str], arg1: List[VeloxType]) -> None: ... def child_at(self, arg0: int) -> VeloxType: ... def contains_child(self, arg0: str) -> bool: ... @@ -2187,49 +2187,49 @@ class VeloxRowType(VeloxType): def size(self) -> int: ... __pybind11_module_local_v4_clang_libstdcpp_cxxabi1002__ = ... # type: PyCapsule # value = pass -class VeloxType_BIGINT(VeloxType): +class BigintType(VeloxType): def __init__(self) -> None: ... def kind(self) -> TypeKind: ... def kind_name(self) -> str: ... __pybind11_module_local_v4_clang_libstdcpp_cxxabi1002__ = ... # type: PyCapsule # value = pass -class VeloxType_BOOLEAN(VeloxType): +class BooleanType(VeloxType): def __init__(self) -> None: ... def kind(self) -> TypeKind: ... def kind_name(self) -> str: ... __pybind11_module_local_v4_clang_libstdcpp_cxxabi1002__ = ... # type: PyCapsule # value = pass -class VeloxType_DOUBLE(VeloxType): +class DoubleType(VeloxType): def __init__(self) -> None: ... def kind(self) -> TypeKind: ... def kind_name(self) -> str: ... __pybind11_module_local_v4_clang_libstdcpp_cxxabi1002__ = ... # type: PyCapsule # value = pass -class VeloxType_INTEGER(VeloxType): +class IntegerType(VeloxType): def __init__(self) -> None: ... def kind(self) -> TypeKind: ... def kind_name(self) -> str: ... __pybind11_module_local_v4_clang_libstdcpp_cxxabi1002__ = ... # type: PyCapsule # value = pass -class VeloxType_REAL(VeloxType): +class RealType(VeloxType): def __init__(self) -> None: ... def kind(self) -> TypeKind: ... def kind_name(self) -> str: ... __pybind11_module_local_v4_clang_libstdcpp_cxxabi1002__ = ... # type: PyCapsule # value = pass -class VeloxType_SMALLINT(VeloxType): +class SmallintType(VeloxType): def __init__(self) -> None: ... def kind(self) -> TypeKind: ... def kind_name(self) -> str: ... __pybind11_module_local_v4_clang_libstdcpp_cxxabi1002__ = ... # type: PyCapsule # value = pass -class VeloxType_TINYINT(VeloxType): +class TinyintType(VeloxType): def __init__(self) -> None: ... def kind(self) -> TypeKind: ... def kind_name(self) -> str: ... __pybind11_module_local_v4_clang_libstdcpp_cxxabi1002__ = ... # type: PyCapsule # value = pass -class VeloxType_VARCHAR(VeloxType): +class VarcharType(VeloxType): def __init__(self) -> None: ... def kind(self) -> TypeKind: ... def kind_name(self) -> str: ... @@ -2265,94 +2265,94 @@ class fb_SentencePiece: def process(self, arg0: str) -> List[str]: ... pass @overload -def Column(arg0: VeloxType_DOUBLE) -> SimpleColumnDOUBLE: +def Column(arg0: DoubleType) -> SimpleColumnDOUBLE: pass @overload -def Column(arg0: VeloxType_INTEGER) -> SimpleColumnINTEGER: +def Column(arg0: IntegerType) -> SimpleColumnINTEGER: pass @overload -def Column(arg0: VeloxType_INTEGER, arg1: list) -> SimpleColumnINTEGER: +def Column(arg0: IntegerType, arg1: list) -> SimpleColumnINTEGER: pass @overload -def Column(arg0: VeloxType_SMALLINT, arg1: tuple) -> SimpleColumnSMALLINT: +def Column(arg0: SmallintType, arg1: tuple) -> SimpleColumnSMALLINT: pass @overload -def Column(arg0: VeloxType_SMALLINT) -> SimpleColumnSMALLINT: +def Column(arg0: SmallintType) -> SimpleColumnSMALLINT: pass @overload -def Column(arg0: VeloxType_REAL, arg1: tuple) -> SimpleColumnREAL: +def Column(arg0: RealType, arg1: tuple) -> SimpleColumnREAL: pass @overload -def Column(arg0: VeloxRowType) -> RowColumn: +def Column(arg0: RowType) -> RowColumn: pass @overload -def Column(arg0: VeloxType_SMALLINT, arg1: list) -> SimpleColumnSMALLINT: +def Column(arg0: SmallintType, arg1: list) -> SimpleColumnSMALLINT: pass @overload -def Column(arg0: VeloxType_TINYINT, arg1: tuple) -> SimpleColumnTINYINT: +def Column(arg0: TinyintType, arg1: tuple) -> SimpleColumnTINYINT: pass @overload -def Column(arg0: VeloxFixedArrayType, arg1: list) -> ArrayColumn: +def Column(arg0: FixedSizeArrayType, arg1: list) -> ArrayColumn: pass @overload -def Column(arg0: VeloxType_BIGINT, arg1: tuple) -> SimpleColumnBIGINT: +def Column(arg0: BigintType, arg1: tuple) -> SimpleColumnBIGINT: pass @overload -def Column(arg0: VeloxArrayType) -> ArrayColumn: +def Column(arg0: ArrayType) -> ArrayColumn: pass @overload -def Column(arg0: VeloxType_DOUBLE, arg1: tuple) -> SimpleColumnDOUBLE: +def Column(arg0: DoubleType, arg1: tuple) -> SimpleColumnDOUBLE: pass @overload -def Column(arg0: VeloxArrayType, arg1: list) -> ArrayColumn: +def Column(arg0: ArrayType, arg1: list) -> ArrayColumn: pass @overload -def Column(arg0: VeloxType_BOOLEAN) -> SimpleColumnBOOLEAN: +def Column(arg0: BooleanType) -> SimpleColumnBOOLEAN: pass @overload -def Column(arg0: VeloxType_VARCHAR, arg1: tuple) -> SimpleColumnVARCHAR: +def Column(arg0: VarcharType, arg1: tuple) -> SimpleColumnVARCHAR: pass @overload -def Column(arg0: VeloxMapType) -> MapColumn: +def Column(arg0: MapType) -> MapColumn: pass @overload -def Column(arg0: VeloxType_TINYINT) -> SimpleColumnTINYINT: +def Column(arg0: TinyintType) -> SimpleColumnTINYINT: pass @overload -def Column(arg0: VeloxFixedArrayType) -> ArrayColumn: +def Column(arg0: FixedSizeArrayType) -> ArrayColumn: pass @overload -def Column(arg0: VeloxType_INTEGER, arg1: tuple) -> SimpleColumnINTEGER: +def Column(arg0: IntegerType, arg1: tuple) -> SimpleColumnINTEGER: pass @overload -def Column(arg0: VeloxType_REAL) -> SimpleColumnREAL: +def Column(arg0: RealType) -> SimpleColumnREAL: pass @overload -def Column(arg0: VeloxType_BIGINT) -> SimpleColumnBIGINT: +def Column(arg0: BigintType) -> SimpleColumnBIGINT: pass @overload -def Column(arg0: VeloxType_TINYINT, arg1: list) -> SimpleColumnTINYINT: +def Column(arg0: TinyintType, arg1: list) -> SimpleColumnTINYINT: pass @overload -def Column(arg0: VeloxType_DOUBLE, arg1: list) -> SimpleColumnDOUBLE: +def Column(arg0: DoubleType, arg1: list) -> SimpleColumnDOUBLE: pass @overload -def Column(arg0: VeloxType_BIGINT, arg1: list) -> SimpleColumnBIGINT: +def Column(arg0: BigintType, arg1: list) -> SimpleColumnBIGINT: pass @overload -def Column(arg0: VeloxType_REAL, arg1: list) -> SimpleColumnREAL: +def Column(arg0: RealType, arg1: list) -> SimpleColumnREAL: pass @overload -def Column(arg0: VeloxType_BOOLEAN, arg1: tuple) -> SimpleColumnBOOLEAN: +def Column(arg0: BooleanType, arg1: tuple) -> SimpleColumnBOOLEAN: pass @overload -def Column(arg0: VeloxType_VARCHAR) -> SimpleColumnVARCHAR: +def Column(arg0: VarcharType) -> SimpleColumnVARCHAR: pass @overload -def Column(arg0: VeloxType_VARCHAR, arg1: list) -> SimpleColumnVARCHAR: +def Column(arg0: VarcharType, arg1: list) -> SimpleColumnVARCHAR: pass @overload -def Column(arg0: VeloxType_BOOLEAN, arg1: list) -> SimpleColumnBOOLEAN: +def Column(arg0: BooleanType, arg1: list) -> SimpleColumnBOOLEAN: pass @overload def ConstantColumn(arg0: handle, arg1: int, arg2: VeloxType) -> BaseColumn: @@ -2361,28 +2361,28 @@ def ConstantColumn(arg0: handle, arg1: int, arg2: VeloxType) -> BaseColumn: def ConstantColumn(arg0: handle, arg1: int) -> BaseColumn: pass @overload -def _import_from_arrow(arg0: VeloxType_SMALLINT, arg1: int, arg2: int) -> SimpleColumnSMALLINT: +def _import_from_arrow(arg0: SmallintType, arg1: int, arg2: int) -> SimpleColumnSMALLINT: pass @overload -def _import_from_arrow(arg0: VeloxType_DOUBLE, arg1: int, arg2: int) -> SimpleColumnDOUBLE: +def _import_from_arrow(arg0: DoubleType, arg1: int, arg2: int) -> SimpleColumnDOUBLE: pass @overload -def _import_from_arrow(arg0: VeloxType_TINYINT, arg1: int, arg2: int) -> SimpleColumnTINYINT: +def _import_from_arrow(arg0: TinyintType, arg1: int, arg2: int) -> SimpleColumnTINYINT: pass @overload -def _import_from_arrow(arg0: VeloxType_BOOLEAN, arg1: int, arg2: int) -> SimpleColumnBOOLEAN: +def _import_from_arrow(arg0: BooleanType, arg1: int, arg2: int) -> SimpleColumnBOOLEAN: pass @overload -def _import_from_arrow(arg0: VeloxType_INTEGER, arg1: int, arg2: int) -> SimpleColumnINTEGER: +def _import_from_arrow(arg0: IntegerType, arg1: int, arg2: int) -> SimpleColumnINTEGER: pass @overload -def _import_from_arrow(arg0: VeloxRowType, arg1: int, arg2: int) -> RowColumn: +def _import_from_arrow(arg0: RowType, arg1: int, arg2: int) -> RowColumn: pass @overload -def _import_from_arrow(arg0: VeloxType_BIGINT, arg1: int, arg2: int) -> SimpleColumnBIGINT: +def _import_from_arrow(arg0: BigintType, arg1: int, arg2: int) -> SimpleColumnBIGINT: pass @overload -def _import_from_arrow(arg0: VeloxType_REAL, arg1: int, arg2: int) -> SimpleColumnREAL: +def _import_from_arrow(arg0: RealType, arg1: int, arg2: int) -> SimpleColumnREAL: pass def _populate_dense_features_nopresence(arg0: RowColumn, arg1: int) -> None: pass diff --git a/csrc/velox/lib.cpp b/csrc/velox/lib.cpp index 10952daec..27b1621bc 100644 --- a/csrc/velox/lib.cpp +++ b/csrc/velox/lib.cpp @@ -18,6 +18,7 @@ #include "bindings.h" #include "column.h" #include "functions/functions.h" // @manual=//pytorch/torcharrow/csrc/velox/functions:torcharrow_functions +#include "pyvelox/pyvelox.h" // @manual=//pytorch/torcharrow/csrc/velox/pyvelox:pyvelox #include "tensor_conversion.h" #include "vector.h" #include "velox/buffer/StringViewBufferHolder.h" @@ -41,6 +42,7 @@ namespace py = pybind11; PYBIND11_MAKE_OPAQUE(std::vector); namespace facebook::torcharrow { + // // SimpleColumn (scalar types) // @@ -136,12 +138,15 @@ py::class_, BaseColumn> declareSimpleType( }); using I = typename velox::TypeTraits::ImplType; - py::class_>( - m, - (std::string("VeloxType_") + velox::TypeTraits::name).c_str(), - // TODO: Move the Koksi binding of Velox type to OSS - py::module_local()) - .def(py::init()); + + if constexpr (false) { + py::class_>( + m, + (std::string("VeloxType_") + velox::TypeTraits::name).c_str(), + // TODO: Move the Koksi binding of Velox type to OSS + py::module_local()) + .def(py::init()); + } // Empty Column m.def("Column", [](std::shared_ptr type) { @@ -681,20 +686,8 @@ void declareArrayType(py::module& m) { .def("withElements", &ArrayColumn::withElements); using I = typename velox::TypeTraits::ImplType; - py::class_>( - m, - "VeloxArrayType", - // TODO: Move the Koksi binding of Velox type to OSS - py::module_local()) - .def(py::init()) - .def("element_type", &velox::ArrayType::elementType); - - using J = typename velox::FixedSizeArrayType; - py::class_>( - m, "VeloxFixedArrayType", py::module_local()) - .def(py::init()) - .def("element_type", &velox::FixedSizeArrayType::elementType) - .def("fixed_width", &velox::FixedSizeArrayType::fixedElementsWidth); + + using J = velox::FixedSizeArrayType; // Empty Column m.def("Column", [](std::shared_ptr type) { @@ -737,14 +730,6 @@ void declareMapType(py::module& m) { .def("slice", &MapColumn::slice); using I = typename velox::TypeTraits::ImplType; - py::class_>( - m, - "VeloxMapType", - // TODO: Move the Koksi binding of Velox type to OSS - py::module_local()) - .def(py::init()) - .def("key_type", &velox::MapType::keyType) - .def("value_type", &velox::MapType::valueType); m.def("Column", [](std::shared_ptr type) { return std::make_unique(type); @@ -772,19 +757,7 @@ void declareRowType(py::module& m) { }); using I = typename velox::TypeTraits::ImplType; - py::class_>( - m, - "VeloxRowType", - // TODO: Move the Koksi binding of Velox type to OSS - py::module_local()) - .def(py::init< - std::vector&&, - std::vector>&&>()) - .def("size", &I::size) - .def("get_child_idx", &I::getChildIdx) - .def("contains_child", &I::containsChild) - .def("name_of", &I::nameOf) - .def("child_at", &I::childAt); + m.def("Column", [](std::shared_ptr type) { return std::make_unique(type); }); @@ -833,33 +806,6 @@ PYBIND11_MODULE(_torcharrow, m) { .def_property_readonly("length", &BaseColumn::getLength) .def("__len__", &BaseColumn::getLength); - py::enum_( - m, - "TypeKind", // TODO: Move the Koksi binding of Velox type to OSS - py::module_local()) - .value("BOOLEAN", velox::TypeKind::BOOLEAN) - .value("TINYINT", velox::TypeKind::TINYINT) - .value("SMALLINT", velox::TypeKind::SMALLINT) - .value("INTEGER", velox::TypeKind::INTEGER) - .value("BIGINT", velox::TypeKind::BIGINT) - .value("REAL", velox::TypeKind::REAL) - .value("DOUBLE", velox::TypeKind::DOUBLE) - .value("VARCHAR", velox::TypeKind::VARCHAR) - .value("VARBINARY", velox::TypeKind::VARBINARY) - .value("TIMESTAMP", velox::TypeKind::TIMESTAMP) - .value("ARRAY", velox::TypeKind::ARRAY) - .value("MAP", velox::TypeKind::MAP) - .value("ROW", velox::TypeKind::ROW) - .export_values(); - - py::class_>( - m, - "VeloxType", - // TODO: Move the Koksi binding of Velox type to OSS - py::module_local()) - .def("kind", &velox::Type::kind) - .def("kind_name", &velox::Type::kindName); - declareIntegralType(m); declareIntegralType(m); declareIntegralType(m); @@ -889,6 +835,8 @@ PYBIND11_MODULE(_torcharrow, m) { declareFloatingType(m); declareFloatingType(m); + pyvelox::addVeloxBindings(m); + declareSimpleType( m, [](const velox::StringView& val) { diff --git a/csrc/velox/pyvelox/CMakeLists.txt b/csrc/velox/pyvelox/CMakeLists.txt new file mode 100644 index 000000000..5c4cc01ea --- /dev/null +++ b/csrc/velox/pyvelox/CMakeLists.txt @@ -0,0 +1,33 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +if(${CREATE_PYVELOX_MODULE}) + + # Define our Python module: + pybind11_add_module( + _pyvelox + MODULE + NO_EXTRAS # TODO: LTO crashes GCC9.2. File issues to pybind11 + pyvelox.cpp + pyvelox.h + ) + + # Link with Velox: + target_link_libraries(_pyvelox PRIVATE + velox_type + ) + + install( + TARGETS _pyvelox + LIBRARY DESTINATION . + ) +else() + add_library(pyvelox pyvelox.cpp pyvelox.h) + target_link_libraries( + pyvelox + velox_type + pybind11::module) +endif() diff --git a/csrc/velox/pyvelox/__init__.py b/csrc/velox/pyvelox/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/csrc/velox/pyvelox/pyvelox.cpp b/csrc/velox/pyvelox/pyvelox.cpp new file mode 100644 index 000000000..320a52038 --- /dev/null +++ b/csrc/velox/pyvelox/pyvelox.cpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "pyvelox.h" // @manual + +namespace facebook::pyvelox { +using namespace velox; +namespace py = pybind11; + +std::string serializeType(const std::shared_ptr& type) { + const auto& obj = type->serialize(); + return folly::json::serialize(obj, velox::getSerializationOptions()); +} + +#ifdef CREATE_PYVELOX_MODULE +PYBIND11_MODULE(_pyvelox, m) { + m.doc() = R"pbdoc( + PyVelox native code module + ----------------------- + )pbdoc"; + + addVeloxBindings(m); + + m.attr("__version__") = "dev"; +} +#endif +} // namespace facebook::pyvelox diff --git a/csrc/velox/pyvelox/pyvelox.h b/csrc/velox/pyvelox/pyvelox.h new file mode 100644 index 000000000..0eebdc144 --- /dev/null +++ b/csrc/velox/pyvelox/pyvelox.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +namespace facebook::pyvelox { + +std::string serializeType(const std::shared_ptr& type); + +// Inlining these bindings since adding them to the cpp file results in a +// ASAN error. +inline void addVeloxBindings(pybind11::module& m, bool asLocalModule = true) { + using namespace velox; + namespace py = pybind11; + + py::enum_(m, "TypeKind", py::module_local(asLocalModule)) + .value("BOOLEAN", velox::TypeKind::BOOLEAN) + .value("TINYINT", velox::TypeKind::TINYINT) + .value("SMALLINT", velox::TypeKind::SMALLINT) + .value("INTEGER", velox::TypeKind::INTEGER) + .value("BIGINT", velox::TypeKind::BIGINT) + .value("REAL", velox::TypeKind::REAL) + .value("DOUBLE", velox::TypeKind::DOUBLE) + .value("VARCHAR", velox::TypeKind::VARCHAR) + .value("VARBINARY", velox::TypeKind::VARBINARY) + .value("TIMESTAMP", velox::TypeKind::TIMESTAMP) + .value("OPAQUE", velox::TypeKind::OPAQUE) + .value("ARRAY", velox::TypeKind::ARRAY) + .value("MAP", velox::TypeKind::MAP) + .value("ROW", velox::TypeKind::ROW) + .export_values(); + + py::class_> type( + m, "VeloxType", py::module_local(asLocalModule)); + + py::class_> booleanType( + m, "BooleanType", py::module_local(asLocalModule)); + py::class_> integerType( + m, "IntegerType", py::module_local(asLocalModule)); + py::class_> bigintType( + m, "BigintType", py::module_local(asLocalModule)); + py::class_> smallintType( + m, "SmallintType", py::module_local(asLocalModule)); + py::class_> tinyintType( + m, "TinyintType", py::module_local(asLocalModule)); + py::class_> realType( + m, "RealType", py::module_local(asLocalModule)); + py::class_> doubleType( + m, "DoubleType", py::module_local(asLocalModule)); + py::class_> timestampType( + m, "TimestampType", py::module_local(asLocalModule)); + py::class_> varcharType( + m, "VarcharType", py::module_local(asLocalModule)); + py::class_> varbinaryType( + m, "VarbinaryType", py::module_local(asLocalModule)); + py::class_> arrayType( + m, "ArrayType", py::module_local(asLocalModule)); + py::class_> mapType( + m, "MapType", py::module_local(asLocalModule)); + py::class_> rowType( + m, "RowType", py::module_local(asLocalModule)); + py::class_> + fixedArrayType(m, "FixedSizeArrayType", py::module_local(asLocalModule)); + + type.def("__str__", &Type::toString); + // Gcc doesnt support the below kind of templatization. +#if defined(__clang__) + type.def(py::self == py::self); + type.def(py::self != py::self); +#endif + type.def( + "cpp_size_in_bytes", + &Type::cppSizeInBytes, + "Return the C++ size in bytes"); + type.def( + "is_fixed_width", + &Type::isFixedWidth, + "Check if the type is fixed width"); + type.def( + "is_primitive_type", + &Type::isPrimitiveType, + "Check if the type is a primitive type"); + type.def("kind", &Type::kind, "Returns the kind of the type"); + type.def("serialize", &serializeType, "Serializes the type as JSON"); + + booleanType.def(py::init()); + tinyintType.def(py::init()); + smallintType.def(py::init()); + integerType.def(py::init()); + bigintType.def(py::init()); + realType.def(py::init()); + doubleType.def(py::init()); + varcharType.def(py::init()); + varbinaryType.def(py::init()); + timestampType.def(py::init()); + arrayType.def(py::init>()); + arrayType.def( + "element_type", &ArrayType::elementType, "Return the element type"); + fixedArrayType.def(py::init()) + .def("element_type", &velox::FixedSizeArrayType::elementType) + .def("fixed_width", &velox::FixedSizeArrayType::fixedElementsWidth); + mapType.def(py::init, std::shared_ptr>()); + mapType.def("key_type", &MapType::keyType, "Return the key type"); + mapType.def("value_type", &MapType::valueType, "Return the value type"); + + rowType.def(py::init< + std::vector, + std::vector>>()); + rowType.def("size", &RowType::size, "Return the number of columns"); + rowType.def( + "child_at", + &RowType::childAt, + "Return the type of the column at a given index", + py::arg("idx")); + rowType.def( + "find_child", + [](const std::shared_ptr& type, const std::string& name) { + return type->findChild(name); + }, + "Return the type of the column with the given name", + py::arg("name")); + rowType.def( + "get_child_idx", + &RowType::getChildIdx, + "Return the index of the column with the given name", + py::arg("name")); + rowType.def( + "name_of", + &RowType::nameOf, + "Return the name of the column at the given index", + py::arg("idx")); + rowType.def("names", &RowType::names, "Return the names of the columns"); +} + +} // namespace facebook::pyvelox diff --git a/torcharrow/test/lib_test/test_column.py b/torcharrow/test/lib_test/test_column.py index 5ce18a012..efacdacca 100644 --- a/torcharrow/test/lib_test/test_column.py +++ b/torcharrow/test/lib_test/test_column.py @@ -277,111 +277,111 @@ def test_SimpleColumnUTF(self) -> None: def test_ConstantColumn(self) -> None: # INTEGER - col = ta.ConstantColumn(42, 6, ta.VeloxType_INTEGER()) - self.assertTrue(isinstance(col.type(), ta.VeloxType_INTEGER)) + col = ta.ConstantColumn(42, 6, ta.IntegerType()) + self.assertTrue(isinstance(col.type(), ta.IntegerType)) self.assert_Column(col, [42] * 6) ########### # BIGINT col = ta.ConstantColumn(42, 6) - self.assertTrue(isinstance(col.type(), ta.VeloxType_BIGINT)) + self.assertTrue(isinstance(col.type(), ta.BigintType)) self.assert_Column(col, [42] * 6) # Test use constant column for normal add data = [1, -2, None, 3, -4, None] num_column = infer_column(data) add_result = num_column.add(col) - self.assertTrue(isinstance(add_result.type(), ta.VeloxType_BIGINT)) + self.assertTrue(isinstance(add_result.type(), ta.BigintType)) self.assert_Column(add_result, [43, 40, None, 45, 38, None]) add_result = col.add(num_column) - self.assertTrue(isinstance(add_result.type(), ta.VeloxType_BIGINT)) + self.assertTrue(isinstance(add_result.type(), ta.BigintType)) self.assert_Column(add_result, [43, 40, None, 45, 38, None]) ########### # REAL col = ta.ConstantColumn(4.2, 6) - self.assertTrue(isinstance(col.type(), ta.VeloxType_REAL)) + self.assertTrue(isinstance(col.type(), ta.RealType)) self.assert_Column(col, [4.2] * 6) # Test use constant column for normal add data = [1.2, -2.3, None, 3.4, -4.6, None] num_column = infer_column(data) add_result = num_column.add(col) - self.assertTrue(isinstance(add_result.type(), ta.VeloxType_REAL)) + self.assertTrue(isinstance(add_result.type(), ta.RealType)) self.assert_Column(add_result, [5.4, 1.9, None, 7.6, -0.4, None]) add_result = col.add(num_column) - self.assertTrue(isinstance(add_result.type(), ta.VeloxType_REAL)) + self.assertTrue(isinstance(add_result.type(), ta.RealType)) self.assert_Column(add_result, [5.4, 1.9, None, 7.6, -0.4, None]) ########### # VARCHAR col = ta.ConstantColumn("abc", 6) - self.assertTrue(isinstance(col.type(), ta.VeloxType_VARCHAR)) + self.assertTrue(isinstance(col.type(), ta.VarcharType)) self.assert_Column(col, ["abc"] * 6) def test_FromPyList(self) -> None: # BIGINT - col = ta.Column(ta.VeloxType_BIGINT(), [1, 2, None, 4]) - self.assertTrue(isinstance(col.type(), ta.VeloxType_BIGINT)) + col = ta.Column(ta.BigintType(), [1, 2, None, 4]) + self.assertTrue(isinstance(col.type(), ta.BigintType)) self.assert_Column(col, [1, 2, None, 4]) # INTEGER - col = ta.Column(ta.VeloxType_INTEGER(), [1, 2, None, 4]) - self.assertTrue(isinstance(col.type(), ta.VeloxType_INTEGER)) + col = ta.Column(ta.IntegerType(), [1, 2, None, 4]) + self.assertTrue(isinstance(col.type(), ta.IntegerType)) self.assert_Column(col, [1, 2, None, 4]) # SMALLINT - col = ta.Column(ta.VeloxType_SMALLINT(), [1, 2, None, 4]) - self.assertTrue(isinstance(col.type(), ta.VeloxType_SMALLINT)) + col = ta.Column(ta.SmallintType(), [1, 2, None, 4]) + self.assertTrue(isinstance(col.type(), ta.SmallintType)) self.assert_Column(col, [1, 2, None, 4]) # TINYINT - col = ta.Column(ta.VeloxType_TINYINT(), [1, 2, None, 4]) - self.assertTrue(isinstance(col.type(), ta.VeloxType_TINYINT)) + col = ta.Column(ta.TinyintType(), [1, 2, None, 4]) + self.assertTrue(isinstance(col.type(), ta.TinyintType)) self.assert_Column(col, [1, 2, None, 4]) # REAL - col = ta.Column(ta.VeloxType_REAL(), [1, 2, None, 4]) - self.assertTrue(isinstance(col.type(), ta.VeloxType_REAL)) + col = ta.Column(ta.RealType(), [1, 2, None, 4]) + self.assertTrue(isinstance(col.type(), ta.RealType)) self.assert_Column(col, [1.0, 2.0, None, 4.0]) # DOUBLE - col = ta.Column(ta.VeloxType_DOUBLE(), [1, 2, None, 4]) - self.assertTrue(isinstance(col.type(), ta.VeloxType_DOUBLE)) + col = ta.Column(ta.DoubleType(), [1, 2, None, 4]) + self.assertTrue(isinstance(col.type(), ta.DoubleType)) self.assert_Column(col, [1.0, 2.0, None, 4.0]) # BOOLEAN - col = ta.Column(ta.VeloxType_BOOLEAN(), [True, False, None, True]) - self.assertTrue(isinstance(col.type(), ta.VeloxType_BOOLEAN)) + col = ta.Column(ta.BooleanType(), [True, False, None, True]) + self.assertTrue(isinstance(col.type(), ta.BooleanType)) self.assert_Column(col, [True, False, None, True]) # VARCHAR - col = ta.Column(ta.VeloxType_VARCHAR(), ["foo", "bar", None, "abc"]) - self.assertTrue(isinstance(col.type(), ta.VeloxType_VARCHAR)) + col = ta.Column(ta.VarcharType(), ["foo", "bar", None, "abc"]) + self.assertTrue(isinstance(col.type(), ta.VarcharType)) self.assert_Column(col, ["foo", "bar", None, "abc"]) # ARRAY of scalar element col = ta.Column( - ta.VeloxArrayType(ta.VeloxType_VARCHAR()), + ta.ArrayType(ta.VarcharType()), [["foo", "bar"], None, ["abc", None]], ) - self.assertTrue(isinstance(col.type(), ta.VeloxArrayType)) - self.assertTrue(isinstance(col.type().element_type(), ta.VeloxType_VARCHAR)) + self.assertTrue(isinstance(col.type(), ta.ArrayType)) + self.assertTrue(isinstance(col.type().element_type(), ta.VarcharType)) self.assert_Column(col, [["foo", "bar"], None, ["abc", None]]) # ARRAY of ROW element # col = ta.Column( - # ta.VeloxArrayType( - # ta.VeloxRowType( - # ["f1", "f2"], [ta.VeloxType_VARCHAR(), ta.VeloxType_BIGINT()] + # ta.ArrayType( + # ta.RowType( + # ["f1", "f2"], [ta.VarcharType(), ta.BigintType()] # ) # ), # [[("foo", 1), ("bar", 2)], None, [("abc", 3), ("def", 4)]], # ) - # self.assertTrue(isinstance(col.type(), ta.VeloxArrayType)) - # self.assertTrue(isinstance(col.type().element_type(), ta.VeloxRowType)) + # self.assertTrue(isinstance(col.type(), ta.ArrayType)) + # self.assertTrue(isinstance(col.type().element_type(), ta.RowType)) # self.assert_Column( # col, [[("foo", 1), ("bar", 2)], None, [("abc", 3), ("def", 4)]] # ) @@ -435,9 +435,9 @@ def test_ToArrow_Struct(self) -> None: ptr_array = int(ffi.cast("uintptr_t", c_array)) col = ta.Column( - ta.VeloxRowType( + ta.RowType( ["f1", "f2"], - [ta.VeloxType_INTEGER(), ta.VeloxType_INTEGER()], + [ta.IntegerType(), ta.IntegerType()], ) ) col.child_at(0).append(1) @@ -480,7 +480,7 @@ def test_FromArrow_Numerical(self) -> None: # pyre-fixme[16]: Item `Array` of `Union[Array[typing.Any], ChunkedArray]` # has no attribute `_export_to_c`. a._export_to_c(ptr_array, ptr_schema) - col = ta._import_from_arrow(ta.VeloxType_BIGINT(), ptr_array, ptr_schema) + col = ta._import_from_arrow(ta.BigintType(), ptr_array, ptr_schema) self.assertEqual(len(col), 4) self.assertEqual(col.get_null_count(), 2) self.assertTrue(col.is_null_at(0)) @@ -513,9 +513,9 @@ def test_FromArrow_Struct(self) -> None: # has no attribute `_export_to_c`. s._export_to_c(ptr_array, ptr_schema) col = ta._import_from_arrow( - ta.VeloxRowType( + ta.RowType( ["f1", "f2"], - [ta.VeloxType_INTEGER(), ta.VeloxType_BOOLEAN()], + [ta.IntegerType(), ta.BooleanType()], ), ptr_array, ptr_schema, @@ -533,14 +533,14 @@ def test_FromArrow_Struct(self) -> None: def is_same_type(a, b) -> bool: - if isinstance(a, ta.VeloxType_BIGINT): - return isinstance(b, ta.VeloxType_BIGINT) - if isinstance(a, ta.VeloxType_VARCHAR): - return isinstance(b, ta.VeloxType_VARCHAR) - if isinstance(a, ta.VeloxType_BOOLEAN): - return isinstance(b, ta.VeloxType_BOOLEAN) - if isinstance(a, ta.VeloxArrayType): - return isinstance(b, ta.VeloxArrayType) and is_same_type( + if isinstance(a, ta.BigintType): + return isinstance(b, ta.BigintType) + if isinstance(a, ta.VarcharType): + return isinstance(b, ta.VarcharType) + if isinstance(a, ta.BooleanType): + return isinstance(b, ta.BooleanType) + if isinstance(a, ta.ArrayType): + return isinstance(b, ta.ArrayType) and is_same_type( a.element_type(), b.element_type() ) raise NotImplementedError() @@ -569,11 +569,11 @@ def infer_column(data) -> ta.BaseColumn: def resolve_column_with_arbitrary_type(unresolved: Unresolved) -> ta.BaseColumn: if isinstance(unresolved, UnresolvedArray): element = resolve_column_with_arbitrary_type(unresolved.element_type) - col = ta.Column(ta.VeloxArrayType(element.type())) + col = ta.Column(ta.ArrayType(element.type())) col.append(element) return col else: - return ta.Column(ta.VeloxType_BIGINT()) + return ta.Column(ta.BigintType()) def get_union_type(inferred_columns: List[Union[ta.BaseColumn, Unresolved, None]]): @@ -620,7 +620,7 @@ def _infer_column(data) -> Union[ta.BaseColumn, Unresolved, None]: return UnresolvedArray(union_type) else: resolved_item_type = union_type - col = ta.Column(ta.VeloxArrayType(resolved_item_type)) + col = ta.Column(ta.ArrayType(resolved_item_type)) for item_col, item in zip(inferred_columns, data): if item is None: resolved_item_col = None @@ -653,12 +653,12 @@ def _infer_column(data) -> Union[ta.BaseColumn, Unresolved, None]: keys_array_type = inferred_keys_array_columns.type() values_array_type = inferred_values_array_columns.type() - if isinstance(keys_array_type, ta.VeloxArrayType) and isinstance( + if isinstance(keys_array_type, ta.ArrayType) and isinstance( values_array_type, - ta.VeloxArrayType, + ta.ArrayType, ): col = ta.Column( - ta.VeloxMapType( + ta.MapType( keys_array_type.element_type(), values_array_type.element_type() ) ) @@ -681,10 +681,10 @@ def _infer_column(data) -> Union[ta.BaseColumn, Unresolved, None]: else: type_ = { - int: ta.VeloxType_BIGINT(), - float: ta.VeloxType_REAL(), - str: ta.VeloxType_VARCHAR(), - bool: ta.VeloxType_BOOLEAN(), + int: ta.BigintType(), + float: ta.RealType(), + str: ta.VarcharType(), + bool: ta.BooleanType(), }.get(type(non_null_item)) if type_ is None: raise NotImplementedError(f"Cannot infer {type(non_null_item)}") @@ -705,12 +705,12 @@ def resolve_column(item, type_) -> ta.BaseColumn: col.append_null() else: if type(type_) in ( - ta.VeloxType_INTEGER, - ta.VeloxType_VARCHAR, - ta.VeloxType_BOOLEAN, + ta.IntegerType, + ta.VarcharType, + ta.BooleanType, ): col.append(value) - elif type(type_) == ta.VeloxArrayType: + elif type(type_) == ta.ArrayType: col.append(resolve_column(value, type_.element_type())) else: raise NotImplementedError(f"{type(type_)}") @@ -721,12 +721,12 @@ class TestInferColumn(unittest.TestCase): def test_infer_simple(self) -> None: data = [1, 2, 3] type_ = infer_column(data).type() - self.assertTrue(is_same_type(type_, ta.VeloxType_BIGINT())) + self.assertTrue(is_same_type(type_, ta.BigintType())) def test_infer_array(self) -> None: data = [[1], [2], [3]] type_ = infer_column(data).type() - self.assertTrue(is_same_type(type_, ta.VeloxArrayType(ta.VeloxType_BIGINT()))) + self.assertTrue(is_same_type(type_, ta.ArrayType(ta.BigintType()))) def test_infer_nested_array(self) -> None: data = [[[1]], [[2], [5]], [[3, 4]]] @@ -734,34 +734,34 @@ def test_infer_nested_array(self) -> None: self.assertTrue( is_same_type( type_, - ta.VeloxArrayType(ta.VeloxArrayType(ta.VeloxType_BIGINT())), + ta.ArrayType(ta.ArrayType(ta.BigintType())), ) ) def test_unresolved(self) -> None: data = [] type_ = infer_column(data).type() - self.assertTrue(is_same_type(type_, ta.VeloxType_BIGINT())) + self.assertTrue(is_same_type(type_, ta.BigintType())) def test_nested_unresolved1(self) -> None: data = [[]] type_ = infer_column(data).type() - self.assertTrue(is_same_type(type_, ta.VeloxArrayType(ta.VeloxType_BIGINT()))) + self.assertTrue(is_same_type(type_, ta.ArrayType(ta.BigintType()))) def test_nested_unresolved2(self) -> None: data = [None] type_ = infer_column(data).type() - self.assertTrue(is_same_type(type_, ta.VeloxType_BIGINT())) + self.assertTrue(is_same_type(type_, ta.BigintType())) def test_nested_unresolved3(self) -> None: data = [[None]] type_ = infer_column(data).type() - self.assertTrue(is_same_type(type_, ta.VeloxArrayType(ta.VeloxType_BIGINT()))) + self.assertTrue(is_same_type(type_, ta.ArrayType(ta.BigintType()))) def test_propagate_unresolved(self) -> None: data = [None, [], [1], [1, None, 2], None] type_ = infer_column(data).type() - self.assertTrue(is_same_type(type_, ta.VeloxArrayType(ta.VeloxType_BIGINT()))) + self.assertTrue(is_same_type(type_, ta.ArrayType(ta.BigintType()))) class TestArrayColumns(BaseTestColumns): @@ -899,9 +899,9 @@ def test_MapColumnInt64_with_none(self) -> None: class TestRowColumns(unittest.TestCase): def test_RowColumn1(self) -> None: col = ta.Column( - ta.VeloxRowType( + ta.RowType( ["a", "b"], - [ta.VeloxType_INTEGER(), ta.VeloxType_VARCHAR()], + [ta.IntegerType(), ta.VarcharType()], ) ) col.child_at(0).append(1) @@ -928,9 +928,9 @@ def test_RowColumn1(self) -> None: def test_set_child(self) -> None: col = ta.Column( - ta.VeloxRowType( + ta.RowType( ["a", "b"], - [ta.VeloxType_INTEGER(), ta.VeloxType_VARCHAR()], + [ta.IntegerType(), ta.VarcharType()], ) ) col.child_at(0).append(1) @@ -952,13 +952,13 @@ def test_set_child(self) -> None: def test_nested_row(self) -> None: col = ta.Column( - ta.VeloxRowType( + ta.RowType( ["a", "b"], [ - ta.VeloxType_INTEGER(), - ta.VeloxRowType( + ta.IntegerType(), + ta.RowType( ["b1", "b2"], - [ta.VeloxType_VARCHAR(), ta.VeloxType_INTEGER()], + [ta.VarcharType(), ta.IntegerType()], ), ], ) diff --git a/torcharrow/test/lib_test/test_udf.py b/torcharrow/test/lib_test/test_udf.py index ab465df3e..b54dc35ed 100644 --- a/torcharrow/test/lib_test/test_udf.py +++ b/torcharrow/test/lib_test/test_udf.py @@ -41,7 +41,7 @@ def construct_simple_column(velox_type, data: List[Any]): def test_basic(self) -> None: # test some UDFs together data = ["abc", "ABC", "XYZ123", None, "xYZ", "123", "äöå"] - col = self.construct_simple_column(ta.VeloxType_VARCHAR(), data) + col = self.construct_simple_column(ta.VarcharType(), data) lcol = ta.generic_udf_dispatch("lower", [col]) self.assert_SimpleColumn( @@ -83,18 +83,18 @@ def test_basic(self) -> None: ) data2 = [1, 2, 3, None, 5, None, -7] - col2 = self.construct_simple_column(ta.VeloxType_BIGINT(), data2) + col2 = self.construct_simple_column(ta.BigintType(), data2) neg = ta.generic_udf_dispatch("negate", [col2]) self.assert_SimpleColumn(neg, [-1, -2, -3, None, -5, None, 7]) data3 = ["\n", "a", "\t", "76", " ", None] - col3 = self.construct_simple_column(ta.VeloxType_VARCHAR(), data3) + col3 = self.construct_simple_column(ta.VarcharType(), data3) isspace = ta.generic_udf_dispatch("torcharrow_isspace", [col3]) self.assert_SimpleColumn(isspace, [True, False, True, False, True, None]) data4 = ["a b c", "d,e,f"] - col4 = self.construct_simple_column(ta.VeloxType_VARCHAR(), data4) + col4 = self.construct_simple_column(ta.VarcharType(), data4) splits = ta.generic_udf_dispatch( "split", [col4, ta.ConstantColumn(" ", len(data4))], @@ -105,26 +105,26 @@ def test_basic(self) -> None: self.assert_SimpleColumn(splits[i], expected[i]) def test_coalesce(self) -> None: - col1 = self.construct_simple_column(ta.VeloxType_BIGINT(), [1, 2, None, 3]) + col1 = self.construct_simple_column(ta.BigintType(), [1, 2, None, 3]) result = ta.generic_udf_dispatch( "coalesce", [col1, ta.ConstantColumn(42, len(col1))], ) self.assert_SimpleColumn(result, [1, 2, 42, 3]) - self.assertTrue(isinstance(result.type(), ta.VeloxType_BIGINT)) + self.assertTrue(isinstance(result.type(), ta.BigintType)) - col2 = self.construct_simple_column(ta.VeloxType_INTEGER(), [1, 2, None, 3]) + col2 = self.construct_simple_column(ta.IntegerType(), [1, 2, None, 3]) result = ta.generic_udf_dispatch( "coalesce", - [col2, ta.ConstantColumn(42, len(col2), ta.VeloxType_INTEGER())], + [col2, ta.ConstantColumn(42, len(col2), ta.IntegerType())], ) self.assert_SimpleColumn(result, [1, 2, 42, 3]) - self.assertTrue(isinstance(result.type(), ta.VeloxType_INTEGER)) + self.assertTrue(isinstance(result.type(), ta.IntegerType)) def test_regex(self) -> None: # test some regex UDF data = ["abc", "a1", "b2", "c3", "___d4___", None] - col = self.construct_simple_column(ta.VeloxType_VARCHAR(), data) + col = self.construct_simple_column(ta.VarcharType(), data) match = ta.generic_udf_dispatch( "match_re", @@ -139,7 +139,7 @@ def test_regex(self) -> None: self.assert_SimpleColumn(search, [False, True, True, True, True, None]) data = ["d4e5", "a1", "b2", "c3", "___d4___f6"] - col = self.construct_simple_column(ta.VeloxType_VARCHAR(), data) + col = self.construct_simple_column(ta.VarcharType(), data) extract = ta.generic_udf_dispatch( "regexp_extract_all", [col, ta.ConstantColumn("([a-z])\\d", 5)], @@ -151,7 +151,7 @@ def test_regex(self) -> None: def test_lower(self) -> None: data = ["abc", "ABC", "XYZ123", None, "xYZ", "123", "äöå"] - col = self.construct_simple_column(ta.VeloxType_VARCHAR(), data) + col = self.construct_simple_column(ta.VarcharType(), data) lcol = ta.generic_udf_dispatch("lower", [col]) self.assert_SimpleColumn( lcol, ["abc", "abc", "xyz123", None, "xyz", "123", "äöå"] @@ -170,7 +170,7 @@ def test_istitle(self) -> None: "AaBbCd", None, ] - col = self.construct_simple_column(ta.VeloxType_VARCHAR(), data) + col = self.construct_simple_column(ta.VarcharType(), data) istitle = ta.generic_udf_dispatch("torcharrow_istitle", [col]) self.assert_SimpleColumn( istitle, @@ -197,14 +197,14 @@ def test_istitle(self) -> None: "A1 B2", "A1B2", ] - col = self.construct_simple_column(ta.VeloxType_VARCHAR(), data) + col = self.construct_simple_column(ta.VarcharType(), data) istitle = ta.generic_udf_dispatch("torcharrow_istitle", [col]) self.assert_SimpleColumn(istitle, [True, True, True, True, True, True, True]) def test_isnumeric(self) -> None: # All False data = ["-1", "1.5", "+2", "abc", "AA", "VIII", "1/3", None] - col = self.construct_simple_column(ta.VeloxType_VARCHAR(), data) + col = self.construct_simple_column(ta.VarcharType(), data) lcol = ta.generic_udf_dispatch("torcharrow_isnumeric", [col]) self.assert_SimpleColumn( lcol, [False, False, False, False, False, False, False, None] @@ -212,13 +212,13 @@ def test_isnumeric(self) -> None: # All True data = ["9876543210123456789", "ⅧⅪ", "ⅷ〩𐍁ᛯ", "᧖७𝟡௫6", "¼⑲⑹⓲➎㉏𐧯"] - col = self.construct_simple_column(ta.VeloxType_VARCHAR(), data) + col = self.construct_simple_column(ta.VarcharType(), data) lcol = ta.generic_udf_dispatch("torcharrow_isnumeric", [col]) self.assert_SimpleColumn(lcol, [True, True, True, True, True]) def test_trinary(self) -> None: data = ["abc", "ABC", "XYZ123", None, "xYZ", "123", "äöå"] - col = self.construct_simple_column(ta.VeloxType_VARCHAR(), data) + col = self.construct_simple_column(ta.VarcharType(), data) # substr, 3 parameters substr = ta.generic_udf_dispatch( @@ -229,10 +229,10 @@ def test_trinary(self) -> None: def test_quaternary(self) -> None: # Based on https://github.com/facebookincubator/velox/blob/8d7fe84d2ce43df952e08ac1015d5bc7c6b868ab/velox/functions/prestosql/tests/ArithmeticTest.cpp#L353-L357 - x = self.construct_simple_column(ta.VeloxType_DOUBLE(), [3.14, 2, -1]) - bound1 = self.construct_simple_column(ta.VeloxType_DOUBLE(), [0, 0, 0]) - bound2 = self.construct_simple_column(ta.VeloxType_DOUBLE(), [4, 4, 3.2]) - bucketCount = self.construct_simple_column(ta.VeloxType_BIGINT(), [3, 3, 4]) + x = self.construct_simple_column(ta.DoubleType(), [3.14, 2, -1]) + bound1 = self.construct_simple_column(ta.DoubleType(), [0, 0, 0]) + bound2 = self.construct_simple_column(ta.DoubleType(), [4, 4, 3.2]) + bucketCount = self.construct_simple_column(ta.BigintType(), [3, 3, 4]) widthBucket = ta.generic_udf_dispatch( "width_bucket", [x, bound1, bound2, bucketCount] diff --git a/torcharrow/test/test_dtypes.py b/torcharrow/test/test_dtypes.py index 25f2bac4e..a938c2573 100644 --- a/torcharrow/test/test_dtypes.py +++ b/torcharrow/test/test_dtypes.py @@ -110,23 +110,21 @@ def test_serialization(self): self.assertEqual(dtype, deserialized_dtype) def test_convert_velox_type_array(self): - vType = velox.VeloxArrayType(velox.VeloxArrayType(velox.VeloxType_VARCHAR())) + vType = velox.ArrayType(velox.ArrayType(velox.VarcharType())) dType = dtype_of_velox_type(vType) self.assertTrue(is_list(dType)) self.assertTrue(is_list(dType.item_dtype)) self.assertTrue(is_string(dType.item_dtype.item_dtype)) def test_convert_velox_type_map(self): - vType = velox.VeloxMapType(velox.VeloxType_VARCHAR(), velox.VeloxType_BIGINT()) + vType = velox.MapType(velox.VarcharType(), velox.BigintType()) dType = dtype_of_velox_type(vType) self.assertTrue(is_map(dType)) self.assertTrue(is_string(dType.key_dtype)) self.assertTrue(is_int64(dType.item_dtype)) def test_convert_velox_type_row(self): - vType = velox.VeloxRowType( - ["c0", "c1"], [velox.VeloxType_VARCHAR(), velox.VeloxType_BIGINT()] - ) + vType = velox.RowType(["c0", "c1"], [velox.VarcharType(), velox.BigintType()]) dType = dtype_of_velox_type(vType) self.assertTrue(is_struct(dType)) self.assertEqual( diff --git a/torcharrow/test/test_list_column.py b/torcharrow/test/test_list_column.py index b18db0b52..1464237f0 100644 --- a/torcharrow/test/test_list_column.py +++ b/torcharrow/test/test_list_column.py @@ -160,7 +160,7 @@ def base_test_fixed_size_list(self): [[1, 2], [3, 4]], dtype=dt.List(item_dtype=dt.int64, fixed_size=2) ) self.assertEqual(d.dtype.fixed_size, 2) - self.assertEqual(type(d._data.type()), velox.VeloxFixedArrayType) + self.assertEqual(type(d._data.type()), velox.FixedSizeArrayType) self.assertEqual(d._data.type().fixed_width(), 2) # Unequal length cells are disallowed @@ -180,14 +180,14 @@ def base_test_fixed_size_list(self): e = d.append(d) self.assertEqual(list(e), [[1, 2], [3, 4], [1, 2], [3, 4]]) self.assertEqual(e.dtype.fixed_size, 2) - self.assertEqual(type(e._data.type()), velox.VeloxFixedArrayType) + self.assertEqual(type(e._data.type()), velox.FixedSizeArrayType) self.assertEqual(e._data.type().fixed_width(), 2) # Appending a non-fixed list of same size f = d.append(ta.column([[4, 5], [5, 6]])) self.assertEqual(list(f), [[1, 2], [3, 4], [4, 5], [5, 6]]) self.assertEqual(f.dtype.fixed_size, 2) - self.assertEqual(type(f._data.type()), velox.VeloxFixedArrayType) + self.assertEqual(type(f._data.type()), velox.FixedSizeArrayType) self.assertEqual(f._data.type().fixed_width(), 2) # Appending a fixed list of different size diff --git a/torcharrow/velox_rt/list_column_cpu.py b/torcharrow/velox_rt/list_column_cpu.py index 3f7555614..65cab76e6 100644 --- a/torcharrow/velox_rt/list_column_cpu.py +++ b/torcharrow/velox_rt/list_column_cpu.py @@ -34,9 +34,9 @@ def __init__(self, device, dtype, data, offsets, mask): ListColumn.__init__(self, device, dtype) self._data = velox.Column( - velox.VeloxArrayType(get_velox_type(dtype.item_dtype)) + velox.ArrayType(get_velox_type(dtype.item_dtype)) if dtype.fixed_size == -1 - else velox.VeloxFixedArrayType( + else velox.FixedSizeArrayType( dtype.fixed_size, get_velox_type(dtype.item_dtype) ) ) diff --git a/torcharrow/velox_rt/map_column_cpu.py b/torcharrow/velox_rt/map_column_cpu.py index e88b55990..4bc86bd83 100644 --- a/torcharrow/velox_rt/map_column_cpu.py +++ b/torcharrow/velox_rt/map_column_cpu.py @@ -33,7 +33,7 @@ def __init__(self, device, dtype, key_data, item_data, mask): MapColumn.__init__(self, device, dtype) self._data = velox.Column( - velox.VeloxMapType( + velox.MapType( get_velox_type(dtype.key_dtype), get_velox_type(dtype.item_dtype) ) ) diff --git a/torcharrow/velox_rt/typing.py b/torcharrow/velox_rt/typing.py index 38e6ba61f..a72cf14c5 100644 --- a/torcharrow/velox_rt/typing.py +++ b/torcharrow/velox_rt/typing.py @@ -16,35 +16,35 @@ def get_velox_type(dtype: dt.DType) -> velox.VeloxType: underlying_dtype = dt.get_underlying_dtype(dtype) if underlying_dtype == dt.int64: - return velox.VeloxType_BIGINT() + return velox.BigintType() elif underlying_dtype == dt.int32: - return velox.VeloxType_INTEGER() + return velox.IntegerType() elif underlying_dtype == dt.int16: - return velox.VeloxType_SMALLINT() + return velox.SmallintType() elif underlying_dtype == dt.int8: - return velox.VeloxType_TINYINT() + return velox.TinyintType() elif underlying_dtype == dt.float32: - return velox.VeloxType_REAL() + return velox.RealType() elif underlying_dtype == dt.float64: - return velox.VeloxType_DOUBLE() + return velox.DoubleType() elif underlying_dtype == dt.string: - return velox.VeloxType_VARCHAR() + return velox.VarcharType() elif underlying_dtype == dt.boolean: - return velox.VeloxType_BOOLEAN() + return velox.BooleanType() elif isinstance(underlying_dtype, dt.List): if underlying_dtype.fixed_size == -1: - return velox.VeloxArrayType(get_velox_type(underlying_dtype.item_dtype)) + return velox.ArrayType(get_velox_type(underlying_dtype.item_dtype)) else: - return velox.VeloxFixedArrayType( + return velox.FixedSizeArrayType( underlying_dtype.fixed_size, get_velox_type(underlying_dtype.item_dtype) ) elif isinstance(underlying_dtype, dt.Map): - return velox.VeloxMapType( + return velox.MapType( get_velox_type(underlying_dtype.key_dtype), get_velox_type(underlying_dtype.item_dtype), ) elif isinstance(underlying_dtype, dt.Struct): - return velox.VeloxRowType( + return velox.RowType( [f.name for f in underlying_dtype.fields], [get_velox_type(f.dtype) for f in underlying_dtype.fields], ) @@ -70,12 +70,12 @@ def dtype_of_velox_type(vtype: velox.VeloxType) -> dt.DType: if vtype.kind() == velox.TypeKind.VARCHAR: return dt.String(nullable=True) if vtype.kind() == velox.TypeKind.ARRAY: - if type(vtype) == velox.VeloxArrayType: + if type(vtype) == velox.ArrayType: return dt.List( item_dtype=dtype_of_velox_type(vtype.element_type()), nullable=True, ) - elif type(vtype) == velox.VeloxFixedArrayType: + elif type(vtype) == velox.FixedSizeArrayType: return dt.List( item_dtype=dtype_of_velox_type(vtype.element_type()), fixed_size=vtype.fixed_width(), @@ -84,16 +84,16 @@ def dtype_of_velox_type(vtype: velox.VeloxType) -> dt.DType: else: raise TypeError(f"Unknown array type {vtype}") if vtype.kind() == velox.TypeKind.MAP: - # pyre-fixme[11]: Annotation `VeloxMapType` is not defined as a type. - vtype = ty.cast(velox.VeloxMapType, vtype) + # pyre-fixme[11]: Annotation `MapType` is not defined as a type. + vtype = ty.cast(velox.MapType, vtype) return dt.Map( key_dtype=dtype_of_velox_type(vtype.key_type()), item_dtype=dtype_of_velox_type(vtype.value_type()), nullable=True, ) if vtype.kind() == velox.TypeKind.ROW: - # pyre-fixme[11]: Annotation `VeloxRowType` is not defined as a type. - vtype = ty.cast(velox.VeloxRowType, vtype) + # pyre-fixme[11]: Annotation `RowType` is not defined as a type. + vtype = ty.cast(velox.RowType, vtype) fields = [ dt.Field( name=vtype.name_of(i), dtype=dtype_of_velox_type(vtype.child_at(i))