From 63f5fe5c9469736bc8a6be27d9f9fca04188d3c2 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Tue, 21 Feb 2023 15:58:56 +0100 Subject: [PATCH] [turbodbc] Add example "Using CrateDB with turbodbc" --- by-language/python-turbodbc/.gitignore | 1 + by-language/python-turbodbc/README.rst | 166 ++++++++++++++++++ by-language/python-turbodbc/backlog.rst | 22 +++ by-language/python-turbodbc/demo.py | 69 ++++++++ .../dockerfiles/archlinux.Dockerfile | 48 +++++ .../dockerfiles/centos.Dockerfile | 9 + .../dockerfiles/debian.Dockerfile | 12 ++ .../dockerfiles/sles.Dockerfile | 24 +++ by-language/python-turbodbc/odbc.ini | 19 ++ by-language/python-turbodbc/pyproject.toml | 7 + .../python-turbodbc/requirements-prereq.txt | 4 + by-language/python-turbodbc/requirements.txt | 2 + 12 files changed, 383 insertions(+) create mode 100644 by-language/python-turbodbc/.gitignore create mode 100644 by-language/python-turbodbc/README.rst create mode 100644 by-language/python-turbodbc/backlog.rst create mode 100644 by-language/python-turbodbc/demo.py create mode 100644 by-language/python-turbodbc/dockerfiles/archlinux.Dockerfile create mode 100644 by-language/python-turbodbc/dockerfiles/centos.Dockerfile create mode 100644 by-language/python-turbodbc/dockerfiles/debian.Dockerfile create mode 100644 by-language/python-turbodbc/dockerfiles/sles.Dockerfile create mode 100644 by-language/python-turbodbc/odbc.ini create mode 100644 by-language/python-turbodbc/pyproject.toml create mode 100644 by-language/python-turbodbc/requirements-prereq.txt create mode 100644 by-language/python-turbodbc/requirements.txt diff --git a/by-language/python-turbodbc/.gitignore b/by-language/python-turbodbc/.gitignore new file mode 100644 index 00000000..831fce04 --- /dev/null +++ b/by-language/python-turbodbc/.gitignore @@ -0,0 +1 @@ +.venv* diff --git a/by-language/python-turbodbc/README.rst b/by-language/python-turbodbc/README.rst new file mode 100644 index 00000000..c9e9523b --- /dev/null +++ b/by-language/python-turbodbc/README.rst @@ -0,0 +1,166 @@ +########################### +Using CrateDB with turbodbc +########################### + + +***** +About +***** + +This section of the documentation describes how to connect to `CrateDB`_ +with `turbodbc`_, by providing a few example programs. + +The examples use the `unixODBC`_ implementation of `ODBC`_, and the `PostgreSQL +ODBC driver`_, for connecting to the `PostgreSQL wire protocol`_ interface of +`CrateDB`_. + +This folder also contains ``Dockerfile`` files providing environments to +exercise the examples on different operating systems, like Arch Linux, +Red Hat (CentOS), Debian, and SUSE Linux. + + +************ +Introduction +************ + +`Turbodbc`_ is a Python module to access relational databases via the `Open +Database Connectivity (ODBC)`_ interface. In addition to complying with +the `Python Database API Specification 2.0`_, turbodbc offers built-in `NumPy`_ +and `Apache Arrow`_ support for improved performance. Their slogan is: + + Don’t wait minutes for your results, just blink. + +*Note: The description texts have been taken from turbodbc's documentation 1:1.* + +Description +=========== + +Its primary target audience are data scientists that use databases for which no +efficient native Python drivers are available. + +For maximum compatibility, turbodbc complies with the `Python Database API +Specification 2.0`_ (PEP 249). For maximum performance, turbodbc internally +relies on batched data transfer instead of single-record communication as +other popular ODBC modules do. + +Why should I use turbodbc instead of other ODBC modules? +======================================================== + +- Short answer: turbodbc is faster. +- Slightly longer answer: turbodbc is faster, *much* faster if you want to + work with NumPy. +- Medium-length answer: The author has tested turbodbc and pyodbc (probably + the most popular Python ODBC module) with various databases (Exasol, + PostgreSQL, MySQL) and corresponding ODBC drivers. He found turbodbc to be + consistently faster. + +Smooth. What is the trick? +========================== + +Turbodbc exploits buffering. + +- Turbodbc implements both sending parameters and retrieving result sets using + buffers of multiple rows/parameter sets. This avoids round trips to the ODBC + driver and (depending how well the ODBC driver is written) to the database. +- Multiple buffers are used for asynchronous I/O. This allows to interleave + Python object conversion and direct database interaction (see performance + options below). +- Buffers contain binary representations of data. NumPy arrays contain binary + representations of data. Good thing they are often the same, so instead of + converting, the driver can just copy data. + + +***** +Setup +***** + +Install prerequisites +===================== + +Arch Linux:: + + # See `dockerfiles/archlinux.Dockerfile`. + +CentOS Stream:: + + dnf install --enablerepo=crb -y boost-devel g++ postgresql-odbc python3 python3-devel python3-pip unixODBC-devel + +Debian:: + + apt-get install --yes build-essential libboost-dev odbc-postgresql unixodbc-dev + +macOS/Homebrew:: + + brew install psqlodbc unixodbc + +SUSE Linux Enterprise Server:: + + # See `dockerfiles/sles.Dockerfile`. + +Install Python sandbox +====================== +:: + + # Create Python virtualenv and install dependency packages. + python3 -m venv .venv + source .venv/bin/activate + pip install --upgrade --requirement=requirements-prereq.txt + pip install --upgrade --requirement=requirements.txt --verbose + +.. note:: + + The `turbodbc pip installation documentation`_ says: + Please ``pip install numpy`` before installing turbodbc, because turbodbc + will search for the ``numpy`` Python package at installation/compile time. + If NumPy is not installed, turbodbc will not compile the `NumPy + support`_ features. Similarly, please ``pip install pyarrow`` before + installing turbodbc if you would like to use the `Apache Arrow + support`_. + + +***** +Usage +***** + +Run CrateDB:: + + docker run --rm -it --publish=4200:4200 --publish=5432:5432 crate \ + -Cdiscovery.type=single-node -Ccluster.routing.allocation.disk.threshold_enabled=false + +Invoke demo program on workstation:: + + python demo.py + +Exercise demo program using Docker, on different operating systems:: + + docker build --progress=plain --tag local/python-turbodbc-demo --file=dockerfiles/archlinux.Dockerfile . + docker build --progress=plain --tag local/python-turbodbc-demo --file=dockerfiles/centos.Dockerfile . + docker build --progress=plain --tag local/python-turbodbc-demo --file=dockerfiles/debian.Dockerfile . + docker build --progress=plain --tag local/python-turbodbc-demo --file=dockerfiles/sles.Dockerfile . + + docker run --rm -it --volume=$(pwd):/src --network=host local/python-turbodbc-demo python3 /src/demo.py + + +******* +Backlog +******* + +The patch just contains a basic example within ``demo.py``. Advanced usage +examples to be exercised are tracked within the `backlog`_. + + + +.. _Apache Arrow: https://en.wikipedia.org/wiki/Apache_Arrow +.. _Apache Arrow support: https://turbodbc.readthedocs.io/en/latest/pages/advanced_usage.html#advanced-usage-arrow +.. _backlog: https://github.com/crate/cratedb-examples/blob/main/python-turbodbc/backlog.rst +.. _CrateDB: https://github.com/crate/crate +.. _NumPy: https://en.wikipedia.org/wiki/NumPy +.. _NumPy support: https://turbodbc.readthedocs.io/en/latest/pages/advanced_usage.html#advanced-usage-numpy +.. _ODBC: https://en.wikipedia.org/wiki/Open_Database_Connectivity +.. _Open Database Connectivity (ODBC): https://en.wikipedia.org/wiki/Open_Database_Connectivity +.. _PostgreSQL ODBC driver: https://odbc.postgresql.org/ +.. _PostgreSQL wire protocol: https://crate.io/docs/crate/reference/en/latest/interfaces/postgres.html +.. _Python Database API Specification 2.0: https://peps.python.org/pep-0249/ +.. _turbodbc: https://turbodbc.readthedocs.io/ +.. _turbodbc pip installation documentation: https://turbodbc.readthedocs.io/en/latest/pages/getting_started.html#pip +.. _unixODBC: https://www.unixodbc.org/ diff --git a/by-language/python-turbodbc/backlog.rst b/by-language/python-turbodbc/backlog.rst new file mode 100644 index 00000000..29d96ded --- /dev/null +++ b/by-language/python-turbodbc/backlog.rst @@ -0,0 +1,22 @@ +####################### +python-turbodbc backlog +####################### + +Various items how this little code example can be improved. + +- [x] Provide basic example +- [x] Insert multiple records using parameters +- [x] Docs: Add installation on SUSE +- [x] Provide example(s) for different operating systems (Linux, macOS) +- [o] Docs: Drop a note about connecting with driver file vs. connecting via DSN +- [o] Evaluate different ODBC drivers +- [o] Provide an example scenario how to run it on Windows +- [o] Exercise advanced NumPy and PyArrow options +- [o] Exchange advanced CrateDB data types like ``OBJECT``, ``ARRAY``, and friends +- [o] Use ``SSLmode = Yes`` to connect to CrateDB Cloud +- [o] Explore other driver options at `Zabbix » Recommended UnixODBC settings for PostgreSQL`_ +- [o] Check out https://github.com/dirkjonker/sqlalchemy-turbodbc +- [o] Check out https://docs.devart.com/odbc/postgresql/centos.htm + + +.. _Zabbix » Recommended UnixODBC settings for PostgreSQL: https://www.zabbix.com/documentation/current/en/manual/config/items/itemtypes/odbc_checks/unixodbc_postgresql diff --git a/by-language/python-turbodbc/demo.py b/by-language/python-turbodbc/demo.py new file mode 100644 index 00000000..d0ab92be --- /dev/null +++ b/by-language/python-turbodbc/demo.py @@ -0,0 +1,69 @@ +import os +import sys + +from turbodbc import connect + + +def demo_pg(): + # Connect to database. + # https://turbodbc.readthedocs.io/en/latest/pages/getting_started.html#establish-a-connection-with-your-database + + # Either connect per data source name defined within the ODBC configuration, + # connection = connect(dsn="postgresql", server="localhost", database="testdrive", uid="crate", pwd=None) + + # or connect per connection string, referencing a driver file directly. + if sys.platform == "linux": + candidates = [ + # archlinux + "/usr/lib/psqlodbcw.so", + # Debian + "/usr/lib/x86_64-linux-gnu/odbc/psqlodbcw.so", + # Red Hat + "/usr/lib64/psqlodbcw.so", + ] + driver_file = find_program(candidates) + if driver_file is None: + raise ValueError(f"Unable to detect driver file at {candidates}") + elif sys.platform == "darwin": + driver_file = "/usr/local/lib/psqlodbcw.so" + else: + raise NotImplementedError(f"Platform {sys.platform} not supported yet") + + connection_string = f"Driver={driver_file};Server=localhost;Port=5432;Database=testdrive;Uid=crate;Pwd=;" + print(f"INFO: Connecting to '{connection_string}'") + connection = connect(connection_string=connection_string) + + # Insert data. + cursor = connection.cursor() + cursor.execute("CREATE TABLE IF NOT EXISTS testdrive (id INT PRIMARY KEY, data TEXT);") + cursor.execute("DELETE FROM testdrive;") + cursor.execute("INSERT INTO testdrive VALUES (0, 'zero'), (1, 'one'), (2, 'two');") + cursor.executemany("INSERT INTO testdrive VALUES (?, ?);", [(3, "three"), (4, "four"), (5, "five")]) + cursor.execute("REFRESH TABLE testdrive;") + cursor.close() + + # Query data. + cursor = connection.cursor() + cursor.execute("SELECT * FROM testdrive ORDER BY id") + + print("Column metadata:") + print(cursor.description) + + print("Results by row:") + for row in cursor: + print(row) + + cursor.close() + + # Terminate database connection. + connection.close() + + +def find_program(candidates): + for candidate in candidates: + if os.path.exists(candidate): + return candidate + + +if __name__ == "__main__": + demo_pg() diff --git a/by-language/python-turbodbc/dockerfiles/archlinux.Dockerfile b/by-language/python-turbodbc/dockerfiles/archlinux.Dockerfile new file mode 100644 index 00000000..c04657c1 --- /dev/null +++ b/by-language/python-turbodbc/dockerfiles/archlinux.Dockerfile @@ -0,0 +1,48 @@ +# --------------------------- +# Setup archlinux environment +# --------------------------- + +# Include `yay` for easily installing AUR packages. + +FROM archlinux:base-20230205.0.123931 as archlinux-build + +# Allow building packages using `makepkg` within Docker container. +# https://blog.ganssle.io/tag/arch-linux.html +RUN pacman -Sy --noconfirm --needed base-devel binutils fakeroot git sudo +RUN useradd --create-home build +RUN echo 'build ALL=NOPASSWD: ALL' >> /etc/sudoers + +# Install AUR package helper program `yay`. +# https://aur.archlinux.org/packages/yay +RUN mkdir /yay-bin; chmod ugo+rwX /yay-bin +USER build +RUN \ + git clone https://aur.archlinux.org/yay-bin.git && \ + cd yay-bin && \ + makepkg -si --noconfirm +USER root + + +# -------------------------- +# Setup turbodbc environment +# -------------------------- + +# Install Python, unixODBC, PostgreSQL ODBC driver, and turbodbc. + +FROM archlinux-build + +# Install unixODBC. +# https://archlinux.org/packages/core/x86_64/unixodbc/ +RUN pacman -Sy --noconfirm --needed unixodbc + +# Install PostgreSQL ODBC driver. +# https://aur.archlinux.org/packages/psqlodbc +USER build +RUN yay -S --noconfirm psqlodbc +USER root + +# Install NumPy, PyArrow, and turbodbc. +RUN pacman -Sy --noconfirm --needed boost python python-pip python-setuptools +ADD requirements*.txt . +RUN pip install --upgrade --requirement=requirements-prereq.txt +RUN MAKEFLAGS="-j$(nproc)" pip install --upgrade --requirement=requirements.txt --verbose diff --git a/by-language/python-turbodbc/dockerfiles/centos.Dockerfile b/by-language/python-turbodbc/dockerfiles/centos.Dockerfile new file mode 100644 index 00000000..c81020ad --- /dev/null +++ b/by-language/python-turbodbc/dockerfiles/centos.Dockerfile @@ -0,0 +1,9 @@ +FROM quay.io/centos/centos:stream9 + +# Install Python, unixODBC, the PostgreSQL ODBC driver, and development libraries. +RUN dnf install --enablerepo=crb -y boost-devel g++ postgresql-odbc python3 python3-devel python3-pip unixODBC-devel + +# Install Python, NumPy, PyArrow, and turbodbc. +ADD requirements*.txt . +RUN pip install --upgrade --requirement=requirements-prereq.txt +RUN MAKEFLAGS="-j$(nproc)" pip install --upgrade --requirement=requirements.txt --verbose diff --git a/by-language/python-turbodbc/dockerfiles/debian.Dockerfile b/by-language/python-turbodbc/dockerfiles/debian.Dockerfile new file mode 100644 index 00000000..a53d164c --- /dev/null +++ b/by-language/python-turbodbc/dockerfiles/debian.Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.11-slim-bullseye + +ENV DEBIAN_FRONTEND=noninteractive + +# Install prerequisites. +RUN apt-get update +RUN apt-get install --yes build-essential libboost-dev odbc-postgresql unixodbc-dev + +# Install NumPy, PyArrow, and turbodbc. +ADD requirements*.txt . +RUN pip install --upgrade --requirement=requirements-prereq.txt +RUN MAKEFLAGS="-j$(nproc)" pip install --upgrade --requirement=requirements.txt --verbose diff --git a/by-language/python-turbodbc/dockerfiles/sles.Dockerfile b/by-language/python-turbodbc/dockerfiles/sles.Dockerfile new file mode 100644 index 00000000..5fb376d8 --- /dev/null +++ b/by-language/python-turbodbc/dockerfiles/sles.Dockerfile @@ -0,0 +1,24 @@ +FROM registry.suse.com/suse/sle15 + +# Add package repository for acquiring `boost-devel`. +# https://software.opensuse.org//download.html?project=home%3Afsirl%3Aboost1651&package=boost +RUN zypper addrepo https://download.opensuse.org/repositories/home:fsirl:boost1651/15.4/home:fsirl:boost1651.repo + +# Add package repository for acquiring `python310`. +# https://download.opensuse.org/repositories/devel:/languages:/python:/backports/15.4/ +RUN zypper addrepo https://download.opensuse.org/repositories/devel:/languages:/python:/backports/15.4/devel:languages:python:backports.repo + +# Activate package repositories. +RUN zypper --gpg-auto-import-keys refresh + +# Install Python, unixODBC, the PostgreSQL ODBC driver, and development libraries. +RUN zypper install -y boost-devel gcc-c++ psqlODBC python310 python310-devel python310-pip unixODBC-devel update-alternatives + +# Make Python 3.10 the default Python 3, and add an alias `python3`. +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 0 +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 0 + +# Install Python, NumPy, PyArrow, and turbodbc. +ADD requirements*.txt . +RUN pip install --upgrade --requirement=requirements-prereq.txt +RUN MAKEFLAGS="-j$(nproc)" pip install --upgrade --requirement=requirements.txt --verbose diff --git a/by-language/python-turbodbc/odbc.ini b/by-language/python-turbodbc/odbc.ini new file mode 100644 index 00000000..7233522d --- /dev/null +++ b/by-language/python-turbodbc/odbc.ini @@ -0,0 +1,19 @@ +# More options: +# https://www.zabbix.com/documentation/current/en/manual/config/items/itemtypes/odbc_checks/unixodbc_postgresql + +[postgresql] +Description = General ODBC for PostgreSQL + +# General +FileUsage = 1 + +# If the driver manager was built with thread support, this entry +# alters the default thread serialization level (available since 1.6). +Threading = 2 + +# Linux +#Driver = /usr/lib64/libodbcpsql.so +#Setup = /usr/lib64/libodbcpsqlS.so + +# macOS +Driver = /usr/local/lib/psqlodbcw.so diff --git a/by-language/python-turbodbc/pyproject.toml b/by-language/python-turbodbc/pyproject.toml new file mode 100644 index 00000000..f99d70c4 --- /dev/null +++ b/by-language/python-turbodbc/pyproject.toml @@ -0,0 +1,7 @@ +[tool.black] +line-length = 120 + +[tool.isort] +profile = "black" +skip_glob = "**/site-packages/**" +skip_gitignore = false diff --git a/by-language/python-turbodbc/requirements-prereq.txt b/by-language/python-turbodbc/requirements-prereq.txt new file mode 100644 index 00000000..405a75fb --- /dev/null +++ b/by-language/python-turbodbc/requirements-prereq.txt @@ -0,0 +1,4 @@ +# Turbodbc wants NumPy and PyArrow to be installed upfront. +numpy<1.25 +pyarrow<11 +wheel diff --git a/by-language/python-turbodbc/requirements.txt b/by-language/python-turbodbc/requirements.txt new file mode 100644 index 00000000..4245ad11 --- /dev/null +++ b/by-language/python-turbodbc/requirements.txt @@ -0,0 +1,2 @@ +pytest<8 +turbodbc<5