From 8553677c9065867861f54eabb6cd8be62ae49d3b Mon Sep 17 00:00:00 2001 From: Aykut Bozkurt Date: Wed, 23 Oct 2024 20:46:29 +0300 Subject: [PATCH 01/15] Adds support for COPY TO/FROM Azure Blob Storage Supports following Azure Blob uri forms: - `az://{container}/key` - `azure://{container}/key` - `https://{account}.blob.core.windows.net/{container}` **Configuration** The simplest way to configure object storage is by creating the standard [`~/.azure/config`](https://learn.microsoft.com/en-us/cli/azure/azure-cli-configuration?view=azure-cli-latest) file: ```bash $ cat ~/.azure/config [storage] account = devstoreaccount1 key = Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw== ``` Alternatively, you can use the following environment variables when starting postgres to configure the Azure Blob Storage client: - `AZURE_STORAGE_ACCOUNT`: the storage account name of the Azure Blob - `AZURE_STORAGE_KEY`: the storage key of the Azure Blob - `AZURE_STORAGE_SAS_TOKEN`: the storage SAS token for the Azure Blob - `AZURE_CONFIG_FILE`: an alternative location for the config file **Bonus** Additionally, PR supports following S3 uri forms: - `s3://{bucket}/key` - `s3a://{bucket}/key` - `https://s3.amazonaws.com/{bucket}/key` - `https://{bucket}.s3.amazonaws.com/key` Closes #50 --- .devcontainer/.env | 8 + .devcontainer/Dockerfile | 5 + .devcontainer/create-test-buckets.sh | 2 + .devcontainer/docker-compose.yml | 17 +- .github/workflows/ci.yml | 16 ++ Cargo.lock | 50 ++++- Cargo.toml | 4 +- README.md | 38 +++- src/arrow_parquet/uri_utils.rs | 214 +++++++++++++++++---- src/pgrx_tests/object_store.rs | 271 +++++++++++++++++++++++++-- 10 files changed, 566 insertions(+), 59 deletions(-) diff --git a/.devcontainer/.env b/.devcontainer/.env index 14f05d0..d94153e 100644 --- a/.devcontainer/.env +++ b/.devcontainer/.env @@ -6,6 +6,14 @@ AWS_S3_TEST_BUCKET=testbucket MINIO_ROOT_USER=minioadmin MINIO_ROOT_PASSWORD=minioadmin +# Azure Blob tests +AZURE_STORAGE_ACCOUNT=devstoreaccount1 +AZURE_STORAGE_KEY="Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" +AZURE_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://localhost:10000/devstoreaccount1;" +AZURE_TEST_CONTAINER_NAME=testcontainer +AZURE_TEST_READ_ONLY_SAS="se=2100-05-05&sp=r&sv=2022-11-02&sr=c&sig=YMPFnAHKe9y0o3hFegncbwQTXtAyvsJEgPB2Ne1b9CQ%3D" +AZURE_TEST_READ_WRITE_SAS="se=2100-05-05&sp=rcw&sv=2022-11-02&sr=c&sig=TPz2jEz0t9L651t6rTCQr%2BOjmJHkM76tnCGdcyttnlA%3D" + # Others RUST_TEST_THREADS=1 PG_PARQUET_TEST=true diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 522d00a..dfad9b6 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -12,6 +12,11 @@ RUN apt-get update && apt-get -y install build-essential libreadline-dev zlib1g- curl lsb-release ca-certificates gnupg sudo git \ nano net-tools awscli +# install azure-cli +RUN curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | tee /etc/apt/keyrings/microsoft.gpg > /dev/null +RUN echo "deb [arch=`dpkg --print-architecture` signed-by=/etc/apt/keyrings/microsoft.gpg] https://packages.microsoft.com/repos/azure-cli/ `lsb_release -cs` main" | tee /etc/apt/sources.list.d/azure-cli.list +RUN apt-get update && apt-get install -y azure-cli + # install Postgres RUN sh -c 'echo "deb https://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' RUN wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - diff --git a/.devcontainer/create-test-buckets.sh b/.devcontainer/create-test-buckets.sh index 65dfef0..9ad1360 100644 --- a/.devcontainer/create-test-buckets.sh +++ b/.devcontainer/create-test-buckets.sh @@ -1,3 +1,5 @@ #!/bin/bash aws --endpoint-url http://localhost:9000 s3 mb s3://$AWS_S3_TEST_BUCKET + +az storage container create -n $AZURE_TEST_CONTAINER_NAME --connection-string $AZURE_STORAGE_CONNECTION_STRING diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 259cfc8..5147410 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -10,13 +10,16 @@ services: - ${USERPROFILE}${HOME}/.ssh:/home/rust/.ssh:ro - ${USERPROFILE}${HOME}/.ssh/known_hosts:/home/rust/.ssh/known_hosts:rw - ${USERPROFILE}${HOME}/.gitconfig:/home/rust/.gitconfig:ro - - ${USERPROFILE}${HOME}/.aws:/home/rust/.aws:ro + - ${USERPROFILE}${HOME}/.aws:/home/rust/.aws:rw + - ${USERPROFILE}${HOME}/.azure:/home/rust/.azure:rw + env_file: - .env cap_add: - SYS_PTRACE depends_on: - minio + - azurite minio: image: minio/minio @@ -30,3 +33,15 @@ services: interval: 6s timeout: 2s retries: 3 + + azurite: + image: mcr.microsoft.com/azure-storage/azurite + env_file: + - .env + network_mode: host + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "http://localhost:10000"] + interval: 6s + timeout: 2s + retries: 3 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7d4ce9d..723c037 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -85,6 +85,11 @@ jobs: postgresql-client-${{ env.PG_MAJOR }} \ libpq-dev + - name: Install azure-cli + run: | + curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | sudo tee /etc/apt/keyrings/microsoft.gpg > /dev/null + echo "deb [arch=`dpkg --print-architecture` signed-by=/etc/apt/keyrings/microsoft.gpg] https://packages.microsoft.com/repos/azure-cli/ `lsb_release -cs` main" | sudo tee /etc/apt/sources.list.d/azure-cli.list + sudo apt-get update && sudo apt-get install -y azure-cli - name: Install and configure pgrx run: | @@ -116,6 +121,17 @@ jobs: aws --endpoint-url http://localhost:9000 s3 mb s3://$AWS_S3_TEST_BUCKET + - name: Start Azurite for Azure Blob Storage emulator tests + run: | + docker run -d --env-file .devcontainer/.env -p 10000:10000 mcr.microsoft.com/azure-storage/azurite + + while ! nc -z localhost 10000; do + echo "Waiting for localhost:10000..." + sleep 1 + done + + az storage container create -n $AZURE_TEST_CONTAINER_NAME --connection-string $AZURE_STORAGE_CONNECTION_STRING + - name: Run tests run: | # Run tests with coverage tool diff --git a/Cargo.lock b/Cargo.lock index a6702d3..b547a37 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -137,7 +137,7 @@ dependencies = [ "arrow-schema", "chrono", "half 2.4.1", - "hashbrown", + "hashbrown 0.15.2", "num", ] @@ -1009,6 +1009,15 @@ dependencies = [ "syn", ] +[[package]] +name = "dlv-list" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f" +dependencies = [ + "const-random", +] + [[package]] name = "either" version = "1.13.0" @@ -1308,6 +1317,12 @@ dependencies = [ "byteorder", ] +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + [[package]] name = "hashbrown" version = "0.15.2" @@ -1706,7 +1721,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.15.2", ] [[package]] @@ -2108,6 +2123,16 @@ dependencies = [ "num-traits", ] +[[package]] +name = "ordered-multimap" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79" +dependencies = [ + "dlv-list", + "hashbrown 0.14.5", +] + [[package]] name = "outref" version = "0.5.1" @@ -2168,7 +2193,7 @@ dependencies = [ "flate2", "futures", "half 2.4.1", - "hashbrown", + "hashbrown 0.15.2", "lz4_flex", "num", "num-bigint", @@ -2236,11 +2261,13 @@ dependencies = [ "aws-config", "aws-credential-types", "futures", + "home", "object_store", "once_cell", "parquet", "pgrx", "pgrx-tests", + "rust-ini", "tokio", "url", ] @@ -2741,6 +2768,17 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rust-ini" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e310ef0e1b6eeb79169a1171daf9abcb87a2e17c03bee2c4bb100b55c75409f" +dependencies = [ + "cfg-if", + "ordered-multimap", + "trim-in-place", +] + [[package]] name = "rustc-demangle" version = "0.1.24" @@ -3551,6 +3589,12 @@ dependencies = [ "once_cell", ] +[[package]] +name = "trim-in-place" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "343e926fc669bc8cde4fa3129ab681c63671bae288b1f1081ceee6d9d37904fc" + [[package]] name = "try-lock" version = "0.2.5" diff --git a/Cargo.toml b/Cargo.toml index e59a625..d71a000 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,8 @@ arrow-schema = {version = "53", default-features = false} aws-config = { version = "1.5", default-features = false, features = ["rustls"]} aws-credential-types = {version = "1.2", default-features = false} futures = "0.3" -object_store = {version = "0.11", default-features = false, features = ["aws"]} +home = "0.5" +object_store = {version = "0.11", default-features = false, features = ["aws", "azure"]} once_cell = "1" parquet = {version = "53", default-features = false, features = [ "arrow", @@ -38,6 +39,7 @@ parquet = {version = "53", default-features = false, features = [ "object_store", ]} pgrx = "=0.12.9" +rust-ini = "0.21" tokio = {version = "1", default-features = false, features = ["rt", "time", "macros"]} url = "2" diff --git a/README.md b/README.md index 353b01f..2060589 100644 --- a/README.md +++ b/README.md @@ -156,7 +156,13 @@ SELECT uri, encode(key, 'escape') as key, encode(value, 'escape') as value FROM ``` ## Object Store Support -`pg_parquet` supports reading and writing Parquet files from/to `S3` object store. Only the uris with `s3://` scheme is supported. +`pg_parquet` supports reading and writing Parquet files from/to `S3` and `Azure Blob Storage` object stores. + +> [!NOTE] +> To be able to write into a object store location, you need to grant `parquet_object_store_write` role to your current postgres user. +> Similarly, to read from an object store location, you need to grant `parquet_object_store_read` role to your current postgres user. + +#### S3 Storage The simplest way to configure object storage is by creating the standard `~/.aws/credentials` and `~/.aws/config` files: @@ -179,9 +185,33 @@ Alternatively, you can use the following environment variables when starting pos - `AWS_CONFIG_FILE`: an alternative location for the config file - `AWS_PROFILE`: the name of the profile from the credentials and config file (default profile name is `default`) -> [!NOTE] -> To be able to write into a object store location, you need to grant `parquet_object_store_write` role to your current postgres user. -> Similarly, to read from an object store location, you need to grant `parquet_object_store_read` role to your current postgres user. +Supported S3 uri formats are shown below: +- s3:// \ / \ +- s3a:// \ / \ +- https:// \.s3.amazonaws.com / \ +- https:// s3.amazonaws.com / \ / \ + +#### Azure Blob Storage + +The simplest way to configure object storage is by creating the standard [`~/.azure/config`](https://learn.microsoft.com/en-us/cli/azure/azure-cli-configuration?view=azure-cli-latest) file: + +```bash +$ cat ~/.azure/config +[storage] +account = devstoreaccount1 +key = Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw== +``` + +Alternatively, you can use the following environment variables when starting postgres to configure the Azure Blob Storage client: +- `AZURE_STORAGE_ACCOUNT`: the storage account name of the Azure Blob +- `AZURE_STORAGE_KEY`: the storage key of the Azure Blob +- `AZURE_STORAGE_SAS_TOKEN`: the storage SAS token for the Azure Blob +- `AZURE_CONFIG_FILE`: an alternative location for the config file + +Supported Azure Blob Storage uri formats are shown below: +- az:// \ / \ +- azure:// \ / \ +- https:// \.blob.core.windows.net / \ ## Copy Options `pg_parquet` supports the following options in the `COPY TO` command: diff --git a/src/arrow_parquet/uri_utils.rs b/src/arrow_parquet/uri_utils.rs index 3ff97af..438bc35 100644 --- a/src/arrow_parquet/uri_utils.rs +++ b/src/arrow_parquet/uri_utils.rs @@ -1,4 +1,7 @@ -use std::{sync::Arc, sync::LazyLock}; +use std::{ + panic, + sync::{Arc, LazyLock}, +}; use arrow::datatypes::SchemaRef; use aws_config::{ @@ -7,11 +10,14 @@ use aws_config::{ profile::{ProfileFileCredentialsProvider, ProfileFileRegionProvider}, }; use aws_credential_types::provider::ProvideCredentials; +use home::home_dir; +use ini::Ini; use object_store::{ aws::{AmazonS3, AmazonS3Builder}, + azure::{AzureConfigKey, MicrosoftAzure, MicrosoftAzureBuilder}, local::LocalFileSystem, path::Path, - ObjectStore, + ObjectStore, ObjectStoreScheme, }; use parquet::{ arrow::{ @@ -44,57 +50,106 @@ pub(crate) static PG_BACKEND_TOKIO_RUNTIME: LazyLock = LazyLock::new(|| .unwrap_or_else(|e| panic!("failed to create tokio runtime: {}", e)) }); -fn parse_bucket_and_key(uri: &Url) -> (String, String) { - debug_assert!(uri.scheme() == "s3"); +fn parse_azure_blob_container(uri: &Url) -> Option { + let host = uri.host_str()?; - let bucket = uri - .host_str() - .unwrap_or_else(|| panic!("bucket not found in uri: {}", uri)); + // az(ure)://{container}/key + if uri.scheme() == "az" || uri.scheme() == "azure" { + return Some(host.to_string()); + } + // https://{account}.blob.core.windows.net/{container} + else if host.ends_with("blob.core.windows.net") { + let path_segments: Vec<&str> = uri.path_segments()?.collect(); - let key = uri.path(); + if !path_segments.is_empty() { + return Some(path_segments[0].to_string()); + } else { + return None; + } + } - (bucket.to_string(), key.to_string()) + None +} + +fn parse_s3_bucket(uri: &Url) -> Option { + let host = uri.host_str()?; + + // s3(a)://{bucket}/key + if uri.scheme() == "s3" || uri.scheme() == "s3a" { + return Some(host.to_string()); + } + // https://s3.amazonaws.com/{bucket}/key + else if host == "s3.amazonaws.com" { + let path_segments: Vec<&str> = uri.path_segments()?.collect(); + if !path_segments.is_empty() { + return Some(path_segments[0].to_string()); // Bucket name is the first part of the path + } else { + return None; + } + } + // https://{bucket}.s3.amazonaws.com/key + else if host.ends_with("s3.amazonaws.com") { + let bucket_name = host.split('.').next()?; + return Some(bucket_name.to_string()); + } + + None } fn object_store_with_location(uri: &Url, copy_from: bool) -> (Arc, Path) { - if uri.scheme() == "s3" { - let (bucket_name, key) = parse_bucket_and_key(uri); + let (scheme, path) = + ObjectStoreScheme::parse(uri).unwrap_or_else(|_| panic!("unsupported uri {}", uri)); - let storage_container = PG_BACKEND_TOKIO_RUNTIME - .block_on(async { Arc::new(get_s3_object_store(&bucket_name).await) }); + match scheme { + ObjectStoreScheme::AmazonS3 => { + let bucket_name = parse_s3_bucket(uri).unwrap_or_else(|| { + panic!("failed to parse bucket name from uri: {}", uri); + }); - let location = Path::from(key); + let storage_container = PG_BACKEND_TOKIO_RUNTIME + .block_on(async { Arc::new(get_s3_object_store(&bucket_name).await) }); - (storage_container, location) - } else { - debug_assert!(uri.scheme() == "file"); - - let uri = uri_as_string(uri); - - if !copy_from { - // create or overwrite the local file - std::fs::OpenOptions::new() - .write(true) - .truncate(true) - .create(true) - .open(&uri) - .unwrap_or_else(|e| panic!("{}", e)); + (storage_container, path) + } + ObjectStoreScheme::MicrosoftAzure => { + let container_name = parse_azure_blob_container(uri).unwrap_or_else(|| { + panic!("failed to parse container name from uri: {}", uri); + }); + + let storage_container = PG_BACKEND_TOKIO_RUNTIME + .block_on(async { Arc::new(get_azure_object_store(&container_name).await) }); + + (storage_container, path) } + ObjectStoreScheme::Local => { + let uri = uri_as_string(uri); + + if !copy_from { + // create or overwrite the local file + std::fs::OpenOptions::new() + .write(true) + .truncate(true) + .create(true) + .open(&uri) + .unwrap_or_else(|e| panic!("{}", e)); + } - let storage_container = Arc::new(LocalFileSystem::new()); + let storage_container = Arc::new(LocalFileSystem::new()); - let location = Path::from_filesystem_path(&uri).unwrap_or_else(|e| panic!("{}", e)); + let path = Path::from_filesystem_path(&uri).unwrap_or_else(|e| panic!("{}", e)); - (storage_container, location) + (storage_container, path) + } + _ => { + panic!("unsupported uri {}", uri); + } } } async fn get_s3_object_store(bucket_name: &str) -> AmazonS3 { let mut aws_s3_builder = AmazonS3Builder::new().with_bucket_name(bucket_name); - let is_test_running = std::env::var("PG_PARQUET_TEST").is_ok(); - - if is_test_running { + if is_testing() { // use minio for testing aws_s3_builder = aws_s3_builder.with_endpoint("http://localhost:9000"); aws_s3_builder = aws_s3_builder.with_allow_http(true); @@ -139,6 +194,78 @@ async fn get_s3_object_store(bucket_name: &str) -> AmazonS3 { aws_s3_builder.build().unwrap_or_else(|e| panic!("{}", e)) } +async fn get_azure_object_store(container_name: &str) -> MicrosoftAzure { + let mut azure_builder = MicrosoftAzureBuilder::new().with_container_name(container_name); + + if is_testing() { + // use azurite for testing + azure_builder = + azure_builder.with_endpoint("http://localhost:10000/devstoreaccount1".into()); + azure_builder = azure_builder.with_allow_http(true); + } + + // ~/.azure/config + let azure_config_file_path = std::env::var("AZURE_CONFIG_FILE").unwrap_or( + home_dir() + .expect("failed to get home directory") + .join(".azure") + .join("config") + .to_str() + .expect("failed to convert path to string") + .to_string(), + ); + + let azure_config_content = Ini::load_from_file(&azure_config_file_path).ok(); + + // storage account + let azure_blob_account = match std::env::var("AZURE_STORAGE_ACCOUNT") { + Ok(account) => Some(account), + Err(_) => azure_config_content + .as_ref() + .and_then(|ini| ini.section(Some("storage"))) + .and_then(|section| section.get("account")) + .map(|account| account.to_string()), + }; + + if let Some(azure_blob_account) = azure_blob_account { + azure_builder = azure_builder.with_account(azure_blob_account); + } + + // storage key + let azure_blob_key = match std::env::var("AZURE_STORAGE_KEY") { + Ok(key) => Some(key), + Err(_) => azure_config_content + .as_ref() + .and_then(|ini| ini.section(Some("storage"))) + .and_then(|section| section.get("key")) + .map(|key| key.to_string()), + }; + + if let Some(azure_blob_key) = azure_blob_key { + azure_builder = azure_builder.with_access_key(azure_blob_key); + } + + // sas token + let azure_blob_sas_token = match std::env::var("AZURE_STORAGE_SAS_TOKEN") { + Ok(token) => Some(token), + Err(_) => azure_config_content + .as_ref() + .and_then(|ini| ini.section(Some("storage"))) + .and_then(|section| section.get("sas_token")) + .map(|token| token.to_string()), + }; + + if let Some(azure_blob_sas_token) = azure_blob_sas_token { + azure_builder = azure_builder.with_config(AzureConfigKey::SasKey, azure_blob_sas_token); + } + + azure_builder.build().unwrap_or_else(|e| panic!("{}", e)) +} + +fn is_testing() -> bool { + std::env::var("PG_PARQUET_TEST").is_ok() +} + pub(crate) fn parse_uri(uri: &str) -> Url { if !uri.contains("://") { // local file @@ -148,12 +275,25 @@ pub(crate) fn parse_uri(uri: &str) -> Url { let uri = Url::parse(uri).unwrap_or_else(|e| panic!("{}", e)); - if uri.scheme() != "s3" { + let (scheme, _) = + ObjectStoreScheme::parse(&uri).unwrap_or_else(|_| panic!("unsupported uri {}", uri)); + + if scheme == ObjectStoreScheme::AmazonS3 { + parse_s3_bucket(&uri) + .unwrap_or_else(|| panic!("failed to parse bucket name from s3 uri {}", uri)); + } else if scheme == ObjectStoreScheme::MicrosoftAzure { + parse_azure_blob_container(&uri).unwrap_or_else(|| { + panic!( + "failed to parse container name from azure blob storage uri {}", + uri + ) + }); + } else { panic!( - "unsupported uri {}. Only local files and URIs with s3:// prefix are supported.", + "unsupported uri {}. Only Azure and S3 uris are supported.", uri ); - } + }; uri } diff --git a/src/pgrx_tests/object_store.rs b/src/pgrx_tests/object_store.rs index 4272027..89edffe 100644 --- a/src/pgrx_tests/object_store.rs +++ b/src/pgrx_tests/object_store.rs @@ -7,20 +7,33 @@ mod tests { use crate::pgrx_tests::common::TestTable; #[pg_test] - fn test_s3_object_store_from_env() { + fn test_s3_from_env() { let test_bucket_name: String = std::env::var("AWS_S3_TEST_BUCKET").expect("AWS_S3_TEST_BUCKET not found"); - let s3_uri = format!("s3://{}/pg_parquet_test.parquet", test_bucket_name); - - let test_table = TestTable::::new("int4".into()).with_uri(s3_uri); - - test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); - test_table.assert_expected_and_result_rows(); + let s3_uris = [ + format!("s3://{}/pg_parquet_test.parquet", test_bucket_name), + format!("s3a://{}/pg_parquet_test.parquet", test_bucket_name), + format!( + "https://s3.amazonaws.com/{}/pg_parquet_test.parquet", + test_bucket_name + ), + format!( + "https://{}.s3.amazonaws.com/pg_parquet_test.parquet", + test_bucket_name + ), + ]; + + for s3_uri in s3_uris { + let test_table = TestTable::::new("int4".into()).with_uri(s3_uri); + + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } } #[pg_test] - fn test_s3_object_store_from_config_file() { + fn test_s3_from_config_file() { let test_bucket_name: String = std::env::var("AWS_S3_TEST_BUCKET").expect("AWS_S3_TEST_BUCKET not found"); @@ -32,14 +45,16 @@ mod tests { let region = std::env::var("AWS_REGION").unwrap(); std::env::remove_var("AWS_REGION"); + let profile = "pg_parquet_test"; + // create a config file let aws_config_file_content = format!( - "[profile pg_parquet_test]\nregion = {}\naws_access_key_id = {}\naws_secret_access_key = {}\n", + "[profile {profile}]\nregion = {}\naws_access_key_id = {}\naws_secret_access_key = {}\n", region, access_key_id, secret_access_key ); - std::env::set_var("AWS_PROFILE", "pg_parquet_test"); + std::env::set_var("AWS_PROFILE", profile); - let aws_config_file = "/tmp/aws_config"; + let aws_config_file = "/tmp/pg_parquet_aws_config"; std::env::set_var("AWS_CONFIG_FILE", aws_config_file); let mut aws_config_file = std::fs::OpenOptions::new() @@ -61,6 +76,38 @@ mod tests { test_table.assert_expected_and_result_rows(); } + #[pg_test] + #[should_panic(expected = "403 Forbidden")] + fn test_s3_with_wrong_access_key_id() { + std::env::set_var("AWS_ACCESS_KEY_ID", "wrong_access_key_id"); + + let test_bucket_name: String = + std::env::var("AWS_S3_TEST_BUCKET").expect("AWS_S3_TEST_BUCKET not found"); + + let s3_uri = format!("s3://{}/pg_parquet_test.parquet", test_bucket_name); + + let test_table = TestTable::::new("int4".into()).with_uri(s3_uri); + + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } + + #[pg_test] + #[should_panic(expected = "403 Forbidden")] + fn test_s3_with_wrong_secret_access_key() { + std::env::set_var("AWS_SECRET_ACCESS_KEY", "wrong_secret_access_key"); + + let test_bucket_name: String = + std::env::var("AWS_S3_TEST_BUCKET").expect("AWS_S3_TEST_BUCKET not found"); + + let s3_uri = format!("s3://{}/pg_parquet_test.parquet", test_bucket_name); + + let test_table = TestTable::::new("int4".into()).with_uri(s3_uri); + + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } + #[pg_test] #[should_panic(expected = "permission denied to COPY from a remote uri")] fn test_s3_no_read_access() { @@ -143,7 +190,7 @@ mod tests { #[pg_test] #[should_panic(expected = "404 Not Found")] - fn test_s3_object_store_write_invalid_uri() { + fn test_s3_write_wrong_bucket() { let s3_uri = "s3://randombucketwhichdoesnotexist/pg_parquet_test.parquet"; let copy_to_command = format!( @@ -155,7 +202,7 @@ mod tests { #[pg_test] #[should_panic(expected = "404 Not Found")] - fn test_s3_object_store_read_invalid_uri() { + fn test_s3_read_wrong_bucket() { let s3_uri = "s3://randombucketwhichdoesnotexist/pg_parquet_test.parquet"; let create_table_command = "CREATE TABLE test_table (a int);"; @@ -165,6 +212,204 @@ mod tests { Spi::run(copy_from_command.as_str()).unwrap(); } + #[pg_test] + #[should_panic(expected = "failed to parse bucket name")] + fn test_s3_unsupported_uri() { + let cloudflare_s3_uri = "https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket".into(); + + let test_table = TestTable::::new("int4".into()).with_uri(cloudflare_s3_uri); + + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } + + #[pg_test] + fn test_azure_blob_from_env() { + let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") + .expect("AZURE_TEST_CONTAINER_NAME not found"); + + let test_account_name: String = + std::env::var("AZURE_STORAGE_ACCOUNT").expect("AZURE_STORAGE_ACCOUNT not found"); + + let azure_blob_uris = [ + format!("az://{}/pg_parquet_test.parquet", test_container_name), + format!("azure://{}/pg_parquet_test.parquet", test_container_name), + format!( + "https://{}.blob.core.windows.net/{}", + test_account_name, test_container_name + ), + ]; + + for azure_blob_uri in azure_blob_uris { + let test_table = TestTable::::new("int4".into()).with_uri(azure_blob_uri); + + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } + } + + #[pg_test] + fn test_azure_from_config_file() { + let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") + .expect("AZURE_TEST_CONTAINER_NAME not found"); + + // remove these to make sure the config file is used + let account_name = std::env::var("AZURE_STORAGE_ACCOUNT").unwrap(); + std::env::remove_var("AZURE_STORAGE_ACCOUNT"); + let account_key = std::env::var("AZURE_STORAGE_KEY").unwrap(); + std::env::remove_var("AZURE_STORAGE_KEY"); + + // create a config file + let azure_config_file_content = format!( + "[storage]\naccount = {}\nkey = {}\n", + account_name, account_key + ); + + let azure_config_file = "/tmp/pg_parquet_azure_config"; + std::env::set_var("AZURE_CONFIG_FILE", azure_config_file); + + let mut azure_config_file = std::fs::OpenOptions::new() + .write(true) + .truncate(true) + .create(true) + .open(azure_config_file) + .unwrap(); + + azure_config_file + .write_all(azure_config_file_content.as_bytes()) + .unwrap(); + + let azure_blob_uri = format!("az://{}/pg_parquet_test.parquet", test_container_name); + + let test_table = TestTable::::new("int4".into()).with_uri(azure_blob_uri); + + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } + + #[pg_test] + #[should_panic(expected = "Account must be specified")] + fn test_azure_with_no_storage_account() { + std::env::remove_var("AZURE_STORAGE_ACCOUNT"); + + let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") + .expect("AZURE_TEST_CONTAINER_NAME not found"); + + let azure_blob_uri = format!("az://{}/pg_parquet_test.parquet", test_container_name); + + let test_table = TestTable::::new("int4".into()).with_uri(azure_blob_uri); + + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } + + #[pg_test] + #[should_panic(expected = "403 Forbidden")] + fn test_azure_blob_with_wrong_storage_key() { + let wrong_account_key = String::from("FFy8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=="); + std::env::set_var("AZURE_STORAGE_KEY", wrong_account_key); + + let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") + .expect("AZURE_TEST_CONTAINER_NAME not found"); + + let test_account_name: String = + std::env::var("AZURE_STORAGE_ACCOUNT").expect("AZURE_STORAGE_ACCOUNT not found"); + + let azure_blob_uri = format!( + "https://{}.blob.core.windows.net/{}", + test_account_name, test_container_name + ); + + let test_table = TestTable::::new("int4".into()).with_uri(azure_blob_uri); + + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } + + #[pg_test] + #[should_panic(expected = "404 Not Found")] + fn test_azure_blob_write_wrong_container() { + let test_account_name: String = + std::env::var("AZURE_STORAGE_ACCOUNT").expect("AZURE_STORAGE_ACCOUNT not found"); + + let azure_blob_uri = format!( + "https://{}.blob.core.windows.net/nonexistentcontainer", + test_account_name + ); + + let copy_to_command = format!( + "COPY (SELECT i FROM generate_series(1,10) i) TO '{}' WITH (format parquet);;", + azure_blob_uri + ); + Spi::run(copy_to_command.as_str()).unwrap(); + } + + #[pg_test] + fn test_azure_blob_read_write_sas() { + let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") + .expect("AZURE_TEST_CONTAINER_NAME not found"); + + let test_account_name: String = + std::env::var("AZURE_STORAGE_ACCOUNT").expect("AZURE_STORAGE_ACCOUNT not found"); + + let read_write_sas_token = std::env::var("AZURE_TEST_READ_WRITE_SAS") + .expect("AZURE_TEST_READ_WRITE_SAS not found"); + + // remove account key to make sure the sas token is used + std::env::remove_var("AZURE_STORAGE_KEY"); + std::env::set_var("AZURE_STORAGE_SAS_TOKEN", read_write_sas_token); + + let azure_blob_uri = format!( + "https://{}.blob.core.windows.net/{}", + test_account_name, test_container_name + ); + + let copy_to_command = format!( + "COPY (SELECT i FROM generate_series(1,10) i) TO '{}' WITH (format parquet);;", + azure_blob_uri + ); + Spi::run(copy_to_command.as_str()).unwrap(); + } + + #[pg_test] + #[should_panic(expected = "403 Forbidden")] + fn test_azure_blob_read_only_sas() { + let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") + .expect("AZURE_TEST_CONTAINER_NAME not found"); + + let test_account_name: String = + std::env::var("AZURE_STORAGE_ACCOUNT").expect("AZURE_STORAGE_ACCOUNT not found"); + + let read_only_sas_token: String = + std::env::var("AZURE_TEST_READ_ONLY_SAS").expect("AZURE_TEST_READ_ONLY_SAS not found"); + + // remove account key to make sure the sas token is used + std::env::remove_var("AZURE_STORAGE_KEY"); + std::env::set_var("AZURE_STORAGE_SAS_TOKEN", read_only_sas_token); + + let azure_blob_uri = format!( + "https://{}.blob.core.windows.net/{}", + test_account_name, test_container_name + ); + + let copy_to_command = format!( + "COPY (SELECT i FROM generate_series(1,10) i) TO '{}' WITH (format parquet);", + azure_blob_uri + ); + Spi::run(copy_to_command.as_str()).unwrap(); + } + + #[pg_test] + #[should_panic(expected = "failed to parse container name")] + fn test_azure_blob_unsupported_uri() { + let fabric_azure_blob_uri = "https://ACCOUNT.dfs.fabric.microsoft.com".into(); + + let test_table = TestTable::::new("int4".into()).with_uri(fabric_azure_blob_uri); + + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } + #[pg_test] #[should_panic(expected = "unsupported uri gs://testbucket")] fn test_unsupported_uri() { From f187bc41ef791cef822c65d9ef500f675a863c39 Mon Sep 17 00:00:00 2001 From: Aykut Bozkurt Date: Wed, 23 Oct 2024 20:46:29 +0300 Subject: [PATCH 02/15] Adds support for COPY TO/FROM Google Cloud Storage Supports following Google Cloud Storage uri forms: - gs:// \ / \ **Configuration** The simplest way to configure object storage is by creating a json config file like [`/tmp/gcs.json`]: ```bash $ cat /tmp/gcs.json { "gcs_base_url": "gs://testbucket/test.parquet", "disable_oauth": false, "client_email": "...", "private_key_id": "...", "private_key": "..." } ``` Alternatively, you can use the following environment variables when starting postgres to configure the Google Cloud Storage client: - `GOOGLE_SERVICE_ACCOUNT_KEY`: json serialized service account key - `GOOGLE_SERVICE_ACCOUNT_PATH`: an alternative location for the config file Closes #62 --- .devcontainer/.env | 3 ++ .devcontainer/create-test-buckets.sh | 2 ++ .devcontainer/docker-compose.yml | 15 ++++++++ .github/workflows/ci.yml | 11 ++++++ .vscode/settings.json | 4 +-- Cargo.lock | 1 + Cargo.toml | 2 +- README.md | 22 ++++++++++++ src/arrow_parquet/uri_utils.rs | 52 ++++++++++++++++++++++++++-- src/pgrx_tests/object_store.rs | 45 +++++++++++++++++++++--- 10 files changed, 148 insertions(+), 9 deletions(-) diff --git a/.devcontainer/.env b/.devcontainer/.env index d94153e..ebc69c1 100644 --- a/.devcontainer/.env +++ b/.devcontainer/.env @@ -14,6 +14,9 @@ AZURE_TEST_CONTAINER_NAME=testcontainer AZURE_TEST_READ_ONLY_SAS="se=2100-05-05&sp=r&sv=2022-11-02&sr=c&sig=YMPFnAHKe9y0o3hFegncbwQTXtAyvsJEgPB2Ne1b9CQ%3D" AZURE_TEST_READ_WRITE_SAS="se=2100-05-05&sp=rcw&sv=2022-11-02&sr=c&sig=TPz2jEz0t9L651t6rTCQr%2BOjmJHkM76tnCGdcyttnlA%3D" +# GCS tests +GOOGLE_TEST_BUCKET=testbucket + # Others RUST_TEST_THREADS=1 PG_PARQUET_TEST=true diff --git a/.devcontainer/create-test-buckets.sh b/.devcontainer/create-test-buckets.sh index 9ad1360..4c45e61 100644 --- a/.devcontainer/create-test-buckets.sh +++ b/.devcontainer/create-test-buckets.sh @@ -3,3 +3,5 @@ aws --endpoint-url http://localhost:9000 s3 mb s3://$AWS_S3_TEST_BUCKET az storage container create -n $AZURE_TEST_CONTAINER_NAME --connection-string $AZURE_STORAGE_CONNECTION_STRING + +curl -v -X POST --data-binary "{\"name\":\"$GOOGLE_TEST_BUCKET\"}" -H "Content-Type: application/json" "http://localhost:4443/storage/v1/b" diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 5147410..321f19f 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -12,6 +12,7 @@ services: - ${USERPROFILE}${HOME}/.gitconfig:/home/rust/.gitconfig:ro - ${USERPROFILE}${HOME}/.aws:/home/rust/.aws:rw - ${USERPROFILE}${HOME}/.azure:/home/rust/.azure:rw + - ${USERPROFILE}${HOME}/.config/gcloud:/home/rust/.config/gcloud:rw env_file: - .env @@ -20,6 +21,7 @@ services: depends_on: - minio - azurite + - fake-gcs-server minio: image: minio/minio @@ -45,3 +47,16 @@ services: interval: 6s timeout: 2s retries: 3 + + fake-gcs-server: + image: tustvold/fake-gcs-server + env_file: + - .env + network_mode: host + command: -scheme http -public-host localhost:4443 + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "http://localhost:4443"] + interval: 6s + timeout: 2s + retries: 3 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 723c037..46625b6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -132,6 +132,17 @@ jobs: az storage container create -n $AZURE_TEST_CONTAINER_NAME --connection-string $AZURE_STORAGE_CONNECTION_STRING + - name: Start fake-gcs-server for Google Cloud Storage emulator tests + run: | + docker run -d --env-file .devcontainer/.env -p 4443:4443 tustvold/fake-gcs-server -scheme http -filesystem-root /tmp/gcs -public-host localhost:4443 + + while ! nc -z localhost 4443; do + echo "Waiting for localhost:4443..." + sleep 1 + done + + curl -v -X POST --data-binary "{\"name\":\"$GOOGLE_TEST_BUCKET\"}" -H "Content-Type: application/json" "http://localhost:4443/storage/v1/b" + - name: Run tests run: | # Run tests with coverage tool diff --git a/.vscode/settings.json b/.vscode/settings.json index f6ad919..f90c4ab 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -5,6 +5,6 @@ "rust-analyzer.checkOnSave": true, "editor.inlayHints.enabled": "offUnlessPressed", "files.watcherExclude": { - "**/target/**": true - } + "**/target/**": true + } } diff --git a/Cargo.lock b/Cargo.lock index b547a37..7305067 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2093,6 +2093,7 @@ dependencies = [ "rand", "reqwest", "ring", + "rustls-pemfile 2.2.0", "serde", "serde_json", "snafu", diff --git a/Cargo.toml b/Cargo.toml index d71a000..87856e3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,7 +27,7 @@ aws-config = { version = "1.5", default-features = false, features = ["rustls"]} aws-credential-types = {version = "1.2", default-features = false} futures = "0.3" home = "0.5" -object_store = {version = "0.11", default-features = false, features = ["aws", "azure"]} +object_store = {version = "0.11", default-features = false, features = ["aws", "azure", "gcp"]} once_cell = "1" parquet = {version = "53", default-features = false, features = [ "arrow", diff --git a/README.md b/README.md index 2060589..4f3b681 100644 --- a/README.md +++ b/README.md @@ -213,6 +213,28 @@ Supported Azure Blob Storage uri formats are shown below: - azure:// \ / \ - https:// \.blob.core.windows.net / \ +#### Google Cloud Storage + +The simplest way to configure object storage is by creating a json config file like [`/tmp/gcs.json`]: + +```bash +$ cat /tmp/gcs.json +{ + "gcs_base_url": "http://localhost:4443", + "disable_oauth": true, + "client_email": "", + "private_key_id": "", + "private_key": "" +} +``` + +Alternatively, you can use the following environment variables when starting postgres to configure the Google Cloud Storage client: +- `GOOGLE_SERVICE_ACCOUNT_KEY`: json serialized service account key +- `GOOGLE_SERVICE_ACCOUNT_PATH`: an alternative location for the config file + +Supported Google Cloud Storage uri formats are shown below: +- gs:// \ / \ + ## Copy Options `pg_parquet` supports the following options in the `COPY TO` command: - `format parquet`: you need to specify this option to read or write Parquet files which does not end with `.parquet[.]` extension, diff --git a/src/arrow_parquet/uri_utils.rs b/src/arrow_parquet/uri_utils.rs index 438bc35..9f4dd72 100644 --- a/src/arrow_parquet/uri_utils.rs +++ b/src/arrow_parquet/uri_utils.rs @@ -15,6 +15,7 @@ use ini::Ini; use object_store::{ aws::{AmazonS3, AmazonS3Builder}, azure::{AzureConfigKey, MicrosoftAzure, MicrosoftAzureBuilder}, + gcp::{GoogleCloudStorage, GoogleCloudStorageBuilder}, local::LocalFileSystem, path::Path, ObjectStore, ObjectStoreScheme, @@ -96,6 +97,17 @@ fn parse_s3_bucket(uri: &Url) -> Option { None } +fn parse_gcs_bucket(uri: &Url) -> Option { + let host = uri.host_str()?; + + // gs://{bucket}/key + if uri.scheme() == "gs" { + return Some(host.to_string()); + } + + None +} + fn object_store_with_location(uri: &Url, copy_from: bool) -> (Arc, Path) { let (scheme, path) = ObjectStoreScheme::parse(uri).unwrap_or_else(|_| panic!("unsupported uri {}", uri)); @@ -121,6 +133,16 @@ fn object_store_with_location(uri: &Url, copy_from: bool) -> (Arc { + let bucket_name = parse_gcs_bucket(uri).unwrap_or_else(|| { + panic!("failed to parse bucket name from uri: {}", uri); + }); + + let storage_container = PG_BACKEND_TOKIO_RUNTIME + .block_on(async { Arc::new(get_gcs_object_store(&bucket_name).await) }); + + (storage_container, path) + } ObjectStoreScheme::Local => { let uri = uri_as_string(uri); @@ -262,6 +284,25 @@ async fn get_azure_object_store(container_name: &str) -> MicrosoftAzure { azure_builder.build().unwrap_or_else(|e| panic!("{}", e)) } +async fn get_gcs_object_store(bucket_name: &str) -> GoogleCloudStorage { + let mut gcs_builder = GoogleCloudStorageBuilder::from_env().with_bucket_name(bucket_name); + + if is_testing() { + // use fake-gcp-server for testing + gcs_builder = gcs_builder.with_service_account_key( + "{ + \"gcs_base_url\": \"http://localhost:4443\", + \"disable_oauth\": true, + \"client_email\": \"\", + \"private_key_id\": \"\", + \"private_key\": \"\" + }", + ); + } + + gcs_builder.build().unwrap_or_else(|e| panic!("{}", e)) +} + fn is_testing() -> bool { std::env::var("PG_PARQUET_TEST").is_ok() } @@ -284,13 +325,20 @@ pub(crate) fn parse_uri(uri: &str) -> Url { } else if scheme == ObjectStoreScheme::MicrosoftAzure { parse_azure_blob_container(&uri).unwrap_or_else(|| { panic!( - "failed to parse container name from azure blob storage uri {}", + "failed to parse container name from Azure Blob Storage uri {}", + uri + ) + }); + } else if scheme == ObjectStoreScheme::GoogleCloudStorage { + parse_gcs_bucket(&uri).unwrap_or_else(|| { + panic!( + "failed to parse bucket name from Google Cloud Storage uri {}", uri ) }); } else { panic!( - "unsupported uri {}. Only Azure and S3 uris are supported.", + "unsupported uri {}. Only Azure Blob Storage, S3 and Google Cloud Storage uris are supported.", uri ); }; diff --git a/src/pgrx_tests/object_store.rs b/src/pgrx_tests/object_store.rs index 89edffe..2e985f6 100644 --- a/src/pgrx_tests/object_store.rs +++ b/src/pgrx_tests/object_store.rs @@ -338,7 +338,7 @@ mod tests { ); let copy_to_command = format!( - "COPY (SELECT i FROM generate_series(1,10) i) TO '{}' WITH (format parquet);;", + "COPY (SELECT i FROM generate_series(1,10) i) TO '{}' WITH (format parquet);", azure_blob_uri ); Spi::run(copy_to_command.as_str()).unwrap(); @@ -365,7 +365,7 @@ mod tests { ); let copy_to_command = format!( - "COPY (SELECT i FROM generate_series(1,10) i) TO '{}' WITH (format parquet);;", + "COPY (SELECT i FROM generate_series(1,10) i) TO '{}' WITH (format parquet);", azure_blob_uri ); Spi::run(copy_to_command.as_str()).unwrap(); @@ -411,10 +411,47 @@ mod tests { } #[pg_test] - #[should_panic(expected = "unsupported uri gs://testbucket")] + fn test_gcs_from_env() { + let test_bucket_name: String = + std::env::var("GOOGLE_TEST_BUCKET").expect("GOOGLE_TEST_BUCKET not found"); + + let gcs_uri = format!("gs://{}/pg_parquet_test.parquet", test_bucket_name); + + let test_table = TestTable::::new("int4".into()).with_uri(gcs_uri); + + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } + + #[pg_test] + #[should_panic(expected = "404 Not Found")] + fn test_gcs_write_wrong_bucket() { + let s3_uri = "gs://randombucketwhichdoesnotexist/pg_parquet_test.parquet"; + + let copy_to_command = format!( + "COPY (SELECT i FROM generate_series(1,10) i) TO '{}';", + s3_uri + ); + Spi::run(copy_to_command.as_str()).unwrap(); + } + + #[pg_test] + #[should_panic(expected = "404 Not Found")] + fn test_gcs_read_wrong_bucket() { + let gcs_uri = "gs://randombucketwhichdoesnotexist/pg_parquet_test.parquet"; + + let create_table_command = "CREATE TABLE test_table (a int);"; + Spi::run(create_table_command).unwrap(); + + let copy_from_command = format!("COPY test_table FROM '{}';", gcs_uri); + Spi::run(copy_from_command.as_str()).unwrap(); + } + + #[pg_test] + #[should_panic(expected = "unsupported uri http://testbucket")] fn test_unsupported_uri() { let test_table = - TestTable::::new("int4".into()).with_uri("gs://testbucket".to_string()); + TestTable::::new("int4".into()).with_uri("http://testbucket".to_string()); test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); test_table.assert_expected_and_result_rows(); } From f1b7114d1a221cb8c232adcb2924e96755fe3d7a Mon Sep 17 00:00:00 2001 From: Aykut Bozkurt Date: Mon, 16 Dec 2024 18:41:52 +0300 Subject: [PATCH 03/15] configure test endpoint via env var --- .devcontainer/.env | 2 ++ .devcontainer/docker-compose.yml | 12 +++++------- .../{azurite-entrypoint.sh => entrypoint.sh} | 13 ++----------- .github/workflows/ci.yml | 13 +++++++++---- README.md | 4 +++- src/arrow_parquet/uri_utils.rs | 13 +------------ 6 files changed, 22 insertions(+), 35 deletions(-) rename .devcontainer/{azurite-entrypoint.sh => entrypoint.sh} (53%) diff --git a/.devcontainer/.env b/.devcontainer/.env index ef12430..7e7f783 100644 --- a/.devcontainer/.env +++ b/.devcontainer/.env @@ -12,6 +12,8 @@ MINIO_ROOT_PASSWORD=minioadmin AZURE_STORAGE_ACCOUNT=devstoreaccount1 AZURE_STORAGE_KEY="Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" AZURE_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://localhost:10000/devstoreaccount1;" +AZURE_STORAGE_ENDPOINT=http://localhost:10000/devstoreaccount1 +AZURE_ALLOW_HTTP=true AZURE_TEST_CONTAINER_NAME=testcontainer AZURE_TEST_READ_ONLY_SAS="se=2100-05-05&sp=r&sv=2022-11-02&sr=c&sig=YMPFnAHKe9y0o3hFegncbwQTXtAyvsJEgPB2Ne1b9CQ%3D" AZURE_TEST_READ_WRITE_SAS="se=2100-05-05&sp=rcw&sv=2022-11-02&sr=c&sig=TPz2jEz0t9L651t6rTCQr%2BOjmJHkM76tnCGdcyttnlA%3D" diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 9b145b8..a432f90 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -3,7 +3,7 @@ services: build: context: . dockerfile: Dockerfile - command: sleep infinity + entrypoint: "./entrypoint.sh" network_mode: host volumes: - ..:/workspace @@ -12,7 +12,7 @@ services: - ${USERPROFILE}${HOME}/.gitconfig:/home/rust/.gitconfig:ro - ${USERPROFILE}${HOME}/.aws:/home/rust/.aws:rw - ${USERPROFILE}${HOME}/.azure:/home/rust/.azure:rw - + - ./entrypoint.sh:/entrypoint.sh env_file: - .env cap_add: @@ -33,8 +33,8 @@ services: interval: 6s timeout: 2s retries: 3 - volumes: - - ./minio-entrypoint.sh:/entrypoint.sh + volumes: + - ./minio-entrypoint.sh:/entrypoint.sh azurite: image: mcr.microsoft.com/azure-storage/azurite @@ -43,9 +43,7 @@ services: network_mode: host restart: unless-stopped healthcheck: - test: ["CMD", "curl", "http://localhost:10000"] + test: ["CMD", "nc", "-z", "localhost", "10000"] interval: 6s timeout: 2s retries: 3 - volumes: - - ./azurite-entrypoint.sh:/entrypoint.sh diff --git a/.devcontainer/azurite-entrypoint.sh b/.devcontainer/entrypoint.sh similarity index 53% rename from .devcontainer/azurite-entrypoint.sh rename to .devcontainer/entrypoint.sh index 5e98cb5..43944ed 100755 --- a/.devcontainer/azurite-entrypoint.sh +++ b/.devcontainer/entrypoint.sh @@ -2,16 +2,7 @@ trap "echo 'Caught termination signal. Exiting...'; exit 0" SIGINT SIGTERM -minio server /data & - -azurite_pid=$! - -while ! curl $AWS_ENDPOINT_URL; do - echo "Waiting for $AWS_ENDPOINT_URL..." - sleep 1 -done - -# create container +# create azurite container az storage container create -n $AZURE_TEST_CONTAINER_NAME --connection-string $AZURE_STORAGE_CONNECTION_STRING -wait $azurite_pid +sleep infinity diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 943b77e..9b289fb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -117,7 +117,6 @@ jobs: -p 9000:9000 \ --entrypoint "./entrypoint.sh" \ --volume ./.devcontainer/minio-entrypoint.sh:/entrypoint.sh \ - --name miniocontainer \ minio/minio while ! curl $AWS_ENDPOINT_URL; do @@ -127,13 +126,19 @@ jobs: - name: Start Azurite for Azure Blob Storage emulator tests run: | - docker run -d --env-file .devcontainer/.env -p 10000:10000 mcr.microsoft.com/azure-storage/azurite + docker run -d \ + --env-file .devcontainer/.env \ + -p 10000:10000 \ + mcr.microsoft.com/azure-storage/azurite - while ! nc -z localhost 10000; do - echo "Waiting for localhost:10000..." + while ! curl $AZURITE_URL; do + echo "Waiting for $AZURITE_URL..." sleep 1 done + # create container + az storage container create -n $AZURE_TEST_CONTAINER_NAME --connection-string $AZURE_STORAGE_CONNECTION_STRING + - name: Run tests run: | # Run tests with coverage tool diff --git a/README.md b/README.md index d485f7e..fdcbaba 100644 --- a/README.md +++ b/README.md @@ -210,7 +210,9 @@ Alternatively, you can use the following environment variables when starting pos - `AZURE_STORAGE_ACCOUNT`: the storage account name of the Azure Blob - `AZURE_STORAGE_KEY`: the storage key of the Azure Blob - `AZURE_STORAGE_SAS_TOKEN`: the storage SAS token for the Azure Blob -- `AZURE_CONFIG_FILE`: an alternative location for the config file +- `AZURE_STORAGE_ENDPOINT`: the endpoint **(only via environment variables)** +- `AZURE_CONFIG_FILE`: an alternative location for the config file **(only via environment variables)** +- `AZURE_ALLOW_HTTP`: allows http endpoints **(only via environment variables)** Supported Azure Blob Storage uri formats are shown below: - az:// \ / \ diff --git a/src/arrow_parquet/uri_utils.rs b/src/arrow_parquet/uri_utils.rs index 8996741..9e20663 100644 --- a/src/arrow_parquet/uri_utils.rs +++ b/src/arrow_parquet/uri_utils.rs @@ -185,14 +185,7 @@ async fn get_s3_object_store(bucket_name: &str) -> AmazonS3 { } async fn get_azure_object_store(container_name: &str) -> MicrosoftAzure { - let mut azure_builder = MicrosoftAzureBuilder::new().with_container_name(container_name); - - if is_testing() { - // use azurite for testing - azure_builder = - azure_builder.with_endpoint("http://localhost:10000/devstoreaccount1".into()); - azure_builder = azure_builder.with_allow_http(true); - } + let mut azure_builder = MicrosoftAzureBuilder::from_env().with_container_name(container_name); // ~/.azure/config let azure_config_file_path = std::env::var("AZURE_CONFIG_FILE").unwrap_or( @@ -252,10 +245,6 @@ async fn get_azure_object_store(container_name: &str) -> MicrosoftAzure { azure_builder.build().unwrap_or_else(|e| panic!("{}", e)) } -fn is_testing() -> bool { - std::env::var("PG_PARQUET_TEST").is_ok() -} - pub(crate) fn parse_uri(uri: &str) -> Url { if !uri.contains("://") { // local file From 31f6701acde4b0c0cabdf98c7d154bcdf4cc62d8 Mon Sep 17 00:00:00 2001 From: Aykut Bozkurt Date: Mon, 16 Dec 2024 19:00:03 +0300 Subject: [PATCH 04/15] merge --- .devcontainer/.env | 7 +- .devcontainer/create-test-buckets.sh | 7 -- .devcontainer/devcontainer.json | 1 - .devcontainer/docker-compose.yml | 11 +-- .devcontainer/gcs-entrypoint.sh | 17 +++++ .devcontainer/minio-entrypoint.sh | 20 ++++++ .github/workflows/ci.yml | 40 +++++++---- Cargo.lock | 33 ++++++--- Cargo.toml | 5 +- README.md | 18 +++-- src/arrow_parquet/uri_utils.rs | 103 +++++++++------------------ src/lib.rs | 1 + src/parquet_copy_hook/copy_utils.rs | 66 ++++++++++------- src/parquet_copy_hook/hook.rs | 4 +- src/pgrx_tests/object_store.rs | 82 ++++++++++++++++++++- src/pgrx_utils.rs | 10 ++- 16 files changed, 281 insertions(+), 144 deletions(-) delete mode 100644 .devcontainer/create-test-buckets.sh create mode 100755 .devcontainer/gcs-entrypoint.sh create mode 100755 .devcontainer/minio-entrypoint.sh diff --git a/.devcontainer/.env b/.devcontainer/.env index ebc69c1..866fcd7 100644 --- a/.devcontainer/.env +++ b/.devcontainer/.env @@ -2,6 +2,8 @@ AWS_ACCESS_KEY_ID=minioadmin AWS_SECRET_ACCESS_KEY=minioadmin AWS_REGION=us-east-1 +AWS_ENDPOINT_URL=http://localhost:9000 +AWS_ALLOW_HTTP=true AWS_S3_TEST_BUCKET=testbucket MINIO_ROOT_USER=minioadmin MINIO_ROOT_PASSWORD=minioadmin @@ -10,13 +12,16 @@ MINIO_ROOT_PASSWORD=minioadmin AZURE_STORAGE_ACCOUNT=devstoreaccount1 AZURE_STORAGE_KEY="Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" AZURE_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://localhost:10000/devstoreaccount1;" +AZURE_STORAGE_ENDPOINT=http://localhost:10000/devstoreaccount1 +AZURE_ALLOW_HTTP=true AZURE_TEST_CONTAINER_NAME=testcontainer AZURE_TEST_READ_ONLY_SAS="se=2100-05-05&sp=r&sv=2022-11-02&sr=c&sig=YMPFnAHKe9y0o3hFegncbwQTXtAyvsJEgPB2Ne1b9CQ%3D" AZURE_TEST_READ_WRITE_SAS="se=2100-05-05&sp=rcw&sv=2022-11-02&sr=c&sig=TPz2jEz0t9L651t6rTCQr%2BOjmJHkM76tnCGdcyttnlA%3D" # GCS tests GOOGLE_TEST_BUCKET=testbucket +GOOGLE_SERVICE_ACCOUNT_KEY='{"gcs_base_url": "http://localhost:4443","disable_oauth": true,"client_email": "","private_key_id": "","private_key": ""}' +GOOGLE_SERVICE_ENDPOINT=http://localhost:4443 # Others RUST_TEST_THREADS=1 -PG_PARQUET_TEST=true diff --git a/.devcontainer/create-test-buckets.sh b/.devcontainer/create-test-buckets.sh deleted file mode 100644 index 4c45e61..0000000 --- a/.devcontainer/create-test-buckets.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -aws --endpoint-url http://localhost:9000 s3 mb s3://$AWS_S3_TEST_BUCKET - -az storage container create -n $AZURE_TEST_CONTAINER_NAME --connection-string $AZURE_STORAGE_CONNECTION_STRING - -curl -v -X POST --data-binary "{\"name\":\"$GOOGLE_TEST_BUCKET\"}" -H "Content-Type: application/json" "http://localhost:4443/storage/v1/b" diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index e2c90a8..90f0a2d 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -3,7 +3,6 @@ "dockerComposeFile": "docker-compose.yml", "service": "app", "workspaceFolder": "/workspace", - "postStartCommand": "bash .devcontainer/create-test-buckets.sh", "postAttachCommand": "sudo chown -R rust /workspace", "customizations": { "vscode": { diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 321f19f..12e48d8 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -3,7 +3,7 @@ services: build: context: . dockerfile: Dockerfile - command: sleep infinity + entrypoint: "./entrypoint.sh" network_mode: host volumes: - ..:/workspace @@ -13,7 +13,7 @@ services: - ${USERPROFILE}${HOME}/.aws:/home/rust/.aws:rw - ${USERPROFILE}${HOME}/.azure:/home/rust/.azure:rw - ${USERPROFILE}${HOME}/.config/gcloud:/home/rust/.config/gcloud:rw - + - ./entrypoint.sh:/entrypoint.sh env_file: - .env cap_add: @@ -28,13 +28,15 @@ services: env_file: - .env network_mode: host - command: server /data + entrypoint: "./entrypoint.sh" restart: unless-stopped healthcheck: test: ["CMD", "curl", "http://localhost:9000"] interval: 6s timeout: 2s retries: 3 + volumes: + - ./minio-entrypoint.sh:/entrypoint.sh azurite: image: mcr.microsoft.com/azure-storage/azurite @@ -43,7 +45,7 @@ services: network_mode: host restart: unless-stopped healthcheck: - test: ["CMD", "curl", "http://localhost:10000"] + test: ["CMD", "nc", "-z", "localhost", "10000"] interval: 6s timeout: 2s retries: 3 @@ -53,6 +55,7 @@ services: env_file: - .env network_mode: host + entrypoint: "./entrypoint.sh" command: -scheme http -public-host localhost:4443 restart: unless-stopped healthcheck: diff --git a/.devcontainer/gcs-entrypoint.sh b/.devcontainer/gcs-entrypoint.sh new file mode 100755 index 0000000..45d7315 --- /dev/null +++ b/.devcontainer/gcs-entrypoint.sh @@ -0,0 +1,17 @@ +#!/bin/sh + +trap "echo 'Caught termination signal. Exiting...'; exit 0" SIGINT SIGTERM + +/bin/fake-gcs-server -data /data -scheme http -public-host localhost:4443 & + +gcs_pid=$! + +while ! curl $GOOGLE_SERVICE_ENDPOINT; do + echo "Waiting for $GOOGLE_SERVICE_ENDPOINT..." + sleep 1 +done + +# create bucket +curl -v -X POST --data-binary "{\"name\":\"$GOOGLE_TEST_BUCKET\"}" -H "Content-Type: application/json" "$GOOGLE_SERVICE_ENDPOINT/storage/v1/b" + +wait $gcs_pid diff --git a/.devcontainer/minio-entrypoint.sh b/.devcontainer/minio-entrypoint.sh new file mode 100755 index 0000000..7831ba5 --- /dev/null +++ b/.devcontainer/minio-entrypoint.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +trap "echo 'Caught termination signal. Exiting...'; exit 0" SIGINT SIGTERM + +minio server /data & + +minio_pid=$! + +while ! curl $AWS_ENDPOINT_URL; do + echo "Waiting for $AWS_ENDPOINT_URL..." + sleep 1 +done + +# set access key and secret key +mc alias set local $AWS_ENDPOINT_URL $MINIO_ROOT_USER $MINIO_ROOT_PASSWORD + +# create bucket +mc mb local/$AWS_S3_TEST_BUCKET + +wait $minio_pid diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 46625b6..c2c09ab 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -94,7 +94,7 @@ jobs: - name: Install and configure pgrx run: | cargo install --locked cargo-pgrx@0.12.9 - cargo pgrx init --pg${{ env.PG_MAJOR }} $(which pg_config) + cargo pgrx init --pg${{ env.PG_MAJOR }} /usr/lib/postgresql/${{ env.PG_MAJOR }}/bin/pg_config - name: Install cargo-llvm-cov for coverage report run: cargo install --locked cargo-llvm-cov@0.6.12 @@ -106,35 +106,47 @@ jobs: - name: Set up permissions for PostgreSQL run: | - sudo chmod a+rwx $(pg_config --pkglibdir) \ - $(pg_config --sharedir)/extension \ + sudo chmod a+rwx $(/usr/lib/postgresql/${{ env.PG_MAJOR }}/bin/pg_config --pkglibdir) \ + $(/usr/lib/postgresql/${{ env.PG_MAJOR }}/bin/pg_config --sharedir)/extension \ /var/run/postgresql/ - name: Start Minio for s3 emulator tests run: | - docker run -d --env-file .devcontainer/.env -p 9000:9000 minio/minio server /data - - while ! nc -z localhost 9000; do - echo "Waiting for localhost:9000..." - sleep 1 + docker run -d \ + --env-file .devcontainer/.env \ + -p 9000:9000 \ + --entrypoint "./entrypoint.sh" \ + --volume ./.devcontainer/minio-entrypoint.sh:/entrypoint.sh \ + minio/minio + + while ! curl $AWS_ENDPOINT_URL; do + echo "Waiting for $AWS_ENDPOINT_URL..." + sleep 1 done - aws --endpoint-url http://localhost:9000 s3 mb s3://$AWS_S3_TEST_BUCKET - - name: Start Azurite for Azure Blob Storage emulator tests run: | - docker run -d --env-file .devcontainer/.env -p 10000:10000 mcr.microsoft.com/azure-storage/azurite + docker run -d \ + --env-file .devcontainer/.env \ + -p 10000:10000 \ + mcr.microsoft.com/azure-storage/azurite - while ! nc -z localhost 10000; do - echo "Waiting for localhost:10000..." + while ! curl $AZURITE_URL; do + echo "Waiting for $AZURITE_URL..." sleep 1 done + # create container az storage container create -n $AZURE_TEST_CONTAINER_NAME --connection-string $AZURE_STORAGE_CONNECTION_STRING - name: Start fake-gcs-server for Google Cloud Storage emulator tests run: | - docker run -d --env-file .devcontainer/.env -p 4443:4443 tustvold/fake-gcs-server -scheme http -filesystem-root /tmp/gcs -public-host localhost:4443 + docker run -d \ + --env-file .devcontainer/.env \ + -p 4443:4443 \ + --entrypoint "./entrypoint.sh" \ + --volume ./.devcontainer/gcs-entrypoint.sh:/entrypoint.sh \ + tustvold/fake-gcs-server while ! nc -z localhost 4443; do echo "Waiting for localhost:4443..." diff --git a/Cargo.lock b/Cargo.lock index 7305067..3fe968c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -317,7 +317,7 @@ dependencies = [ "aws-sdk-sts", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.60.7", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -345,9 +345,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.4.3" +version = "1.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a10d5c055aa540164d9561a0e2e74ad30f0dcf7393c3a92f6733ddf9c5762468" +checksum = "b5ac934720fbb46206292d2c75b57e67acfc56fe7dfd34fb9a02334af08409ea" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -370,15 +370,15 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.50.0" +version = "1.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ada54e5f26ac246dc79727def52f7f8ed38915cb47781e2a72213957dc3a7d5" +checksum = "b68fde0d69c8bfdc1060ea7da21df3e39f6014da316783336deff0a9ec28f4bf" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.1", "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -393,9 +393,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.2.5" +version = "1.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5619742a0d8f253be760bfbb8e8e8368c69e3587e4637af5754e488a611499b1" +checksum = "7d3820e0c08d0737872ff3c7c1f21ebbb6693d832312d6152bf18ef50a5471c2" dependencies = [ "aws-credential-types", "aws-smithy-http", @@ -454,6 +454,15 @@ dependencies = [ "aws-smithy-types", ] +[[package]] +name = "aws-smithy-json" +version = "0.61.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee4e69cc50921eb913c6b662f8d909131bb3e6ad6cb6090d3a39b66fc5c52095" +dependencies = [ + "aws-smithy-types", +] + [[package]] name = "aws-smithy-query" version = "0.60.7" @@ -466,9 +475,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.7.3" +version = "1.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be28bd063fa91fd871d131fc8b68d7cd4c5fa0869bea68daca50dcb1cbd76be2" +checksum = "9f20685047ca9d6f17b994a07f629c813f08b5bce65523e47124879e60103d45" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -517,6 +526,7 @@ dependencies = [ "base64-simd", "bytes", "bytes-utils", + "futures-core", "http 0.2.12", "http 1.1.0", "http-body 0.4.6", @@ -529,6 +539,8 @@ dependencies = [ "ryu", "serde", "time", + "tokio", + "tokio-util", ] [[package]] @@ -2261,6 +2273,7 @@ dependencies = [ "arrow-schema", "aws-config", "aws-credential-types", + "aws-sdk-sts", "futures", "home", "object_store", diff --git a/Cargo.toml b/Cargo.toml index 87856e3..4742d06 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,8 +23,9 @@ pg_test = [] arrow = {version = "53", default-features = false} arrow-cast = {version = "53", default-features = false} arrow-schema = {version = "53", default-features = false} -aws-config = { version = "1.5", default-features = false, features = ["rustls"]} -aws-credential-types = {version = "1.2", default-features = false} +aws-config = { version = "1", default-features = false, features = ["rustls"]} +aws-credential-types = {version = "1", default-features = false} +aws-sdk-sts = "1" futures = "0.3" home = "0.5" object_store = {version = "0.11", default-features = false, features = ["aws", "azure", "gcp"]} diff --git a/README.md b/README.md index 4f3b681..a0cb3b1 100644 --- a/README.md +++ b/README.md @@ -180,10 +180,14 @@ region = eu-central-1 Alternatively, you can use the following environment variables when starting postgres to configure the S3 client: - `AWS_ACCESS_KEY_ID`: the access key ID of the AWS account - `AWS_SECRET_ACCESS_KEY`: the secret access key of the AWS account +- `AWS_SESSION_TOKEN`: the session token for the AWS account - `AWS_REGION`: the default region of the AWS account -- `AWS_SHARED_CREDENTIALS_FILE`: an alternative location for the credentials file -- `AWS_CONFIG_FILE`: an alternative location for the config file -- `AWS_PROFILE`: the name of the profile from the credentials and config file (default profile name is `default`) +- `AWS_ENDPOINT_URL`: the endpoint +- `AWS_SHARED_CREDENTIALS_FILE`: an alternative location for the credentials file **(only via environment variables)** +- `AWS_CONFIG_FILE`: an alternative location for the config file **(only via environment variables)** +- `AWS_PROFILE`: the name of the profile from the credentials and config file (default profile name is `default`) **(only via environment variables)** +- `AWS_ALLOW_HTTP`: allows http endpoints **(only via environment variables)** + Supported S3 uri formats are shown below: - s3:// \ / \ @@ -206,7 +210,9 @@ Alternatively, you can use the following environment variables when starting pos - `AZURE_STORAGE_ACCOUNT`: the storage account name of the Azure Blob - `AZURE_STORAGE_KEY`: the storage key of the Azure Blob - `AZURE_STORAGE_SAS_TOKEN`: the storage SAS token for the Azure Blob -- `AZURE_CONFIG_FILE`: an alternative location for the config file +- `AZURE_STORAGE_ENDPOINT`: the endpoint **(only via environment variables)** +- `AZURE_CONFIG_FILE`: an alternative location for the config file **(only via environment variables)** +- `AZURE_ALLOW_HTTP`: allows http endpoints **(only via environment variables)** Supported Azure Blob Storage uri formats are shown below: - az:// \ / \ @@ -229,8 +235,8 @@ $ cat /tmp/gcs.json ``` Alternatively, you can use the following environment variables when starting postgres to configure the Google Cloud Storage client: -- `GOOGLE_SERVICE_ACCOUNT_KEY`: json serialized service account key -- `GOOGLE_SERVICE_ACCOUNT_PATH`: an alternative location for the config file +- `GOOGLE_SERVICE_ACCOUNT_KEY`: json serialized service account key **(only via environment variables)** +- `GOOGLE_SERVICE_ACCOUNT_PATH`: an alternative location for the config file **(only via environment variables)** Supported Google Cloud Storage uri formats are shown below: - gs:// \ / \ diff --git a/src/arrow_parquet/uri_utils.rs b/src/arrow_parquet/uri_utils.rs index 9f4dd72..7e82c69 100644 --- a/src/arrow_parquet/uri_utils.rs +++ b/src/arrow_parquet/uri_utils.rs @@ -4,11 +4,7 @@ use std::{ }; use arrow::datatypes::SchemaRef; -use aws_config::{ - environment::{EnvironmentVariableCredentialsProvider, EnvironmentVariableRegionProvider}, - meta::{credentials::CredentialsProviderChain, region::RegionProviderChain}, - profile::{ProfileFileCredentialsProvider, ProfileFileRegionProvider}, -}; +use aws_config::BehaviorVersion; use aws_credential_types::provider::ProvideCredentials; use home::home_dir; use ini::Ini; @@ -168,63 +164,50 @@ fn object_store_with_location(uri: &Url, copy_from: bool) -> (Arc AmazonS3 { - let mut aws_s3_builder = AmazonS3Builder::new().with_bucket_name(bucket_name); + let mut aws_s3_builder = AmazonS3Builder::from_env().with_bucket_name(bucket_name); - if is_testing() { - // use minio for testing - aws_s3_builder = aws_s3_builder.with_endpoint("http://localhost:9000"); - aws_s3_builder = aws_s3_builder.with_allow_http(true); - } + // first tries environment variables and then the config files + let sdk_config = aws_config::defaults(BehaviorVersion::v2024_03_28()) + .load() + .await; - let aws_profile_name = std::env::var("AWS_PROFILE").unwrap_or("default".to_string()); + if let Some(credential_provider) = sdk_config.credentials_provider() { + if let Ok(credentials) = credential_provider.provide_credentials().await { + // AWS_ACCESS_KEY_ID + aws_s3_builder = aws_s3_builder.with_access_key_id(credentials.access_key_id()); - let region_provider = RegionProviderChain::first_try(EnvironmentVariableRegionProvider::new()) - .or_else( - ProfileFileRegionProvider::builder() - .profile_name(aws_profile_name.clone()) - .build(), - ); + // AWS_SECRET_ACCESS_KEY + aws_s3_builder = aws_s3_builder.with_secret_access_key(credentials.secret_access_key()); - let region = region_provider.region().await; - - if let Some(region) = region { - aws_s3_builder = aws_s3_builder.with_region(region.to_string()); + if let Some(token) = credentials.session_token() { + // AWS_SESSION_TOKEN + aws_s3_builder = aws_s3_builder.with_token(token); + } + } } - let credential_provider = CredentialsProviderChain::first_try( - "Environment", - EnvironmentVariableCredentialsProvider::new(), - ) - .or_else( - "Profile", - ProfileFileCredentialsProvider::builder() - .profile_name(aws_profile_name) - .build(), - ); - - if let Ok(credentials) = credential_provider.provide_credentials().await { - aws_s3_builder = aws_s3_builder.with_access_key_id(credentials.access_key_id()); - - aws_s3_builder = aws_s3_builder.with_secret_access_key(credentials.secret_access_key()); + // AWS_ENDPOINT_URL + if let Some(aws_endpoint_url) = sdk_config.endpoint_url() { + aws_s3_builder = aws_s3_builder.with_endpoint(aws_endpoint_url); + } - if let Some(token) = credentials.session_token() { - aws_s3_builder = aws_s3_builder.with_token(token); - } + // AWS_REGION + if let Some(aws_region) = sdk_config.region() { + aws_s3_builder = aws_s3_builder.with_region(aws_region.as_ref()); } aws_s3_builder.build().unwrap_or_else(|e| panic!("{}", e)) } async fn get_azure_object_store(container_name: &str) -> MicrosoftAzure { - let mut azure_builder = MicrosoftAzureBuilder::new().with_container_name(container_name); - - if is_testing() { - // use azurite for testing - azure_builder = - azure_builder.with_endpoint("http://localhost:10000/devstoreaccount1".into()); - azure_builder = azure_builder.with_allow_http(true); - } + let mut azure_builder = MicrosoftAzureBuilder::from_env().with_container_name(container_name); // ~/.azure/config let azure_config_file_path = std::env::var("AZURE_CONFIG_FILE").unwrap_or( @@ -285,26 +268,10 @@ async fn get_azure_object_store(container_name: &str) -> MicrosoftAzure { } async fn get_gcs_object_store(bucket_name: &str) -> GoogleCloudStorage { - let mut gcs_builder = GoogleCloudStorageBuilder::from_env().with_bucket_name(bucket_name); - - if is_testing() { - // use fake-gcp-server for testing - gcs_builder = gcs_builder.with_service_account_key( - "{ - \"gcs_base_url\": \"http://localhost:4443\", - \"disable_oauth\": true, - \"client_email\": \"\", - \"private_key_id\": \"\", - \"private_key\": \"\" - }", - ); - } - - gcs_builder.build().unwrap_or_else(|e| panic!("{}", e)) -} - -fn is_testing() -> bool { - std::env::var("PG_PARQUET_TEST").is_ok() + GoogleCloudStorageBuilder::from_env() + .with_bucket_name(bucket_name) + .build() + .unwrap_or_else(|e| panic!("{}", e)) } pub(crate) fn parse_uri(uri: &str) -> Url { diff --git a/src/lib.rs b/src/lib.rs index 817f224..100c80b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,6 +5,7 @@ use pgrx::{prelude::*, GucContext, GucFlags, GucRegistry}; mod arrow_parquet; mod parquet_copy_hook; mod parquet_udfs; +#[cfg(any(test, feature = "pg_test"))] mod pgrx_tests; mod pgrx_utils; mod type_compat; diff --git a/src/parquet_copy_hook/copy_utils.rs b/src/parquet_copy_hook/copy_utils.rs index dde7848..48fa24f 100644 --- a/src/parquet_copy_hook/copy_utils.rs +++ b/src/parquet_copy_hook/copy_utils.rs @@ -1,7 +1,7 @@ use std::{ffi::CStr, str::FromStr}; use pgrx::{ - is_a, + ereport, is_a, pg_sys::{ addRangeTableEntryForRelation, defGetInt32, defGetInt64, defGetString, get_namespace_name, get_rel_namespace, makeDefElem, makeString, make_parsestate, quote_qualified_identifier, @@ -9,18 +9,21 @@ use pgrx::{ NodeTag::T_CopyStmt, Oid, ParseNamespaceItem, ParseState, PlannedStmt, QueryEnvironment, RangeVar, RangeVarGetRelidExtended, RowExclusiveLock, TupleDescInitEntry, }, - PgBox, PgList, PgRelation, PgTupleDesc, + PgBox, PgList, PgLogLevel, PgRelation, PgSqlErrorCode, PgTupleDesc, }; use url::Url; -use crate::arrow_parquet::{ - compression::{all_supported_compressions, PgParquetCompression}, - match_by::MatchBy, - parquet_writer::{DEFAULT_ROW_GROUP_SIZE, DEFAULT_ROW_GROUP_SIZE_BYTES}, - uri_utils::parse_uri, +use crate::{ + arrow_parquet::{ + compression::{all_supported_compressions, PgParquetCompression}, + match_by::MatchBy, + parquet_writer::{DEFAULT_ROW_GROUP_SIZE, DEFAULT_ROW_GROUP_SIZE_BYTES}, + uri_utils::parse_uri, + }, + pgrx_utils::extension_exists, }; -use super::pg_compat::strVal; +use super::{hook::ENABLE_PARQUET_COPY_HOOK, pg_compat::strVal}; pub(crate) fn validate_copy_to_options(p_stmt: &PgBox, uri: &Url) { validate_copy_option_names( @@ -297,7 +300,12 @@ pub(crate) fn copy_stmt_get_option( PgBox::null() } -pub(crate) fn is_copy_to_parquet_stmt(p_stmt: &PgBox) -> bool { +fn is_copy_parquet_stmt(p_stmt: &PgBox, copy_from: bool) -> bool { + // the GUC pg_parquet.enable_copy_hook must be set to true + if !ENABLE_PARQUET_COPY_HOOK.get() { + return false; + } + let is_copy_stmt = unsafe { is_a(p_stmt.utilityStmt, T_CopyStmt) }; if !is_copy_stmt { @@ -306,7 +314,7 @@ pub(crate) fn is_copy_to_parquet_stmt(p_stmt: &PgBox) -> bool { let copy_stmt = unsafe { PgBox::::from_pg(p_stmt.utilityStmt as _) }; - if copy_stmt.is_from { + if copy_from != copy_stmt.is_from { return false; } @@ -320,33 +328,41 @@ pub(crate) fn is_copy_to_parquet_stmt(p_stmt: &PgBox) -> bool { let uri = copy_stmt_uri(p_stmt).expect("uri is None"); - is_parquet_format_option(p_stmt) || is_parquet_uri(uri) -} - -pub(crate) fn is_copy_from_parquet_stmt(p_stmt: &PgBox) -> bool { - let is_copy_stmt = unsafe { is_a(p_stmt.utilityStmt, T_CopyStmt) }; - - if !is_copy_stmt { + if !is_parquet_format_option(p_stmt) && !is_parquet_uri(uri) { return false; } - let copy_stmt = unsafe { PgBox::::from_pg(p_stmt.utilityStmt as _) }; + // extension checks are done via catalog (not yet searched via cache by postgres till pg18) + // this is why we check them after the uri checks - if !copy_stmt.is_from { + // crunchy_query_engine should not be created + if extension_exists("crunchy_query_engine") { return false; } - if copy_stmt.is_program { - return false; - } + // pg_parquet should be created + if !extension_exists("pg_parquet") { + ereport!( + PgLogLevel::WARNING, + PgSqlErrorCode::ERRCODE_WARNING, + "pg_parquet can handle this COPY command but is not enabled", + "Run CREATE EXTENSION pg_parquet; to enable the pg_parquet extension.", + ); - if copy_stmt.filename.is_null() { return false; } - let uri = copy_stmt_uri(p_stmt).expect("uri is None"); + true +} - is_parquet_format_option(p_stmt) || is_parquet_uri(uri) +pub(crate) fn is_copy_to_parquet_stmt(p_stmt: &PgBox) -> bool { + let copy_from = false; + is_copy_parquet_stmt(p_stmt, copy_from) +} + +pub(crate) fn is_copy_from_parquet_stmt(p_stmt: &PgBox) -> bool { + let copy_from = true; + is_copy_parquet_stmt(p_stmt, copy_from) } fn is_parquet_format_option(p_stmt: &PgBox) -> bool { diff --git a/src/parquet_copy_hook/hook.rs b/src/parquet_copy_hook/hook.rs index cd167c0..e159937 100644 --- a/src/parquet_copy_hook/hook.rs +++ b/src/parquet_copy_hook/hook.rs @@ -128,7 +128,7 @@ extern "C" fn parquet_copy_hook( let query_env = unsafe { PgBox::from_pg(query_env) }; let mut completion_tag = unsafe { PgBox::from_pg(completion_tag) }; - if ENABLE_PARQUET_COPY_HOOK.get() && is_copy_to_parquet_stmt(&p_stmt) { + if is_copy_to_parquet_stmt(&p_stmt) { let nprocessed = process_copy_to_parquet(&p_stmt, query_string, ¶ms, &query_env); if !completion_tag.is_null() { @@ -136,7 +136,7 @@ extern "C" fn parquet_copy_hook( completion_tag.commandTag = CommandTag::CMDTAG_COPY; } return; - } else if ENABLE_PARQUET_COPY_HOOK.get() && is_copy_from_parquet_stmt(&p_stmt) { + } else if is_copy_from_parquet_stmt(&p_stmt) { let nprocessed = process_copy_from_parquet(&p_stmt, query_string, &query_env); if !completion_tag.is_null() { diff --git a/src/pgrx_tests/object_store.rs b/src/pgrx_tests/object_store.rs index 2e985f6..05744e3 100644 --- a/src/pgrx_tests/object_store.rs +++ b/src/pgrx_tests/object_store.rs @@ -2,6 +2,7 @@ mod tests { use std::io::Write; + use aws_config::BehaviorVersion; use pgrx::{pg_test, Spi}; use crate::pgrx_tests::common::TestTable; @@ -44,13 +45,15 @@ mod tests { std::env::remove_var("AWS_SECRET_ACCESS_KEY"); let region = std::env::var("AWS_REGION").unwrap(); std::env::remove_var("AWS_REGION"); + let endpoint = std::env::var("AWS_ENDPOINT_URL").unwrap(); + std::env::remove_var("AWS_ENDPOINT_URL"); let profile = "pg_parquet_test"; // create a config file let aws_config_file_content = format!( - "[profile {profile}]\nregion = {}\naws_access_key_id = {}\naws_secret_access_key = {}\n", - region, access_key_id, secret_access_key + "[profile {profile}]\nregion = {}\naws_access_key_id = {}\naws_secret_access_key = {}\nendpoint_url = {}\n", + region, access_key_id, secret_access_key, endpoint ); std::env::set_var("AWS_PROFILE", profile); @@ -212,6 +215,81 @@ mod tests { Spi::run(copy_from_command.as_str()).unwrap(); } + #[pg_test] + fn test_s3_object_store_with_temporary_token() { + let tokio_rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap_or_else(|e| panic!("failed to create tokio runtime: {}", e)); + + let s3_uri = tokio_rt.block_on(async { + let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; + let client = aws_sdk_sts::Client::new(&config); + + let assume_role_result = client + .assume_role() + .role_session_name("testsession") + .role_arn("arn:xxx:xxx:xxx:xxxx") + .send() + .await + .unwrap(); + + let assumed_creds = assume_role_result.credentials().unwrap(); + + std::env::set_var("AWS_ACCESS_KEY_ID", assumed_creds.access_key_id()); + std::env::set_var("AWS_SECRET_ACCESS_KEY", assumed_creds.secret_access_key()); + std::env::set_var("AWS_SESSION_TOKEN", assumed_creds.session_token()); + + let test_bucket_name: String = + std::env::var("AWS_S3_TEST_BUCKET").expect("AWS_S3_TEST_BUCKET not found"); + + format!("s3://{}/pg_parquet_test.parquet", test_bucket_name) + }); + + let test_table = TestTable::::new("int4".into()).with_uri(s3_uri); + + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } + + #[pg_test] + #[should_panic(expected = "403 Forbidden")] + fn test_s3_object_store_with_missing_temporary_token_fail() { + let tokio_rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap_or_else(|e| panic!("failed to create tokio runtime: {}", e)); + + let s3_uri = tokio_rt.block_on(async { + let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; + let client = aws_sdk_sts::Client::new(&config); + + let assume_role_result = client + .assume_role() + .role_session_name("testsession") + .role_arn("arn:xxx:xxx:xxx:xxxx") + .send() + .await + .unwrap(); + + let assumed_creds = assume_role_result.credentials().unwrap(); + + // we do not set the session token on purpose + std::env::set_var("AWS_ACCESS_KEY_ID", assumed_creds.access_key_id()); + std::env::set_var("AWS_SECRET_ACCESS_KEY", assumed_creds.secret_access_key()); + + let test_bucket_name: String = + std::env::var("AWS_S3_TEST_BUCKET").expect("AWS_S3_TEST_BUCKET not found"); + + format!("s3://{}/pg_parquet_test.parquet", test_bucket_name) + }); + + let test_table = TestTable::::new("int4".into()).with_uri(s3_uri); + + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } + #[pg_test] #[should_panic(expected = "failed to parse bucket name")] fn test_s3_unsupported_uri() { diff --git a/src/pgrx_utils.rs b/src/pgrx_utils.rs index 0793def..6b2d0b4 100644 --- a/src/pgrx_utils.rs +++ b/src/pgrx_utils.rs @@ -2,8 +2,8 @@ use std::collections::HashSet; use pgrx::{ pg_sys::{ - getBaseType, get_element_type, lookup_rowtype_tupdesc, type_is_array, type_is_rowtype, - FormData_pg_attribute, InvalidOid, Oid, + getBaseType, get_element_type, get_extension_oid, lookup_rowtype_tupdesc, type_is_array, + type_is_rowtype, AsPgCStr, FormData_pg_attribute, InvalidOid, Oid, }, PgTupleDesc, }; @@ -99,3 +99,9 @@ pub(crate) fn domain_array_base_elem_typoid(domain_typoid: Oid) -> Oid { array_element_typoid(base_array_typoid) } + +pub(crate) fn extension_exists(extension_name: &str) -> bool { + let extension_name = extension_name.as_pg_cstr(); + let extension_oid = unsafe { get_extension_oid(extension_name, true) }; + extension_oid != InvalidOid +} From f3a31d0e9ce24046c90faa1a6a2927e7f9547b6f Mon Sep 17 00:00:00 2001 From: Aykut Bozkurt Date: Mon, 16 Dec 2024 19:10:20 +0300 Subject: [PATCH 05/15] no curl in fake-gcs --- .devcontainer/docker-compose.yml | 3 +-- .devcontainer/entrypoint.sh | 3 +++ .devcontainer/gcs-entrypoint.sh | 17 ----------------- .github/workflows/ci.yml | 6 +++--- 4 files changed, 7 insertions(+), 22 deletions(-) delete mode 100755 .devcontainer/gcs-entrypoint.sh diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml index 12e48d8..2b75da6 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/docker-compose.yml @@ -55,11 +55,10 @@ services: env_file: - .env network_mode: host - entrypoint: "./entrypoint.sh" command: -scheme http -public-host localhost:4443 restart: unless-stopped healthcheck: - test: ["CMD", "curl", "http://localhost:4443"] + test: ["CMD", "nc", "-z", "localhost", "4443"] interval: 6s timeout: 2s retries: 3 diff --git a/.devcontainer/entrypoint.sh b/.devcontainer/entrypoint.sh index 43944ed..5408468 100755 --- a/.devcontainer/entrypoint.sh +++ b/.devcontainer/entrypoint.sh @@ -5,4 +5,7 @@ trap "echo 'Caught termination signal. Exiting...'; exit 0" SIGINT SIGTERM # create azurite container az storage container create -n $AZURE_TEST_CONTAINER_NAME --connection-string $AZURE_STORAGE_CONNECTION_STRING +# create fake-gcs bucket +curl -v -X POST --data-binary "{\"name\":\"$GOOGLE_TEST_BUCKET\"}" -H "Content-Type: application/json" "$GOOGLE_SERVICE_ENDPOINT/storage/v1/b" + sleep infinity diff --git a/.devcontainer/gcs-entrypoint.sh b/.devcontainer/gcs-entrypoint.sh deleted file mode 100755 index 45d7315..0000000 --- a/.devcontainer/gcs-entrypoint.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/sh - -trap "echo 'Caught termination signal. Exiting...'; exit 0" SIGINT SIGTERM - -/bin/fake-gcs-server -data /data -scheme http -public-host localhost:4443 & - -gcs_pid=$! - -while ! curl $GOOGLE_SERVICE_ENDPOINT; do - echo "Waiting for $GOOGLE_SERVICE_ENDPOINT..." - sleep 1 -done - -# create bucket -curl -v -X POST --data-binary "{\"name\":\"$GOOGLE_TEST_BUCKET\"}" -H "Content-Type: application/json" "$GOOGLE_SERVICE_ENDPOINT/storage/v1/b" - -wait $gcs_pid diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c2c09ab..687867f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -148,12 +148,12 @@ jobs: --volume ./.devcontainer/gcs-entrypoint.sh:/entrypoint.sh \ tustvold/fake-gcs-server - while ! nc -z localhost 4443; do - echo "Waiting for localhost:4443..." + while ! curl $GOOGLE_SERVICE_ENDPOINT; do + echo "Waiting for $GOOGLE_SERVICE_ENDPOINT..." sleep 1 done - curl -v -X POST --data-binary "{\"name\":\"$GOOGLE_TEST_BUCKET\"}" -H "Content-Type: application/json" "http://localhost:4443/storage/v1/b" + curl -v -X POST --data-binary "{\"name\":\"$GOOGLE_TEST_BUCKET\"}" -H "Content-Type: application/json" "$GOOGLE_SERVICE_ENDPOINT/storage/v1/b" - name: Run tests run: | From 0c65a3b84a6dd3e6dfae48ba20a17be508d64249 Mon Sep 17 00:00:00 2001 From: Aykut Bozkurt Date: Mon, 16 Dec 2024 19:15:05 +0300 Subject: [PATCH 06/15] ci uri fix --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9b289fb..9a16df7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -131,8 +131,8 @@ jobs: -p 10000:10000 \ mcr.microsoft.com/azure-storage/azurite - while ! curl $AZURITE_URL; do - echo "Waiting for $AZURITE_URL..." + while ! curl $AZURE_STORAGE_ENDPOINT; do + echo "Waiting for $AZURE_STORAGE_ENDPOINT..." sleep 1 done From d76bf55dcf049ce3ebe389d045b58d5cebb6761a Mon Sep 17 00:00:00 2001 From: Aykut Bozkurt Date: Mon, 16 Dec 2024 19:15:33 +0300 Subject: [PATCH 07/15] ci uri fix --- .github/workflows/ci.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 687867f..db7f7fb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -131,8 +131,8 @@ jobs: -p 10000:10000 \ mcr.microsoft.com/azure-storage/azurite - while ! curl $AZURITE_URL; do - echo "Waiting for $AZURITE_URL..." + while ! curl $AZURE_STORAGE_ENDPOINT; do + echo "Waiting for $AZURE_STORAGE_ENDPOINT..." sleep 1 done @@ -144,8 +144,6 @@ jobs: docker run -d \ --env-file .devcontainer/.env \ -p 4443:4443 \ - --entrypoint "./entrypoint.sh" \ - --volume ./.devcontainer/gcs-entrypoint.sh:/entrypoint.sh \ tustvold/fake-gcs-server while ! curl $GOOGLE_SERVICE_ENDPOINT; do From 74df5b2f71305e45ac134b830c69d401cdd33513 Mon Sep 17 00:00:00 2001 From: Aykut Bozkurt Date: Mon, 16 Dec 2024 19:22:06 +0300 Subject: [PATCH 08/15] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a0cb3b1..370db7d 100644 --- a/README.md +++ b/README.md @@ -156,7 +156,7 @@ SELECT uri, encode(key, 'escape') as key, encode(value, 'escape') as value FROM ``` ## Object Store Support -`pg_parquet` supports reading and writing Parquet files from/to `S3` and `Azure Blob Storage` object stores. +`pg_parquet` supports reading and writing Parquet files from/to `S3`, `Azure Blob Storage` and `Google Cloud Service` object stores. > [!NOTE] > To be able to write into a object store location, you need to grant `parquet_object_store_write` role to your current postgres user. From 6102ba903f494ef34f0b902f725043cdd5aac682 Mon Sep 17 00:00:00 2001 From: Aykut Bozkurt Date: Fri, 3 Jan 2025 17:17:35 +0300 Subject: [PATCH 09/15] improve coverage --- src/arrow_parquet/uri_utils.rs | 63 +++++++++++++--------------------- src/pgrx_tests/object_store.rs | 6 ++-- 2 files changed, 27 insertions(+), 42 deletions(-) diff --git a/src/arrow_parquet/uri_utils.rs b/src/arrow_parquet/uri_utils.rs index 9e20663..c15ac5f 100644 --- a/src/arrow_parquet/uri_utils.rs +++ b/src/arrow_parquet/uri_utils.rs @@ -54,14 +54,16 @@ fn parse_azure_blob_container(uri: &Url) -> Option { return Some(host.to_string()); } // https://{account}.blob.core.windows.net/{container} - else if host.ends_with("blob.core.windows.net") { + else if host.ends_with(".blob.core.windows.net") { let path_segments: Vec<&str> = uri.path_segments()?.collect(); - if !path_segments.is_empty() { - return Some(path_segments[0].to_string()); - } else { - return None; - } + // Container name is the first part of the path + return Some( + path_segments + .first() + .expect("unexpected error during parsing azure blob uri") + .to_string(), + ); } None @@ -77,14 +79,17 @@ fn parse_s3_bucket(uri: &Url) -> Option { // https://s3.amazonaws.com/{bucket}/key else if host == "s3.amazonaws.com" { let path_segments: Vec<&str> = uri.path_segments()?.collect(); - if !path_segments.is_empty() { - return Some(path_segments[0].to_string()); // Bucket name is the first part of the path - } else { - return None; - } + + // Bucket name is the first part of the path + return Some( + path_segments + .first() + .expect("unexpected error during parsing s3 uri") + .to_string(), + ); } // https://{bucket}.s3.amazonaws.com/key - else if host.ends_with("s3.amazonaws.com") { + else if host.ends_with(".s3.amazonaws.com") { let bucket_name = host.split('.').next()?; return Some(bucket_name.to_string()); } @@ -94,12 +99,14 @@ fn parse_s3_bucket(uri: &Url) -> Option { fn object_store_with_location(uri: &Url, copy_from: bool) -> (Arc, Path) { let (scheme, path) = - ObjectStoreScheme::parse(uri).unwrap_or_else(|_| panic!("unsupported uri {}", uri)); + ObjectStoreScheme::parse(uri).unwrap_or_else(|_| panic!("unrecognized uri {}", uri)); + // object_store crate can recognize a bunch of different schemes and paths, but we only support + // local, azure, and s3 schemes with a subset of all supported paths. match scheme { ObjectStoreScheme::AmazonS3 => { let bucket_name = parse_s3_bucket(uri).unwrap_or_else(|| { - panic!("failed to parse bucket name from uri: {}", uri); + panic!("unsupported s3 uri: {}", uri); }); let storage_container = PG_BACKEND_TOKIO_RUNTIME @@ -109,7 +116,7 @@ fn object_store_with_location(uri: &Url, copy_from: bool) -> (Arc { let container_name = parse_azure_blob_container(uri).unwrap_or_else(|| { - panic!("failed to parse container name from uri: {}", uri); + panic!("unsupported azure blob storage uri: {}", uri); }); let storage_container = PG_BACKEND_TOKIO_RUNTIME @@ -137,7 +144,7 @@ fn object_store_with_location(uri: &Url, copy_from: bool) -> (Arc { - panic!("unsupported uri {}", uri); + panic!("unsupported scheme {} in uri {}", uri.scheme(), uri); } } } @@ -252,29 +259,7 @@ pub(crate) fn parse_uri(uri: &str) -> Url { .unwrap_or_else(|_| panic!("not a valid file path: {}", uri)); } - let uri = Url::parse(uri).unwrap_or_else(|e| panic!("{}", e)); - - let (scheme, _) = - ObjectStoreScheme::parse(&uri).unwrap_or_else(|_| panic!("unsupported uri {}", uri)); - - if scheme == ObjectStoreScheme::AmazonS3 { - parse_s3_bucket(&uri) - .unwrap_or_else(|| panic!("failed to parse bucket name from s3 uri {}", uri)); - } else if scheme == ObjectStoreScheme::MicrosoftAzure { - parse_azure_blob_container(&uri).unwrap_or_else(|| { - panic!( - "failed to parse container name from azure blob storage uri {}", - uri - ) - }); - } else { - panic!( - "unsupported uri {}. Only Azure and S3 uris are supported.", - uri - ); - }; - - uri + Url::parse(uri).unwrap_or_else(|e| panic!("{}", e)) } pub(crate) fn uri_as_string(uri: &Url) -> String { diff --git a/src/pgrx_tests/object_store.rs b/src/pgrx_tests/object_store.rs index 19d0dc6..cbab4fd 100644 --- a/src/pgrx_tests/object_store.rs +++ b/src/pgrx_tests/object_store.rs @@ -291,7 +291,7 @@ mod tests { } #[pg_test] - #[should_panic(expected = "failed to parse bucket name")] + #[should_panic(expected = "unsupported s3 uri")] fn test_s3_unsupported_uri() { let cloudflare_s3_uri = "https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket".into(); @@ -478,7 +478,7 @@ mod tests { } #[pg_test] - #[should_panic(expected = "failed to parse container name")] + #[should_panic(expected = "unsupported azure blob storage uri")] fn test_azure_blob_unsupported_uri() { let fabric_azure_blob_uri = "https://ACCOUNT.dfs.fabric.microsoft.com".into(); @@ -489,7 +489,7 @@ mod tests { } #[pg_test] - #[should_panic(expected = "unsupported uri gs://testbucket")] + #[should_panic(expected = "unsupported scheme gs in uri gs://testbucket")] fn test_unsupported_uri() { let test_table = TestTable::::new("int4".into()).with_uri("gs://testbucket".to_string()); From 24ff5231902813ee1a7e0560d76d0bcb540b0006 Mon Sep 17 00:00:00 2001 From: Aykut Bozkurt Date: Mon, 6 Jan 2025 19:29:21 +0300 Subject: [PATCH 10/15] support connection string --- README.md | 2 +- src/arrow_parquet/parquet_reader.rs | 3 +- src/arrow_parquet/parquet_writer.rs | 3 +- src/arrow_parquet/uri_utils.rs | 243 +--------------------------- src/lib.rs | 13 ++ src/object_store.rs | 58 +++++++ src/object_store/aws.rs | 92 +++++++++++ src/object_store/azure.rs | 202 +++++++++++++++++++++++ src/object_store/local_file.rs | 21 +++ src/pgrx_tests/object_store.rs | 91 +++++++++-- 10 files changed, 477 insertions(+), 251 deletions(-) create mode 100644 src/object_store.rs create mode 100644 src/object_store/aws.rs create mode 100644 src/object_store/azure.rs create mode 100644 src/object_store/local_file.rs diff --git a/README.md b/README.md index cc61366..8b38bfc 100644 --- a/README.md +++ b/README.md @@ -191,7 +191,6 @@ Alternatively, you can use the following environment variables when starting pos Supported S3 uri formats are shown below: - s3:// \ / \ -- s3a:// \ / \ - https:// \.s3.amazonaws.com / \ - https:// s3.amazonaws.com / \ / \ @@ -209,6 +208,7 @@ key = Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/ Alternatively, you can use the following environment variables when starting postgres to configure the Azure Blob Storage client: - `AZURE_STORAGE_ACCOUNT`: the storage account name of the Azure Blob - `AZURE_STORAGE_KEY`: the storage key of the Azure Blob +- `AZURE_STORAGE_CONNECTION_STRING`: the connection string for the Azure Blob (this can be set instead of specifying account name and key) - `AZURE_STORAGE_SAS_TOKEN`: the storage SAS token for the Azure Blob - `AZURE_STORAGE_ENDPOINT`: the endpoint **(only via environment variables)** - `AZURE_CONFIG_FILE`: an alternative location for the config file **(only via environment variables)** diff --git a/src/arrow_parquet/parquet_reader.rs b/src/arrow_parquet/parquet_reader.rs index 6790513..b64b238 100644 --- a/src/arrow_parquet/parquet_reader.rs +++ b/src/arrow_parquet/parquet_reader.rs @@ -25,6 +25,7 @@ use crate::{ }, pgrx_utils::{collect_attributes_for, CollectAttributesFor}, type_compat::{geometry::reset_postgis_context, map::reset_map_context}, + PG_BACKEND_TOKIO_RUNTIME, }; use super::{ @@ -33,7 +34,7 @@ use super::{ schema_parser::{ ensure_file_schema_match_tupledesc_schema, parse_arrow_schema_from_attributes, }, - uri_utils::{parquet_reader_from_uri, PG_BACKEND_TOKIO_RUNTIME}, + uri_utils::parquet_reader_from_uri, }; pub(crate) struct ParquetReaderContext { diff --git a/src/arrow_parquet/parquet_writer.rs b/src/arrow_parquet/parquet_writer.rs index e93ea8b..4f5713f 100644 --- a/src/arrow_parquet/parquet_writer.rs +++ b/src/arrow_parquet/parquet_writer.rs @@ -15,10 +15,11 @@ use crate::{ schema_parser::{ parquet_schema_string_from_attributes, parse_arrow_schema_from_attributes, }, - uri_utils::{parquet_writer_from_uri, PG_BACKEND_TOKIO_RUNTIME}, + uri_utils::parquet_writer_from_uri, }, pgrx_utils::{collect_attributes_for, CollectAttributesFor}, type_compat::{geometry::reset_postgis_context, map::reset_map_context}, + PG_BACKEND_TOKIO_RUNTIME, }; use super::pg_to_arrow::{ diff --git a/src/arrow_parquet/uri_utils.rs b/src/arrow_parquet/uri_utils.rs index c15ac5f..6eadc91 100644 --- a/src/arrow_parquet/uri_utils.rs +++ b/src/arrow_parquet/uri_utils.rs @@ -1,20 +1,6 @@ -use std::{ - panic, - sync::{Arc, LazyLock}, -}; +use std::{panic, sync::Arc}; use arrow::datatypes::SchemaRef; -use aws_config::BehaviorVersion; -use aws_credential_types::provider::ProvideCredentials; -use home::home_dir; -use ini::Ini; -use object_store::{ - aws::{AmazonS3, AmazonS3Builder}, - azure::{AzureConfigKey, MicrosoftAzure, MicrosoftAzureBuilder}, - local::LocalFileSystem, - path::Path, - ObjectStore, ObjectStoreScheme, -}; use parquet::{ arrow::{ arrow_to_parquet_schema, @@ -29,229 +15,16 @@ use pgrx::{ ereport, pg_sys::{get_role_oid, has_privs_of_role, superuser, AsPgCStr, GetUserId}, }; -use tokio::runtime::Runtime; use url::Url; -use crate::arrow_parquet::parquet_writer::DEFAULT_ROW_GROUP_SIZE; +use crate::{ + arrow_parquet::parquet_writer::DEFAULT_ROW_GROUP_SIZE, object_store::create_object_store, + PG_BACKEND_TOKIO_RUNTIME, +}; const PARQUET_OBJECT_STORE_READ_ROLE: &str = "parquet_object_store_read"; const PARQUET_OBJECT_STORE_WRITE_ROLE: &str = "parquet_object_store_write"; -// PG_BACKEND_TOKIO_RUNTIME creates a tokio runtime that uses the current thread -// to run the tokio reactor. This uses the same thread that is running the Postgres backend. -pub(crate) static PG_BACKEND_TOKIO_RUNTIME: LazyLock = LazyLock::new(|| { - tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap_or_else(|e| panic!("failed to create tokio runtime: {}", e)) -}); - -fn parse_azure_blob_container(uri: &Url) -> Option { - let host = uri.host_str()?; - - // az(ure)://{container}/key - if uri.scheme() == "az" || uri.scheme() == "azure" { - return Some(host.to_string()); - } - // https://{account}.blob.core.windows.net/{container} - else if host.ends_with(".blob.core.windows.net") { - let path_segments: Vec<&str> = uri.path_segments()?.collect(); - - // Container name is the first part of the path - return Some( - path_segments - .first() - .expect("unexpected error during parsing azure blob uri") - .to_string(), - ); - } - - None -} - -fn parse_s3_bucket(uri: &Url) -> Option { - let host = uri.host_str()?; - - // s3(a)://{bucket}/key - if uri.scheme() == "s3" || uri.scheme() == "s3a" { - return Some(host.to_string()); - } - // https://s3.amazonaws.com/{bucket}/key - else if host == "s3.amazonaws.com" { - let path_segments: Vec<&str> = uri.path_segments()?.collect(); - - // Bucket name is the first part of the path - return Some( - path_segments - .first() - .expect("unexpected error during parsing s3 uri") - .to_string(), - ); - } - // https://{bucket}.s3.amazonaws.com/key - else if host.ends_with(".s3.amazonaws.com") { - let bucket_name = host.split('.').next()?; - return Some(bucket_name.to_string()); - } - - None -} - -fn object_store_with_location(uri: &Url, copy_from: bool) -> (Arc, Path) { - let (scheme, path) = - ObjectStoreScheme::parse(uri).unwrap_or_else(|_| panic!("unrecognized uri {}", uri)); - - // object_store crate can recognize a bunch of different schemes and paths, but we only support - // local, azure, and s3 schemes with a subset of all supported paths. - match scheme { - ObjectStoreScheme::AmazonS3 => { - let bucket_name = parse_s3_bucket(uri).unwrap_or_else(|| { - panic!("unsupported s3 uri: {}", uri); - }); - - let storage_container = PG_BACKEND_TOKIO_RUNTIME - .block_on(async { Arc::new(get_s3_object_store(&bucket_name).await) }); - - (storage_container, path) - } - ObjectStoreScheme::MicrosoftAzure => { - let container_name = parse_azure_blob_container(uri).unwrap_or_else(|| { - panic!("unsupported azure blob storage uri: {}", uri); - }); - - let storage_container = PG_BACKEND_TOKIO_RUNTIME - .block_on(async { Arc::new(get_azure_object_store(&container_name).await) }); - - (storage_container, path) - } - ObjectStoreScheme::Local => { - let uri = uri_as_string(uri); - - if !copy_from { - // create or overwrite the local file - std::fs::OpenOptions::new() - .write(true) - .truncate(true) - .create(true) - .open(&uri) - .unwrap_or_else(|e| panic!("{}", e)); - } - - let storage_container = Arc::new(LocalFileSystem::new()); - - let path = Path::from_filesystem_path(&uri).unwrap_or_else(|e| panic!("{}", e)); - - (storage_container, path) - } - _ => { - panic!("unsupported scheme {} in uri {}", uri.scheme(), uri); - } - } -} - -// get_s3_object_store creates an AmazonS3 object store with the given bucket name. -// It is configured by environment variables and aws config files as fallback method. -// We need to read the config files to make the fallback method work since object_store -// does not provide a way to read them. Currently, we only support to extract -// "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN", "AWS_ENDPOINT_URL", -// and "AWS_REGION" from the config files. -async fn get_s3_object_store(bucket_name: &str) -> AmazonS3 { - let mut aws_s3_builder = AmazonS3Builder::from_env().with_bucket_name(bucket_name); - - // first tries environment variables and then the config files - let sdk_config = aws_config::defaults(BehaviorVersion::v2024_03_28()) - .load() - .await; - - if let Some(credential_provider) = sdk_config.credentials_provider() { - if let Ok(credentials) = credential_provider.provide_credentials().await { - // AWS_ACCESS_KEY_ID - aws_s3_builder = aws_s3_builder.with_access_key_id(credentials.access_key_id()); - - // AWS_SECRET_ACCESS_KEY - aws_s3_builder = aws_s3_builder.with_secret_access_key(credentials.secret_access_key()); - - if let Some(token) = credentials.session_token() { - // AWS_SESSION_TOKEN - aws_s3_builder = aws_s3_builder.with_token(token); - } - } - } - - // AWS_ENDPOINT_URL - if let Some(aws_endpoint_url) = sdk_config.endpoint_url() { - aws_s3_builder = aws_s3_builder.with_endpoint(aws_endpoint_url); - } - - // AWS_REGION - if let Some(aws_region) = sdk_config.region() { - aws_s3_builder = aws_s3_builder.with_region(aws_region.as_ref()); - } - - aws_s3_builder.build().unwrap_or_else(|e| panic!("{}", e)) -} - -async fn get_azure_object_store(container_name: &str) -> MicrosoftAzure { - let mut azure_builder = MicrosoftAzureBuilder::from_env().with_container_name(container_name); - - // ~/.azure/config - let azure_config_file_path = std::env::var("AZURE_CONFIG_FILE").unwrap_or( - home_dir() - .expect("failed to get home directory") - .join(".azure") - .join("config") - .to_str() - .expect("failed to convert path to string") - .to_string(), - ); - - let azure_config_content = Ini::load_from_file(&azure_config_file_path).ok(); - - // storage account - let azure_blob_account = match std::env::var("AZURE_STORAGE_ACCOUNT") { - Ok(account) => Some(account), - Err(_) => azure_config_content - .as_ref() - .and_then(|ini| ini.section(Some("storage"))) - .and_then(|section| section.get("account")) - .map(|account| account.to_string()), - }; - - if let Some(azure_blob_account) = azure_blob_account { - azure_builder = azure_builder.with_account(azure_blob_account); - } - - // storage key - let azure_blob_key = match std::env::var("AZURE_STORAGE_KEY") { - Ok(key) => Some(key), - Err(_) => azure_config_content - .as_ref() - .and_then(|ini| ini.section(Some("storage"))) - .and_then(|section| section.get("key")) - .map(|key| key.to_string()), - }; - - if let Some(azure_blob_key) = azure_blob_key { - azure_builder = azure_builder.with_access_key(azure_blob_key); - } - - // sas token - let azure_blob_sas_token = match std::env::var("AZURE_STORAGE_SAS_TOKEN") { - Ok(token) => Some(token), - Err(_) => azure_config_content - .as_ref() - .and_then(|ini| ini.section(Some("storage"))) - .and_then(|section| section.get("sas_token")) - .map(|token| token.to_string()), - }; - - if let Some(azure_blob_sas_token) = azure_blob_sas_token { - azure_builder = azure_builder.with_config(AzureConfigKey::SasKey, azure_blob_sas_token); - } - - azure_builder.build().unwrap_or_else(|e| panic!("{}", e)) -} - pub(crate) fn parse_uri(uri: &str) -> Url { if !uri.contains("://") { // local file @@ -285,7 +58,7 @@ pub(crate) fn parquet_schema_from_uri(uri: &Url) -> SchemaDescriptor { pub(crate) fn parquet_metadata_from_uri(uri: &Url) -> Arc { let copy_from = true; - let (parquet_object_store, location) = object_store_with_location(uri, copy_from); + let (parquet_object_store, location) = create_object_store(uri, copy_from); PG_BACKEND_TOKIO_RUNTIME.block_on(async { let object_store_meta = parquet_object_store @@ -308,7 +81,7 @@ pub(crate) fn parquet_metadata_from_uri(uri: &Url) -> Arc { pub(crate) fn parquet_reader_from_uri(uri: &Url) -> ParquetRecordBatchStream { let copy_from = true; - let (parquet_object_store, location) = object_store_with_location(uri, copy_from); + let (parquet_object_store, location) = create_object_store(uri, copy_from); PG_BACKEND_TOKIO_RUNTIME.block_on(async { let object_store_meta = parquet_object_store @@ -340,7 +113,7 @@ pub(crate) fn parquet_writer_from_uri( writer_props: WriterProperties, ) -> AsyncArrowWriter { let copy_from = false; - let (parquet_object_store, location) = object_store_with_location(uri, copy_from); + let (parquet_object_store, location) = create_object_store(uri, copy_from); let parquet_object_writer = ParquetObjectWriter::new(parquet_object_store, location); diff --git a/src/lib.rs b/src/lib.rs index 100c80b..16aa1d9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,8 +1,12 @@ +use std::sync::LazyLock; + use parquet_copy_hook::hook::{init_parquet_copy_hook, ENABLE_PARQUET_COPY_HOOK}; use parquet_copy_hook::pg_compat::MarkGUCPrefixReserved; use pgrx::{prelude::*, GucContext, GucFlags, GucRegistry}; +use tokio::runtime::Runtime; mod arrow_parquet; +mod object_store; mod parquet_copy_hook; mod parquet_udfs; #[cfg(any(test, feature = "pg_test"))] @@ -20,6 +24,15 @@ pgrx::pg_module_magic!(); extension_sql_file!("../sql/bootstrap.sql", name = "role_setup", bootstrap); +// PG_BACKEND_TOKIO_RUNTIME creates a tokio runtime that uses the current thread +// to run the tokio reactor. This uses the same thread that is running the Postgres backend. +pub(crate) static PG_BACKEND_TOKIO_RUNTIME: LazyLock = LazyLock::new(|| { + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap_or_else(|e| panic!("failed to create tokio runtime: {}", e)) +}); + #[pg_guard] pub extern "C" fn _PG_init() { GucRegistry::define_bool_guc( diff --git a/src/object_store.rs b/src/object_store.rs new file mode 100644 index 0000000..50cf230 --- /dev/null +++ b/src/object_store.rs @@ -0,0 +1,58 @@ +use std::sync::Arc; + +use object_store::{path::Path, ObjectStore, ObjectStoreScheme}; +use url::Url; + +use crate::{ + arrow_parquet::uri_utils::uri_as_string, + object_store::{ + aws::create_s3_object_store, azure::create_azure_object_store, + local_file::create_local_file_object_store, + }, + PG_BACKEND_TOKIO_RUNTIME, +}; + +pub(crate) mod aws; +pub(crate) mod azure; +pub(crate) mod local_file; + +pub(crate) fn create_object_store(uri: &Url, copy_from: bool) -> (Arc, Path) { + let (scheme, path) = ObjectStoreScheme::parse(uri).unwrap_or_else(|_| { + panic!( + "unrecognized uri {}. pg_parquet supports s3:// or azure:// schemes.", + uri + ) + }); + + // object_store crate can recognize a bunch of different schemes and paths, but we only support + // local, azure, and s3 schemes with a subset of all supported paths. + match scheme { + ObjectStoreScheme::AmazonS3 => { + let storage_container = PG_BACKEND_TOKIO_RUNTIME + .block_on(async { Arc::new(create_s3_object_store(uri).await) }); + + (storage_container, path) + } + ObjectStoreScheme::MicrosoftAzure => { + let storage_container = PG_BACKEND_TOKIO_RUNTIME + .block_on(async { Arc::new(create_azure_object_store(uri).await) }); + + (storage_container, path) + } + ObjectStoreScheme::Local => { + let storage_container = Arc::new(create_local_file_object_store(uri, copy_from)); + + let path = + Path::from_filesystem_path(uri_as_string(uri)).unwrap_or_else(|e| panic!("{}", e)); + + (storage_container, path) + } + _ => { + panic!( + "unsupported scheme {} in uri {}. pg_parquet supports s3:// or azure:// schemes.", + uri.scheme(), + uri + ); + } + } +} diff --git a/src/object_store/aws.rs b/src/object_store/aws.rs new file mode 100644 index 0000000..076e3d4 --- /dev/null +++ b/src/object_store/aws.rs @@ -0,0 +1,92 @@ +use aws_config::BehaviorVersion; +use aws_sdk_sts::config::ProvideCredentials; +use object_store::aws::{AmazonS3, AmazonS3Builder}; +use url::Url; + +// create_s3_object_store creates an AmazonS3 object store with the given bucket name. +// It is configured by environment variables and aws config files as fallback method. +// We need to read the config files to make the fallback method work since object_store +// does not provide a way to read them. Currently, we only support following environment +// variables and config parameters: +// - AWS_ACCESS_KEY_ID +// - AWS_SECRET_ACCESS_KEY +// - AWS_SESSION_TOKEN +// - AWS_ENDPOINT_URL +// - AWS_REGION +// - AWS_SHARED_CREDENTIALS_FILE (env var only) +// - AWS_CONFIG_FILE (env var only) +// - AWS_PROFILE (env var only) +// - AWS_ALLOW_HTTP (env var only, object_store specific) +pub(crate) async fn create_s3_object_store(uri: &Url) -> AmazonS3 { + let bucket_name = parse_s3_bucket(uri).unwrap_or_else(|| { + panic!("unsupported s3 uri: {}", uri); + }); + + // we do not use builder::from_env() here because not all environment variables have + // a fallback to the config files + let mut aws_s3_builder = AmazonS3Builder::new().with_bucket_name(bucket_name); + + if let Ok(allow_http) = std::env::var("AWS_ALLOW_HTTP") { + aws_s3_builder = aws_s3_builder.with_allow_http(allow_http.parse().unwrap_or(false)); + } + + // first tries environment variables and then the config files + let sdk_config = aws_config::defaults(BehaviorVersion::v2024_03_28()) + .load() + .await; + + if let Some(credential_provider) = sdk_config.credentials_provider() { + if let Ok(credentials) = credential_provider.provide_credentials().await { + // AWS_ACCESS_KEY_ID + aws_s3_builder = aws_s3_builder.with_access_key_id(credentials.access_key_id()); + + // AWS_SECRET_ACCESS_KEY + aws_s3_builder = aws_s3_builder.with_secret_access_key(credentials.secret_access_key()); + + if let Some(token) = credentials.session_token() { + // AWS_SESSION_TOKEN + aws_s3_builder = aws_s3_builder.with_token(token); + } + } + } + + // AWS_ENDPOINT_URL + if let Some(aws_endpoint_url) = sdk_config.endpoint_url() { + aws_s3_builder = aws_s3_builder.with_endpoint(aws_endpoint_url); + } + + // AWS_REGION + if let Some(aws_region) = sdk_config.region() { + aws_s3_builder = aws_s3_builder.with_region(aws_region.as_ref()); + } + + aws_s3_builder.build().unwrap_or_else(|e| panic!("{}", e)) +} + +fn parse_s3_bucket(uri: &Url) -> Option { + let host = uri.host_str()?; + + // s3(a)://{bucket}/key + if uri.scheme() == "s3" { + return Some(host.to_string()); + } + // https://s3.amazonaws.com/{bucket}/key + else if host == "s3.amazonaws.com" { + let path_segments: Vec<&str> = uri.path_segments()?.collect(); + + // Bucket name is the first part of the path + return Some( + path_segments + .first() + .expect("unexpected error during parsing s3 uri") + .to_string(), + ); + } + // https://{bucket}.s3.amazonaws.com/key + else if host.ends_with(".s3.amazonaws.com") { + let bucket_name = host.split('.').next()?; + return Some(bucket_name.to_string()); + } + + None +} diff --git a/src/object_store/azure.rs b/src/object_store/azure.rs new file mode 100644 index 0000000..33232ad --- /dev/null +++ b/src/object_store/azure.rs @@ -0,0 +1,202 @@ +use home::home_dir; +use ini::Ini; +use object_store::azure::{AzureConfigKey, MicrosoftAzure, MicrosoftAzureBuilder}; +use url::Url; + +// create_azure_object_store creates a MicrosoftAzure object store with the given container name. +// It is configured by environment variables and azure config files as fallback method. +// We need to read the config files to make the fallback method work since object_store +// does not provide a way to read them. Currently, we only support following environment +// variables and config parameters: +// - AZURE_STORAGE_ACCOUNT +// - AZURE_STORAGE_KEY +// - AZURE_STORAGE_CONNECTION_STRING +// - AZURE_STORAGE_SAS_TOKEN +// - AZURE_CONFIG_FILE (env var only, object_store specific) +// - AZURE_STORAGE_ENDPOINT (env var only, object_store specific) +// - AZURE_ALLOW_HTTP (env var only, object_store specific) +pub(crate) async fn create_azure_object_store(uri: &Url) -> MicrosoftAzure { + let container_name = parse_azure_blob_container(uri).unwrap_or_else(|| { + panic!("unsupported azure blob storage uri: {}", uri); + }); + + let mut azure_builder = MicrosoftAzureBuilder::new().with_container_name(container_name); + + let azure_blob_config = AzureStorageConfig::with_provider_chain(); + + // account name + if let Some(account_name) = azure_blob_config.account_name { + azure_builder = azure_builder.with_account(account_name); + } + + // account key + if let Some(account_key) = azure_blob_config.account_key { + azure_builder = azure_builder.with_access_key(account_key); + } + + // sas token + if let Some(sas_token) = azure_blob_config.sas_token { + azure_builder = azure_builder.with_config(AzureConfigKey::SasKey, sas_token); + } + + // allow http + azure_builder = azure_builder.with_allow_http(azure_blob_config.allow_http); + + // endpoint + if let Some(endpoint) = azure_blob_config.endpoint { + azure_builder = azure_builder.with_endpoint(endpoint); + } + + azure_builder.build().unwrap_or_else(|e| panic!("{}", e)) +} + +fn parse_azure_blob_container(uri: &Url) -> Option { + let host = uri.host_str()?; + + // az(ure)://{container}/key + if uri.scheme() == "az" || uri.scheme() == "azure" { + return Some(host.to_string()); + } + // https://{account}.blob.core.windows.net/{container} + else if host.ends_with(".blob.core.windows.net") { + let path_segments: Vec<&str> = uri.path_segments()?.collect(); + + // Container name is the first part of the path + return Some( + path_segments + .first() + .expect("unexpected error during parsing azure blob uri") + .to_string(), + ); + } + + None +} + +// AzureStorageConfig represents the configuration for Azure Blob Storage. +// There is no proper azure sdk config crate that can read the config files. +// So, we need to read the config files manually from azure's ini config. +// See https://learn.microsoft.com/en-us/cli/azure/azure-cli-configuration?view=azure-cli-latest +struct AzureStorageConfig { + account_name: Option, + account_key: Option, + sas_token: Option, + endpoint: Option, + allow_http: bool, +} + +impl AzureStorageConfig { + fn with_provider_chain() -> Self { + // ~/.azure/config + let azure_config_file_path = std::env::var("AZURE_CONFIG_FILE").unwrap_or( + home_dir() + .expect("failed to get home directory") + .join(".azure") + .join("config") + .to_str() + .expect("failed to convert path to string") + .to_string(), + ); + + let azure_config_content = Ini::load_from_file(&azure_config_file_path).ok(); + + // connection string + let connection_string = match std::env::var("AZURE_STORAGE_CONNECTION_STRING") { + Ok(connection_string) => Some(connection_string), + Err(_) => azure_config_content + .as_ref() + .and_then(|ini| ini.section(Some("storage"))) + .and_then(|section| section.get("connection_string")) + .map(|connection_string| connection_string.to_string()), + }; + + // connection string has the highest priority + if let Some(connection_string) = connection_string { + return Self::from_connection_string(&connection_string); + } + + // account name + let account_name = match std::env::var("AZURE_STORAGE_ACCOUNT") { + Ok(account) => Some(account), + Err(_) => azure_config_content + .as_ref() + .and_then(|ini| ini.section(Some("storage"))) + .and_then(|section| section.get("account")) + .map(|account| account.to_string()), + }; + + // account key + let account_key = match std::env::var("AZURE_STORAGE_KEY") { + Ok(key) => Some(key), + Err(_) => azure_config_content + .as_ref() + .and_then(|ini| ini.section(Some("storage"))) + .and_then(|section| section.get("key")) + .map(|key| key.to_string()), + }; + + // sas token + let sas_token = match std::env::var("AZURE_STORAGE_SAS_TOKEN") { + Ok(token) => Some(token), + Err(_) => azure_config_content + .as_ref() + .and_then(|ini| ini.section(Some("storage"))) + .and_then(|section| section.get("sas_token")) + .map(|token| token.to_string()), + }; + + // endpoint + let endpoint = std::env::var("AZURE_STORAGE_ENDPOINT").ok(); + + // allow http + let allow_http = std::env::var("AZURE_ALLOW_HTTP") + .ok() + .map(|allow_http| allow_http.parse().unwrap_or(false)) + .unwrap_or(false); + + AzureStorageConfig { + account_name, + account_key, + sas_token, + endpoint, + allow_http, + } + } + + // from_connection_string parses AzureBlobConfig from the given connection string. + // See https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string#create-a-connection-string-for-an-explicit-storage-endpoint + fn from_connection_string(connection_string: &str) -> Self { + let mut account_name = None; + let mut account_key = None; + let mut sas_token = None; + let mut endpoint = None; + let mut allow_http = false; + + for pair in connection_string.trim_end_matches(';').split(';') { + let (key, value) = pair + .split_once('=') + .expect("invalid azure connection string format"); + + match key { + "AccountName" => account_name = Some(value.to_string()), + "AccountKey" => account_key = Some(value.to_string()), + "SharedAccessSignature" => sas_token = Some(value.to_string()), + "BlobEndpoint" => endpoint = Some(value.to_string()), + "DefaultEndpointsProtocol" => { + allow_http = value.to_lowercase() == "http"; + } + _ => { + panic!("unsupported config key in azure connection string: {}", key); + } + } + } + + AzureStorageConfig { + account_name, + account_key, + sas_token, + endpoint, + allow_http, + } + } +} diff --git a/src/object_store/local_file.rs b/src/object_store/local_file.rs new file mode 100644 index 0000000..938dde4 --- /dev/null +++ b/src/object_store/local_file.rs @@ -0,0 +1,21 @@ +use object_store::local::LocalFileSystem; +use url::Url; + +use super::uri_as_string; + +// create_local_file_object_store creates a LocalFileSystem object store with the given path. +pub(crate) fn create_local_file_object_store(uri: &Url, copy_from: bool) -> LocalFileSystem { + let path = uri_as_string(uri); + + if !copy_from { + // create or overwrite the local file + std::fs::OpenOptions::new() + .write(true) + .truncate(true) + .create(true) + .open(path) + .unwrap_or_else(|e| panic!("{}", e)); + } + + LocalFileSystem::new() +} diff --git a/src/pgrx_tests/object_store.rs b/src/pgrx_tests/object_store.rs index cbab4fd..08fc988 100644 --- a/src/pgrx_tests/object_store.rs +++ b/src/pgrx_tests/object_store.rs @@ -14,7 +14,6 @@ mod tests { let s3_uris = [ format!("s3://{}/pg_parquet_test.parquet", test_bucket_name), - format!("s3a://{}/pg_parquet_test.parquet", test_bucket_name), format!( "https://s3.amazonaws.com/{}/pg_parquet_test.parquet", test_bucket_name @@ -81,7 +80,7 @@ mod tests { #[pg_test] #[should_panic(expected = "403 Forbidden")] - fn test_s3_with_wrong_access_key_id() { + fn test_s3_wrong_access_key_id() { std::env::set_var("AWS_ACCESS_KEY_ID", "wrong_access_key_id"); let test_bucket_name: String = @@ -97,7 +96,7 @@ mod tests { #[pg_test] #[should_panic(expected = "403 Forbidden")] - fn test_s3_with_wrong_secret_access_key() { + fn test_s3_wrong_secret_access_key() { std::env::set_var("AWS_SECRET_ACCESS_KEY", "wrong_secret_access_key"); let test_bucket_name: String = @@ -216,7 +215,7 @@ mod tests { } #[pg_test] - fn test_s3_object_store_with_temporary_token() { + fn test_s3_temporary_token() { let tokio_rt = tokio::runtime::Builder::new_current_thread() .enable_all() .build() @@ -254,7 +253,7 @@ mod tests { #[pg_test] #[should_panic(expected = "403 Forbidden")] - fn test_s3_object_store_with_missing_temporary_token_fail() { + fn test_s3_missing_temporary_token() { let tokio_rt = tokio::runtime::Builder::new_current_thread() .enable_all() .build() @@ -303,6 +302,9 @@ mod tests { #[pg_test] fn test_azure_blob_from_env() { + // unset AZURE_STORAGE_CONNECTION_STRING to make sure the account name and key are used + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); + let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") .expect("AZURE_TEST_CONTAINER_NAME not found"); @@ -336,6 +338,7 @@ mod tests { std::env::remove_var("AZURE_STORAGE_ACCOUNT"); let account_key = std::env::var("AZURE_STORAGE_KEY").unwrap(); std::env::remove_var("AZURE_STORAGE_KEY"); + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); // create a config file let azure_config_file_content = format!( @@ -365,9 +368,66 @@ mod tests { test_table.assert_expected_and_result_rows(); } + #[pg_test] + fn test_azure_from_env_via_connection_string() { + // unset AZURE_STORAGE_ACCOUNT AND AZURE_STORAGE_KEY to make sure the connection string is used + std::env::remove_var("AZURE_STORAGE_ACCOUNT"); + std::env::remove_var("AZURE_STORAGE_KEY"); + + let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") + .expect("AZURE_TEST_CONTAINER_NAME not found"); + + let azure_blob_uri = format!("az://{}/pg_parquet_test.parquet", test_container_name); + + let test_table = TestTable::::new("int4".into()).with_uri(azure_blob_uri); + + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } + + #[pg_test] + fn test_azure_from_config_via_connection_string() { + let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") + .expect("AZURE_TEST_CONTAINER_NAME not found"); + + // remove these to make sure the config file is used + std::env::remove_var("AZURE_STORAGE_ACCOUNT"); + std::env::remove_var("AZURE_STORAGE_KEY"); + let connection_string = std::env::var("AZURE_STORAGE_CONNECTION_STRING").unwrap(); + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); + + // create a config file + let azure_config_file_content = + format!("[storage]\nconnection_string = {}\n", connection_string); + + let azure_config_file = "/tmp/pg_parquet_azure_config"; + std::env::set_var("AZURE_CONFIG_FILE", azure_config_file); + + let mut azure_config_file = std::fs::OpenOptions::new() + .write(true) + .truncate(true) + .create(true) + .open(azure_config_file) + .unwrap(); + + azure_config_file + .write_all(azure_config_file_content.as_bytes()) + .unwrap(); + + let azure_blob_uri = format!("az://{}/pg_parquet_test.parquet", test_container_name); + + let test_table = TestTable::::new("int4".into()).with_uri(azure_blob_uri); + + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } + #[pg_test] #[should_panic(expected = "Account must be specified")] - fn test_azure_with_no_storage_account() { + fn test_azure_no_storage_account() { + // unset AZURE_STORAGE_CONNECTION_STRING to make sure the account name and key are used + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); + std::env::remove_var("AZURE_STORAGE_ACCOUNT"); let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") @@ -383,7 +443,10 @@ mod tests { #[pg_test] #[should_panic(expected = "403 Forbidden")] - fn test_azure_blob_with_wrong_storage_key() { + fn test_azure_wrong_storage_key() { + // unset AZURE_STORAGE_CONNECTION_STRING to make sure the account name and key are used + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); + let wrong_account_key = String::from("FFy8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=="); std::env::set_var("AZURE_STORAGE_KEY", wrong_account_key); @@ -406,7 +469,7 @@ mod tests { #[pg_test] #[should_panic(expected = "404 Not Found")] - fn test_azure_blob_write_wrong_container() { + fn test_azure_write_wrong_container() { let test_account_name: String = std::env::var("AZURE_STORAGE_ACCOUNT").expect("AZURE_STORAGE_ACCOUNT not found"); @@ -423,7 +486,7 @@ mod tests { } #[pg_test] - fn test_azure_blob_read_write_sas() { + fn test_azure_read_write_sas() { let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") .expect("AZURE_TEST_CONTAINER_NAME not found"); @@ -433,8 +496,9 @@ mod tests { let read_write_sas_token = std::env::var("AZURE_TEST_READ_WRITE_SAS") .expect("AZURE_TEST_READ_WRITE_SAS not found"); - // remove account key to make sure the sas token is used + // remove account key and connection string to make sure the sas token is used std::env::remove_var("AZURE_STORAGE_KEY"); + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); std::env::set_var("AZURE_STORAGE_SAS_TOKEN", read_write_sas_token); let azure_blob_uri = format!( @@ -451,7 +515,7 @@ mod tests { #[pg_test] #[should_panic(expected = "403 Forbidden")] - fn test_azure_blob_read_only_sas() { + fn test_azure_read_only_sas() { let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") .expect("AZURE_TEST_CONTAINER_NAME not found"); @@ -461,8 +525,9 @@ mod tests { let read_only_sas_token: String = std::env::var("AZURE_TEST_READ_ONLY_SAS").expect("AZURE_TEST_READ_ONLY_SAS not found"); - // remove account key to make sure the sas token is used + // remove account key and connection string to make sure the sas token is used std::env::remove_var("AZURE_STORAGE_KEY"); + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); std::env::set_var("AZURE_STORAGE_SAS_TOKEN", read_only_sas_token); let azure_blob_uri = format!( @@ -479,7 +544,7 @@ mod tests { #[pg_test] #[should_panic(expected = "unsupported azure blob storage uri")] - fn test_azure_blob_unsupported_uri() { + fn test_azure_unsupported_uri() { let fabric_azure_blob_uri = "https://ACCOUNT.dfs.fabric.microsoft.com".into(); let test_table = TestTable::::new("int4".into()).with_uri(fabric_azure_blob_uri); From e0df256a0520395fcd2930fbef01b8dbbb0ca312 Mon Sep 17 00:00:00 2001 From: Aykut Bozkurt Date: Mon, 6 Jan 2025 19:29:21 +0300 Subject: [PATCH 11/15] support connection string --- README.md | 2 +- src/arrow_parquet/parquet_reader.rs | 3 +- src/arrow_parquet/parquet_writer.rs | 3 +- src/arrow_parquet/uri_utils.rs | 243 +--------------------------- src/lib.rs | 13 ++ src/object_store.rs | 56 +++++++ src/object_store/aws.rs | 98 +++++++++++ src/object_store/azure.rs | 202 +++++++++++++++++++++++ src/object_store/local_file.rs | 21 +++ src/pgrx_tests/object_store.rs | 91 +++++++++-- 10 files changed, 481 insertions(+), 251 deletions(-) create mode 100644 src/object_store.rs create mode 100644 src/object_store/aws.rs create mode 100644 src/object_store/azure.rs create mode 100644 src/object_store/local_file.rs diff --git a/README.md b/README.md index cc61366..8b38bfc 100644 --- a/README.md +++ b/README.md @@ -191,7 +191,6 @@ Alternatively, you can use the following environment variables when starting pos Supported S3 uri formats are shown below: - s3:// \ / \ -- s3a:// \ / \ - https:// \.s3.amazonaws.com / \ - https:// s3.amazonaws.com / \ / \ @@ -209,6 +208,7 @@ key = Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/ Alternatively, you can use the following environment variables when starting postgres to configure the Azure Blob Storage client: - `AZURE_STORAGE_ACCOUNT`: the storage account name of the Azure Blob - `AZURE_STORAGE_KEY`: the storage key of the Azure Blob +- `AZURE_STORAGE_CONNECTION_STRING`: the connection string for the Azure Blob (this can be set instead of specifying account name and key) - `AZURE_STORAGE_SAS_TOKEN`: the storage SAS token for the Azure Blob - `AZURE_STORAGE_ENDPOINT`: the endpoint **(only via environment variables)** - `AZURE_CONFIG_FILE`: an alternative location for the config file **(only via environment variables)** diff --git a/src/arrow_parquet/parquet_reader.rs b/src/arrow_parquet/parquet_reader.rs index 6790513..b64b238 100644 --- a/src/arrow_parquet/parquet_reader.rs +++ b/src/arrow_parquet/parquet_reader.rs @@ -25,6 +25,7 @@ use crate::{ }, pgrx_utils::{collect_attributes_for, CollectAttributesFor}, type_compat::{geometry::reset_postgis_context, map::reset_map_context}, + PG_BACKEND_TOKIO_RUNTIME, }; use super::{ @@ -33,7 +34,7 @@ use super::{ schema_parser::{ ensure_file_schema_match_tupledesc_schema, parse_arrow_schema_from_attributes, }, - uri_utils::{parquet_reader_from_uri, PG_BACKEND_TOKIO_RUNTIME}, + uri_utils::parquet_reader_from_uri, }; pub(crate) struct ParquetReaderContext { diff --git a/src/arrow_parquet/parquet_writer.rs b/src/arrow_parquet/parquet_writer.rs index e93ea8b..4f5713f 100644 --- a/src/arrow_parquet/parquet_writer.rs +++ b/src/arrow_parquet/parquet_writer.rs @@ -15,10 +15,11 @@ use crate::{ schema_parser::{ parquet_schema_string_from_attributes, parse_arrow_schema_from_attributes, }, - uri_utils::{parquet_writer_from_uri, PG_BACKEND_TOKIO_RUNTIME}, + uri_utils::parquet_writer_from_uri, }, pgrx_utils::{collect_attributes_for, CollectAttributesFor}, type_compat::{geometry::reset_postgis_context, map::reset_map_context}, + PG_BACKEND_TOKIO_RUNTIME, }; use super::pg_to_arrow::{ diff --git a/src/arrow_parquet/uri_utils.rs b/src/arrow_parquet/uri_utils.rs index c15ac5f..6eadc91 100644 --- a/src/arrow_parquet/uri_utils.rs +++ b/src/arrow_parquet/uri_utils.rs @@ -1,20 +1,6 @@ -use std::{ - panic, - sync::{Arc, LazyLock}, -}; +use std::{panic, sync::Arc}; use arrow::datatypes::SchemaRef; -use aws_config::BehaviorVersion; -use aws_credential_types::provider::ProvideCredentials; -use home::home_dir; -use ini::Ini; -use object_store::{ - aws::{AmazonS3, AmazonS3Builder}, - azure::{AzureConfigKey, MicrosoftAzure, MicrosoftAzureBuilder}, - local::LocalFileSystem, - path::Path, - ObjectStore, ObjectStoreScheme, -}; use parquet::{ arrow::{ arrow_to_parquet_schema, @@ -29,229 +15,16 @@ use pgrx::{ ereport, pg_sys::{get_role_oid, has_privs_of_role, superuser, AsPgCStr, GetUserId}, }; -use tokio::runtime::Runtime; use url::Url; -use crate::arrow_parquet::parquet_writer::DEFAULT_ROW_GROUP_SIZE; +use crate::{ + arrow_parquet::parquet_writer::DEFAULT_ROW_GROUP_SIZE, object_store::create_object_store, + PG_BACKEND_TOKIO_RUNTIME, +}; const PARQUET_OBJECT_STORE_READ_ROLE: &str = "parquet_object_store_read"; const PARQUET_OBJECT_STORE_WRITE_ROLE: &str = "parquet_object_store_write"; -// PG_BACKEND_TOKIO_RUNTIME creates a tokio runtime that uses the current thread -// to run the tokio reactor. This uses the same thread that is running the Postgres backend. -pub(crate) static PG_BACKEND_TOKIO_RUNTIME: LazyLock = LazyLock::new(|| { - tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap_or_else(|e| panic!("failed to create tokio runtime: {}", e)) -}); - -fn parse_azure_blob_container(uri: &Url) -> Option { - let host = uri.host_str()?; - - // az(ure)://{container}/key - if uri.scheme() == "az" || uri.scheme() == "azure" { - return Some(host.to_string()); - } - // https://{account}.blob.core.windows.net/{container} - else if host.ends_with(".blob.core.windows.net") { - let path_segments: Vec<&str> = uri.path_segments()?.collect(); - - // Container name is the first part of the path - return Some( - path_segments - .first() - .expect("unexpected error during parsing azure blob uri") - .to_string(), - ); - } - - None -} - -fn parse_s3_bucket(uri: &Url) -> Option { - let host = uri.host_str()?; - - // s3(a)://{bucket}/key - if uri.scheme() == "s3" || uri.scheme() == "s3a" { - return Some(host.to_string()); - } - // https://s3.amazonaws.com/{bucket}/key - else if host == "s3.amazonaws.com" { - let path_segments: Vec<&str> = uri.path_segments()?.collect(); - - // Bucket name is the first part of the path - return Some( - path_segments - .first() - .expect("unexpected error during parsing s3 uri") - .to_string(), - ); - } - // https://{bucket}.s3.amazonaws.com/key - else if host.ends_with(".s3.amazonaws.com") { - let bucket_name = host.split('.').next()?; - return Some(bucket_name.to_string()); - } - - None -} - -fn object_store_with_location(uri: &Url, copy_from: bool) -> (Arc, Path) { - let (scheme, path) = - ObjectStoreScheme::parse(uri).unwrap_or_else(|_| panic!("unrecognized uri {}", uri)); - - // object_store crate can recognize a bunch of different schemes and paths, but we only support - // local, azure, and s3 schemes with a subset of all supported paths. - match scheme { - ObjectStoreScheme::AmazonS3 => { - let bucket_name = parse_s3_bucket(uri).unwrap_or_else(|| { - panic!("unsupported s3 uri: {}", uri); - }); - - let storage_container = PG_BACKEND_TOKIO_RUNTIME - .block_on(async { Arc::new(get_s3_object_store(&bucket_name).await) }); - - (storage_container, path) - } - ObjectStoreScheme::MicrosoftAzure => { - let container_name = parse_azure_blob_container(uri).unwrap_or_else(|| { - panic!("unsupported azure blob storage uri: {}", uri); - }); - - let storage_container = PG_BACKEND_TOKIO_RUNTIME - .block_on(async { Arc::new(get_azure_object_store(&container_name).await) }); - - (storage_container, path) - } - ObjectStoreScheme::Local => { - let uri = uri_as_string(uri); - - if !copy_from { - // create or overwrite the local file - std::fs::OpenOptions::new() - .write(true) - .truncate(true) - .create(true) - .open(&uri) - .unwrap_or_else(|e| panic!("{}", e)); - } - - let storage_container = Arc::new(LocalFileSystem::new()); - - let path = Path::from_filesystem_path(&uri).unwrap_or_else(|e| panic!("{}", e)); - - (storage_container, path) - } - _ => { - panic!("unsupported scheme {} in uri {}", uri.scheme(), uri); - } - } -} - -// get_s3_object_store creates an AmazonS3 object store with the given bucket name. -// It is configured by environment variables and aws config files as fallback method. -// We need to read the config files to make the fallback method work since object_store -// does not provide a way to read them. Currently, we only support to extract -// "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN", "AWS_ENDPOINT_URL", -// and "AWS_REGION" from the config files. -async fn get_s3_object_store(bucket_name: &str) -> AmazonS3 { - let mut aws_s3_builder = AmazonS3Builder::from_env().with_bucket_name(bucket_name); - - // first tries environment variables and then the config files - let sdk_config = aws_config::defaults(BehaviorVersion::v2024_03_28()) - .load() - .await; - - if let Some(credential_provider) = sdk_config.credentials_provider() { - if let Ok(credentials) = credential_provider.provide_credentials().await { - // AWS_ACCESS_KEY_ID - aws_s3_builder = aws_s3_builder.with_access_key_id(credentials.access_key_id()); - - // AWS_SECRET_ACCESS_KEY - aws_s3_builder = aws_s3_builder.with_secret_access_key(credentials.secret_access_key()); - - if let Some(token) = credentials.session_token() { - // AWS_SESSION_TOKEN - aws_s3_builder = aws_s3_builder.with_token(token); - } - } - } - - // AWS_ENDPOINT_URL - if let Some(aws_endpoint_url) = sdk_config.endpoint_url() { - aws_s3_builder = aws_s3_builder.with_endpoint(aws_endpoint_url); - } - - // AWS_REGION - if let Some(aws_region) = sdk_config.region() { - aws_s3_builder = aws_s3_builder.with_region(aws_region.as_ref()); - } - - aws_s3_builder.build().unwrap_or_else(|e| panic!("{}", e)) -} - -async fn get_azure_object_store(container_name: &str) -> MicrosoftAzure { - let mut azure_builder = MicrosoftAzureBuilder::from_env().with_container_name(container_name); - - // ~/.azure/config - let azure_config_file_path = std::env::var("AZURE_CONFIG_FILE").unwrap_or( - home_dir() - .expect("failed to get home directory") - .join(".azure") - .join("config") - .to_str() - .expect("failed to convert path to string") - .to_string(), - ); - - let azure_config_content = Ini::load_from_file(&azure_config_file_path).ok(); - - // storage account - let azure_blob_account = match std::env::var("AZURE_STORAGE_ACCOUNT") { - Ok(account) => Some(account), - Err(_) => azure_config_content - .as_ref() - .and_then(|ini| ini.section(Some("storage"))) - .and_then(|section| section.get("account")) - .map(|account| account.to_string()), - }; - - if let Some(azure_blob_account) = azure_blob_account { - azure_builder = azure_builder.with_account(azure_blob_account); - } - - // storage key - let azure_blob_key = match std::env::var("AZURE_STORAGE_KEY") { - Ok(key) => Some(key), - Err(_) => azure_config_content - .as_ref() - .and_then(|ini| ini.section(Some("storage"))) - .and_then(|section| section.get("key")) - .map(|key| key.to_string()), - }; - - if let Some(azure_blob_key) = azure_blob_key { - azure_builder = azure_builder.with_access_key(azure_blob_key); - } - - // sas token - let azure_blob_sas_token = match std::env::var("AZURE_STORAGE_SAS_TOKEN") { - Ok(token) => Some(token), - Err(_) => azure_config_content - .as_ref() - .and_then(|ini| ini.section(Some("storage"))) - .and_then(|section| section.get("sas_token")) - .map(|token| token.to_string()), - }; - - if let Some(azure_blob_sas_token) = azure_blob_sas_token { - azure_builder = azure_builder.with_config(AzureConfigKey::SasKey, azure_blob_sas_token); - } - - azure_builder.build().unwrap_or_else(|e| panic!("{}", e)) -} - pub(crate) fn parse_uri(uri: &str) -> Url { if !uri.contains("://") { // local file @@ -285,7 +58,7 @@ pub(crate) fn parquet_schema_from_uri(uri: &Url) -> SchemaDescriptor { pub(crate) fn parquet_metadata_from_uri(uri: &Url) -> Arc { let copy_from = true; - let (parquet_object_store, location) = object_store_with_location(uri, copy_from); + let (parquet_object_store, location) = create_object_store(uri, copy_from); PG_BACKEND_TOKIO_RUNTIME.block_on(async { let object_store_meta = parquet_object_store @@ -308,7 +81,7 @@ pub(crate) fn parquet_metadata_from_uri(uri: &Url) -> Arc { pub(crate) fn parquet_reader_from_uri(uri: &Url) -> ParquetRecordBatchStream { let copy_from = true; - let (parquet_object_store, location) = object_store_with_location(uri, copy_from); + let (parquet_object_store, location) = create_object_store(uri, copy_from); PG_BACKEND_TOKIO_RUNTIME.block_on(async { let object_store_meta = parquet_object_store @@ -340,7 +113,7 @@ pub(crate) fn parquet_writer_from_uri( writer_props: WriterProperties, ) -> AsyncArrowWriter { let copy_from = false; - let (parquet_object_store, location) = object_store_with_location(uri, copy_from); + let (parquet_object_store, location) = create_object_store(uri, copy_from); let parquet_object_writer = ParquetObjectWriter::new(parquet_object_store, location); diff --git a/src/lib.rs b/src/lib.rs index 100c80b..16aa1d9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,8 +1,12 @@ +use std::sync::LazyLock; + use parquet_copy_hook::hook::{init_parquet_copy_hook, ENABLE_PARQUET_COPY_HOOK}; use parquet_copy_hook::pg_compat::MarkGUCPrefixReserved; use pgrx::{prelude::*, GucContext, GucFlags, GucRegistry}; +use tokio::runtime::Runtime; mod arrow_parquet; +mod object_store; mod parquet_copy_hook; mod parquet_udfs; #[cfg(any(test, feature = "pg_test"))] @@ -20,6 +24,15 @@ pgrx::pg_module_magic!(); extension_sql_file!("../sql/bootstrap.sql", name = "role_setup", bootstrap); +// PG_BACKEND_TOKIO_RUNTIME creates a tokio runtime that uses the current thread +// to run the tokio reactor. This uses the same thread that is running the Postgres backend. +pub(crate) static PG_BACKEND_TOKIO_RUNTIME: LazyLock = LazyLock::new(|| { + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap_or_else(|e| panic!("failed to create tokio runtime: {}", e)) +}); + #[pg_guard] pub extern "C" fn _PG_init() { GucRegistry::define_bool_guc( diff --git a/src/object_store.rs b/src/object_store.rs new file mode 100644 index 0000000..c978cd6 --- /dev/null +++ b/src/object_store.rs @@ -0,0 +1,56 @@ +use std::sync::Arc; + +use object_store::{path::Path, ObjectStore, ObjectStoreScheme}; +use url::Url; + +use crate::{ + arrow_parquet::uri_utils::uri_as_string, + object_store::{ + aws::create_s3_object_store, azure::create_azure_object_store, + local_file::create_local_file_object_store, + }, + PG_BACKEND_TOKIO_RUNTIME, +}; + +pub(crate) mod aws; +pub(crate) mod azure; +pub(crate) mod local_file; + +pub(crate) fn create_object_store(uri: &Url, copy_from: bool) -> (Arc, Path) { + let (scheme, path) = ObjectStoreScheme::parse(uri).unwrap_or_else(|_| { + panic!( + "unrecognized uri {}. pg_parquet supports local paths, s3:// or azure:// schemes.", + uri + ) + }); + + // object_store crate can recognize a bunch of different schemes and paths, but we only support + // local, azure, and s3 schemes with a subset of all supported paths. + match scheme { + ObjectStoreScheme::AmazonS3 => { + let storage_container = Arc::new(create_s3_object_store(uri)); + + (storage_container, path) + } + ObjectStoreScheme::MicrosoftAzure => { + let storage_container = Arc::new(create_azure_object_store(uri)); + + (storage_container, path) + } + ObjectStoreScheme::Local => { + let storage_container = Arc::new(create_local_file_object_store(uri, copy_from)); + + let path = + Path::from_filesystem_path(uri_as_string(uri)).unwrap_or_else(|e| panic!("{}", e)); + + (storage_container, path) + } + _ => { + panic!( + "unsupported scheme {} in uri {}. pg_parquet supports s3:// or azure:// schemes.", + uri.scheme(), + uri + ); + } + } +} diff --git a/src/object_store/aws.rs b/src/object_store/aws.rs new file mode 100644 index 0000000..c738f8a --- /dev/null +++ b/src/object_store/aws.rs @@ -0,0 +1,98 @@ +use aws_config::BehaviorVersion; +use aws_sdk_sts::config::ProvideCredentials; +use object_store::aws::{AmazonS3, AmazonS3Builder}; +use url::Url; + +use super::PG_BACKEND_TOKIO_RUNTIME; + +// create_s3_object_store creates an AmazonS3 object store with the given bucket name. +// It is configured by environment variables and aws config files as fallback method. +// We need to read the config files to make the fallback method work since object_store +// does not provide a way to read them. Currently, we only support following environment +// variables and config parameters: +// - AWS_ACCESS_KEY_ID +// - AWS_SECRET_ACCESS_KEY +// - AWS_SESSION_TOKEN +// - AWS_ENDPOINT_URL +// - AWS_REGION +// - AWS_SHARED_CREDENTIALS_FILE (env var only) +// - AWS_CONFIG_FILE (env var only) +// - AWS_PROFILE (env var only) +// - AWS_ALLOW_HTTP (env var only, object_store specific) +pub(crate) fn create_s3_object_store(uri: &Url) -> AmazonS3 { + let bucket_name = parse_s3_bucket(uri).unwrap_or_else(|| { + panic!("unsupported s3 uri: {}", uri); + }); + + // we do not use builder::from_env() here because not all environment variables have + // a fallback to the config files + let mut aws_s3_builder = AmazonS3Builder::new().with_bucket_name(bucket_name); + + if let Ok(allow_http) = std::env::var("AWS_ALLOW_HTTP") { + aws_s3_builder = aws_s3_builder.with_allow_http(allow_http.parse().unwrap_or(false)); + } + + // first tries environment variables and then the config files + let sdk_config = PG_BACKEND_TOKIO_RUNTIME.block_on(async { + aws_config::defaults(BehaviorVersion::v2024_03_28()) + .load() + .await + }); + + if let Some(credential_provider) = sdk_config.credentials_provider() { + if let Ok(credentials) = PG_BACKEND_TOKIO_RUNTIME + .block_on(async { credential_provider.provide_credentials().await }) + { + // AWS_ACCESS_KEY_ID + aws_s3_builder = aws_s3_builder.with_access_key_id(credentials.access_key_id()); + + // AWS_SECRET_ACCESS_KEY + aws_s3_builder = aws_s3_builder.with_secret_access_key(credentials.secret_access_key()); + + if let Some(token) = credentials.session_token() { + // AWS_SESSION_TOKEN + aws_s3_builder = aws_s3_builder.with_token(token); + } + } + } + + // AWS_ENDPOINT_URL + if let Some(aws_endpoint_url) = sdk_config.endpoint_url() { + aws_s3_builder = aws_s3_builder.with_endpoint(aws_endpoint_url); + } + + // AWS_REGION + if let Some(aws_region) = sdk_config.region() { + aws_s3_builder = aws_s3_builder.with_region(aws_region.as_ref()); + } + + aws_s3_builder.build().unwrap_or_else(|e| panic!("{}", e)) +} + +fn parse_s3_bucket(uri: &Url) -> Option { + let host = uri.host_str()?; + + // s3(a)://{bucket}/key + if uri.scheme() == "s3" { + return Some(host.to_string()); + } + // https://s3.amazonaws.com/{bucket}/key + else if host == "s3.amazonaws.com" { + let path_segments: Vec<&str> = uri.path_segments()?.collect(); + + // Bucket name is the first part of the path + return Some( + path_segments + .first() + .expect("unexpected error during parsing s3 uri") + .to_string(), + ); + } + // https://{bucket}.s3.amazonaws.com/key + else if host.ends_with(".s3.amazonaws.com") { + let bucket_name = host.split('.').next()?; + return Some(bucket_name.to_string()); + } + + None +} diff --git a/src/object_store/azure.rs b/src/object_store/azure.rs new file mode 100644 index 0000000..638dfe6 --- /dev/null +++ b/src/object_store/azure.rs @@ -0,0 +1,202 @@ +use home::home_dir; +use ini::Ini; +use object_store::azure::{AzureConfigKey, MicrosoftAzure, MicrosoftAzureBuilder}; +use url::Url; + +// create_azure_object_store creates a MicrosoftAzure object store with the given container name. +// It is configured by environment variables and azure config files as fallback method. +// We need to read the config files to make the fallback method work since object_store +// does not provide a way to read them. Currently, we only support following environment +// variables and config parameters: +// - AZURE_STORAGE_ACCOUNT +// - AZURE_STORAGE_KEY +// - AZURE_STORAGE_CONNECTION_STRING +// - AZURE_STORAGE_SAS_TOKEN +// - AZURE_CONFIG_FILE (env var only, object_store specific) +// - AZURE_STORAGE_ENDPOINT (env var only, object_store specific) +// - AZURE_ALLOW_HTTP (env var only, object_store specific) +pub(crate) fn create_azure_object_store(uri: &Url) -> MicrosoftAzure { + let container_name = parse_azure_blob_container(uri).unwrap_or_else(|| { + panic!("unsupported azure blob storage uri: {}", uri); + }); + + let mut azure_builder = MicrosoftAzureBuilder::new().with_container_name(container_name); + + let azure_blob_config = AzureStorageConfig::with_provider_chain(); + + // account name + if let Some(account_name) = azure_blob_config.account_name { + azure_builder = azure_builder.with_account(account_name); + } + + // account key + if let Some(account_key) = azure_blob_config.account_key { + azure_builder = azure_builder.with_access_key(account_key); + } + + // sas token + if let Some(sas_token) = azure_blob_config.sas_token { + azure_builder = azure_builder.with_config(AzureConfigKey::SasKey, sas_token); + } + + // allow http + azure_builder = azure_builder.with_allow_http(azure_blob_config.allow_http); + + // endpoint + if let Some(endpoint) = azure_blob_config.endpoint { + azure_builder = azure_builder.with_endpoint(endpoint); + } + + azure_builder.build().unwrap_or_else(|e| panic!("{}", e)) +} + +fn parse_azure_blob_container(uri: &Url) -> Option { + let host = uri.host_str()?; + + // az(ure)://{container}/key + if uri.scheme() == "az" || uri.scheme() == "azure" { + return Some(host.to_string()); + } + // https://{account}.blob.core.windows.net/{container} + else if host.ends_with(".blob.core.windows.net") { + let path_segments: Vec<&str> = uri.path_segments()?.collect(); + + // Container name is the first part of the path + return Some( + path_segments + .first() + .expect("unexpected error during parsing azure blob uri") + .to_string(), + ); + } + + None +} + +// AzureStorageConfig represents the configuration for Azure Blob Storage. +// There is no proper azure sdk config crate that can read the config files. +// So, we need to read the config files manually from azure's ini config. +// See https://learn.microsoft.com/en-us/cli/azure/azure-cli-configuration?view=azure-cli-latest +struct AzureStorageConfig { + account_name: Option, + account_key: Option, + sas_token: Option, + endpoint: Option, + allow_http: bool, +} + +impl AzureStorageConfig { + fn with_provider_chain() -> Self { + // ~/.azure/config + let azure_config_file_path = std::env::var("AZURE_CONFIG_FILE").unwrap_or( + home_dir() + .expect("failed to get home directory") + .join(".azure") + .join("config") + .to_str() + .expect("failed to convert path to string") + .to_string(), + ); + + let azure_config_content = Ini::load_from_file(&azure_config_file_path).ok(); + + // connection string + let connection_string = match std::env::var("AZURE_STORAGE_CONNECTION_STRING") { + Ok(connection_string) => Some(connection_string), + Err(_) => azure_config_content + .as_ref() + .and_then(|ini| ini.section(Some("storage"))) + .and_then(|section| section.get("connection_string")) + .map(|connection_string| connection_string.to_string()), + }; + + // connection string has the highest priority + if let Some(connection_string) = connection_string { + return Self::from_connection_string(&connection_string); + } + + // account name + let account_name = match std::env::var("AZURE_STORAGE_ACCOUNT") { + Ok(account) => Some(account), + Err(_) => azure_config_content + .as_ref() + .and_then(|ini| ini.section(Some("storage"))) + .and_then(|section| section.get("account")) + .map(|account| account.to_string()), + }; + + // account key + let account_key = match std::env::var("AZURE_STORAGE_KEY") { + Ok(key) => Some(key), + Err(_) => azure_config_content + .as_ref() + .and_then(|ini| ini.section(Some("storage"))) + .and_then(|section| section.get("key")) + .map(|key| key.to_string()), + }; + + // sas token + let sas_token = match std::env::var("AZURE_STORAGE_SAS_TOKEN") { + Ok(token) => Some(token), + Err(_) => azure_config_content + .as_ref() + .and_then(|ini| ini.section(Some("storage"))) + .and_then(|section| section.get("sas_token")) + .map(|token| token.to_string()), + }; + + // endpoint + let endpoint = std::env::var("AZURE_STORAGE_ENDPOINT").ok(); + + // allow http + let allow_http = std::env::var("AZURE_ALLOW_HTTP") + .ok() + .map(|allow_http| allow_http.parse().unwrap_or(false)) + .unwrap_or(false); + + AzureStorageConfig { + account_name, + account_key, + sas_token, + endpoint, + allow_http, + } + } + + // from_connection_string parses AzureBlobConfig from the given connection string. + // See https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string#create-a-connection-string-for-an-explicit-storage-endpoint + fn from_connection_string(connection_string: &str) -> Self { + let mut account_name = None; + let mut account_key = None; + let mut sas_token = None; + let mut endpoint = None; + let mut allow_http = false; + + for pair in connection_string.trim_end_matches(';').split(';') { + let (key, value) = pair + .split_once('=') + .expect("invalid azure connection string format"); + + match key { + "AccountName" => account_name = Some(value.to_string()), + "AccountKey" => account_key = Some(value.to_string()), + "SharedAccessSignature" => sas_token = Some(value.to_string()), + "BlobEndpoint" => endpoint = Some(value.to_string()), + "DefaultEndpointsProtocol" => { + allow_http = value.to_lowercase() == "http"; + } + _ => { + panic!("unsupported config key in azure connection string: {}", key); + } + } + } + + AzureStorageConfig { + account_name, + account_key, + sas_token, + endpoint, + allow_http, + } + } +} diff --git a/src/object_store/local_file.rs b/src/object_store/local_file.rs new file mode 100644 index 0000000..938dde4 --- /dev/null +++ b/src/object_store/local_file.rs @@ -0,0 +1,21 @@ +use object_store::local::LocalFileSystem; +use url::Url; + +use super::uri_as_string; + +// create_local_file_object_store creates a LocalFileSystem object store with the given path. +pub(crate) fn create_local_file_object_store(uri: &Url, copy_from: bool) -> LocalFileSystem { + let path = uri_as_string(uri); + + if !copy_from { + // create or overwrite the local file + std::fs::OpenOptions::new() + .write(true) + .truncate(true) + .create(true) + .open(path) + .unwrap_or_else(|e| panic!("{}", e)); + } + + LocalFileSystem::new() +} diff --git a/src/pgrx_tests/object_store.rs b/src/pgrx_tests/object_store.rs index cbab4fd..08fc988 100644 --- a/src/pgrx_tests/object_store.rs +++ b/src/pgrx_tests/object_store.rs @@ -14,7 +14,6 @@ mod tests { let s3_uris = [ format!("s3://{}/pg_parquet_test.parquet", test_bucket_name), - format!("s3a://{}/pg_parquet_test.parquet", test_bucket_name), format!( "https://s3.amazonaws.com/{}/pg_parquet_test.parquet", test_bucket_name @@ -81,7 +80,7 @@ mod tests { #[pg_test] #[should_panic(expected = "403 Forbidden")] - fn test_s3_with_wrong_access_key_id() { + fn test_s3_wrong_access_key_id() { std::env::set_var("AWS_ACCESS_KEY_ID", "wrong_access_key_id"); let test_bucket_name: String = @@ -97,7 +96,7 @@ mod tests { #[pg_test] #[should_panic(expected = "403 Forbidden")] - fn test_s3_with_wrong_secret_access_key() { + fn test_s3_wrong_secret_access_key() { std::env::set_var("AWS_SECRET_ACCESS_KEY", "wrong_secret_access_key"); let test_bucket_name: String = @@ -216,7 +215,7 @@ mod tests { } #[pg_test] - fn test_s3_object_store_with_temporary_token() { + fn test_s3_temporary_token() { let tokio_rt = tokio::runtime::Builder::new_current_thread() .enable_all() .build() @@ -254,7 +253,7 @@ mod tests { #[pg_test] #[should_panic(expected = "403 Forbidden")] - fn test_s3_object_store_with_missing_temporary_token_fail() { + fn test_s3_missing_temporary_token() { let tokio_rt = tokio::runtime::Builder::new_current_thread() .enable_all() .build() @@ -303,6 +302,9 @@ mod tests { #[pg_test] fn test_azure_blob_from_env() { + // unset AZURE_STORAGE_CONNECTION_STRING to make sure the account name and key are used + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); + let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") .expect("AZURE_TEST_CONTAINER_NAME not found"); @@ -336,6 +338,7 @@ mod tests { std::env::remove_var("AZURE_STORAGE_ACCOUNT"); let account_key = std::env::var("AZURE_STORAGE_KEY").unwrap(); std::env::remove_var("AZURE_STORAGE_KEY"); + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); // create a config file let azure_config_file_content = format!( @@ -365,9 +368,66 @@ mod tests { test_table.assert_expected_and_result_rows(); } + #[pg_test] + fn test_azure_from_env_via_connection_string() { + // unset AZURE_STORAGE_ACCOUNT AND AZURE_STORAGE_KEY to make sure the connection string is used + std::env::remove_var("AZURE_STORAGE_ACCOUNT"); + std::env::remove_var("AZURE_STORAGE_KEY"); + + let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") + .expect("AZURE_TEST_CONTAINER_NAME not found"); + + let azure_blob_uri = format!("az://{}/pg_parquet_test.parquet", test_container_name); + + let test_table = TestTable::::new("int4".into()).with_uri(azure_blob_uri); + + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } + + #[pg_test] + fn test_azure_from_config_via_connection_string() { + let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") + .expect("AZURE_TEST_CONTAINER_NAME not found"); + + // remove these to make sure the config file is used + std::env::remove_var("AZURE_STORAGE_ACCOUNT"); + std::env::remove_var("AZURE_STORAGE_KEY"); + let connection_string = std::env::var("AZURE_STORAGE_CONNECTION_STRING").unwrap(); + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); + + // create a config file + let azure_config_file_content = + format!("[storage]\nconnection_string = {}\n", connection_string); + + let azure_config_file = "/tmp/pg_parquet_azure_config"; + std::env::set_var("AZURE_CONFIG_FILE", azure_config_file); + + let mut azure_config_file = std::fs::OpenOptions::new() + .write(true) + .truncate(true) + .create(true) + .open(azure_config_file) + .unwrap(); + + azure_config_file + .write_all(azure_config_file_content.as_bytes()) + .unwrap(); + + let azure_blob_uri = format!("az://{}/pg_parquet_test.parquet", test_container_name); + + let test_table = TestTable::::new("int4".into()).with_uri(azure_blob_uri); + + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } + #[pg_test] #[should_panic(expected = "Account must be specified")] - fn test_azure_with_no_storage_account() { + fn test_azure_no_storage_account() { + // unset AZURE_STORAGE_CONNECTION_STRING to make sure the account name and key are used + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); + std::env::remove_var("AZURE_STORAGE_ACCOUNT"); let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") @@ -383,7 +443,10 @@ mod tests { #[pg_test] #[should_panic(expected = "403 Forbidden")] - fn test_azure_blob_with_wrong_storage_key() { + fn test_azure_wrong_storage_key() { + // unset AZURE_STORAGE_CONNECTION_STRING to make sure the account name and key are used + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); + let wrong_account_key = String::from("FFy8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=="); std::env::set_var("AZURE_STORAGE_KEY", wrong_account_key); @@ -406,7 +469,7 @@ mod tests { #[pg_test] #[should_panic(expected = "404 Not Found")] - fn test_azure_blob_write_wrong_container() { + fn test_azure_write_wrong_container() { let test_account_name: String = std::env::var("AZURE_STORAGE_ACCOUNT").expect("AZURE_STORAGE_ACCOUNT not found"); @@ -423,7 +486,7 @@ mod tests { } #[pg_test] - fn test_azure_blob_read_write_sas() { + fn test_azure_read_write_sas() { let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") .expect("AZURE_TEST_CONTAINER_NAME not found"); @@ -433,8 +496,9 @@ mod tests { let read_write_sas_token = std::env::var("AZURE_TEST_READ_WRITE_SAS") .expect("AZURE_TEST_READ_WRITE_SAS not found"); - // remove account key to make sure the sas token is used + // remove account key and connection string to make sure the sas token is used std::env::remove_var("AZURE_STORAGE_KEY"); + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); std::env::set_var("AZURE_STORAGE_SAS_TOKEN", read_write_sas_token); let azure_blob_uri = format!( @@ -451,7 +515,7 @@ mod tests { #[pg_test] #[should_panic(expected = "403 Forbidden")] - fn test_azure_blob_read_only_sas() { + fn test_azure_read_only_sas() { let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") .expect("AZURE_TEST_CONTAINER_NAME not found"); @@ -461,8 +525,9 @@ mod tests { let read_only_sas_token: String = std::env::var("AZURE_TEST_READ_ONLY_SAS").expect("AZURE_TEST_READ_ONLY_SAS not found"); - // remove account key to make sure the sas token is used + // remove account key and connection string to make sure the sas token is used std::env::remove_var("AZURE_STORAGE_KEY"); + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); std::env::set_var("AZURE_STORAGE_SAS_TOKEN", read_only_sas_token); let azure_blob_uri = format!( @@ -479,7 +544,7 @@ mod tests { #[pg_test] #[should_panic(expected = "unsupported azure blob storage uri")] - fn test_azure_blob_unsupported_uri() { + fn test_azure_unsupported_uri() { let fabric_azure_blob_uri = "https://ACCOUNT.dfs.fabric.microsoft.com".into(); let test_table = TestTable::::new("int4".into()).with_uri(fabric_azure_blob_uri); From e9d0cdd5c2c50e2da4769afdb5edc37f28376782 Mon Sep 17 00:00:00 2001 From: Aykut Bozkurt Date: Mon, 6 Jan 2025 19:29:21 +0300 Subject: [PATCH 12/15] support connection string --- README.md | 2 +- src/arrow_parquet/parquet_reader.rs | 3 +- src/arrow_parquet/parquet_writer.rs | 3 +- src/arrow_parquet/uri_utils.rs | 243 +--------------------------- src/lib.rs | 13 ++ src/object_store.rs | 56 +++++++ src/object_store/aws.rs | 98 +++++++++++ src/object_store/azure.rs | 203 +++++++++++++++++++++++ src/object_store/local_file.rs | 21 +++ src/pgrx_tests/object_store.rs | 91 +++++++++-- 10 files changed, 482 insertions(+), 251 deletions(-) create mode 100644 src/object_store.rs create mode 100644 src/object_store/aws.rs create mode 100644 src/object_store/azure.rs create mode 100644 src/object_store/local_file.rs diff --git a/README.md b/README.md index cc61366..8b38bfc 100644 --- a/README.md +++ b/README.md @@ -191,7 +191,6 @@ Alternatively, you can use the following environment variables when starting pos Supported S3 uri formats are shown below: - s3:// \ / \ -- s3a:// \ / \ - https:// \.s3.amazonaws.com / \ - https:// s3.amazonaws.com / \ / \ @@ -209,6 +208,7 @@ key = Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/ Alternatively, you can use the following environment variables when starting postgres to configure the Azure Blob Storage client: - `AZURE_STORAGE_ACCOUNT`: the storage account name of the Azure Blob - `AZURE_STORAGE_KEY`: the storage key of the Azure Blob +- `AZURE_STORAGE_CONNECTION_STRING`: the connection string for the Azure Blob (this can be set instead of specifying account name and key) - `AZURE_STORAGE_SAS_TOKEN`: the storage SAS token for the Azure Blob - `AZURE_STORAGE_ENDPOINT`: the endpoint **(only via environment variables)** - `AZURE_CONFIG_FILE`: an alternative location for the config file **(only via environment variables)** diff --git a/src/arrow_parquet/parquet_reader.rs b/src/arrow_parquet/parquet_reader.rs index 6790513..b64b238 100644 --- a/src/arrow_parquet/parquet_reader.rs +++ b/src/arrow_parquet/parquet_reader.rs @@ -25,6 +25,7 @@ use crate::{ }, pgrx_utils::{collect_attributes_for, CollectAttributesFor}, type_compat::{geometry::reset_postgis_context, map::reset_map_context}, + PG_BACKEND_TOKIO_RUNTIME, }; use super::{ @@ -33,7 +34,7 @@ use super::{ schema_parser::{ ensure_file_schema_match_tupledesc_schema, parse_arrow_schema_from_attributes, }, - uri_utils::{parquet_reader_from_uri, PG_BACKEND_TOKIO_RUNTIME}, + uri_utils::parquet_reader_from_uri, }; pub(crate) struct ParquetReaderContext { diff --git a/src/arrow_parquet/parquet_writer.rs b/src/arrow_parquet/parquet_writer.rs index e93ea8b..4f5713f 100644 --- a/src/arrow_parquet/parquet_writer.rs +++ b/src/arrow_parquet/parquet_writer.rs @@ -15,10 +15,11 @@ use crate::{ schema_parser::{ parquet_schema_string_from_attributes, parse_arrow_schema_from_attributes, }, - uri_utils::{parquet_writer_from_uri, PG_BACKEND_TOKIO_RUNTIME}, + uri_utils::parquet_writer_from_uri, }, pgrx_utils::{collect_attributes_for, CollectAttributesFor}, type_compat::{geometry::reset_postgis_context, map::reset_map_context}, + PG_BACKEND_TOKIO_RUNTIME, }; use super::pg_to_arrow::{ diff --git a/src/arrow_parquet/uri_utils.rs b/src/arrow_parquet/uri_utils.rs index c15ac5f..6eadc91 100644 --- a/src/arrow_parquet/uri_utils.rs +++ b/src/arrow_parquet/uri_utils.rs @@ -1,20 +1,6 @@ -use std::{ - panic, - sync::{Arc, LazyLock}, -}; +use std::{panic, sync::Arc}; use arrow::datatypes::SchemaRef; -use aws_config::BehaviorVersion; -use aws_credential_types::provider::ProvideCredentials; -use home::home_dir; -use ini::Ini; -use object_store::{ - aws::{AmazonS3, AmazonS3Builder}, - azure::{AzureConfigKey, MicrosoftAzure, MicrosoftAzureBuilder}, - local::LocalFileSystem, - path::Path, - ObjectStore, ObjectStoreScheme, -}; use parquet::{ arrow::{ arrow_to_parquet_schema, @@ -29,229 +15,16 @@ use pgrx::{ ereport, pg_sys::{get_role_oid, has_privs_of_role, superuser, AsPgCStr, GetUserId}, }; -use tokio::runtime::Runtime; use url::Url; -use crate::arrow_parquet::parquet_writer::DEFAULT_ROW_GROUP_SIZE; +use crate::{ + arrow_parquet::parquet_writer::DEFAULT_ROW_GROUP_SIZE, object_store::create_object_store, + PG_BACKEND_TOKIO_RUNTIME, +}; const PARQUET_OBJECT_STORE_READ_ROLE: &str = "parquet_object_store_read"; const PARQUET_OBJECT_STORE_WRITE_ROLE: &str = "parquet_object_store_write"; -// PG_BACKEND_TOKIO_RUNTIME creates a tokio runtime that uses the current thread -// to run the tokio reactor. This uses the same thread that is running the Postgres backend. -pub(crate) static PG_BACKEND_TOKIO_RUNTIME: LazyLock = LazyLock::new(|| { - tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap_or_else(|e| panic!("failed to create tokio runtime: {}", e)) -}); - -fn parse_azure_blob_container(uri: &Url) -> Option { - let host = uri.host_str()?; - - // az(ure)://{container}/key - if uri.scheme() == "az" || uri.scheme() == "azure" { - return Some(host.to_string()); - } - // https://{account}.blob.core.windows.net/{container} - else if host.ends_with(".blob.core.windows.net") { - let path_segments: Vec<&str> = uri.path_segments()?.collect(); - - // Container name is the first part of the path - return Some( - path_segments - .first() - .expect("unexpected error during parsing azure blob uri") - .to_string(), - ); - } - - None -} - -fn parse_s3_bucket(uri: &Url) -> Option { - let host = uri.host_str()?; - - // s3(a)://{bucket}/key - if uri.scheme() == "s3" || uri.scheme() == "s3a" { - return Some(host.to_string()); - } - // https://s3.amazonaws.com/{bucket}/key - else if host == "s3.amazonaws.com" { - let path_segments: Vec<&str> = uri.path_segments()?.collect(); - - // Bucket name is the first part of the path - return Some( - path_segments - .first() - .expect("unexpected error during parsing s3 uri") - .to_string(), - ); - } - // https://{bucket}.s3.amazonaws.com/key - else if host.ends_with(".s3.amazonaws.com") { - let bucket_name = host.split('.').next()?; - return Some(bucket_name.to_string()); - } - - None -} - -fn object_store_with_location(uri: &Url, copy_from: bool) -> (Arc, Path) { - let (scheme, path) = - ObjectStoreScheme::parse(uri).unwrap_or_else(|_| panic!("unrecognized uri {}", uri)); - - // object_store crate can recognize a bunch of different schemes and paths, but we only support - // local, azure, and s3 schemes with a subset of all supported paths. - match scheme { - ObjectStoreScheme::AmazonS3 => { - let bucket_name = parse_s3_bucket(uri).unwrap_or_else(|| { - panic!("unsupported s3 uri: {}", uri); - }); - - let storage_container = PG_BACKEND_TOKIO_RUNTIME - .block_on(async { Arc::new(get_s3_object_store(&bucket_name).await) }); - - (storage_container, path) - } - ObjectStoreScheme::MicrosoftAzure => { - let container_name = parse_azure_blob_container(uri).unwrap_or_else(|| { - panic!("unsupported azure blob storage uri: {}", uri); - }); - - let storage_container = PG_BACKEND_TOKIO_RUNTIME - .block_on(async { Arc::new(get_azure_object_store(&container_name).await) }); - - (storage_container, path) - } - ObjectStoreScheme::Local => { - let uri = uri_as_string(uri); - - if !copy_from { - // create or overwrite the local file - std::fs::OpenOptions::new() - .write(true) - .truncate(true) - .create(true) - .open(&uri) - .unwrap_or_else(|e| panic!("{}", e)); - } - - let storage_container = Arc::new(LocalFileSystem::new()); - - let path = Path::from_filesystem_path(&uri).unwrap_or_else(|e| panic!("{}", e)); - - (storage_container, path) - } - _ => { - panic!("unsupported scheme {} in uri {}", uri.scheme(), uri); - } - } -} - -// get_s3_object_store creates an AmazonS3 object store with the given bucket name. -// It is configured by environment variables and aws config files as fallback method. -// We need to read the config files to make the fallback method work since object_store -// does not provide a way to read them. Currently, we only support to extract -// "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN", "AWS_ENDPOINT_URL", -// and "AWS_REGION" from the config files. -async fn get_s3_object_store(bucket_name: &str) -> AmazonS3 { - let mut aws_s3_builder = AmazonS3Builder::from_env().with_bucket_name(bucket_name); - - // first tries environment variables and then the config files - let sdk_config = aws_config::defaults(BehaviorVersion::v2024_03_28()) - .load() - .await; - - if let Some(credential_provider) = sdk_config.credentials_provider() { - if let Ok(credentials) = credential_provider.provide_credentials().await { - // AWS_ACCESS_KEY_ID - aws_s3_builder = aws_s3_builder.with_access_key_id(credentials.access_key_id()); - - // AWS_SECRET_ACCESS_KEY - aws_s3_builder = aws_s3_builder.with_secret_access_key(credentials.secret_access_key()); - - if let Some(token) = credentials.session_token() { - // AWS_SESSION_TOKEN - aws_s3_builder = aws_s3_builder.with_token(token); - } - } - } - - // AWS_ENDPOINT_URL - if let Some(aws_endpoint_url) = sdk_config.endpoint_url() { - aws_s3_builder = aws_s3_builder.with_endpoint(aws_endpoint_url); - } - - // AWS_REGION - if let Some(aws_region) = sdk_config.region() { - aws_s3_builder = aws_s3_builder.with_region(aws_region.as_ref()); - } - - aws_s3_builder.build().unwrap_or_else(|e| panic!("{}", e)) -} - -async fn get_azure_object_store(container_name: &str) -> MicrosoftAzure { - let mut azure_builder = MicrosoftAzureBuilder::from_env().with_container_name(container_name); - - // ~/.azure/config - let azure_config_file_path = std::env::var("AZURE_CONFIG_FILE").unwrap_or( - home_dir() - .expect("failed to get home directory") - .join(".azure") - .join("config") - .to_str() - .expect("failed to convert path to string") - .to_string(), - ); - - let azure_config_content = Ini::load_from_file(&azure_config_file_path).ok(); - - // storage account - let azure_blob_account = match std::env::var("AZURE_STORAGE_ACCOUNT") { - Ok(account) => Some(account), - Err(_) => azure_config_content - .as_ref() - .and_then(|ini| ini.section(Some("storage"))) - .and_then(|section| section.get("account")) - .map(|account| account.to_string()), - }; - - if let Some(azure_blob_account) = azure_blob_account { - azure_builder = azure_builder.with_account(azure_blob_account); - } - - // storage key - let azure_blob_key = match std::env::var("AZURE_STORAGE_KEY") { - Ok(key) => Some(key), - Err(_) => azure_config_content - .as_ref() - .and_then(|ini| ini.section(Some("storage"))) - .and_then(|section| section.get("key")) - .map(|key| key.to_string()), - }; - - if let Some(azure_blob_key) = azure_blob_key { - azure_builder = azure_builder.with_access_key(azure_blob_key); - } - - // sas token - let azure_blob_sas_token = match std::env::var("AZURE_STORAGE_SAS_TOKEN") { - Ok(token) => Some(token), - Err(_) => azure_config_content - .as_ref() - .and_then(|ini| ini.section(Some("storage"))) - .and_then(|section| section.get("sas_token")) - .map(|token| token.to_string()), - }; - - if let Some(azure_blob_sas_token) = azure_blob_sas_token { - azure_builder = azure_builder.with_config(AzureConfigKey::SasKey, azure_blob_sas_token); - } - - azure_builder.build().unwrap_or_else(|e| panic!("{}", e)) -} - pub(crate) fn parse_uri(uri: &str) -> Url { if !uri.contains("://") { // local file @@ -285,7 +58,7 @@ pub(crate) fn parquet_schema_from_uri(uri: &Url) -> SchemaDescriptor { pub(crate) fn parquet_metadata_from_uri(uri: &Url) -> Arc { let copy_from = true; - let (parquet_object_store, location) = object_store_with_location(uri, copy_from); + let (parquet_object_store, location) = create_object_store(uri, copy_from); PG_BACKEND_TOKIO_RUNTIME.block_on(async { let object_store_meta = parquet_object_store @@ -308,7 +81,7 @@ pub(crate) fn parquet_metadata_from_uri(uri: &Url) -> Arc { pub(crate) fn parquet_reader_from_uri(uri: &Url) -> ParquetRecordBatchStream { let copy_from = true; - let (parquet_object_store, location) = object_store_with_location(uri, copy_from); + let (parquet_object_store, location) = create_object_store(uri, copy_from); PG_BACKEND_TOKIO_RUNTIME.block_on(async { let object_store_meta = parquet_object_store @@ -340,7 +113,7 @@ pub(crate) fn parquet_writer_from_uri( writer_props: WriterProperties, ) -> AsyncArrowWriter { let copy_from = false; - let (parquet_object_store, location) = object_store_with_location(uri, copy_from); + let (parquet_object_store, location) = create_object_store(uri, copy_from); let parquet_object_writer = ParquetObjectWriter::new(parquet_object_store, location); diff --git a/src/lib.rs b/src/lib.rs index 100c80b..16aa1d9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,8 +1,12 @@ +use std::sync::LazyLock; + use parquet_copy_hook::hook::{init_parquet_copy_hook, ENABLE_PARQUET_COPY_HOOK}; use parquet_copy_hook::pg_compat::MarkGUCPrefixReserved; use pgrx::{prelude::*, GucContext, GucFlags, GucRegistry}; +use tokio::runtime::Runtime; mod arrow_parquet; +mod object_store; mod parquet_copy_hook; mod parquet_udfs; #[cfg(any(test, feature = "pg_test"))] @@ -20,6 +24,15 @@ pgrx::pg_module_magic!(); extension_sql_file!("../sql/bootstrap.sql", name = "role_setup", bootstrap); +// PG_BACKEND_TOKIO_RUNTIME creates a tokio runtime that uses the current thread +// to run the tokio reactor. This uses the same thread that is running the Postgres backend. +pub(crate) static PG_BACKEND_TOKIO_RUNTIME: LazyLock = LazyLock::new(|| { + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap_or_else(|e| panic!("failed to create tokio runtime: {}", e)) +}); + #[pg_guard] pub extern "C" fn _PG_init() { GucRegistry::define_bool_guc( diff --git a/src/object_store.rs b/src/object_store.rs new file mode 100644 index 0000000..8d36032 --- /dev/null +++ b/src/object_store.rs @@ -0,0 +1,56 @@ +use std::sync::Arc; + +use object_store::{path::Path, ObjectStore, ObjectStoreScheme}; +use url::Url; + +use crate::{ + arrow_parquet::uri_utils::uri_as_string, + object_store::{ + aws::create_s3_object_store, azure::create_azure_object_store, + local_file::create_local_file_object_store, + }, + PG_BACKEND_TOKIO_RUNTIME, +}; + +pub(crate) mod aws; +pub(crate) mod azure; +pub(crate) mod local_file; + +pub(crate) fn create_object_store(uri: &Url, copy_from: bool) -> (Arc, Path) { + let (scheme, path) = ObjectStoreScheme::parse(uri).unwrap_or_else(|_| { + panic!( + "unrecognized uri {}. pg_parquet supports local paths, s3:// or azure:// schemes.", + uri + ) + }); + + // object_store crate can recognize a bunch of different schemes and paths, but we only support + // local, azure, and s3 schemes with a subset of all supported paths. + match scheme { + ObjectStoreScheme::AmazonS3 => { + let storage_container = Arc::new(create_s3_object_store(uri)); + + (storage_container, path) + } + ObjectStoreScheme::MicrosoftAzure => { + let storage_container = Arc::new(create_azure_object_store(uri)); + + (storage_container, path) + } + ObjectStoreScheme::Local => { + let storage_container = Arc::new(create_local_file_object_store(uri, copy_from)); + + let path = + Path::from_filesystem_path(uri_as_string(uri)).unwrap_or_else(|e| panic!("{}", e)); + + (storage_container, path) + } + _ => { + panic!( + "unsupported scheme {} in uri {}. pg_parquet supports local paths, s3:// or azure:// schemes.", + uri.scheme(), + uri + ); + } + } +} diff --git a/src/object_store/aws.rs b/src/object_store/aws.rs new file mode 100644 index 0000000..c738f8a --- /dev/null +++ b/src/object_store/aws.rs @@ -0,0 +1,98 @@ +use aws_config::BehaviorVersion; +use aws_sdk_sts::config::ProvideCredentials; +use object_store::aws::{AmazonS3, AmazonS3Builder}; +use url::Url; + +use super::PG_BACKEND_TOKIO_RUNTIME; + +// create_s3_object_store creates an AmazonS3 object store with the given bucket name. +// It is configured by environment variables and aws config files as fallback method. +// We need to read the config files to make the fallback method work since object_store +// does not provide a way to read them. Currently, we only support following environment +// variables and config parameters: +// - AWS_ACCESS_KEY_ID +// - AWS_SECRET_ACCESS_KEY +// - AWS_SESSION_TOKEN +// - AWS_ENDPOINT_URL +// - AWS_REGION +// - AWS_SHARED_CREDENTIALS_FILE (env var only) +// - AWS_CONFIG_FILE (env var only) +// - AWS_PROFILE (env var only) +// - AWS_ALLOW_HTTP (env var only, object_store specific) +pub(crate) fn create_s3_object_store(uri: &Url) -> AmazonS3 { + let bucket_name = parse_s3_bucket(uri).unwrap_or_else(|| { + panic!("unsupported s3 uri: {}", uri); + }); + + // we do not use builder::from_env() here because not all environment variables have + // a fallback to the config files + let mut aws_s3_builder = AmazonS3Builder::new().with_bucket_name(bucket_name); + + if let Ok(allow_http) = std::env::var("AWS_ALLOW_HTTP") { + aws_s3_builder = aws_s3_builder.with_allow_http(allow_http.parse().unwrap_or(false)); + } + + // first tries environment variables and then the config files + let sdk_config = PG_BACKEND_TOKIO_RUNTIME.block_on(async { + aws_config::defaults(BehaviorVersion::v2024_03_28()) + .load() + .await + }); + + if let Some(credential_provider) = sdk_config.credentials_provider() { + if let Ok(credentials) = PG_BACKEND_TOKIO_RUNTIME + .block_on(async { credential_provider.provide_credentials().await }) + { + // AWS_ACCESS_KEY_ID + aws_s3_builder = aws_s3_builder.with_access_key_id(credentials.access_key_id()); + + // AWS_SECRET_ACCESS_KEY + aws_s3_builder = aws_s3_builder.with_secret_access_key(credentials.secret_access_key()); + + if let Some(token) = credentials.session_token() { + // AWS_SESSION_TOKEN + aws_s3_builder = aws_s3_builder.with_token(token); + } + } + } + + // AWS_ENDPOINT_URL + if let Some(aws_endpoint_url) = sdk_config.endpoint_url() { + aws_s3_builder = aws_s3_builder.with_endpoint(aws_endpoint_url); + } + + // AWS_REGION + if let Some(aws_region) = sdk_config.region() { + aws_s3_builder = aws_s3_builder.with_region(aws_region.as_ref()); + } + + aws_s3_builder.build().unwrap_or_else(|e| panic!("{}", e)) +} + +fn parse_s3_bucket(uri: &Url) -> Option { + let host = uri.host_str()?; + + // s3(a)://{bucket}/key + if uri.scheme() == "s3" { + return Some(host.to_string()); + } + // https://s3.amazonaws.com/{bucket}/key + else if host == "s3.amazonaws.com" { + let path_segments: Vec<&str> = uri.path_segments()?.collect(); + + // Bucket name is the first part of the path + return Some( + path_segments + .first() + .expect("unexpected error during parsing s3 uri") + .to_string(), + ); + } + // https://{bucket}.s3.amazonaws.com/key + else if host.ends_with(".s3.amazonaws.com") { + let bucket_name = host.split('.').next()?; + return Some(bucket_name.to_string()); + } + + None +} diff --git a/src/object_store/azure.rs b/src/object_store/azure.rs new file mode 100644 index 0000000..58cf22c --- /dev/null +++ b/src/object_store/azure.rs @@ -0,0 +1,203 @@ +use home::home_dir; +use ini::Ini; +use object_store::azure::{AzureConfigKey, MicrosoftAzure, MicrosoftAzureBuilder}; +use url::Url; + +// create_azure_object_store creates a MicrosoftAzure object store with the given container name. +// It is configured by environment variables and azure config files as fallback method. +// We need to read the config files to make the fallback method work since object_store +// does not provide a way to read them. Currently, we only support following environment +// variables and config parameters: +// - AZURE_STORAGE_ACCOUNT +// - AZURE_STORAGE_KEY +// - AZURE_STORAGE_CONNECTION_STRING +// - AZURE_STORAGE_SAS_TOKEN +// - AZURE_CONFIG_FILE (env var only, object_store specific) +// - AZURE_STORAGE_ENDPOINT (env var only, object_store specific) +// - AZURE_ALLOW_HTTP (env var only, object_store specific) +pub(crate) fn create_azure_object_store(uri: &Url) -> MicrosoftAzure { + let container_name = parse_azure_blob_container(uri).unwrap_or_else(|| { + panic!("unsupported azure blob storage uri: {}", uri); + }); + + let mut azure_builder = MicrosoftAzureBuilder::new().with_container_name(container_name); + + let azure_blob_config = AzureStorageConfig::load(); + + // account name + if let Some(account_name) = azure_blob_config.account_name { + azure_builder = azure_builder.with_account(account_name); + } + + // account key + if let Some(account_key) = azure_blob_config.account_key { + azure_builder = azure_builder.with_access_key(account_key); + } + + // sas token + if let Some(sas_token) = azure_blob_config.sas_token { + azure_builder = azure_builder.with_config(AzureConfigKey::SasKey, sas_token); + } + + // allow http + azure_builder = azure_builder.with_allow_http(azure_blob_config.allow_http); + + // endpoint + if let Some(endpoint) = azure_blob_config.endpoint { + azure_builder = azure_builder.with_endpoint(endpoint); + } + + azure_builder.build().unwrap_or_else(|e| panic!("{}", e)) +} + +fn parse_azure_blob_container(uri: &Url) -> Option { + let host = uri.host_str()?; + + // az(ure)://{container}/key + if uri.scheme() == "az" || uri.scheme() == "azure" { + return Some(host.to_string()); + } + // https://{account}.blob.core.windows.net/{container} + else if host.ends_with(".blob.core.windows.net") { + let path_segments: Vec<&str> = uri.path_segments()?.collect(); + + // Container name is the first part of the path + return Some( + path_segments + .first() + .expect("unexpected error during parsing azure blob uri") + .to_string(), + ); + } + + None +} + +// AzureStorageConfig represents the configuration for Azure Blob Storage. +// There is no proper azure sdk config crate that can read the config files. +// So, we need to read the config files manually from azure's ini config. +// See https://learn.microsoft.com/en-us/cli/azure/azure-cli-configuration?view=azure-cli-latest +struct AzureStorageConfig { + account_name: Option, + account_key: Option, + sas_token: Option, + endpoint: Option, + allow_http: bool, +} + +impl AzureStorageConfig { + // load reads the azure config from the environment variables first and config files as fallback. + fn load() -> Self { + // ~/.azure/config + let azure_config_file_path = std::env::var("AZURE_CONFIG_FILE").unwrap_or( + home_dir() + .expect("failed to get home directory") + .join(".azure") + .join("config") + .to_str() + .expect("failed to convert path to string") + .to_string(), + ); + + let azure_config_content = Ini::load_from_file(&azure_config_file_path).ok(); + + // connection string + let connection_string = match std::env::var("AZURE_STORAGE_CONNECTION_STRING") { + Ok(connection_string) => Some(connection_string), + Err(_) => azure_config_content + .as_ref() + .and_then(|ini| ini.section(Some("storage"))) + .and_then(|section| section.get("connection_string")) + .map(|connection_string| connection_string.to_string()), + }; + + // connection string has the highest priority + if let Some(connection_string) = connection_string { + return Self::from_connection_string(&connection_string); + } + + // account name + let account_name = match std::env::var("AZURE_STORAGE_ACCOUNT") { + Ok(account) => Some(account), + Err(_) => azure_config_content + .as_ref() + .and_then(|ini| ini.section(Some("storage"))) + .and_then(|section| section.get("account")) + .map(|account| account.to_string()), + }; + + // account key + let account_key = match std::env::var("AZURE_STORAGE_KEY") { + Ok(key) => Some(key), + Err(_) => azure_config_content + .as_ref() + .and_then(|ini| ini.section(Some("storage"))) + .and_then(|section| section.get("key")) + .map(|key| key.to_string()), + }; + + // sas token + let sas_token = match std::env::var("AZURE_STORAGE_SAS_TOKEN") { + Ok(token) => Some(token), + Err(_) => azure_config_content + .as_ref() + .and_then(|ini| ini.section(Some("storage"))) + .and_then(|section| section.get("sas_token")) + .map(|token| token.to_string()), + }; + + // endpoint, object_store specific + let endpoint = std::env::var("AZURE_STORAGE_ENDPOINT").ok(); + + // allow http, object_store specific + let allow_http = std::env::var("AZURE_ALLOW_HTTP") + .ok() + .map(|allow_http| allow_http.parse().unwrap_or(false)) + .unwrap_or(false); + + AzureStorageConfig { + account_name, + account_key, + sas_token, + endpoint, + allow_http, + } + } + + // from_connection_string parses AzureBlobConfig from the given connection string. + // See https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string#create-a-connection-string-for-an-explicit-storage-endpoint + fn from_connection_string(connection_string: &str) -> Self { + let mut account_name = None; + let mut account_key = None; + let mut sas_token = None; + let mut endpoint = None; + let mut allow_http = false; + + for pair in connection_string.trim_end_matches(';').split(';') { + let (key, value) = pair + .split_once('=') + .expect("invalid azure connection string format"); + + match key { + "AccountName" => account_name = Some(value.to_string()), + "AccountKey" => account_key = Some(value.to_string()), + "SharedAccessSignature" => sas_token = Some(value.to_string()), + "BlobEndpoint" => endpoint = Some(value.to_string()), + "DefaultEndpointsProtocol" => { + allow_http = value.to_lowercase() == "http"; + } + _ => { + panic!("unsupported config key in azure connection string: {}", key); + } + } + } + + AzureStorageConfig { + account_name, + account_key, + sas_token, + endpoint, + allow_http, + } + } +} diff --git a/src/object_store/local_file.rs b/src/object_store/local_file.rs new file mode 100644 index 0000000..938dde4 --- /dev/null +++ b/src/object_store/local_file.rs @@ -0,0 +1,21 @@ +use object_store::local::LocalFileSystem; +use url::Url; + +use super::uri_as_string; + +// create_local_file_object_store creates a LocalFileSystem object store with the given path. +pub(crate) fn create_local_file_object_store(uri: &Url, copy_from: bool) -> LocalFileSystem { + let path = uri_as_string(uri); + + if !copy_from { + // create or overwrite the local file + std::fs::OpenOptions::new() + .write(true) + .truncate(true) + .create(true) + .open(path) + .unwrap_or_else(|e| panic!("{}", e)); + } + + LocalFileSystem::new() +} diff --git a/src/pgrx_tests/object_store.rs b/src/pgrx_tests/object_store.rs index cbab4fd..08fc988 100644 --- a/src/pgrx_tests/object_store.rs +++ b/src/pgrx_tests/object_store.rs @@ -14,7 +14,6 @@ mod tests { let s3_uris = [ format!("s3://{}/pg_parquet_test.parquet", test_bucket_name), - format!("s3a://{}/pg_parquet_test.parquet", test_bucket_name), format!( "https://s3.amazonaws.com/{}/pg_parquet_test.parquet", test_bucket_name @@ -81,7 +80,7 @@ mod tests { #[pg_test] #[should_panic(expected = "403 Forbidden")] - fn test_s3_with_wrong_access_key_id() { + fn test_s3_wrong_access_key_id() { std::env::set_var("AWS_ACCESS_KEY_ID", "wrong_access_key_id"); let test_bucket_name: String = @@ -97,7 +96,7 @@ mod tests { #[pg_test] #[should_panic(expected = "403 Forbidden")] - fn test_s3_with_wrong_secret_access_key() { + fn test_s3_wrong_secret_access_key() { std::env::set_var("AWS_SECRET_ACCESS_KEY", "wrong_secret_access_key"); let test_bucket_name: String = @@ -216,7 +215,7 @@ mod tests { } #[pg_test] - fn test_s3_object_store_with_temporary_token() { + fn test_s3_temporary_token() { let tokio_rt = tokio::runtime::Builder::new_current_thread() .enable_all() .build() @@ -254,7 +253,7 @@ mod tests { #[pg_test] #[should_panic(expected = "403 Forbidden")] - fn test_s3_object_store_with_missing_temporary_token_fail() { + fn test_s3_missing_temporary_token() { let tokio_rt = tokio::runtime::Builder::new_current_thread() .enable_all() .build() @@ -303,6 +302,9 @@ mod tests { #[pg_test] fn test_azure_blob_from_env() { + // unset AZURE_STORAGE_CONNECTION_STRING to make sure the account name and key are used + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); + let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") .expect("AZURE_TEST_CONTAINER_NAME not found"); @@ -336,6 +338,7 @@ mod tests { std::env::remove_var("AZURE_STORAGE_ACCOUNT"); let account_key = std::env::var("AZURE_STORAGE_KEY").unwrap(); std::env::remove_var("AZURE_STORAGE_KEY"); + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); // create a config file let azure_config_file_content = format!( @@ -365,9 +368,66 @@ mod tests { test_table.assert_expected_and_result_rows(); } + #[pg_test] + fn test_azure_from_env_via_connection_string() { + // unset AZURE_STORAGE_ACCOUNT AND AZURE_STORAGE_KEY to make sure the connection string is used + std::env::remove_var("AZURE_STORAGE_ACCOUNT"); + std::env::remove_var("AZURE_STORAGE_KEY"); + + let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") + .expect("AZURE_TEST_CONTAINER_NAME not found"); + + let azure_blob_uri = format!("az://{}/pg_parquet_test.parquet", test_container_name); + + let test_table = TestTable::::new("int4".into()).with_uri(azure_blob_uri); + + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } + + #[pg_test] + fn test_azure_from_config_via_connection_string() { + let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") + .expect("AZURE_TEST_CONTAINER_NAME not found"); + + // remove these to make sure the config file is used + std::env::remove_var("AZURE_STORAGE_ACCOUNT"); + std::env::remove_var("AZURE_STORAGE_KEY"); + let connection_string = std::env::var("AZURE_STORAGE_CONNECTION_STRING").unwrap(); + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); + + // create a config file + let azure_config_file_content = + format!("[storage]\nconnection_string = {}\n", connection_string); + + let azure_config_file = "/tmp/pg_parquet_azure_config"; + std::env::set_var("AZURE_CONFIG_FILE", azure_config_file); + + let mut azure_config_file = std::fs::OpenOptions::new() + .write(true) + .truncate(true) + .create(true) + .open(azure_config_file) + .unwrap(); + + azure_config_file + .write_all(azure_config_file_content.as_bytes()) + .unwrap(); + + let azure_blob_uri = format!("az://{}/pg_parquet_test.parquet", test_container_name); + + let test_table = TestTable::::new("int4".into()).with_uri(azure_blob_uri); + + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } + #[pg_test] #[should_panic(expected = "Account must be specified")] - fn test_azure_with_no_storage_account() { + fn test_azure_no_storage_account() { + // unset AZURE_STORAGE_CONNECTION_STRING to make sure the account name and key are used + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); + std::env::remove_var("AZURE_STORAGE_ACCOUNT"); let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") @@ -383,7 +443,10 @@ mod tests { #[pg_test] #[should_panic(expected = "403 Forbidden")] - fn test_azure_blob_with_wrong_storage_key() { + fn test_azure_wrong_storage_key() { + // unset AZURE_STORAGE_CONNECTION_STRING to make sure the account name and key are used + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); + let wrong_account_key = String::from("FFy8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=="); std::env::set_var("AZURE_STORAGE_KEY", wrong_account_key); @@ -406,7 +469,7 @@ mod tests { #[pg_test] #[should_panic(expected = "404 Not Found")] - fn test_azure_blob_write_wrong_container() { + fn test_azure_write_wrong_container() { let test_account_name: String = std::env::var("AZURE_STORAGE_ACCOUNT").expect("AZURE_STORAGE_ACCOUNT not found"); @@ -423,7 +486,7 @@ mod tests { } #[pg_test] - fn test_azure_blob_read_write_sas() { + fn test_azure_read_write_sas() { let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") .expect("AZURE_TEST_CONTAINER_NAME not found"); @@ -433,8 +496,9 @@ mod tests { let read_write_sas_token = std::env::var("AZURE_TEST_READ_WRITE_SAS") .expect("AZURE_TEST_READ_WRITE_SAS not found"); - // remove account key to make sure the sas token is used + // remove account key and connection string to make sure the sas token is used std::env::remove_var("AZURE_STORAGE_KEY"); + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); std::env::set_var("AZURE_STORAGE_SAS_TOKEN", read_write_sas_token); let azure_blob_uri = format!( @@ -451,7 +515,7 @@ mod tests { #[pg_test] #[should_panic(expected = "403 Forbidden")] - fn test_azure_blob_read_only_sas() { + fn test_azure_read_only_sas() { let test_container_name: String = std::env::var("AZURE_TEST_CONTAINER_NAME") .expect("AZURE_TEST_CONTAINER_NAME not found"); @@ -461,8 +525,9 @@ mod tests { let read_only_sas_token: String = std::env::var("AZURE_TEST_READ_ONLY_SAS").expect("AZURE_TEST_READ_ONLY_SAS not found"); - // remove account key to make sure the sas token is used + // remove account key and connection string to make sure the sas token is used std::env::remove_var("AZURE_STORAGE_KEY"); + std::env::remove_var("AZURE_STORAGE_CONNECTION_STRING"); std::env::set_var("AZURE_STORAGE_SAS_TOKEN", read_only_sas_token); let azure_blob_uri = format!( @@ -479,7 +544,7 @@ mod tests { #[pg_test] #[should_panic(expected = "unsupported azure blob storage uri")] - fn test_azure_blob_unsupported_uri() { + fn test_azure_unsupported_uri() { let fabric_azure_blob_uri = "https://ACCOUNT.dfs.fabric.microsoft.com".into(); let test_table = TestTable::::new("int4".into()).with_uri(fabric_azure_blob_uri); From d5f5ef2397db24df0742e255ee41ff6614a75b6a Mon Sep 17 00:00:00 2001 From: Aykut Bozkurt Date: Fri, 10 Jan 2025 14:08:30 +0300 Subject: [PATCH 13/15] - support azure bearer token via client secret - get rid of aws-sts dep in test as aws_config already supports it --- Cargo.lock | 350 ++++++++++++++++++++++++++++++--- Cargo.toml | 4 +- README.md | 22 ++- src/object_store/aws.rs | 108 +++++++--- src/object_store/azure.rs | 119 ++++++----- src/pgrx_tests/object_store.rs | 150 +++++++------- 6 files changed, 574 insertions(+), 179 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5f3927f..bfe857f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "RustyXML" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5" + [[package]] name = "addr2line" version = "0.24.2" @@ -25,7 +31,7 @@ checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "const-random", - "getrandom", + "getrandom 0.2.15", "once_cell", "version_check", "zerocopy", @@ -264,6 +270,28 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "async-channel" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" +dependencies = [ + "concurrent-queue", + "event-listener 2.5.3", + "futures-core", +] + +[[package]] +name = "async-lock" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff6e472cdea888a4bd64f342f09b3f50e1886d32afe8df3d663c01140b811b18" +dependencies = [ + "event-listener 5.4.0", + "event-listener-strategy", + "pin-project-lite", +] + [[package]] name = "async-trait" version = "0.1.83" @@ -323,7 +351,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "time", "tokio", @@ -358,7 +386,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "http-body 0.4.6", "once_cell", @@ -484,7 +512,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "bytes", - "fastrand", + "fastrand 2.3.0", "h2 0.3.26", "http 0.2.12", "http-body 0.4.6", @@ -526,7 +554,6 @@ dependencies = [ "base64-simd", "bytes", "bytes-utils", - "futures-core", "http 0.2.12", "http 1.2.0", "http-body 0.4.6", @@ -539,8 +566,6 @@ dependencies = [ "ryu", "serde", "time", - "tokio", - "tokio-util", ] [[package]] @@ -566,6 +591,52 @@ dependencies = [ "tracing", ] +[[package]] +name = "azure_core" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b552ad43a45a746461ec3d3a51dfb6466b4759209414b439c165eb6a6b7729e" +dependencies = [ + "async-trait", + "base64 0.22.1", + "bytes", + "dyn-clone", + "futures", + "getrandom 0.2.15", + "http-types", + "once_cell", + "paste", + "pin-project", + "quick-xml 0.31.0", + "rand 0.8.5", + "rustc_version 0.4.1", + "serde", + "serde_json", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_storage" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59f838159f4d29cb400a14d9d757578ba495ae64feb07a7516bf9e4415127126" +dependencies = [ + "RustyXML", + "async-lock", + "async-trait", + "azure_core", + "bytes", + "serde", + "serde_derive", + "time", + "tracing", + "url", + "uuid", +] + [[package]] name = "backtrace" version = "0.3.74" @@ -581,6 +652,12 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.21.7" @@ -876,6 +953,15 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" +[[package]] +name = "concurrent-queue" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "const-random" version = "0.1.18" @@ -891,7 +977,7 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" dependencies = [ - "getrandom", + "getrandom 0.2.15", "once_cell", "tiny-keccak", ] @@ -997,6 +1083,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" dependencies = [ "powerfmt", + "serde", ] [[package]] @@ -1030,6 +1117,12 @@ dependencies = [ "const-random", ] +[[package]] +name = "dyn-clone" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125" + [[package]] name = "either" version = "1.13.0" @@ -1072,6 +1165,33 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + +[[package]] +name = "event-listener" +version = "5.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3492acde4c3fc54c845eaab3eed8bd00c7a7d881f78bfc801e43a93dec1331ae" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + +[[package]] +name = "event-listener-strategy" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c3e4e0dd3673c1139bf041f3008816d9cf2946bbfac2945c09e523b8d7b05b2" +dependencies = [ + "event-listener 5.4.0", + "pin-project-lite", +] + [[package]] name = "eyre" version = "0.6.12" @@ -1088,6 +1208,15 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -1189,6 +1318,21 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +[[package]] +name = "futures-lite" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" +dependencies = [ + "fastrand 1.9.0", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + [[package]] name = "futures-macro" version = "0.3.31" @@ -1240,6 +1384,17 @@ dependencies = [ "version_check", ] +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -1249,7 +1404,7 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -1443,6 +1598,26 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-types" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e9b187a72d63adbfba487f48095306ac823049cb504ee195541e91c7775f5ad" +dependencies = [ + "anyhow", + "async-channel", + "base64 0.13.1", + "futures-lite", + "infer", + "pin-project-lite", + "rand 0.7.3", + "serde", + "serde_json", + "serde_qs", + "serde_urlencoded", + "url", +] + [[package]] name = "httparse" version = "1.9.5" @@ -1736,6 +1911,21 @@ dependencies = [ "hashbrown 0.15.2", ] +[[package]] +name = "infer" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64e9829a50b42bb782c1df523f78d332fe371b10c661e78b7a3c34b0198e9fac" + +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", +] + [[package]] name = "integer-encoding" version = "3.0.4" @@ -1972,7 +2162,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.52.0", ] @@ -2101,8 +2291,8 @@ dependencies = [ "md-5", "parking_lot", "percent-encoding", - "quick-xml", - "rand", + "quick-xml 0.36.2", + "rand 0.8.5", "reqwest", "ring", "serde", @@ -2161,6 +2351,12 @@ dependencies = [ "supports-color 3.0.2", ] +[[package]] +name = "parking" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" + [[package]] name = "parking_lot" version = "0.12.3" @@ -2272,7 +2468,7 @@ dependencies = [ "arrow-schema", "aws-config", "aws-credential-types", - "aws-sdk-sts", + "azure_storage", "futures", "home", "object_store", @@ -2404,7 +2600,7 @@ dependencies = [ "pgrx-pg-config", "postgres", "proptest", - "rand", + "rand 0.8.5", "regex", "serde", "serde_json", @@ -2430,6 +2626,26 @@ dependencies = [ "siphasher", ] +[[package]] +name = "pin-project" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e2ec53ad785f4d35dac0adea7f7dc6f1bb277ad84a680c7afefeae05d1f5916" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d56a66c0c55993aa927429d0f8a0abfd74f084e4d9c192cffed01e418d83eefb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "pin-project-lite" version = "0.2.15" @@ -2475,7 +2691,7 @@ dependencies = [ "hmac", "md-5", "memchr", - "rand", + "rand 0.8.5", "sha2", "stringprep", ] @@ -2526,8 +2742,8 @@ dependencies = [ "bitflags 2.6.0", "lazy_static", "num-traits", - "rand", - "rand_chacha", + "rand 0.8.5", + "rand_chacha 0.3.1", "rand_xorshift", "regex-syntax", "rusty-fork", @@ -2541,6 +2757,16 @@ version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quick-xml" version = "0.36.2" @@ -2576,8 +2802,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d" dependencies = [ "bytes", - "getrandom", - "rand", + "getrandom 0.2.15", + "rand 0.8.5", "ring", "rustc-hash 2.1.0", "rustls 0.23.20", @@ -2618,6 +2844,19 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", +] + [[package]] name = "rand" version = "0.8.5" @@ -2625,8 +2864,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha", - "rand_core", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", ] [[package]] @@ -2636,7 +2885,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", ] [[package]] @@ -2645,7 +2903,16 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.15", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", ] [[package]] @@ -2654,7 +2921,7 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f" dependencies = [ - "rand_core", + "rand_core 0.6.4", ] [[package]] @@ -2774,7 +3041,7 @@ checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" dependencies = [ "cc", "cfg-if", - "getrandom", + "getrandom 0.2.15", "libc", "spin", "untrusted", @@ -3108,6 +3375,17 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_qs" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6" +dependencies = [ + "percent-encoding", + "serde", + "thiserror 1.0.69", +] + [[package]] name = "serde_spanned" version = "0.6.8" @@ -3323,7 +3601,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" dependencies = [ "cfg-if", - "fastrand", + "fastrand 2.3.0", "once_cell", "rustix", "windows-sys 0.59.0", @@ -3387,6 +3665,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21" dependencies = [ "deranged", + "itoa", + "js-sys", "num-conv", "powerfmt", "serde", @@ -3490,7 +3770,7 @@ dependencies = [ "pin-project-lite", "postgres-protocol", "postgres-types", - "rand", + "rand 0.8.5", "socket2", "tokio", "tokio-util", @@ -3701,6 +3981,7 @@ dependencies = [ "form_urlencoded", "idna", "percent-encoding", + "serde", ] [[package]] @@ -3727,7 +4008,8 @@ version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" dependencies = [ - "getrandom", + "getrandom 0.2.15", + "serde", ] [[package]] @@ -3751,6 +4033,12 @@ dependencies = [ "libc", ] +[[package]] +name = "waker-fn" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "317211a0dc0ceedd78fb2ca9a44aed3d7b9b26f81870d485c07122b4350673b7" + [[package]] name = "walkdir" version = "2.5.0" @@ -3770,6 +4058,12 @@ dependencies = [ "try-lock", ] +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" diff --git a/Cargo.toml b/Cargo.toml index 892ec8d..c545672 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,9 +23,9 @@ pg_test = [] arrow = {version = "53", default-features = false} arrow-cast = {version = "53", default-features = false} arrow-schema = {version = "53", default-features = false} -aws-config = { version = "1", default-features = false, features = ["rustls"]} +aws-config = { version = "1", default-features = false, features = ["rustls","rt-tokio"] } aws-credential-types = {version = "1", default-features = false} -aws-sdk-sts = "1" +azure_storage = {version = "0.21", default-features = false} futures = "0.3" home = "0.5" object_store = {version = "0.11", default-features = false, features = ["aws", "azure"]} diff --git a/README.md b/README.md index 8b38bfc..1190d42 100644 --- a/README.md +++ b/README.md @@ -188,12 +188,19 @@ Alternatively, you can use the following environment variables when starting pos - `AWS_PROFILE`: the name of the profile from the credentials and config file (default profile name is `default`) **(only via environment variables)** - `AWS_ALLOW_HTTP`: allows http endpoints **(only via environment variables)** +Config source priority order is shown below: +1. Environment variables, +2. Config file. Supported S3 uri formats are shown below: - s3:// \ / \ - https:// \.s3.amazonaws.com / \ - https:// s3.amazonaws.com / \ / \ +Supported authorization methods' priority order is shown below: +1. Temporary session tokens by assuming roles, +2. Long term credentials. + #### Azure Blob Storage The simplest way to configure object storage is by creating the standard [`~/.azure/config`](https://learn.microsoft.com/en-us/cli/azure/azure-cli-configuration?view=azure-cli-latest) file: @@ -208,17 +215,30 @@ key = Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/ Alternatively, you can use the following environment variables when starting postgres to configure the Azure Blob Storage client: - `AZURE_STORAGE_ACCOUNT`: the storage account name of the Azure Blob - `AZURE_STORAGE_KEY`: the storage key of the Azure Blob -- `AZURE_STORAGE_CONNECTION_STRING`: the connection string for the Azure Blob (this can be set instead of specifying account name and key) +- `AZURE_STORAGE_CONNECTION_STRING`: the connection string for the Azure Blob (overrides any other config) - `AZURE_STORAGE_SAS_TOKEN`: the storage SAS token for the Azure Blob +- `AZURE_TENANT_ID`: the tenant id for client secret auth **(only via environment variables)** +- `AZURE_CLIENT_ID`: the client id for client secret auth **(only via environment variables)** +- `AZURE_CLIENT_SECRET`: the client secret for client secret auth **(only via environment variables)** - `AZURE_STORAGE_ENDPOINT`: the endpoint **(only via environment variables)** - `AZURE_CONFIG_FILE`: an alternative location for the config file **(only via environment variables)** - `AZURE_ALLOW_HTTP`: allows http endpoints **(only via environment variables)** +Config source priority order is shown below: +1. Connection string (read from environment variable or config file), +2. Environment variables, +3. Config file. + Supported Azure Blob Storage uri formats are shown below: - az:// \ / \ - azure:// \ / \ - https:// \.blob.core.windows.net / \ +Supported authorization methods' priority order is shown below: +1. Bearer token via client secret, +2. Sas token, +3. Storage key. + ## Copy Options `pg_parquet` supports the following options in the `COPY TO` command: - `format parquet`: you need to specify this option to read or write Parquet files which does not end with `.parquet[.]` extension, diff --git a/src/object_store/aws.rs b/src/object_store/aws.rs index c738f8a..ed6a85e 100644 --- a/src/object_store/aws.rs +++ b/src/object_store/aws.rs @@ -1,5 +1,5 @@ use aws_config::BehaviorVersion; -use aws_sdk_sts::config::ProvideCredentials; +use aws_credential_types::provider::ProvideCredentials; use object_store::aws::{AmazonS3, AmazonS3Builder}; use url::Url; @@ -28,42 +28,34 @@ pub(crate) fn create_s3_object_store(uri: &Url) -> AmazonS3 { // a fallback to the config files let mut aws_s3_builder = AmazonS3Builder::new().with_bucket_name(bucket_name); - if let Ok(allow_http) = std::env::var("AWS_ALLOW_HTTP") { - aws_s3_builder = aws_s3_builder.with_allow_http(allow_http.parse().unwrap_or(false)); - } + let aws_s3_config = AwsS3Config::load(); - // first tries environment variables and then the config files - let sdk_config = PG_BACKEND_TOKIO_RUNTIME.block_on(async { - aws_config::defaults(BehaviorVersion::v2024_03_28()) - .load() - .await - }); + // allow http + aws_s3_builder = aws_s3_builder.with_allow_http(aws_s3_config.allow_http); - if let Some(credential_provider) = sdk_config.credentials_provider() { - if let Ok(credentials) = PG_BACKEND_TOKIO_RUNTIME - .block_on(async { credential_provider.provide_credentials().await }) - { - // AWS_ACCESS_KEY_ID - aws_s3_builder = aws_s3_builder.with_access_key_id(credentials.access_key_id()); + // access key id + if let Some(access_key_id) = aws_s3_config.access_key_id { + aws_s3_builder = aws_s3_builder.with_access_key_id(access_key_id); + } - // AWS_SECRET_ACCESS_KEY - aws_s3_builder = aws_s3_builder.with_secret_access_key(credentials.secret_access_key()); + // secret access key + if let Some(secret_access_key) = aws_s3_config.secret_access_key { + aws_s3_builder = aws_s3_builder.with_secret_access_key(secret_access_key); + } - if let Some(token) = credentials.session_token() { - // AWS_SESSION_TOKEN - aws_s3_builder = aws_s3_builder.with_token(token); - } - } + // session token + if let Some(session_token) = aws_s3_config.session_token { + aws_s3_builder = aws_s3_builder.with_token(session_token); } - // AWS_ENDPOINT_URL - if let Some(aws_endpoint_url) = sdk_config.endpoint_url() { - aws_s3_builder = aws_s3_builder.with_endpoint(aws_endpoint_url); + // endpoint url + if let Some(endpoint_url) = aws_s3_config.endpoint_url { + aws_s3_builder = aws_s3_builder.with_endpoint(endpoint_url); } - // AWS_REGION - if let Some(aws_region) = sdk_config.region() { - aws_s3_builder = aws_s3_builder.with_region(aws_region.as_ref()); + // region + if let Some(region) = aws_s3_config.region { + aws_s3_builder = aws_s3_builder.with_region(region); } aws_s3_builder.build().unwrap_or_else(|e| panic!("{}", e)) @@ -96,3 +88,61 @@ fn parse_s3_bucket(uri: &Url) -> Option { None } + +// AwsS3Config is a struct that holds the configuration that is +// used to configure the AmazonS3 object store. object_store does +// not provide a way to read the config files, so we need to read +// them ourselves via aws sdk. +struct AwsS3Config { + region: Option, + access_key_id: Option, + secret_access_key: Option, + session_token: Option, + endpoint_url: Option, + allow_http: bool, +} + +impl AwsS3Config { + // load reads the s3 config from the environment variables first and config files as fallback. + fn load() -> Self { + let allow_http = if let Ok(allow_http) = std::env::var("AWS_ALLOW_HTTP") { + allow_http.parse().unwrap_or(false) + } else { + false + }; + + // first tries environment variables and then the config files + let sdk_config = PG_BACKEND_TOKIO_RUNTIME.block_on(async { + aws_config::defaults(BehaviorVersion::v2024_03_28()) + .load() + .await + }); + + let mut access_key_id = None; + let mut secret_access_key = None; + let mut session_token = None; + + if let Some(credential_provider) = sdk_config.credentials_provider() { + if let Ok(credentials) = PG_BACKEND_TOKIO_RUNTIME + .block_on(async { credential_provider.provide_credentials().await }) + { + access_key_id = Some(credentials.access_key_id().to_string()); + secret_access_key = Some(credentials.secret_access_key().to_string()); + session_token = credentials.session_token().map(|t| t.to_string()); + } + } + + let endpoint_url = sdk_config.endpoint_url().map(|u| u.to_string()); + + let region = sdk_config.region().map(|r| r.as_ref().to_string()); + + Self { + region, + access_key_id, + secret_access_key, + session_token, + endpoint_url, + allow_http, + } + } +} diff --git a/src/object_store/azure.rs b/src/object_store/azure.rs index 58cf22c..c41c854 100644 --- a/src/object_store/azure.rs +++ b/src/object_store/azure.rs @@ -1,3 +1,4 @@ +use azure_storage::{ConnectionString, EndpointProtocol}; use home::home_dir; use ini::Ini; use object_store::azure::{AzureConfigKey, MicrosoftAzure, MicrosoftAzureBuilder}; @@ -24,6 +25,19 @@ pub(crate) fn create_azure_object_store(uri: &Url) -> MicrosoftAzure { let azure_blob_config = AzureStorageConfig::load(); + // allow http + azure_builder = azure_builder.with_allow_http(azure_blob_config.allow_http); + + // endpoint + if let Some(endpoint) = azure_blob_config.endpoint { + azure_builder = azure_builder.with_endpoint(endpoint); + } + + // sas token + if let Some(sas_token) = azure_blob_config.sas_token { + azure_builder = azure_builder.with_config(AzureConfigKey::SasKey, sas_token); + } + // account name if let Some(account_name) = azure_blob_config.account_name { azure_builder = azure_builder.with_account(account_name); @@ -34,17 +48,19 @@ pub(crate) fn create_azure_object_store(uri: &Url) -> MicrosoftAzure { azure_builder = azure_builder.with_access_key(account_key); } - // sas token - if let Some(sas_token) = azure_blob_config.sas_token { - azure_builder = azure_builder.with_config(AzureConfigKey::SasKey, sas_token); + // tenant id + if let Some(tenant_id) = azure_blob_config.tenant_id { + azure_builder = azure_builder.with_tenant_id(tenant_id); } - // allow http - azure_builder = azure_builder.with_allow_http(azure_blob_config.allow_http); + // client id + if let Some(client_id) = azure_blob_config.client_id { + azure_builder = azure_builder.with_client_id(client_id); + } - // endpoint - if let Some(endpoint) = azure_blob_config.endpoint { - azure_builder = azure_builder.with_endpoint(endpoint); + // client secret + if let Some(client_secret) = azure_blob_config.client_secret { + azure_builder = azure_builder.with_client_secret(client_secret); } azure_builder.build().unwrap_or_else(|e| panic!("{}", e)) @@ -73,20 +89,26 @@ fn parse_azure_blob_container(uri: &Url) -> Option { None } -// AzureStorageConfig represents the configuration for Azure Blob Storage. -// There is no proper azure sdk config crate that can read the config files. -// So, we need to read the config files manually from azure's ini config. -// See https://learn.microsoft.com/en-us/cli/azure/azure-cli-configuration?view=azure-cli-latest +// AzureStorageConfig is a struct that holds the configuration that is +// used to configure the Azure Blob Storage object store. object_store does +// not provide a way to read the config files, so we need to read +// them ourselves via rust-ini and azure sdk. struct AzureStorageConfig { account_name: Option, account_key: Option, sas_token: Option, + tenant_id: Option, + client_id: Option, + client_secret: Option, endpoint: Option, allow_http: bool, } impl AzureStorageConfig { // load reads the azure config from the environment variables first and config files as fallback. + // There is no proper azure sdk config crate that can read the config files. + // So, we need to read the config files manually from azure's ini config. + // See https://learn.microsoft.com/en-us/cli/azure/azure-cli-configuration?view=azure-cli-latest fn load() -> Self { // ~/.azure/config let azure_config_file_path = std::env::var("AZURE_CONFIG_FILE").unwrap_or( @@ -101,7 +123,6 @@ impl AzureStorageConfig { let azure_config_content = Ini::load_from_file(&azure_config_file_path).ok(); - // connection string let connection_string = match std::env::var("AZURE_STORAGE_CONNECTION_STRING") { Ok(connection_string) => Some(connection_string), Err(_) => azure_config_content @@ -111,12 +132,13 @@ impl AzureStorageConfig { .map(|connection_string| connection_string.to_string()), }; - // connection string has the highest priority + // connection string overrides everything if let Some(connection_string) = connection_string { - return Self::from_connection_string(&connection_string); + if let Ok(connection_string) = ConnectionString::new(&connection_string) { + return connection_string.into(); + } } - // account name let account_name = match std::env::var("AZURE_STORAGE_ACCOUNT") { Ok(account) => Some(account), Err(_) => azure_config_content @@ -126,7 +148,6 @@ impl AzureStorageConfig { .map(|account| account.to_string()), }; - // account key let account_key = match std::env::var("AZURE_STORAGE_KEY") { Ok(key) => Some(key), Err(_) => azure_config_content @@ -136,7 +157,6 @@ impl AzureStorageConfig { .map(|key| key.to_string()), }; - // sas token let sas_token = match std::env::var("AZURE_STORAGE_SAS_TOKEN") { Ok(token) => Some(token), Err(_) => azure_config_content @@ -155,47 +175,56 @@ impl AzureStorageConfig { .map(|allow_http| allow_http.parse().unwrap_or(false)) .unwrap_or(false); + // tenant id, object_store specific + let tenant_id = std::env::var("AZURE_TENANT_ID").ok(); + + // client id, object_store specific + let client_id = std::env::var("AZURE_CLIENT_ID").ok(); + + // client secret, object_store specific + let client_secret = std::env::var("AZURE_CLIENT_SECRET").ok(); + AzureStorageConfig { account_name, account_key, sas_token, + tenant_id, + client_id, + client_secret, endpoint, allow_http, } } +} - // from_connection_string parses AzureBlobConfig from the given connection string. - // See https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string#create-a-connection-string-for-an-explicit-storage-endpoint - fn from_connection_string(connection_string: &str) -> Self { - let mut account_name = None; - let mut account_key = None; - let mut sas_token = None; - let mut endpoint = None; - let mut allow_http = false; - - for pair in connection_string.trim_end_matches(';').split(';') { - let (key, value) = pair - .split_once('=') - .expect("invalid azure connection string format"); - - match key { - "AccountName" => account_name = Some(value.to_string()), - "AccountKey" => account_key = Some(value.to_string()), - "SharedAccessSignature" => sas_token = Some(value.to_string()), - "BlobEndpoint" => endpoint = Some(value.to_string()), - "DefaultEndpointsProtocol" => { - allow_http = value.to_lowercase() == "http"; - } - _ => { - panic!("unsupported config key in azure connection string: {}", key); - } - } - } +impl From> for AzureStorageConfig { + fn from(connection_string: ConnectionString) -> Self { + let account_name = connection_string + .account_name + .map(|account_name| account_name.to_string()); + + let account_key = connection_string + .account_key + .map(|account_key| account_key.to_string()); + + let sas_token = connection_string.sas.map(|sas| sas.to_string()); + + let endpoint = connection_string + .blob_endpoint + .map(|blob_endpoint| blob_endpoint.to_string()); + + let allow_http = matches!( + connection_string.default_endpoints_protocol, + Some(EndpointProtocol::Http) + ); AzureStorageConfig { account_name, account_key, sas_token, + tenant_id: None, + client_id: None, + client_secret: None, endpoint, allow_http, } diff --git a/src/pgrx_tests/object_store.rs b/src/pgrx_tests/object_store.rs index 08fc988..c5c9e83 100644 --- a/src/pgrx_tests/object_store.rs +++ b/src/pgrx_tests/object_store.rs @@ -2,7 +2,6 @@ mod tests { use std::io::Write; - use aws_config::BehaviorVersion; use pgrx::{pg_test, Spi}; use crate::pgrx_tests::common::TestTable; @@ -51,19 +50,22 @@ mod tests { // create a config file let aws_config_file_content = format!( - "[profile {profile}]\nregion = {}\naws_access_key_id = {}\naws_secret_access_key = {}\nendpoint_url = {}\n", - region, access_key_id, secret_access_key, endpoint + "[profile {profile}]\n\ + region={region}\n\ + aws_access_key_id={access_key_id}\n\ + aws_secret_access_key={secret_access_key}\n\ + endpoint_url={endpoint}\n" ); std::env::set_var("AWS_PROFILE", profile); - let aws_config_file = "/tmp/pg_parquet_aws_config"; - std::env::set_var("AWS_CONFIG_FILE", aws_config_file); + let aws_config_file_path = "/tmp/pg_parquet_aws_config"; + std::env::set_var("AWS_CONFIG_FILE", aws_config_file_path); let mut aws_config_file = std::fs::OpenOptions::new() .write(true) .truncate(true) .create(true) - .open(aws_config_file) + .open(aws_config_file_path) .unwrap(); aws_config_file @@ -76,6 +78,9 @@ mod tests { test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); test_table.assert_expected_and_result_rows(); + + // remove the config file + std::fs::remove_file(aws_config_file_path).unwrap(); } #[pg_test] @@ -216,77 +221,58 @@ mod tests { #[pg_test] fn test_s3_temporary_token() { - let tokio_rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap_or_else(|e| panic!("failed to create tokio runtime: {}", e)); - - let s3_uri = tokio_rt.block_on(async { - let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; - let client = aws_sdk_sts::Client::new(&config); - - let assume_role_result = client - .assume_role() - .role_session_name("testsession") - .role_arn("arn:xxx:xxx:xxx:xxxx") - .send() - .await - .unwrap(); - - let assumed_creds = assume_role_result.credentials().unwrap(); - - std::env::set_var("AWS_ACCESS_KEY_ID", assumed_creds.access_key_id()); - std::env::set_var("AWS_SECRET_ACCESS_KEY", assumed_creds.secret_access_key()); - std::env::set_var("AWS_SESSION_TOKEN", assumed_creds.session_token()); - - let test_bucket_name: String = - std::env::var("AWS_S3_TEST_BUCKET").expect("AWS_S3_TEST_BUCKET not found"); - - format!("s3://{}/pg_parquet_test.parquet", test_bucket_name) - }); - - let test_table = TestTable::::new("int4".into()).with_uri(s3_uri); - - test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); - test_table.assert_expected_and_result_rows(); - } + let test_bucket_name: String = + std::env::var("AWS_S3_TEST_BUCKET").expect("AWS_S3_TEST_BUCKET not found"); - #[pg_test] - #[should_panic(expected = "403 Forbidden")] - fn test_s3_missing_temporary_token() { - let tokio_rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap_or_else(|e| panic!("failed to create tokio runtime: {}", e)); + // remove these to make sure the config file is used + let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").unwrap(); + std::env::remove_var("AWS_ACCESS_KEY_ID"); + let secret_access_key = std::env::var("AWS_SECRET_ACCESS_KEY").unwrap(); + std::env::remove_var("AWS_SECRET_ACCESS_KEY"); + let region = std::env::var("AWS_REGION").unwrap(); + std::env::remove_var("AWS_REGION"); + let endpoint = std::env::var("AWS_ENDPOINT_URL").unwrap(); + std::env::remove_var("AWS_ENDPOINT_URL"); - let s3_uri = tokio_rt.block_on(async { - let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await; - let client = aws_sdk_sts::Client::new(&config); + let profile = "pg_parquet_test"; - let assume_role_result = client - .assume_role() - .role_session_name("testsession") - .role_arn("arn:xxx:xxx:xxx:xxxx") - .send() - .await - .unwrap(); + // create a config file + let aws_config_file_content = format!( + "[profile {profile}-source]\n\ + aws_access_key_id={access_key_id}\n\ + aws_secret_access_key={secret_access_key}\n\ + \n\ + [profile {profile}]\n\ + region={region}\n\ + source_profile={profile}-source\n\ + role_arn=arn:aws:iam::123456789012:dummy\n\ + endpoint_url={endpoint}\n" + ); + std::env::set_var("AWS_PROFILE", profile); - let assumed_creds = assume_role_result.credentials().unwrap(); + let aws_config_file_path = "/tmp/pg_parquet_aws_config"; + std::env::set_var("AWS_CONFIG_FILE", aws_config_file_path); - // we do not set the session token on purpose - std::env::set_var("AWS_ACCESS_KEY_ID", assumed_creds.access_key_id()); - std::env::set_var("AWS_SECRET_ACCESS_KEY", assumed_creds.secret_access_key()); + let mut aws_config_file = std::fs::OpenOptions::new() + .write(true) + .truncate(true) + .create(true) + .open(aws_config_file_path) + .unwrap(); - let test_bucket_name: String = - std::env::var("AWS_S3_TEST_BUCKET").expect("AWS_S3_TEST_BUCKET not found"); + aws_config_file + .write_all(aws_config_file_content.as_bytes()) + .unwrap(); - format!("s3://{}/pg_parquet_test.parquet", test_bucket_name) - }); + let s3_uri = format!("s3://{}/pg_parquet_test.parquet", test_bucket_name); let test_table = TestTable::::new("int4".into()).with_uri(s3_uri); test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); test_table.assert_expected_and_result_rows(); + + // remove the config file + std::fs::remove_file(aws_config_file_path).unwrap(); } #[pg_test] @@ -342,18 +328,19 @@ mod tests { // create a config file let azure_config_file_content = format!( - "[storage]\naccount = {}\nkey = {}\n", - account_name, account_key + "[storage]\n\ + account={account_name}\n\ + key={account_key}" ); - let azure_config_file = "/tmp/pg_parquet_azure_config"; - std::env::set_var("AZURE_CONFIG_FILE", azure_config_file); + let azure_config_file_path = "/tmp/pg_parquet_azure_config"; + std::env::set_var("AZURE_CONFIG_FILE", azure_config_file_path); let mut azure_config_file = std::fs::OpenOptions::new() .write(true) .truncate(true) .create(true) - .open(azure_config_file) + .open(azure_config_file_path) .unwrap(); azure_config_file @@ -366,6 +353,9 @@ mod tests { test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); test_table.assert_expected_and_result_rows(); + + // remove the config file + std::fs::remove_file(azure_config_file_path).unwrap(); } #[pg_test] @@ -398,16 +388,16 @@ mod tests { // create a config file let azure_config_file_content = - format!("[storage]\nconnection_string = {}\n", connection_string); + format!("[storage]\nconnection_string = {connection_string}\n"); - let azure_config_file = "/tmp/pg_parquet_azure_config"; - std::env::set_var("AZURE_CONFIG_FILE", azure_config_file); + let azure_config_file_path = "/tmp/pg_parquet_azure_config"; + std::env::set_var("AZURE_CONFIG_FILE", azure_config_file_path); let mut azure_config_file = std::fs::OpenOptions::new() .write(true) .truncate(true) .create(true) - .open(azure_config_file) + .open(azure_config_file_path) .unwrap(); azure_config_file @@ -420,6 +410,9 @@ mod tests { test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); test_table.assert_expected_and_result_rows(); + + // remove the config file + std::fs::remove_file(azure_config_file_path).unwrap(); } #[pg_test] @@ -561,4 +554,13 @@ mod tests { test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); test_table.assert_expected_and_result_rows(); } + + #[pg_test] + #[should_panic(expected = "unrecognized uri dummy://testbucket")] + fn test_unrecognized_uri() { + let test_table = + TestTable::::new("int4".into()).with_uri("dummy://testbucket".to_string()); + test_table.insert("INSERT INTO test_expected (a) VALUES (1), (2), (null);"); + test_table.assert_expected_and_result_rows(); + } } From 5955d57ceb146df7f0d16f863579e8d4c5fa340b Mon Sep 17 00:00:00 2001 From: Aykut Bozkurt Date: Fri, 10 Jan 2025 15:53:41 +0300 Subject: [PATCH 14/15] google config to pass to object store --- src/object_store/gcs.rs | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/object_store/gcs.rs b/src/object_store/gcs.rs index 14833a7..00e13b5 100644 --- a/src/object_store/gcs.rs +++ b/src/object_store/gcs.rs @@ -3,7 +3,7 @@ use url::Url; // create_gcs_object_store a GoogleCloudStorage object store from given uri. // It is configured by environment variables. Currently, we only support -// following environment variables and config parameters: +// following environment variables: // - GOOGLE_SERVICE_ACCOUNT_KEY // - GOOGLE_SERVICE_ACCOUNT_PATH pub(crate) fn create_gcs_object_store(uri: &Url) -> GoogleCloudStorage { @@ -13,13 +13,15 @@ pub(crate) fn create_gcs_object_store(uri: &Url) -> GoogleCloudStorage { let mut gcs_builder = GoogleCloudStorageBuilder::new().with_bucket_name(bucket_name); - // GOOGLE_SERVICE_ACCOUNT_KEY - if let Ok(service_account_key) = std::env::var("GOOGLE_SERVICE_ACCOUNT_KEY") { + let gcs_config = GoogleStorageConfig::load(); + + // service account key + if let Some(service_account_key) = gcs_config.service_account_key { gcs_builder = gcs_builder.with_service_account_key(&service_account_key); } - // GOOGLE_SERVICE_ACCOUNT_PATH - if let Ok(service_account_path) = std::env::var("GOOGLE_SERVICE_ACCOUNT_PATH") { + // service account path + if let Some(service_account_path) = gcs_config.service_account_path { gcs_builder = gcs_builder.with_service_account_path(&service_account_path); } @@ -36,3 +38,20 @@ fn parse_gcs_bucket(uri: &Url) -> Option { None } + +// GoogleStorageConfig is a struct that holds the configuration that is +// used to configure the Google Storage object store. +struct GoogleStorageConfig { + service_account_key: Option, + service_account_path: Option, +} + +impl GoogleStorageConfig { + // load loads the Google Storage configuration from the environment. + fn load() -> Self { + Self { + service_account_key: std::env::var("GOOGLE_SERVICE_ACCOUNT_KEY").ok(), + service_account_path: std::env::var("GOOGLE_SERVICE_ACCOUNT_PATH").ok(), + } + } +} From dd3d1577248ccc3ff3dc92fe2b1a8b7e768791cd Mon Sep 17 00:00:00 2001 From: Aykut Bozkurt Date: Fri, 10 Jan 2025 16:18:50 +0300 Subject: [PATCH 15/15] fix ci for fake google storage server --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index db7f7fb..7445eab 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -144,7 +144,7 @@ jobs: docker run -d \ --env-file .devcontainer/.env \ -p 4443:4443 \ - tustvold/fake-gcs-server + tustvold/fake-gcs-server -scheme http -public-host localhost:4443 while ! curl $GOOGLE_SERVICE_ENDPOINT; do echo "Waiting for $GOOGLE_SERVICE_ENDPOINT..."